From db7b7396ff1cc98832396a57e8d3e76e0eebd5fa Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Thu, 28 Nov 2024 00:16:55 +0800
Subject: [PATCH 01/73] Make KEYS can visit expired key in import-source state
 (#1326)

After #1185, a client in import-source state can visit expired key
both in read commands and write commands, this commit handle
keyIsExpired function to handle import-source state as well, so
KEYS can visit the expired key.

This is not particularly important, but it ensures the definition,
also doing some cleanup around the test, verified that the client
can indeed visit the expired key.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/db.c              | 10 ++++++++--
 src/networking.c      |  2 +-
 tests/unit/expire.tcl | 23 ++++++++++++++---------
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/src/db.c b/src/db.c
index d3ef19027d..3f6452c44c 100644
--- a/src/db.c
+++ b/src/db.c
@@ -390,7 +390,7 @@ robj *dbRandomKey(serverDb *db) {
             if (allvolatile && (server.primary_host || server.import_mode) && --maxtries == 0) {
                 /* If the DB is composed only of keys with an expire set,
                  * it could happen that all the keys are already logically
-                 * expired in the repilca, so the function cannot stop because
+                 * expired in the replica, so the function cannot stop because
                  * expireIfNeeded() is false, nor it can stop because
                  * dictGetFairRandomKey() returns NULL (there are keys to return).
                  * To prevent the infinite loop we do some tries, but if there
@@ -1808,7 +1808,13 @@ int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index) {
 /* Check if the key is expired. */
 int keyIsExpired(serverDb *db, robj *key) {
     int dict_index = getKVStoreIndexForKey(key->ptr);
-    return keyIsExpiredWithDictIndex(db, key, dict_index);
+    if (!keyIsExpiredWithDictIndex(db, key, dict_index)) return 0;
+
+    /* See expireIfNeededWithDictIndex for more details. */
+    if (server.primary_host == NULL && server.import_mode) {
+        if (server.current_client && server.current_client->flag.import_source) return 0;
+    }
+    return 1;
 }
 
 keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, int flags, int dict_index) {
diff --git a/src/networking.c b/src/networking.c
index 01aaa48148..97479967f6 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -3617,7 +3617,7 @@ void clientCommand(client *c) {
             "NO-TOUCH (ON|OFF)",
             "    Will not touch LRU/LFU stats when this mode is on.",
             "IMPORT-SOURCE (ON|OFF)",
-            "    Mark this connection as an import source if server.import_mode is true.",
+            "    Mark this connection as an import source if import-mode is enabled.",
             "    Sync tools can set their connections into 'import-source' state to visit",
             "    expired keys.",
             NULL};
diff --git a/tests/unit/expire.tcl b/tests/unit/expire.tcl
index fba425f62d..941acfad38 100644
--- a/tests/unit/expire.tcl
+++ b/tests/unit/expire.tcl
@@ -841,7 +841,7 @@ start_server {tags {"expire"}} {
 
         r set foo1 bar PX 1
         r set foo2 bar PX 1
-        after 100
+        after 10
 
         assert_equal [r dbsize] {2}
 
@@ -879,22 +879,27 @@ start_server {tags {"expire"}} {
         assert_equal [r debug set-active-expire 1] {OK}
     } {} {needs:debug}
 
-    test {RANDOMKEY can return expired key in import mode} {
+    test {Client can visit expired key in import-source state} {
         r flushall
 
         r config set import-mode yes
-        assert_equal [r client import-source on] {OK}
 
-        r set foo1 bar PX 1
+        r set foo1 1 PX 1
         after 10
 
-        set client [valkey [srv "host"] [srv "port"] 0 $::tls]
-        if {!$::singledb} {
-            $client select 9
-        }
-        assert_equal [$client ttl foo1] {-2}
+        # Normal clients cannot visit expired key.
+        assert_equal [r get foo1] {}
+        assert_equal [r ttl foo1] {-2}
+        assert_equal [r dbsize] 1
 
+        # Client can visit expired key when in import-source state.
+        assert_equal [r client import-source on] {OK}
+        assert_equal [r ttl foo1] {0}
+        assert_equal [r get foo1] {1}
+        assert_equal [r incr foo1] {2}
         assert_equal [r randomkey] {foo1}
+        assert_equal [r scan 0 match * count 10000] {0 foo1}
+        assert_equal [r keys *] {foo1}
 
         assert_equal [r client import-source off] {OK}
         r config set import-mode no

From a939cb88ee0c0512c003106be483b7c6968b3e7f Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Thu, 28 Nov 2024 14:10:48 +0800
Subject: [PATCH 02/73] Handle keyIsExpiredWithDictIndex to make it check for
 import mode (#1368)

In #1326 we make KEYS can visit expired key in import-source state
by updating keyIsExpired to check for import mode. But after #1205,
we now use keyIsExpiredWithDictIndex to optimize and remove the
redundant dict_index, and keyIsExpiredWithDictIndex does not handle
this logic.

In this commit, we handle keyIsExpiredWithDictIndex to make it check
for import mode as well so that KEYS can visit the expired key.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/db.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/db.c b/src/db.c
index 3f6452c44c..3c3ccb4899 100644
--- a/src/db.c
+++ b/src/db.c
@@ -1789,7 +1789,7 @@ void propagateDeletion(serverDb *db, robj *key, int lazy) {
     decrRefCount(argv[1]);
 }
 
-int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index) {
+static int keyIsExpiredWithDictIndexImpl(serverDb *db, robj *key, int dict_index) {
     /* Don't expire anything while loading. It will be done later. */
     if (server.loading) return 0;
 
@@ -1806,9 +1806,8 @@ int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index) {
 }
 
 /* Check if the key is expired. */
-int keyIsExpired(serverDb *db, robj *key) {
-    int dict_index = getKVStoreIndexForKey(key->ptr);
-    if (!keyIsExpiredWithDictIndex(db, key, dict_index)) return 0;
+int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index) {
+    if (!keyIsExpiredWithDictIndexImpl(db, key, dict_index)) return 0;
 
     /* See expireIfNeededWithDictIndex for more details. */
     if (server.primary_host == NULL && server.import_mode) {
@@ -1817,9 +1816,15 @@ int keyIsExpired(serverDb *db, robj *key) {
     return 1;
 }
 
+/* Check if the key is expired. */
+int keyIsExpired(serverDb *db, robj *key) {
+    int dict_index = getKVStoreIndexForKey(key->ptr);
+    return keyIsExpiredWithDictIndex(db, key, dict_index);
+}
+
 keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, int flags, int dict_index) {
     if (server.lazy_expire_disabled) return KEY_VALID;
-    if (!keyIsExpiredWithDictIndex(db, key, dict_index)) return KEY_VALID;
+    if (!keyIsExpiredWithDictIndexImpl(db, key, dict_index)) return KEY_VALID;
 
     /* If we are running in the context of a replica, instead of
      * evicting the expired key from the database, we return ASAP:

From fd58f8d0585a3e558fbb837c2302ef51dc8d1810 Mon Sep 17 00:00:00 2001
From: zvi-code <54795925+zvi-code@users.noreply.github.com>
Date: Thu, 28 Nov 2024 17:27:00 +0200
Subject: [PATCH 03/73] Disable lazy free in defrag test to fix 32bit daily
 failure (#1370)

Signed-off-by: Zvi Schneider <zvi.schneider22@gmail.com>
Co-authored-by: Zvi Schneider <zvi.schneider22@gmail.com>
---
 tests/unit/memefficiency.tcl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl
index d5a6a6efe2..67329f03f1 100644
--- a/tests/unit/memefficiency.tcl
+++ b/tests/unit/memefficiency.tcl
@@ -720,11 +720,11 @@ run_solo {defrag} {
     }
     }
 
-    start_cluster 1 0 {tags {"defrag external:skip cluster"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save ""}} {
+    start_cluster 1 0 {tags {"defrag external:skip cluster"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save "" lazyfree-lazy-user-del no}} {
         test_active_defrag "cluster"
     }
 
-    start_server {tags {"defrag external:skip standalone"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save ""}} {
+    start_server {tags {"defrag external:skip standalone"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save "" lazyfree-lazy-user-del no}} {
         test_active_defrag "standalone"
     }
 } ;# run_solo

From 4695d118dd6126b9b4f3e3415198df398e8bbb79 Mon Sep 17 00:00:00 2001
From: zhenwei pi <pizhenwei@bytedance.com>
Date: Fri, 29 Nov 2024 18:13:34 +0800
Subject: [PATCH 04/73] RDMA builtin support (#1209)

There are several patches in this PR:

* Abstract set/rewrite config bind option: `bind` option is a special
config, `socket` and `tls` are using the same one. However RDMA uses the
similar style but different one. Use a bit abstract work to make it
flexible for both `socket` and `RDMA`. (Even for QUIC in the future.)
* Introduce closeListener for connection type: closing socket by a
simple syscall would be fine, RDMA has complex logic. Introduce
connection type specific close listener method.
* RDMA: Use valkey.conf style instead of module parameters: use
`--rdma-bind` and `--rdma-port` style instead of module parameters. The
module style config `rdma.bind` and `rdma.port` are removed.
* RDMA: Support builtin: support `make BUILD_RDMA=yes`. module style is
still kept for now.

Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
---
 README.md                       |  26 +++-
 cmake/Modules/SourceFiles.cmake |   1 +
 cmake/Modules/ValkeySetup.cmake |  29 +++--
 src/CMakeLists.txt              |   2 +-
 src/Makefile                    |  30 ++---
 src/config.c                    | 114 +++++++++++++---
 src/connection.c                |   3 +
 src/connection.h                |  10 ++
 src/rdma.c                      | 222 ++++++++------------------------
 src/server.c                    |  28 ++--
 src/server.h                    |  13 +-
 src/socket.c                    |  14 ++
 src/tls.c                       |   5 +
 src/unix.c                      |   5 +
 tests/rdma/run.py               |   2 +-
 tests/unit/introspection.tcl    |   4 +
 valkey.conf                     |  48 +++++++
 17 files changed, 314 insertions(+), 242 deletions(-)

diff --git a/README.md b/README.md
index a32ac255df..c447cc8d47 100644
--- a/README.md
+++ b/README.md
@@ -37,8 +37,13 @@ To build TLS as Valkey module:
 Note that sentinel mode does not support TLS module.
 
 To build with experimental RDMA support you'll need RDMA development libraries
-(e.g. librdmacm-dev and libibverbs-dev on Debian/Ubuntu). For now, Valkey only
-supports RDMA as connection module mode. Run:
+(e.g. librdmacm-dev and libibverbs-dev on Debian/Ubuntu).
+
+To build RDMA support as Valkey built-in:
+
+    % make BUILD_RDMA=yes
+
+To build RDMA as Valkey module:
 
     % make BUILD_RDMA=module
 
@@ -203,20 +208,27 @@ Note that Valkey Over RDMA is an experimental feature.
 It may be changed or removed in any minor or major version.
 Currently, it is only supported on Linux.
 
-To manually run a Valkey server with RDMA mode:
+* RDMA built-in mode:
+    ```
+    ./src/valkey-server --protected-mode no \
+         --rdma-bind 192.168.122.100 --rdma-port 6379
+    ```
 
-    % ./src/valkey-server --protected-mode no \
-         --loadmodule src/valkey-rdma.so bind=192.168.122.100 port=6379
+* RDMA module mode:
+    ```
+    ./src/valkey-server --protected-mode no \
+         --loadmodule src/valkey-rdma.so --rdma-bind 192.168.122.100 --rdma-port 6379
+    ```
 
 It's possible to change bind address/port of RDMA by runtime command:
 
-    192.168.122.100:6379> CONFIG SET rdma.port 6380
+    192.168.122.100:6379> CONFIG SET rdma-port 6380
 
 It's also possible to have both RDMA and TCP available, and there is no
 conflict of TCP(6379) and RDMA(6379), Ex:
 
     % ./src/valkey-server --protected-mode no \
-         --loadmodule src/valkey-rdma.so bind=192.168.122.100 port=6379 \
+         --loadmodule src/valkey-rdma.so --rdma-bind 192.168.122.100 --rdma-port 6379 \
          --port 6379
 
 Note that the network card (192.168.122.100 of this example) should support
diff --git a/cmake/Modules/SourceFiles.cmake b/cmake/Modules/SourceFiles.cmake
index 873229d6f0..c34ae644a2 100644
--- a/cmake/Modules/SourceFiles.cmake
+++ b/cmake/Modules/SourceFiles.cmake
@@ -88,6 +88,7 @@ set(VALKEY_SERVER_SRCS
     ${CMAKE_SOURCE_DIR}/src/tracking.c
     ${CMAKE_SOURCE_DIR}/src/socket.c
     ${CMAKE_SOURCE_DIR}/src/tls.c
+    ${CMAKE_SOURCE_DIR}/src/rdma.c
     ${CMAKE_SOURCE_DIR}/src/sha256.c
     ${CMAKE_SOURCE_DIR}/src/timeout.c
     ${CMAKE_SOURCE_DIR}/src/setcpuaffinity.c
diff --git a/cmake/Modules/ValkeySetup.cmake b/cmake/Modules/ValkeySetup.cmake
index 4fafd07910..8a4d4da1c9 100644
--- a/cmake/Modules/ValkeySetup.cmake
+++ b/cmake/Modules/ValkeySetup.cmake
@@ -208,25 +208,30 @@ if (BUILD_RDMA)
     # RDMA support (Linux only)
     if (LINUX AND NOT APPLE)
         valkey_parse_build_option(${BUILD_RDMA} USE_RDMA)
+        find_package(PkgConfig REQUIRED)
+        # Locate librdmacm & libibverbs, fail if we can't find them
+        valkey_pkg_config(librdmacm RDMACM_LIBS)
+        valkey_pkg_config(libibverbs IBVERBS_LIBS)
+        message(STATUS "${RDMACM_LIBS};${IBVERBS_LIBS}")
+        list(APPEND RDMA_LIBS "${RDMACM_LIBS};${IBVERBS_LIBS}")
+
         if (USE_RDMA EQUAL 2) # Module
             message(STATUS "Building RDMA as module")
             add_valkey_server_compiler_options("-DUSE_RDMA=2")
-
-            # Locate librdmacm & libibverbs, fail if we can't find them
-            valkey_pkg_config(librdmacm RDMACM_LIBS)
-            valkey_pkg_config(libibverbs IBVERBS_LIBS)
-
-            list(APPEND RDMA_LIBS "${RDMACM_LIBS};${IBVERBS_LIBS}")
-            set(BUILD_RDMA_MODULE 1)
-        elseif (USE_RDMA EQUAL 1)
-            # RDMA can only be built as a module. So disable it
-            message(WARNING "BUILD_RDMA can be one of: [NO | 0 | MODULE], but '${BUILD_RDMA}' was provided")
-            message(STATUS "RDMA build is disabled")
-            set(USE_RDMA 0)
+            set(BUILD_RDMA_MODULE 2)
+        elseif (USE_RDMA EQUAL 1) # Builtin
+            message(STATUS "Building RDMA as builtin")
+            add_valkey_server_compiler_options("-DUSE_RDMA=1")
+            add_valkey_server_compiler_options("-DBUILD_RDMA_MODULE=0")
+            list(APPEND SERVER_LIBS "${RDMA_LIBS}")
         endif ()
     else ()
         message(WARNING "RDMA is only supported on Linux platforms")
     endif ()
+else ()
+    # By default, RDMA is disabled
+    message(STATUS "RDMA is disabled")
+    set(USE_RDMA 0)
 endif ()
 
 set(BUILDING_ARM64 0)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 51e1b5a2e6..b87dff3db0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -55,7 +55,7 @@ if (BUILD_RDMA_MODULE)
     set(MODULE_NAME "valkey-rdma")
     message(STATUS "Building RDMA module")
     add_library(${MODULE_NAME} SHARED "${VALKEY_RDMA_MODULE_SRCS}")
-    target_compile_options(${MODULE_NAME} PRIVATE -DBUILD_RDMA_MODULE -DUSE_RDMA=1)
+    target_compile_options(${MODULE_NAME} PRIVATE -DBUILD_RDMA_MODULE=2 -DUSE_RDMA=1)
     target_link_libraries(${MODULE_NAME} "${RDMA_LIBS}")
     # remove the "lib" prefix from the module
     set_target_properties(${MODULE_NAME} PROPERTIES PREFIX "")
diff --git a/src/Makefile b/src/Makefile
index 0cbf5763cb..3b4ad0a2ef 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -325,26 +325,26 @@ ifeq ($(BUILD_TLS),module)
 	TLS_MODULE_CFLAGS+=-DUSE_OPENSSL=$(BUILD_MODULE) $(OPENSSL_CFLAGS) -DBUILD_TLS_MODULE=$(BUILD_MODULE)
 endif
 
-BUILD_RDMA:=no
-RDMA_MODULE=
-RDMA_MODULE_NAME:=valkey-rdma$(PROG_SUFFIX).so
-RDMA_MODULE_CFLAGS:=$(FINAL_CFLAGS)
-ifeq ($(BUILD_RDMA),module)
-	FINAL_CFLAGS+=-DUSE_RDMA=$(BUILD_MODULE)
-	RDMA_PKGCONFIG := $(shell $(PKG_CONFIG) --exists librdmacm libibverbs && echo $$?)
+RDMA_LIBS=
+RDMA_PKGCONFIG := $(shell $(PKG_CONFIG) --exists librdmacm libibverbs && echo $$?)
 ifeq ($(RDMA_PKGCONFIG),0)
 	RDMA_LIBS=$(shell $(PKG_CONFIG) --libs librdmacm libibverbs)
 else
 	RDMA_LIBS=-lrdmacm -libverbs
 endif
-	RDMA_MODULE=$(RDMA_MODULE_NAME)
-	RDMA_MODULE_CFLAGS+=-DUSE_RDMA=$(BUILD_YES) -DBUILD_RDMA_MODULE $(RDMA_LIBS)
-else
-ifeq ($(BUILD_RDMA),no)
-    # disable RDMA, do nothing
-else
-    $(error "RDMA is only supported as module (BUILD_RDMA=module), or disabled (BUILD_RDMA=no)")
+
+ifeq ($(BUILD_RDMA),yes)
+	FINAL_CFLAGS+=-DUSE_RDMA=$(BUILD_YES) -DBUILD_RDMA_MODULE=$(BUILD_NO)
+	FINAL_LIBS += $(RDMA_LIBS)
 endif
+
+RDMA_MODULE=
+RDMA_MODULE_NAME:=valkey-rdma$(PROG_SUFFIX).so
+RDMA_MODULE_CFLAGS:=$(FINAL_CFLAGS)
+ifeq ($(BUILD_RDMA),module)
+	FINAL_CFLAGS+=-DUSE_RDMA=$(BUILD_MODULE)
+	RDMA_MODULE=$(RDMA_MODULE_NAME)
+	RDMA_MODULE_CFLAGS+=-DUSE_RDMA=$(BUILD_MODULE) -DBUILD_RDMA_MODULE=$(BUILD_MODULE) $(RDMA_LIBS)
 endif
 
 ifndef V
@@ -411,7 +411,7 @@ endif
 ENGINE_NAME=valkey
 SERVER_NAME=$(ENGINE_NAME)-server$(PROG_SUFFIX)
 ENGINE_SENTINEL_NAME=$(ENGINE_NAME)-sentinel$(PROG_SUFFIX)
-ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o memory_prefetch.o io_threads.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o allocator_defrag.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o
+ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o memory_prefetch.o io_threads.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o allocator_defrag.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o rdma.o
 ENGINE_CLI_NAME=$(ENGINE_NAME)-cli$(PROG_SUFFIX)
 ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o
 ENGINE_BENCHMARK_NAME=$(ENGINE_NAME)-benchmark$(PROG_SUFFIX)
diff --git a/src/config.c b/src/config.c
index c4009adefa..7f0901c50a 100644
--- a/src/config.c
+++ b/src/config.c
@@ -1536,10 +1536,27 @@ void rewriteConfigOOMScoreAdjValuesOption(standardConfig *config, const char *na
 }
 
 /* Rewrite the bind option. */
-void rewriteConfigBindOption(standardConfig *config, const char *name, struct rewriteConfigState *state) {
+static void rewriteConfigBindOption(standardConfig *config, const char *name, struct rewriteConfigState *state, char **bindaddr, int bindaddr_count) {
     UNUSED(config);
     int force = 1;
     sds line, addresses;
+
+    /* Rewrite as bind <addr1> <addr2> ... <addrN> */
+    if (bindaddr_count > 0)
+        addresses = sdsjoin(bindaddr, bindaddr_count, " ");
+    else
+        addresses = sdsnew("\"\"");
+    line = sdsnew(name);
+    line = sdscatlen(line, " ", 1);
+    line = sdscatsds(line, addresses);
+    sdsfree(addresses);
+
+    rewriteConfigRewriteLine(state, name, line, force);
+}
+
+/* Rewrite the bind option. */
+static void rewriteConfigSocketBindOption(standardConfig *config, const char *name, struct rewriteConfigState *state) {
+    UNUSED(config);
     int is_default = 0;
 
     /* Compare server.bindaddr with CONFIG_DEFAULT_BINDADDR */
@@ -1559,17 +1576,7 @@ void rewriteConfigBindOption(standardConfig *config, const char *name, struct re
         return;
     }
 
-    /* Rewrite as bind <addr1> <addr2> ... <addrN> */
-    if (server.bindaddr_count > 0)
-        addresses = sdsjoin(server.bindaddr, server.bindaddr_count, " ");
-    else
-        addresses = sdsnew("\"\"");
-    line = sdsnew(name);
-    line = sdscatlen(line, " ", 1);
-    line = sdscatsds(line, addresses);
-    sdsfree(addresses);
-
-    rewriteConfigRewriteLine(state, name, line, force);
+    rewriteConfigBindOption(config, name, state, server.bindaddr, server.bindaddr_count);
 }
 
 /* Rewrite the loadmodule option. */
@@ -2637,7 +2644,7 @@ static int applyBind(const char **err) {
     tcp_listener->ct = connectionByType(CONN_TYPE_SOCKET);
     if (changeListener(tcp_listener) == C_ERR) {
         *err = "Failed to bind to specified addresses.";
-        if (tls_listener) closeListener(tls_listener); /* failed with TLS together */
+        if (tls_listener) connCloseListener(tls_listener); /* failed with TLS together */
         return 0;
     }
 
@@ -2649,7 +2656,7 @@ static int applyBind(const char **err) {
         tls_listener->ct = connectionByType(CONN_TYPE_TLS);
         if (changeListener(tls_listener) == C_ERR) {
             *err = "Failed to bind to specified addresses.";
-            closeListener(tcp_listener); /* failed with TCP together */
+            connCloseListener(tcp_listener); /* failed with TCP together */
             return 0;
         }
     }
@@ -2922,8 +2929,9 @@ static sds getConfigNotifyKeyspaceEventsOption(standardConfig *config) {
     return keyspaceEventsFlagsToString(server.notify_keyspace_events);
 }
 
-static int setConfigBindOption(standardConfig *config, sds *argv, int argc, const char **err) {
+static int setConfigBindOption(standardConfig *config, sds *argv, int argc, const char **err, char **bindaddr, int *bindaddr_count) {
     UNUSED(config);
+    int orig_bindaddr_count = *bindaddr_count;
     int j;
 
     if (argc > CONFIG_BINDADDR_MAX) {
@@ -2935,11 +2943,73 @@ static int setConfigBindOption(standardConfig *config, sds *argv, int argc, cons
     if (argc == 1 && sdslen(argv[0]) == 0) argc = 0;
 
     /* Free old bind addresses */
-    for (j = 0; j < server.bindaddr_count; j++) {
-        zfree(server.bindaddr[j]);
+    for (j = 0; j < orig_bindaddr_count; j++) zfree(bindaddr[j]);
+    for (j = 0; j < argc; j++) bindaddr[j] = zstrdup(argv[j]);
+    *bindaddr_count = argc;
+
+    return 1;
+}
+
+static int setConfigSocketBindOption(standardConfig *config, sds *argv, int argc, const char **err) {
+    UNUSED(config);
+    return setConfigBindOption(config, argv, argc, err, server.bindaddr, &server.bindaddr_count);
+}
+
+static int setConfigRdmaBindOption(standardConfig *config, sds *argv, int argc, const char **err) {
+    UNUSED(config);
+    return setConfigBindOption(config, argv, argc, err, server.rdma_ctx_config.bindaddr, &server.rdma_ctx_config.bindaddr_count);
+}
+
+static sds getConfigRdmaBindOption(standardConfig *config) {
+    UNUSED(config);
+    return sdsjoin(server.rdma_ctx_config.bindaddr, server.rdma_ctx_config.bindaddr_count, " ");
+}
+
+static void rewriteConfigRdmaBindOption(standardConfig *config, const char *name, struct rewriteConfigState *state) {
+    UNUSED(config);
+
+    if (server.rdma_ctx_config.bindaddr_count) {
+        rewriteConfigBindOption(config, name, state, server.rdma_ctx_config.bindaddr,
+                                server.rdma_ctx_config.bindaddr_count);
+    }
+}
+
+static int applyRdmaBind(const char **err) {
+    connListener *rdma_listener = listenerByType(CONN_TYPE_RDMA);
+
+    if (!rdma_listener) {
+        *err = "No RDMA building support.";
+        return 0;
+    }
+
+    rdma_listener->bindaddr = server.rdma_ctx_config.bindaddr;
+    rdma_listener->bindaddr_count = server.rdma_ctx_config.bindaddr_count;
+    rdma_listener->port = server.rdma_ctx_config.port;
+    rdma_listener->ct = connectionByType(CONN_TYPE_RDMA);
+    if (changeListener(rdma_listener) == C_ERR) {
+        *err = "Failed to bind to specified addresses for RDMA.";
+        return 0;
+    }
+
+    return 1;
+}
+
+static int updateRdmaPort(const char **err) {
+    connListener *listener = listenerByType(CONN_TYPE_RDMA);
+
+    if (listener == NULL) {
+        *err = "No RDMA building support.";
+        return 0;
+    }
+
+    listener->bindaddr = server.rdma_ctx_config.bindaddr;
+    listener->bindaddr_count = server.rdma_ctx_config.bindaddr_count;
+    listener->port = server.rdma_ctx_config.port;
+    listener->ct = connectionByType(CONN_TYPE_RDMA);
+    if (changeListener(listener) == C_ERR) {
+        *err = "Unable to listen on this port for RDMA. Check server logs.";
+        return 0;
     }
-    for (j = 0; j < argc; j++) server.bindaddr[j] = zstrdup(argv[j]);
-    server.bindaddr_count = argc;
 
     return 1;
 }
@@ -3237,6 +3307,9 @@ standardConfig static_configs[] = {
     createIntConfig("watchdog-period", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 0, INT_MAX, server.watchdog_period, 0, INTEGER_CONFIG, NULL, updateWatchdogPeriod),
     createIntConfig("shutdown-timeout", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.shutdown_timeout, 10, INTEGER_CONFIG, NULL, NULL),
     createIntConfig("repl-diskless-sync-max-replicas", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.repl_diskless_sync_max_replicas, 0, INTEGER_CONFIG, NULL, NULL),
+    createIntConfig("rdma-port", NULL, MODIFIABLE_CONFIG, 0, 65535, server.rdma_ctx_config.port, 0, INTEGER_CONFIG, NULL, updateRdmaPort),
+    createIntConfig("rdma-rx-size", NULL, IMMUTABLE_CONFIG, 64 * 1024, 16 * 1024 * 1024, server.rdma_ctx_config.rx_size, 1024 * 1024, INTEGER_CONFIG, NULL, NULL),
+    createIntConfig("rdma-completion-vector", NULL, IMMUTABLE_CONFIG, -1, 1024, server.rdma_ctx_config.completion_vector, -1, INTEGER_CONFIG, NULL, NULL),
 
     /* Unsigned int configs */
     createUIntConfig("maxclients", NULL, MODIFIABLE_CONFIG, 1, UINT_MAX, server.maxclients, 10000, INTEGER_CONFIG, NULL, updateMaxclients),
@@ -3316,7 +3389,8 @@ standardConfig static_configs[] = {
     createSpecialConfig("client-output-buffer-limit", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigClientOutputBufferLimitOption, getConfigClientOutputBufferLimitOption, rewriteConfigClientOutputBufferLimitOption, NULL),
     createSpecialConfig("oom-score-adj-values", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigOOMScoreAdjValuesOption, getConfigOOMScoreAdjValuesOption, rewriteConfigOOMScoreAdjValuesOption, updateOOMScoreAdj),
     createSpecialConfig("notify-keyspace-events", NULL, MODIFIABLE_CONFIG, setConfigNotifyKeyspaceEventsOption, getConfigNotifyKeyspaceEventsOption, rewriteConfigNotifyKeyspaceEventsOption, NULL),
-    createSpecialConfig("bind", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigBindOption, getConfigBindOption, rewriteConfigBindOption, applyBind),
+    createSpecialConfig("bind", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigSocketBindOption, getConfigBindOption, rewriteConfigSocketBindOption, applyBind),
+    createSpecialConfig("rdma-bind", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigRdmaBindOption, getConfigRdmaBindOption, rewriteConfigRdmaBindOption, applyRdmaBind),
     createSpecialConfig("replicaof", "slaveof", IMMUTABLE_CONFIG | MULTI_ARG_CONFIG, setConfigReplicaOfOption, getConfigReplicaOfOption, rewriteConfigReplicaOfOption, NULL),
     createSpecialConfig("latency-tracking-info-percentiles", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigLatencyTrackingInfoPercentilesOutputOption, getConfigLatencyTrackingInfoPercentilesOutputOption, rewriteConfigLatencyTrackingInfoPercentilesOutputOption, NULL),
 
diff --git a/src/connection.c b/src/connection.c
index f0c1c2d364..8807541d77 100644
--- a/src/connection.c
+++ b/src/connection.c
@@ -66,6 +66,9 @@ int connTypeInitialize(void) {
     /* may fail if without BUILD_TLS=yes */
     RedisRegisterConnectionTypeTLS();
 
+    /* may fail if without BUILD_RDMA=yes */
+    RegisterConnectionTypeRdma();
+
     return C_OK;
 }
 
diff --git a/src/connection.h b/src/connection.h
index 0762441732..8a2775ee34 100644
--- a/src/connection.h
+++ b/src/connection.h
@@ -60,6 +60,7 @@ typedef enum {
 #define CONN_TYPE_SOCKET "tcp"
 #define CONN_TYPE_UNIX "unix"
 #define CONN_TYPE_TLS "tls"
+#define CONN_TYPE_RDMA "rdma"
 #define CONN_TYPE_MAX 8 /* 8 is enough to be extendable */
 
 typedef void (*ConnectionCallbackFunc)(struct connection *conn);
@@ -79,6 +80,7 @@ typedef struct ConnectionType {
     int (*addr)(connection *conn, char *ip, size_t ip_len, int *port, int remote);
     int (*is_local)(connection *conn);
     int (*listen)(connListener *listener);
+    void (*closeListener)(connListener *listener);
 
     /* create/shutdown/close connection */
     connection *(*conn_create)(void);
@@ -442,6 +444,13 @@ static inline int connListen(connListener *listener) {
     return listener->ct->listen(listener);
 }
 
+/* Close a listened listener */
+static inline void connCloseListener(connListener *listener) {
+    if (listener->count) {
+        listener->ct->closeListener(listener);
+    }
+}
+
 /* Get accept_handler of a connection type */
 static inline aeFileProc *connAcceptHandler(ConnectionType *ct) {
     if (ct) return ct->accept_handler;
@@ -454,6 +463,7 @@ sds getListensInfoString(sds info);
 int RedisRegisterConnectionTypeSocket(void);
 int RedisRegisterConnectionTypeUnix(void);
 int RedisRegisterConnectionTypeTLS(void);
+int RegisterConnectionTypeRdma(void);
 
 /* Return 1 if connection is using TLS protocol, 0 if otherwise. */
 static inline int connIsTLS(connection *conn) {
diff --git a/src/rdma.c b/src/rdma.c
index 7cdcb24913..de7ea396a1 100644
--- a/src/rdma.c
+++ b/src/rdma.c
@@ -10,9 +10,10 @@
 
 #define VALKEYMODULE_CORE_MODULE
 #include "server.h"
-
-#if defined USE_RDMA && defined __linux__ /* currently RDMA is only supported on Linux */
 #include "connection.h"
+
+#if defined __linux__ /* currently RDMA is only supported on Linux */
+#if (USE_RDMA == 1 /* BUILD_YES */) || ((USE_RDMA == 2 /* BUILD_MODULE */) && (BUILD_RDMA_MODULE == 2))
 #include "connhelpers.h"
 
 #include <assert.h>
@@ -128,12 +129,10 @@ typedef struct rdma_listener {
 static list *pending_list;
 
 static rdma_listener *rdma_listeners;
+static serverRdmaContextConfig *rdma_config;
 
 static ConnectionType CT_RDMA;
 
-static int valkey_rdma_rx_size = VALKEY_RDMA_DEFAULT_RX_SIZE;
-static int valkey_rdma_comp_vector = -1; /* -1 means a random one */
-
 static void serverRdmaError(char *err, const char *fmt, ...) {
     va_list ap;
 
@@ -272,7 +271,7 @@ static int rdmaSetupIoBuf(RdmaContext *ctx, struct rdma_cm_id *cm_id) {
 
     /* setup recv buf & MR */
     access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE;
-    length = valkey_rdma_rx_size;
+    length = rdma_config->rx_size;
     ctx->rx.addr = page_aligned_zalloc(length);
     ctx->rx.length = length;
     ctx->rx.mr = ibv_reg_mr(ctx->pd, ctx->rx.addr, length, access);
@@ -295,6 +294,7 @@ static int rdmaCreateResource(RdmaContext *ctx, struct rdma_cm_id *cm_id) {
     struct ibv_comp_channel *comp_channel = NULL;
     struct ibv_cq *cq = NULL;
     struct ibv_pd *pd = NULL;
+    int comp_vector = rdma_config->completion_vector;
 
     if (ibv_query_device(cm_id->verbs, &device_attr)) {
         serverLog(LL_WARNING, "RDMA: ibv ibv query device failed");
@@ -317,8 +317,13 @@ static int rdmaCreateResource(RdmaContext *ctx, struct rdma_cm_id *cm_id) {
 
     ctx->comp_channel = comp_channel;
 
+    /* negative number means a random one */
+    if (comp_vector < 0) {
+        comp_vector = abs((int)random());
+    }
+
     cq = ibv_create_cq(cm_id->verbs, VALKEY_RDMA_MAX_WQE * 2, NULL, comp_channel,
-                       valkey_rdma_comp_vector % cm_id->verbs->num_comp_vectors);
+                       comp_vector % cm_id->verbs->num_comp_vectors);
     if (!cq) {
         serverLog(LL_WARNING, "RDMA: ibv create cq failed");
         return C_ERR;
@@ -1610,9 +1615,28 @@ int connRdmaListen(connListener *listener) {
         rdma_listener++;
     }
 
+    rdma_config = listener->priv;
     return C_OK;
 }
 
+static void connRdmaCloseListener(connListener *listener) {
+    /* Close old servers */
+    for (int i = 0; i < listener->count; i++) {
+        if (listener->fd[i] == -1) continue;
+
+        aeDeleteFileEvent(server.el, listener->fd[i], AE_READABLE);
+        listener->fd[i] = -1;
+        struct rdma_listener *rdma_listener = &rdma_listeners[i];
+        rdma_destroy_id(rdma_listener->cm_id);
+        rdma_destroy_event_channel(rdma_listener->cm_channel);
+    }
+
+    listener->count = 0;
+    zfree(rdma_listeners);
+    rdma_listeners = NULL;
+    rdma_config = NULL;
+}
+
 static int connRdmaAddr(connection *conn, char *ip, size_t ip_len, int *port, int remote) {
     rdma_connection *rdma_conn = (rdma_connection *)conn;
     struct rdma_cm_id *cm_id = rdma_conn->cm_id;
@@ -1740,6 +1764,7 @@ static ConnectionType CT_RDMA = {
     //.cluster_accept_handler = NULL,
     .is_local = connRdmaIsLocal,
     .listen = connRdmaListen,
+    .closeListener = connRdmaCloseListener,
     .addr = connRdmaAddr,
 
     /* create/close connection */
@@ -1769,17 +1794,6 @@ static ConnectionType CT_RDMA = {
     .process_pending_data = rdmaProcessPendingData,
 };
 
-static struct connListener *rdmaListener(void) {
-    static struct connListener *listener = NULL;
-
-    if (listener) return listener;
-
-    listener = listenerByType(CONN_TYPE_RDMA);
-    serverAssert(listener != NULL);
-
-    return listener;
-}
-
 ConnectionType *connectionTypeRdma(void) {
     static ConnectionType *ct_rdma = NULL;
 
@@ -1791,133 +1805,28 @@ ConnectionType *connectionTypeRdma(void) {
     return ct_rdma;
 }
 
-/* rdma listener has different create/close logic from TCP, we can't re-use 'int changeListener(connListener *listener)'
- * directly */
-static int rdmaChangeListener(void) {
-    struct connListener *listener = rdmaListener();
-
-    /* Close old servers */
-    for (int i = 0; i < listener->count; i++) {
-        if (listener->fd[i] == -1) continue;
-
-        aeDeleteFileEvent(server.el, listener->fd[i], AE_READABLE);
-        listener->fd[i] = -1;
-        struct rdma_listener *rdma_listener = &rdma_listeners[i];
-        rdma_destroy_id(rdma_listener->cm_id);
-        rdma_destroy_event_channel(rdma_listener->cm_channel);
-    }
-
-    listener->count = 0;
-    zfree(rdma_listeners);
-    rdma_listeners = NULL;
-
-    closeListener(listener);
-
-    /* Just close the server if port disabled */
-    if (listener->port == 0) {
-        if (server.set_proc_title) serverSetProcTitle(NULL);
-        return VALKEYMODULE_OK;
-    }
-
-    /* Re-create listener */
-    if (connListen(listener) != C_OK) {
-        return VALKEYMODULE_ERR;
-    }
-
-    /* Create event handlers */
-    if (createSocketAcceptHandler(listener, listener->ct->accept_handler) != C_OK) {
-        serverPanic("Unrecoverable error creating %s accept handler.", listener->ct->get_type(NULL));
-    }
-
-    if (server.set_proc_title) serverSetProcTitle(NULL);
-
-    return VALKEYMODULE_OK;
-}
-
-#ifdef BUILD_RDMA_MODULE
-
-#include "release.h"
-
-static long long rdmaGetPort(const char *name, void *privdata) {
-    UNUSED(name);
-    UNUSED(privdata);
-    struct connListener *listener = rdmaListener();
-
-    return listener->port;
-}
-
-static int rdmaSetPort(const char *name, long long val, void *privdata, ValkeyModuleString **err) {
-    UNUSED(name);
-    UNUSED(privdata);
-    UNUSED(err);
-    struct connListener *listener = rdmaListener();
-    listener->port = val;
-
-    return VALKEYMODULE_OK;
-}
-
-static ValkeyModuleString *rdma_bind;
-
-static void rdmaBuildBind(void *ctx) {
-    struct connListener *listener = rdmaListener();
-
-    if (rdma_bind) ValkeyModule_FreeString(NULL, rdma_bind);
-
-    sds rdma_bind_str = sdsjoin(listener->bindaddr, listener->bindaddr_count, " ");
-    rdma_bind = ValkeyModule_CreateString(ctx, rdma_bind_str, sdslen(rdma_bind_str));
+int RegisterConnectionTypeRdma(void) {
+    return connTypeRegister(&CT_RDMA);
 }
 
-static ValkeyModuleString *rdmaGetBind(const char *name, void *privdata) {
-    UNUSED(name);
-    UNUSED(privdata);
+#else
 
-    return rdma_bind;
+int RegisterConnectionTypeRdma(void) {
+    serverLog(LL_VERBOSE, "Connection type %s not builtin", CONN_TYPE_RDMA);
+    return C_ERR;
 }
 
-static int rdmaSetBind(const char *name, ValkeyModuleString *val, void *privdata, ValkeyModuleString **err) {
-    UNUSED(name);
-    UNUSED(err);
-    struct connListener *listener = rdmaListener();
-    const char *bind = ValkeyModule_StringPtrLen(val, NULL);
-    int nexts;
-    sds *exts = sdssplitlen(bind, strlen(bind), " ", 1, &nexts);
-
-    if (nexts > CONFIG_BINDADDR_MAX) {
-        serverLog(LL_WARNING, "RDMA: Unsupported bind ( > %d)", CONFIG_BINDADDR_MAX);
-        return VALKEYMODULE_ERR;
-    }
-
-    /* Free old bind addresses */
-    for (int j = 0; j < listener->bindaddr_count; j++) {
-        zfree(listener->bindaddr[j]);
-    }
-
-    for (int j = 0; j < nexts; j++) listener->bindaddr[j] = zstrdup(exts[j]);
-    listener->bindaddr_count = nexts;
-
-    sdsfreesplitres(exts, nexts);
-    rdmaBuildBind(privdata);
-
-    return VALKEYMODULE_OK;
-}
+#endif
 
-static int rdmaApplyListener(ValkeyModuleCtx *ctx, void *privdata, ValkeyModuleString **err) {
-    UNUSED(ctx);
-    UNUSED(privdata);
-    UNUSED(err);
+#if BUILD_RDMA_MODULE == 2 /* BUILD_MODULE */
 
-    return rdmaChangeListener();
-}
+#include "release.h"
 
-static void rdmaListenerAddConfig(void *ctx) {
-    serverAssert(ValkeyModule_RegisterNumericConfig(ctx, "port", 0, VALKEYMODULE_CONFIG_DEFAULT, 0, 65535, rdmaGetPort,
-                                                    rdmaSetPort, rdmaApplyListener, NULL) == VALKEYMODULE_OK);
-    serverAssert(ValkeyModule_RegisterStringConfig(ctx, "bind", "", VALKEYMODULE_CONFIG_DEFAULT, rdmaGetBind,
-                                                   rdmaSetBind, rdmaApplyListener, ctx) == VALKEYMODULE_OK);
-    serverAssert(ValkeyModule_LoadConfigs(ctx) == VALKEYMODULE_OK);
-}
 
 int ValkeyModule_OnLoad(void *ctx, ValkeyModuleString **argv, int argc) {
+    UNUSED(argv);
+    UNUSED(argc);
+
     /* Connection modules MUST be part of the same build as valkey. */
     if (strcmp(REDIS_BUILD_ID_RAW, serverBuildIdRaw())) {
         serverLog(LL_NOTICE, "Connection type %s was not built together with the valkey-server used.", CONN_TYPE_RDMA);
@@ -1936,40 +1845,6 @@ int ValkeyModule_OnLoad(void *ctx, ValkeyModuleString **argv, int argc) {
 
     if (connTypeRegister(&CT_RDMA) != C_OK) return VALKEYMODULE_ERR;
 
-    rdmaListenerAddConfig(ctx);
-
-    struct connListener *listener = rdmaListener();
-    listener->ct = connectionTypeRdma();
-    listener->bindaddr = zcalloc_num(CONFIG_BINDADDR_MAX, sizeof(listener->bindaddr[0]));
-
-    for (int i = 0; i < argc; i++) {
-        robj *str = (robj *)argv[i];
-        int nexts;
-        sds *exts = sdssplitlen(str->ptr, strlen(str->ptr), "=", 1, &nexts);
-        if (nexts != 2) {
-            serverLog(LL_WARNING, "RDMA: Unsupported argument \"%s\"", (char *)str->ptr);
-            return VALKEYMODULE_ERR;
-        }
-
-        if (!strcasecmp(exts[0], "bind")) {
-            listener->bindaddr[listener->bindaddr_count++] = zstrdup(exts[1]);
-        } else if (!strcasecmp(exts[0], "port")) {
-            listener->port = atoi(exts[1]);
-        } else if (!strcasecmp(exts[0], "rx-size")) {
-            valkey_rdma_rx_size = atoi(exts[1]);
-        } else if (!strcasecmp(exts[0], "comp-vector")) {
-            valkey_rdma_comp_vector = atoi(exts[1]);
-        } else {
-            serverLog(LL_WARNING, "RDMA: Unsupported argument \"%s\"", (char *)str->ptr);
-            return VALKEYMODULE_ERR;
-        }
-
-        sdsfreesplitres(exts, nexts);
-    }
-
-    rdmaBuildBind(ctx);
-    if (valkey_rdma_comp_vector == -1) valkey_rdma_comp_vector = abs((int)random());
-
     return VALKEYMODULE_OK;
 }
 
@@ -1981,4 +1856,11 @@ int ValkeyModule_OnUnload(void *arg) {
 
 #endif /* BUILD_RDMA_MODULE */
 
-#endif /* USE_RDMA && __linux__ */
+#else /* __linux__ */
+
+int RegisterConnectionTypeRdma(void) {
+    serverLog(LL_VERBOSE, "Connection type %s is supported on Linux only", CONN_TYPE_RDMA);
+    return C_ERR;
+}
+
+#endif /* __linux__ */
diff --git a/src/server.c b/src/server.c
index a83ef9096c..df57659715 100644
--- a/src/server.c
+++ b/src/server.c
@@ -2482,19 +2482,6 @@ void checkTcpBacklogSettings(void) {
 #endif
 }
 
-void closeListener(connListener *sfd) {
-    int j;
-
-    for (j = 0; j < sfd->count; j++) {
-        if (sfd->fd[j] == -1) continue;
-
-        aeDeleteFileEvent(server.el, sfd->fd[j], AE_READABLE);
-        close(sfd->fd[j]);
-    }
-
-    sfd->count = 0;
-}
-
 /* Create an event handler for accepting new connections in TCP or TLS domain sockets.
  * This works atomically for all socket fds */
 int createSocketAcceptHandler(connListener *sfd, aeFileProc *accept_handler) {
@@ -2558,7 +2545,7 @@ int listenToPort(connListener *sfd) {
                 continue;
 
             /* Rollback successful listens before exiting */
-            closeListener(sfd);
+            connCloseListener(sfd);
             return C_ERR;
         }
         if (server.socket_mark_id > 0) anetSetSockMarkId(NULL, sfd->fd[sfd->count], server.socket_mark_id);
@@ -2899,6 +2886,17 @@ void initListeners(void) {
         listener->priv = &server.unix_ctx_config; /* Unix socket specified */
     }
 
+    if (server.rdma_ctx_config.port != 0) {
+        conn_index = connectionIndexByType(CONN_TYPE_RDMA);
+        if (conn_index < 0) serverPanic("Failed finding connection listener of %s", CONN_TYPE_RDMA);
+        listener = &server.listeners[conn_index];
+        listener->bindaddr = server.rdma_ctx_config.bindaddr;
+        listener->bindaddr_count = server.rdma_ctx_config.bindaddr_count;
+        listener->port = server.rdma_ctx_config.port;
+        listener->ct = connectionByType(CONN_TYPE_RDMA);
+        listener->priv = &server.rdma_ctx_config;
+    }
+
     /* create all the configured listener, and add handler to start to accept */
     int listen_fds = 0;
     for (int j = 0; j < CONN_TYPE_MAX; j++) {
@@ -6297,7 +6295,7 @@ connListener *listenerByType(const char *typename) {
 /* Close original listener, re-create a new listener from the updated bind address & port */
 int changeListener(connListener *listener) {
     /* Close old servers */
-    closeListener(listener);
+    connCloseListener(listener);
 
     /* Just close the server if port disabled */
     if (listener->port == 0) {
diff --git a/src/server.h b/src/server.h
index 70bd3868c3..b9e8be9479 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1614,6 +1614,17 @@ typedef struct serverUnixContextConfig {
     unsigned int perm; /* UNIX socket permission (see mode_t) */
 } serverUnixContextConfig;
 
+/*-----------------------------------------------------------------------------
+ * RDMA Context Configuration
+ *----------------------------------------------------------------------------*/
+typedef struct serverRdmaContextConfig {
+    char *bindaddr[CONFIG_BINDADDR_MAX];
+    int bindaddr_count;
+    int port;
+    int rx_size;
+    int completion_vector;
+} serverRdmaContextConfig;
+
 /*-----------------------------------------------------------------------------
  * AOF manifest definition
  *----------------------------------------------------------------------------*/
@@ -2229,6 +2240,7 @@ struct valkeyServer {
     int tls_auth_clients;
     serverTLSContextConfig tls_ctx_config;
     serverUnixContextConfig unix_ctx_config;
+    serverRdmaContextConfig rdma_ctx_config;
     /* cpu affinity */
     char *server_cpulist;      /* cpu affinity list of server main/io thread. */
     char *bio_cpulist;         /* cpu affinity list of bio thread. */
@@ -3293,7 +3305,6 @@ void setupSignalHandlers(void);
 int createSocketAcceptHandler(connListener *sfd, aeFileProc *accept_handler);
 connListener *listenerByType(const char *typename);
 int changeListener(connListener *listener);
-void closeListener(connListener *listener);
 struct serverCommand *lookupSubcommand(struct serverCommand *container, sds sub_name);
 struct serverCommand *lookupCommand(robj **argv, int argc);
 struct serverCommand *lookupCommandBySdsLogic(dict *commands, sds s);
diff --git a/src/socket.c b/src/socket.c
index 7344d66ad8..d89e6c8767 100644
--- a/src/socket.c
+++ b/src/socket.c
@@ -339,6 +339,19 @@ static int connSocketListen(connListener *listener) {
     return listenToPort(listener);
 }
 
+static void connSocketCloseListener(connListener *listener) {
+    int j;
+
+    for (j = 0; j < listener->count; j++) {
+        if (listener->fd[j] == -1) continue;
+
+        aeDeleteFileEvent(server.el, listener->fd[j], AE_READABLE);
+        close(listener->fd[j]);
+    }
+
+    listener->count = 0;
+}
+
 static int connSocketBlockingConnect(connection *conn, const char *addr, int port, long long timeout) {
     int fd = anetTcpNonBlockConnect(NULL, addr, port);
     if (fd == -1) {
@@ -395,6 +408,7 @@ static ConnectionType CT_Socket = {
     .addr = connSocketAddr,
     .is_local = connSocketIsLocal,
     .listen = connSocketListen,
+    .closeListener = connSocketCloseListener,
 
     /* create/shutdown/close connection */
     .conn_create = connCreateSocket,
diff --git a/src/tls.c b/src/tls.c
index d1dd567354..48b75553de 100644
--- a/src/tls.c
+++ b/src/tls.c
@@ -805,6 +805,10 @@ static int connTLSListen(connListener *listener) {
     return listenToPort(listener);
 }
 
+static void connTLSCloseListener(connListener *listener) {
+    connectionTypeTcp()->closeListener(listener);
+}
+
 static void connTLSShutdown(connection *conn_) {
     tls_connection *conn = (tls_connection *)conn_;
 
@@ -1147,6 +1151,7 @@ static ConnectionType CT_TLS = {
     .addr = connTLSAddr,
     .is_local = connTLSIsLocal,
     .listen = connTLSListen,
+    .closeListener = connTLSCloseListener,
 
     /* create/shutdown/close connection */
     .conn_create = connCreateTLS,
diff --git a/src/unix.c b/src/unix.c
index 35778779f9..86df05bd52 100644
--- a/src/unix.c
+++ b/src/unix.c
@@ -74,6 +74,10 @@ static int connUnixListen(connListener *listener) {
     return C_OK;
 }
 
+static void connUnixCloseListener(connListener *listener) {
+    connectionTypeTcp()->closeListener(listener);
+}
+
 static connection *connCreateUnix(void) {
     connection *conn = zcalloc(sizeof(connection));
     conn->type = &CT_Unix;
@@ -174,6 +178,7 @@ static ConnectionType CT_Unix = {
     .addr = connUnixAddr,
     .is_local = connUnixIsLocal,
     .listen = connUnixListen,
+    .closeListener = connUnixCloseListener,
 
     /* create/shutdown/close connection */
     .conn_create = connCreateUnix,
diff --git a/tests/rdma/run.py b/tests/rdma/run.py
index 0724c27adc..09168f368a 100755
--- a/tests/rdma/run.py
+++ b/tests/rdma/run.py
@@ -63,7 +63,7 @@ def test_rdma(ipaddr):
     rdmapath = valkeydir + "/src/valkey-rdma.so"
     svrcmd = [svrpath, "--port", "0", "--loglevel", "verbose", "--protected-mode", "yes",
              "--appendonly", "no", "--daemonize", "no", "--dir", valkeydir + "/tests/rdma/tmp",
-             "--loadmodule", rdmapath, "port=6379", "bind=" + ipaddr]
+             "--loadmodule", rdmapath, "--rdma-port", "6379", "--rdma-bind", ipaddr]
 
     svr = subprocess.Popen(svrcmd, shell=False, stdout=subprocess.PIPE)
     try:
diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl
index 352f5f183e..d79bb1c7da 100644
--- a/tests/unit/introspection.tcl
+++ b/tests/unit/introspection.tcl
@@ -558,6 +558,10 @@ start_server {tags {"introspection"}} {
             req-res-logfile
             client-default-resp
             dual-channel-replication-enabled
+            rdma-completion-vector
+            rdma-rx-size
+            rdma-bind
+            rdma-port
         }
 
         if {!$::tls} {
diff --git a/valkey.conf b/valkey.conf
index bf82b01874..8d3e11c515 100644
--- a/valkey.conf
+++ b/valkey.conf
@@ -300,6 +300,54 @@ tcp-keepalive 300
 #
 # tls-session-cache-timeout 60
 
+################################### RDMA ######################################
+
+# Valkey Over RDMA is experimental, it may be changed or be removed in any minor or major version.
+# By default, RDMA is disabled. To enable it, the "rdma-port" configuration
+# directive can be used to define RDMA-listening ports.
+#
+# rdma-port 6379
+# rdma-bind 192.168.1.100
+
+# The RDMA receive transfer buffer is 1M by default. It can be set between 64K and 16M.
+# Note that page size aligned size is preferred.
+#
+# rdma-rx-size 1048576
+
+# The RDMA completion queue will use the completion vector to signal completion events
+# via hardware interrupts. A large number of hardware interrupts can affect CPU performance.
+# It is possible to tune the performance using rdma-completion-vector.
+#
+# Example 1. a) Pin hardware interrupt vectors [0, 3] to CPU [0, 3].
+#            b) Set CPU affinity for valkey to CPU [4, X].
+#            c) Any valkey server uses a random RDMA completion vector [-1].
+# All valkey servers will not affect each other and will be isolated from kernel interrupts.
+#
+#   SYS    SYS    SYS    SYS  VALKEY VALKEY     VALKEY
+#    |      |      |      |      |      |          |
+#  CPU0   CPU1   CPU2   CPU3   CPU4   CPU5   ... CPUX
+#    |      |      |      |
+#  INTR0  INTR1  INTR2  INTR3
+#
+# Example 2. a) 1:1 pin hardware interrupt vectors [0, X] to CPU [0, X].
+#            b) Set CPU affinity for valkey [M] to CPU [M].
+#            c) Valkey server [M] uses RDMA completion vector [M].
+# A single CPU [M] handles hardware interrupts, the RDMA completion vector [M],
+# and the valkey server [M] within its context only.
+# This avoids overhead and function calls across multiple CPUs, fully isolating
+# each valkey server from one another.
+#
+# VALKEY VALKEY VALKEY VALKEY VALKEY VALKEY     VALKEY
+#    |      |      |      |      |      |          |
+#  CPU0   CPU1   CPU2   CPU3   CPU4   CPU5  ...  CPUX
+#    |      |      |      |      |      |          |
+#  INTR0  INTR1  INTR2  INTR3  INTR4  INTR5      INTRX
+#
+# Use 0 and positive numbers to specify the RDMA completion vector, or specify -1 to allow
+# the server to use a random vector for a new connection. The default vector is -1.
+#
+# rdma-completion-vector 0
+
 ################################# GENERAL #####################################
 
 # By default the server does not run as a daemon. Use 'yes' if you need it.

From c8ceb2ee255c899b0cb05b69f0511fc7dcf4ddca Mon Sep 17 00:00:00 2001
From: Stav Ben-Tov <90314138+stav-bentov@users.noreply.github.com>
Date: Sun, 1 Dec 2024 13:24:18 +0200
Subject: [PATCH 05/73] Use zfree_with_size for client buffer (#1376)

Replace occurrences of 'zfree' with 'zfree_with_size' to improve
performance.
'zfree_with_size' function avoids calling 'zmalloc_size' to retrieve
buffer size and
uses previuos calculation of size for calling 'zfree_with_size'. This
results in faster
memory deallocation and reduces overhead.

Signed-off-by: stav bentov <stavbt@amazon.com>
Co-authored-by: stav bentov <stavbt@amazon.com>
---
 src/networking.c | 2 +-
 src/server.c     | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/networking.c b/src/networking.c
index 97479967f6..bbd684a3e5 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -1760,7 +1760,7 @@ void freeClient(client *c) {
     /* Free data structures. */
     listRelease(c->reply);
     c->reply = NULL;
-    zfree(c->buf);
+    zfree_with_size(c->buf, c->buf_usable_size);
     c->buf = NULL;
     freeReplicaReferencedReplBuffer(c);
     freeClientArgv(c);
diff --git a/src/server.c b/src/server.c
index df57659715..ef9f523145 100644
--- a/src/server.c
+++ b/src/server.c
@@ -889,9 +889,10 @@ int clientsCronResizeOutputBuffer(client *c, mstime_t now_ms) {
 
     if (new_buffer_size) {
         oldbuf = c->buf;
+        size_t oldbuf_size = c->buf_usable_size;
         c->buf = zmalloc_usable(new_buffer_size, &c->buf_usable_size);
         memcpy(c->buf, oldbuf, c->bufpos);
-        zfree(oldbuf);
+        zfree_with_size(oldbuf, oldbuf_size);
     }
     return 0;
 }

From 9c48f567907087637e19bf30a5a137d8b50e0df3 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Sun, 1 Dec 2024 21:33:21 +0800
Subject: [PATCH 06/73] Reset repl_down_since to zero only on state change
 (#1149)

We should reset repl_down_since only on state change, in the
current code, if the rdb channel in the dual channel is normal,
that is, rdb is loaded normally, but the psync channel is
abnormal, we will set repl_down_since 0 here. If the primary
is down at this time, the replica may be abnormal when calculating
data_age in cluster failover, since repl_state != REPL_STATE_CONNECTED,
this causes the replica to be unable to initiate an election due
to the old data_age.

In dualChannelSyncHandleRdbLoadCompletion, if the psync channel
is not established, the function will return. We will set repl_state
to REPL_STATE_CONNECTED and set repl_down_since to 0 in
dualChannelSyncSuccess, that is, in establishPrimaryConnection.

See also 677d10b2a8ff7f13033ccfe56ffcd246dbe70fb6 for more details.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/replication.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/replication.c b/src/replication.c
index 260da1cd6e..d17199bfc3 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -2405,10 +2405,10 @@ void readSyncBulkPayload(connection *conn) {
     } else {
         replicationCreatePrimaryClient(server.repl_transfer_s, rsi.repl_stream_db);
         server.repl_state = REPL_STATE_CONNECTED;
+        server.repl_down_since = 0;
         /* Send the initial ACK immediately to put this replica in online state. */
         replicationSendAck();
     }
-    server.repl_down_since = 0;
 
     /* Fire the primary link modules event. */
     moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_UP, NULL);

From 7043ef0bbb627b66bcaa75351b1b141c96852df8 Mon Sep 17 00:00:00 2001
From: Amit Nagler <58042354+naglera@users.noreply.github.com>
Date: Sun, 1 Dec 2024 15:33:43 +0200
Subject: [PATCH 07/73] Split dual-channel COB overrun tests to separate
 servers (#1374)

1. The test isn't waiting long enough for the output buffer to overrun.
This problem is happening because an error from the previous test is
bleeding into the current test's logs. The simplest fix would be to
split these tests.
2. Increased replication timeout to ensure sync fails due to output
buffer overrun before a timeout occurs.

Fixes #1367

Signed-off-by: naglera <anagler123@gmail.com>
---
 .../integration/dual-channel-replication.tcl  | 33 ++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl
index 055ed670ab..e417dad6c9 100644
--- a/tests/integration/dual-channel-replication.tcl
+++ b/tests/integration/dual-channel-replication.tcl
@@ -775,7 +775,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
 
         $replica config set dual-channel-replication-enabled yes
         $replica config set loglevel debug
-        $replica config set repl-timeout 10
+        $replica config set repl-timeout 60
         $primary config set repl-backlog-size 1mb
 
         test "Test dual-channel-replication primary gets cob overrun before established psync" {
@@ -815,6 +815,37 @@ start_server {tags {"dual-channel-replication external:skip"}} {
         } else {
             fail "Primary should abort sync"
         }
+        stop_write_load $load_handle0
+        stop_write_load $load_handle1
+        stop_write_load $load_handle2
+    }
+}
+
+start_server {tags {"dual-channel-replication external:skip"}} {
+    set primary [srv 0 client]
+    set primary_host [srv 0 host]
+    set primary_port [srv 0 port]
+    set loglines [count_log_lines 0]
+
+    $primary config set repl-diskless-sync yes
+    $primary config set dual-channel-replication-enabled yes
+    $primary config set client-output-buffer-limit "replica 1100k 0 0"
+    $primary config set loglevel debug
+    start_server {} {
+        set replica [srv 0 client]
+        set replica_host [srv 0 host]
+        set replica_port [srv 0 port]
+        set replica_log [srv 0 stdout]
+        set replica_pid  [srv 0 pid]
+        
+        set load_handle0 [start_write_load $primary_host $primary_port 60]
+        set load_handle1 [start_write_load $primary_host $primary_port 60]
+        set load_handle2 [start_write_load $primary_host $primary_port 60]
+
+        $replica config set dual-channel-replication-enabled yes
+        $replica config set loglevel debug
+        $replica config set repl-timeout 60
+        $primary config set repl-backlog-size 1mb
         
         $replica debug pause-after-fork 1
         $primary debug populate 1000 primary 100000

From 90475af59429583182402ee3b408d7bcb36d56cd Mon Sep 17 00:00:00 2001
From: Vadym Khoptynets <1099644+poiuj@users.noreply.github.com>
Date: Sun, 1 Dec 2024 17:12:27 +0200
Subject: [PATCH 08/73] Free strings during BGSAVE/BGAOFRW to reduce
 copy-on-write (#905)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

**Motivation**

Copy-on-write (COW) amplification refers to the issue where writing to a
small object leads to the entire page being cloned, resulting in
inefficient memory usage. This issue arises during the BGSAVE process,
which can be particularly problematic on instances with limited memory.
If the BGSAVE process could release unneeded memory, it could reduce
memory consumption. To address this, the BGSAVE process calls the
`madvise` function to signal the operating system to reclaim the buffer.
However, this approach does not work for buffers smaller than a page
(usually 4KiB). Even after multiple such calls, where a full page may be
free, the operating system will not reclaim it.
To solve this issue, we can call `zfree` directly. This allows the
allocator (jemalloc) to handle the bookkeeping and release pages when
buffers are no longer needed. This approach reduces copy-on-write
events.

**Benchmarks**
To understand how usage of `zfree` affects BGSAVE and the memory
consumption I ran 45 benchmarks that compares my clonewith the vanilla
version. The benchmark has the following steps:
1. Start a new Valkey process
2. Fill the DB with data sequentially
3. Run a warmup to randomize the memory layout
4. Introduce fragmentation by deleting part of the keys
5. In parallel:
    1. Trigger BGSAVE
    2. Start 80/20 get/set load

I played the following parameters to understand their influence:

1. Number of keys: 3M, 6M, and 12M.
2. Data size. While key themselves are of fixed length ~30 bytes, the
value size is 120, 250, 500, 1000, and 2000 bytes.
3. Fragmentation. I delete 5%, 10%, and 15% of the original key range.

I'm attaching a graph of BGSAVE process memory consumption. Instead of
all benchmarks, I show the most representative runs IMO.

<img width="1570" alt="3m-fixed"
src="https://github.com/user-attachments/assets/3dbbc528-01c1-4821-a3c2-6be455e7f78a">


For 2000 bytes values peak memory usage is ~53% compared to vanilla. The
peak happens at 57% BGSAVE progress.
For 500 bytes values the peak is ~80% compared to vanilla. And happens
at ~80% progress.
For 120 bytes the difference is under 5%, and the patched version could
even use more memory.


![500b-fixed](https://github.com/user-attachments/assets/b09451d3-4bce-4f33-b3db-2b5df2178ed2)


For 12M keys, the peak is ~85% of the vanilla’s. Happens at ~70% mark.
For 6M keys, the peak is ~87% of the vanilla’s. Happens at ~77% mark.
For 3M keys, the peak is ~87% of the vanilla’s Happens at ~80% mark.

**Changes**

The PR contains 2 changes:
1. Static buffer for RDB comrpession.
RDB compression leads to COW events even without any write load if we
use `zfree`. It happens because the compression functions allocates a
new buffer for each object. Together with freeing objects with `zfree`
it leads to reusing of the memory shared with the main process.
To deal with this problem, we use a pre-allocated constant 8K buffer for
compression. If the object size is too big for this buffer, than we fall
back to the ad hoc allocation behavior.

2. Freeing string objects instead of dismissing them
Call to `zfree` is more expensive than direct call to `madvise`. But
with #453 strings use the fast path – `zfree_with_size`. As a possible
next step we can optimize `zfree` for other data types as well.

---------

Signed-off-by: Vadym Khoptynets <vadymkh@amazon.com>
Signed-off-by: ranshid <88133677+ranshid@users.noreply.github.com>
Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com>
Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
---
 src/object.c |  9 +++++++--
 src/rdb.c    | 19 ++++++++++++-------
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/src/object.c b/src/object.c
index 8c1cf64892..035198ad89 100644
--- a/src/object.c
+++ b/src/object.c
@@ -398,9 +398,14 @@ void decrRefCount(robj *o) {
     }
 }
 
-/* See dismissObject() */
+/* See dismissObject(). sds is an exception, because the allocation
+ * size is known. Instead of dismissing it with madvise(MADV_DONTNEED)
+ * we free it via the allocator, which has minimal overhead when the
+ * size is known. This has advantage that it allows the allocator to
+ * accumulate free buffers to free whole pages, while madvise is nop
+ * if the buffer is less than a page.  */
 void dismissSds(sds s) {
-    dismissMemory(sdsAllocPtr(s), sdsAllocSize(s));
+    sdsfree(s);
 }
 
 /* See dismissObject() */
diff --git a/src/rdb.c b/src/rdb.c
index 1c200e54f5..ca904f7f98 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -49,6 +49,9 @@
 #include <sys/stat.h>
 #include <sys/param.h>
 
+/* Size of the static buffer used for rdbcompression */
+#define LZF_STATIC_BUFFER_SIZE (8 * 1024)
+
 /* This macro is called when the internal RDB structure is corrupt */
 #define rdbReportCorruptRDB(...) rdbReportError(1, __LINE__, __VA_ARGS__)
 /* This macro is called when RDB read failed (possibly a short read) */
@@ -388,18 +391,20 @@ ssize_t rdbSaveLzfBlob(rio *rdb, void *data, size_t compress_len, size_t origina
 ssize_t rdbSaveLzfStringObject(rio *rdb, unsigned char *s, size_t len) {
     size_t comprlen, outlen;
     void *out;
+    static void *buffer = NULL;
 
     /* We require at least four bytes compression for this to be worth it */
     if (len <= 4) return 0;
     outlen = len - 4;
-    if ((out = zmalloc(outlen + 1)) == NULL) return 0;
-    comprlen = lzf_compress(s, len, out, outlen);
-    if (comprlen == 0) {
-        zfree(out);
-        return 0;
+    if (outlen < LZF_STATIC_BUFFER_SIZE) {
+        if (!buffer) buffer = zmalloc(LZF_STATIC_BUFFER_SIZE);
+        out = buffer;
+    } else {
+        if ((out = zmalloc(outlen + 1)) == NULL) return 0;
     }
-    ssize_t nwritten = rdbSaveLzfBlob(rdb, out, comprlen, len);
-    zfree(out);
+    comprlen = lzf_compress(s, len, out, outlen);
+    ssize_t nwritten = comprlen ? rdbSaveLzfBlob(rdb, out, comprlen, len) : 0;
+    if (out != buffer) zfree(out);
     return nwritten;
 }
 

From fbbfe5d3d3833c74d86c324ca9ffee8b97856724 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Mon, 2 Dec 2024 15:55:24 +0800
Subject: [PATCH 09/73] Print logs when the cluster state changes to fail or
 the fail reason changes (#1188)

This log allows us to easily distinguish between full coverage and
minority partition when the cluster fails. Sometimes it is not easy
to see the minority partition in a healthy shards (both primary and
replicas).

And we decided not to add a cluster_fail_reason field to cluster info.
Given that there are only two reasons and both are well-known and if
we ended up adding more down the road we can add it in the furture.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster.h               |  6 ++++++
 src/cluster_legacy.c        | 39 +++++++++++++++++++++++++++++++++++--
 src/cluster_legacy.h        |  1 +
 tests/unit/cluster/info.tcl | 23 ++++++++++++++++++++++
 4 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/src/cluster.h b/src/cluster.h
index 65eadf4c65..142f2d70b3 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -12,6 +12,12 @@
 #define CLUSTER_FAIL 1                                              /* The cluster can't work */
 #define CLUSTER_NAMELEN 40                                          /* sha1 hex length */
 
+/* Reason why the cluster state changes to fail. When adding new reasons,
+ * make sure to update clusterLogFailReason. */
+#define CLUSTER_FAIL_NONE 0
+#define CLUSTER_FAIL_NOT_FULL_COVERAGE 1
+#define CLUSTER_FAIL_MINORITY_PARTITION 2
+
 /* Redirection errors returned by getNodeByQuery(). */
 #define CLUSTER_REDIR_NONE 0          /* Node can serve the request. */
 #define CLUSTER_REDIR_CROSS_SLOT 1    /* -CROSSSLOT request. */
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index e4b25e265d..6ea8eb2e67 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -1082,6 +1082,7 @@ void clusterInit(void) {
     server.cluster->myself = NULL;
     server.cluster->currentEpoch = 0;
     server.cluster->state = CLUSTER_FAIL;
+    server.cluster->fail_reason = CLUSTER_FAIL_NONE;
     server.cluster->size = 0;
     server.cluster->todo_before_sleep = 0;
     server.cluster->nodes = dictCreate(&clusterNodesDictType);
@@ -4493,7 +4494,7 @@ void clusterLogCantFailover(int reason) {
     case CLUSTER_CANT_FAILOVER_WAITING_DELAY: msg = "Waiting the delay before I can start a new failover."; break;
     case CLUSTER_CANT_FAILOVER_EXPIRED: msg = "Failover attempt expired."; break;
     case CLUSTER_CANT_FAILOVER_WAITING_VOTES: msg = "Waiting for votes, but majority still not reached."; break;
-    default: msg = "Unknown reason code."; break;
+    default: serverPanic("Unknown cant failover reason code.");
     }
     lastlog_time = time(NULL);
     serverLog(LL_NOTICE, "Currently unable to failover: %s", msg);
@@ -5362,6 +5363,23 @@ void clusterCloseAllSlots(void) {
  * Cluster state evaluation function
  * -------------------------------------------------------------------------- */
 
+void clusterLogFailReason(int reason) {
+    if (reason == CLUSTER_FAIL_NONE) return;
+
+    char *msg;
+    switch (reason) {
+    case CLUSTER_FAIL_NOT_FULL_COVERAGE:
+        msg = "At least one hash slot is not served by any available node. "
+              "Please check the 'cluster-require-full-coverage' configuration.";
+        break;
+    case CLUSTER_FAIL_MINORITY_PARTITION:
+        msg = "I am part of a minority partition.";
+        break;
+    default: serverPanic("Unknown fail reason code.");
+    }
+    serverLog(LL_WARNING, "Cluster is currently down: %s", msg);
+}
+
 /* The following are defines that are only used in the evaluation function
  * and are based on heuristics. Actually the main point about the rejoin and
  * writable delay is that they should be a few orders of magnitude larger
@@ -5371,7 +5389,7 @@ void clusterCloseAllSlots(void) {
 #define CLUSTER_WRITABLE_DELAY 2000
 
 void clusterUpdateState(void) {
-    int j, new_state;
+    int j, new_state, new_reason;
     int reachable_primaries = 0;
     static mstime_t among_minority_time;
     static mstime_t first_call_time = 0;
@@ -5392,12 +5410,14 @@ void clusterUpdateState(void) {
     /* Start assuming the state is OK. We'll turn it into FAIL if there
      * are the right conditions. */
     new_state = CLUSTER_OK;
+    new_reason = CLUSTER_FAIL_NONE;
 
     /* Check if all the slots are covered. */
     if (server.cluster_require_full_coverage) {
         for (j = 0; j < CLUSTER_SLOTS; j++) {
             if (server.cluster->slots[j] == NULL || server.cluster->slots[j]->flags & (CLUSTER_NODE_FAIL)) {
                 new_state = CLUSTER_FAIL;
+                new_reason = CLUSTER_FAIL_NOT_FULL_COVERAGE;
                 break;
             }
         }
@@ -5432,6 +5452,7 @@ void clusterUpdateState(void) {
 
         if (reachable_primaries < needed_quorum) {
             new_state = CLUSTER_FAIL;
+            new_reason = CLUSTER_FAIL_MINORITY_PARTITION;
             among_minority_time = mstime();
         }
     }
@@ -5455,7 +5476,21 @@ void clusterUpdateState(void) {
         serverLog(new_state == CLUSTER_OK ? LL_NOTICE : LL_WARNING, "Cluster state changed: %s",
                   new_state == CLUSTER_OK ? "ok" : "fail");
         server.cluster->state = new_state;
+
+        /* Cluster state changes from ok to fail, print a log. */
+        if (new_state == CLUSTER_FAIL) {
+            clusterLogFailReason(new_reason);
+            server.cluster->fail_reason = new_reason;
+        }
     }
+
+    /* Cluster state is still fail, but the reason has changed, print a log. */
+    if (new_state == CLUSTER_FAIL && new_reason != server.cluster->fail_reason) {
+        clusterLogFailReason(new_reason);
+        server.cluster->fail_reason = new_reason;
+    }
+
+    if (new_state == CLUSTER_OK) server.cluster->fail_reason = CLUSTER_FAIL_NONE;
 }
 
 /* This function is called after the node startup in order to verify that data
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index 39148c748d..5595402a4d 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -370,6 +370,7 @@ struct clusterState {
     clusterNode *myself; /* This node */
     uint64_t currentEpoch;
     int state;              /* CLUSTER_OK, CLUSTER_FAIL, ... */
+    int fail_reason;        /* Why the cluster state changes to fail. */
     int size;               /* Num of primary nodes with at least one slot */
     dict *nodes;            /* Hash table of name -> clusterNode structures */
     dict *shards;           /* Hash table of shard_id -> list (of nodes) structures */
diff --git a/tests/unit/cluster/info.tcl b/tests/unit/cluster/info.tcl
index 0d7b249899..f882378172 100644
--- a/tests/unit/cluster/info.tcl
+++ b/tests/unit/cluster/info.tcl
@@ -41,3 +41,26 @@ test "errorstats: rejected call due to MOVED Redirection" {
 }
 
 } ;# start_cluster
+
+start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout 1000}} {
+    test "fail reason changed" {
+        # Kill one primary, so the cluster fail with not-full-coverage.
+        pause_process [srv 0 pid]
+        wait_for_condition 1000 50 {
+            [CI 1 cluster_state] eq {fail} &&
+            [CI 2 cluster_state] eq {fail}
+        } else {
+            fail "Cluster doesn't fail"
+        }
+        verify_log_message -1 "*At least one hash slot is not served by any available node*" 0
+        verify_log_message -2 "*At least one hash slot is not served by any available node*" 0
+
+        # Kill one more primary, so the cluster fail with minority-partition.
+        pause_process [srv -1 pid]
+        wait_for_log_messages -2 {"*minority partition*"} 0 1000 50
+
+        resume_process [srv 0 pid]
+        resume_process [srv -1 pid]
+        wait_for_cluster_state ok
+    }
+}

From 3df609ef06f71c37a45049ec1df9611b9f763d55 Mon Sep 17 00:00:00 2001
From: Nugine <nugine@foxmail.com>
Date: Tue, 3 Dec 2024 02:40:38 +0800
Subject: [PATCH 10/73] Optimize PFCOUNT, PFMERGE command by SIMD acceleration
 (#1293)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR optimizes the performance of HyperLogLog commands (PFCOUNT,
PFMERGE) by adding AVX2 fast paths.

Two AVX2 functions are added for conversion between raw representation
and dense representation. They are 15 ~ 30 times faster than scalar
implementaion. Note that sparse representation is not accelerated.

AVX2 fast paths are enabled when the CPU supports AVX2 (checked at
runtime) and the hyperloglog configuration is default (HLL_REGISTERS ==
16384 && HLL_BITS == 6).

`PFDEBUG SIMD (ON|OFF)` subcommand is added for unit tests. A new TCL
unit test checks that the results produced by non-AVX2 and AVX2
implementations are exactly equal.

When merging 3 dense hll structures, the benchmark shows a 12x speedup
compared to the scalar version.

```
pfcount key1 key2 key3
pfmerge keyall key1 key2 key3
```

```
======================================================================================================
Type             Ops/sec    Avg. Latency     p50 Latency     p99 Latency   p99.9 Latency       KB/sec
------------------------------------------------------------------------------------------------------
PFCOUNT-scalar    5665.56        35.29839        32.25500        63.99900        67.58300       608.60
PFCOUNT-avx2     72377.83         2.75834         2.67100         5.34300         6.81500      7774.96
------------------------------------------------------------------------------------------------------
PFMERGE-scalar    9851.29        20.28806        20.09500        36.86300        39.16700       615.71
PFMERGE-avx2    125621.89         1.59126         1.55100         3.11900         4.70300     15702.74
------------------------------------------------------------------------------------------------------

scalar: valkey:unstable  2df56d87c0ebe802f38e8922bb2ea1e4ca9cfa76
avx2:   Nugine:hll-simd  8f9adc34021080d96e60bd0abe06b043f3ed0275

CPU:    13th Gen Intel® Core™ i9-13900H × 20
Memory: 32.0 GiB
OS:     Ubuntu 22.04.5 LTS
```

Experiment repo: https://github.com/Nugine/redis-hyperloglog
Benchmark script:
https://github.com/Nugine/redis-hyperloglog/blob/main/scripts/memtier.sh
Algorithm:
https://github.com/Nugine/redis-hyperloglog/blob/main/cpp/bench.cpp

---------

Signed-off-by: Xuyang Wang <xuyangwang@link.cuhk.edu.cn>
---
 src/config.h               |  13 ++
 src/hyperloglog.c          | 303 +++++++++++++++++++++++++++++++++++--
 tests/unit/hyperloglog.tcl |  40 +++++
 3 files changed, 345 insertions(+), 11 deletions(-)

diff --git a/src/config.h b/src/config.h
index 3b79c5c681..a2e9f353dc 100644
--- a/src/config.h
+++ b/src/config.h
@@ -364,4 +364,17 @@ void setcpuaffinity(const char *cpulist);
 #define valkey_prefetch(addr) ((void)(addr))
 #endif
 
+/* Check if we can compile AVX2 code */
+#if defined(__x86_64__) && ((defined(__GNUC__) && __GNUC__ >= 5) || (defined(__clang__) && __clang_major__ >= 4))
+#if defined(__has_attribute) && __has_attribute(target)
+#define HAVE_AVX2
+#endif
+#endif
+
+#if defined(HAVE_AVX2)
+#define ATTRIBUTE_TARGET_AVX2 __attribute__((target("avx2")))
+#else
+#define ATTRIBUTE_TARGET_AVX2
+#endif
+
 #endif
diff --git a/src/hyperloglog.c b/src/hyperloglog.c
index 563c5e7941..9a48c821ab 100644
--- a/src/hyperloglog.c
+++ b/src/hyperloglog.c
@@ -35,6 +35,10 @@
 #include <stdint.h>
 #include <math.h>
 
+#ifdef HAVE_AVX2
+#include <immintrin.h>
+#endif
+
 /* The HyperLogLog implementation is based on the following ideas:
  *
  * * The use of a 64 bit hash function as proposed in [1], in order to estimate
@@ -208,6 +212,13 @@ struct hllhdr {
 
 static char *invalid_hll_err = "-INVALIDOBJ Corrupted HLL object detected";
 
+#ifdef HAVE_AVX2
+static int simd_enabled = 1;
+#define HLL_USE_AVX2 (simd_enabled && __builtin_cpu_supports("avx2"))
+#else
+#define HLL_USE_AVX2 0
+#endif
+
 /* =========================== Low level bit macros ========================= */
 
 /* Macros to access the dense representation.
@@ -1064,6 +1075,136 @@ int hllAdd(robj *o, unsigned char *ele, size_t elesize) {
     }
 }
 
+#ifdef HAVE_AVX2
+/* A specialized version of hllMergeDense, optimized for default configurations.
+ *
+ * Requirements:
+ * 1) HLL_REGISTERS == 16384 && HLL_BITS == 6
+ * 2) The CPU supports AVX2 (checked at runtime in hllMergeDense)
+ *
+ * reg_raw: pointer to the raw representation array (16384 bytes, one byte per register)
+ * reg_dense: pointer to the dense representation array (12288 bytes, 6 bits per register)
+ */
+ATTRIBUTE_TARGET_AVX2
+void hllMergeDenseAVX2(uint8_t *reg_raw, const uint8_t *reg_dense) {
+    /* Shuffle indices for unpacking bytes of dense registers
+     * From: {XXXX|AAAB|BBCC|CDDD|EEEF|FFGG|GHHH|XXXX}
+     * To:   {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0}
+     */
+    const __m256i shuffle = _mm256_setr_epi8( //
+        4, 5, 6, -1,                          //
+        7, 8, 9, -1,                          //
+        10, 11, 12, -1,                       //
+        13, 14, 15, -1,                       //
+        0, 1, 2, -1,                          //
+        3, 4, 5, -1,                          //
+        6, 7, 8, -1,                          //
+        9, 10, 11, -1                         //
+    );
+
+    /* Merge the first 8 registers (6 bytes) normally
+     * as the AVX2 algorithm needs 4 padding bytes at the start */
+    uint8_t val;
+    for (int i = 0; i < 8; i++) {
+        HLL_DENSE_GET_REGISTER(val, reg_dense, i);
+        if (val > reg_raw[i]) {
+            reg_raw[i] = val;
+        }
+    }
+
+    /* Dense to Raw:
+     *
+     * 4 registers in 3 bytes:
+     * {bbaaaaaa|ccccbbbb|ddddddcc}
+     *
+     * LOAD 32 bytes (32 registers) per iteration:
+     * 4(padding) + 12(16 registers) + 12(16 registers) + 4(padding)
+     * {XXXX|AAAB|BBCC|CDDD|EEEF|FFGG|GHHH|XXXX}
+     *
+     * SHUFFLE to:
+     * {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0}
+     * {bbaaaaaa|ccccbbbb|ddddddcc|00000000} x8
+     *
+     * AVX2 is little endian, each of the 8 groups is a little-endian int32.
+     * A group (int32) contains 3 valid bytes (4 registers) and a zero byte.
+     *
+     * extract registers in each group with AND and SHIFT:
+     * {00aaaaaa|00000000|00000000|00000000} x8 (<<0)
+     * {00000000|00bbbbbb|00000000|00000000} x8 (<<2)
+     * {00000000|00000000|00cccccc|00000000} x8 (<<4)
+     * {00000000|00000000|00000000|00dddddd} x8 (<<6)
+     *
+     * merge the extracted registers with OR:
+     * {00aaaaaa|00bbbbbb|00cccccc|00dddddd} x8
+     *
+     * Finally, compute MAX(reg_raw, merged) and STORE it back to reg_raw
+     */
+
+    /* Skip 8 registers (6 bytes) */
+    const uint8_t *r = reg_dense + 6 - 4;
+    uint8_t *t = reg_raw + 8;
+
+    for (int i = 0; i < HLL_REGISTERS / 32 - 1; ++i) {
+        __m256i x0, x;
+        x0 = _mm256_loadu_si256((__m256i *)r);
+        x = _mm256_shuffle_epi8(x0, shuffle);
+
+        __m256i a1, a2, a3, a4;
+        a1 = _mm256_and_si256(x, _mm256_set1_epi32(0x0000003f));
+        a2 = _mm256_and_si256(x, _mm256_set1_epi32(0x00000fc0));
+        a3 = _mm256_and_si256(x, _mm256_set1_epi32(0x0003f000));
+        a4 = _mm256_and_si256(x, _mm256_set1_epi32(0x00fc0000));
+
+        a2 = _mm256_slli_epi32(a2, 2);
+        a3 = _mm256_slli_epi32(a3, 4);
+        a4 = _mm256_slli_epi32(a4, 6);
+
+        __m256i y1, y2, y;
+        y1 = _mm256_or_si256(a1, a2);
+        y2 = _mm256_or_si256(a3, a4);
+        y = _mm256_or_si256(y1, y2);
+
+        __m256i z = _mm256_loadu_si256((__m256i *)t);
+
+        z = _mm256_max_epu8(z, y);
+
+        _mm256_storeu_si256((__m256i *)t, z);
+
+        r += 24;
+        t += 32;
+    }
+
+    /* Merge the last 24 registers normally
+     * as the AVX2 algorithm needs 4 padding bytes at the end */
+    for (int i = HLL_REGISTERS - 24; i < HLL_REGISTERS; i++) {
+        HLL_DENSE_GET_REGISTER(val, reg_dense, i);
+        if (val > reg_raw[i]) {
+            reg_raw[i] = val;
+        }
+    }
+}
+#endif
+
+/* Merge dense-encoded registers to raw registers array. */
+void hllMergeDense(uint8_t *reg_raw, const uint8_t *reg_dense) {
+#ifdef HAVE_AVX2
+    if (HLL_REGISTERS == 16384 && HLL_BITS == 6) {
+        if (HLL_USE_AVX2) {
+            hllMergeDenseAVX2(reg_raw, reg_dense);
+            return;
+        }
+    }
+#endif
+
+    uint8_t val;
+    for (int i = 0; i < HLL_REGISTERS; i++) {
+        HLL_DENSE_GET_REGISTER(val, reg_dense, i);
+        if (val > reg_raw[i]) {
+            reg_raw[i] = val;
+        }
+    }
+}
+
 /* Merge by computing MAX(registers[i],hll[i]) the HyperLogLog 'hll'
  * with an array of uint8_t HLL_REGISTERS registers pointed by 'max'.
  *
@@ -1077,12 +1218,7 @@ int hllMerge(uint8_t *max, robj *hll) {
     int i;
 
     if (hdr->encoding == HLL_DENSE) {
-        uint8_t val;
-
-        for (i = 0; i < HLL_REGISTERS; i++) {
-            HLL_DENSE_GET_REGISTER(val, hdr->registers, i);
-            if (val > max[i]) max[i] = val;
-        }
+        hllMergeDense(max, hdr->registers);
     } else {
         uint8_t *p = hll->ptr, *end = p + sdslen(hll->ptr);
         long runlen, regval;
@@ -1114,6 +1250,121 @@ int hllMerge(uint8_t *max, robj *hll) {
     return C_OK;
 }
 
+#ifdef HAVE_AVX2
+/* A specialized version of hllDenseCompress, optimized for default configurations.
+ *
+ * Requirements:
+ * 1) HLL_REGISTERS == 16384 && HLL_BITS == 6
+ * 2) The CPU supports AVX2 (checked at runtime in hllDenseCompress)
+ *
+ * reg_dense: pointer to the dense representation array (12288 bytes, 6 bits per register)
+ * reg_raw: pointer to the raw representation array (16384 bytes, one byte per register)
+ */
+ATTRIBUTE_TARGET_AVX2
+void hllDenseCompressAVX2(uint8_t *reg_dense, const uint8_t *reg_raw) {
+    /* Shuffle indices for packing bytes of dense registers
+     * From: {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0}
+     * To:   {AAAB|BBCC|CDDD|0000|EEEF|FFGG|GHHH|0000}
+     */
+    const __m256i shuffle = _mm256_setr_epi8( //
+        0, 1, 2,                              //
+        4, 5, 6,                              //
+        8, 9, 10,                             //
+        12, 13, 14,                           //
+        -1, -1, -1, -1,                       //
+        0, 1, 2,                              //
+        4, 5, 6,                              //
+        8, 9, 10,                             //
+        12, 13, 14,                           //
+        -1, -1, -1, -1                        //
+    );
+
+    /* Raw to Dense:
+     *
+     * LOAD 32 bytes (32 registers) per iteration:
+     * {00aaaaaa|00bbbbbb|00cccccc|00dddddd} x8
+     *
+     * AVX2 is little endian, each of the 8 groups is a little-endian int32.
+     * A group (int32) contains 4 registers.
+     *
+     * move the registers to correct positions with AND and SHIFT:
+     * {00aaaaaa|00000000|00000000|00000000} x8 (>>0)
+     * {bb000000|0000bbbb|00000000|00000000} x8 (>>2)
+     * {00000000|cccc0000|000000cc|00000000} x8 (>>4)
+     * {00000000|00000000|dddddd00|00000000} x8 (>>6)
+     *
+     * merge the registers with OR:
+     * {bbaaaaaa|ccccbbbb|ddddddcc|00000000} x8
+     * {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0}
+     *
+     * SHUFFLE to:
+     * {AAAB|BBCC|CDDD|0000|EEEF|FFGG|GHHH|0000}
+     *
+     * STORE the lower half and higher half respectively:
+     * AAABBBCCCDDD0000
+     *             EEEFFFGGGHHH0000
+     * AAABBBCCCDDDEEEFFFGGGHHH0000
+     *
+     * Note that the last 4 bytes are padding bytes.
+     */
+
+    const uint8_t *r = reg_raw;
+    uint8_t *t = reg_dense;
+
+    for (int i = 0; i < HLL_REGISTERS / 32 - 1; ++i) {
+        __m256i x = _mm256_loadu_si256((__m256i *)r);
+
+        __m256i a1, a2, a3, a4;
+        a1 = _mm256_and_si256(x, _mm256_set1_epi32(0x0000003f));
+        a2 = _mm256_and_si256(x, _mm256_set1_epi32(0x00003f00));
+        a3 = _mm256_and_si256(x, _mm256_set1_epi32(0x003f0000));
+        a4 = _mm256_and_si256(x, _mm256_set1_epi32(0x3f000000));
+
+        a2 = _mm256_srli_epi32(a2, 2);
+        a3 = _mm256_srli_epi32(a3, 4);
+        a4 = _mm256_srli_epi32(a4, 6);
+
+        __m256i y1, y2, y;
+        y1 = _mm256_or_si256(a1, a2);
+        y2 = _mm256_or_si256(a3, a4);
+        y = _mm256_or_si256(y1, y2);
+        y = _mm256_shuffle_epi8(y, shuffle);
+
+        __m128i lower, higher;
+        lower = _mm256_castsi256_si128(y);
+        higher = _mm256_extracti128_si256(y, 1);
+
+        _mm_storeu_si128((__m128i *)t, lower);
+        _mm_storeu_si128((__m128i *)(t + 12), higher);
+
+        r += 32;
+        t += 24;
+    }
+
+    /* Merge the last 32 registers normally
+     * as the AVX2 algorithm needs 4 padding bytes at the end */
+    for (int i = HLL_REGISTERS - 32; i < HLL_REGISTERS; i++) {
+        HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]);
+    }
+}
+#endif
+
+/* Compress raw registers to dense representation. */
+void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) {
+#ifdef HAVE_AVX2
+    if (HLL_REGISTERS == 16384 && HLL_BITS == 6) {
+        if (HLL_USE_AVX2) {
+            hllDenseCompressAVX2(reg_dense, reg_raw);
+            return;
+        }
+    }
+#endif
+
+    for (int i = 0; i < HLL_REGISTERS; i++) {
+        HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]);
+    }
+}
+
 /* ========================== HyperLogLog commands ========================== */
 
 /* Create an HLL object. We always create the HLL using sparse encoding.
@@ -1363,12 +1614,17 @@ void pfmergeCommand(client *c) {
 
     /* Write the resulting HLL to the destination HLL registers and
      * invalidate the cached value. */
-    for (j = 0; j < HLL_REGISTERS; j++) {
-        if (max[j] == 0) continue;
+    if (use_dense) {
         hdr = o->ptr;
-        switch (hdr->encoding) {
-        case HLL_DENSE: hllDenseSet(hdr->registers, j, max[j]); break;
-        case HLL_SPARSE: hllSparseSet(o, j, max[j]); break;
+        hllDenseCompress(hdr->registers, max);
+    } else {
+        for (j = 0; j < HLL_REGISTERS; j++) {
+            if (max[j] == 0) continue;
+            hdr = o->ptr;
+            switch (hdr->encoding) {
+            case HLL_DENSE: hllDenseSet(hdr->registers, j, max[j]); break;
+            case HLL_SPARSE: hllSparseSet(o, j, max[j]); break;
+            }
         }
     }
     hdr = o->ptr; /* o->ptr may be different now, as a side effect of
@@ -1494,6 +1750,7 @@ void pfselftestCommand(client *c) {
  * PFDEBUG DECODE <key>
  * PFDEBUG ENCODING <key>
  * PFDEBUG TODENSE <key>
+ * PFDEBUG SIMD (ON|OFF)
  */
 void pfdebugCommand(client *c) {
     char *cmd = c->argv[1]->ptr;
@@ -1501,6 +1758,30 @@ void pfdebugCommand(client *c) {
     robj *o;
     int j;
 
+    if (!strcasecmp(cmd, "simd")) {
+        if (c->argc != 3) goto arityerr;
+
+        if (!strcasecmp(c->argv[2]->ptr, "on")) {
+#ifdef HAVE_AVX2
+            simd_enabled = 1;
+#endif
+        } else if (!strcasecmp(c->argv[2]->ptr, "off")) {
+#ifdef HAVE_AVX2
+            simd_enabled = 0;
+#endif
+        } else {
+            addReplyError(c, "Argument must be ON or OFF");
+        }
+
+        if (HLL_USE_AVX2) {
+            addReplyStatus(c, "enabled");
+        } else {
+            addReplyStatus(c, "disabled");
+        }
+
+        return;
+    }
+
     o = lookupKeyWrite(c->db, c->argv[2]);
     if (o == NULL) {
         addReplyError(c, "The specified key does not exist");
diff --git a/tests/unit/hyperloglog.tcl b/tests/unit/hyperloglog.tcl
index c1b3b3a79f..765d5e0bdd 100644
--- a/tests/unit/hyperloglog.tcl
+++ b/tests/unit/hyperloglog.tcl
@@ -222,6 +222,46 @@ start_server {tags {"hll"}} {
         assert_equal 3 [r pfcount destkey]
     }
 
+    test {PFMERGE results with simd} {
+        r del hllscalar{t} hllsimd{t} hll1{t} hll2{t} hll3{t}
+        for {set x 1} {$x < 2000} {incr x} {
+            r pfadd hll1{t} [expr rand()]
+        }
+        for {set x 1} {$x < 4000} {incr x} {
+            r pfadd hll2{t} [expr rand()]
+        }
+        for {set x 1} {$x < 8000} {incr x} {
+            r pfadd hll3{t} [expr rand()]
+        }
+        assert {[r pfcount hll1{t}] > 0}
+        assert {[r pfcount hll2{t}] > 0}
+        assert {[r pfcount hll3{t}] > 0}
+
+        r pfdebug simd off
+        set scalar [r pfcount hll1{t} hll2{t} hll3{t}]
+        r pfdebug simd on
+        set simd [r pfcount hll1{t} hll2{t} hll3{t}]
+        assert {$scalar > 0}
+        assert {$simd > 0}
+        assert_equal $scalar $simd
+
+        r pfdebug simd off
+        r pfmerge hllscalar{t} hll1{t} hll2{t} hll3{t}
+        r pfdebug simd on
+        r pfmerge hllsimd{t} hll1{t} hll2{t} hll3{t}
+
+        set scalar [r pfcount hllscalar{t}]
+        set simd [r pfcount hllsimd{t}]
+        assert {$scalar > 0}
+        assert {$simd > 0}
+        assert_equal $scalar $simd
+
+        set scalar [r get hllscalar{t}]
+        set simd [r get hllsimd{t}]
+        assert_equal $scalar $simd
+
+    } {} {needs:pfdebug}
+
     test {PFCOUNT multiple-keys merge returns cardinality of union #1} {
         r del hll1{t} hll2{t} hll3{t}
         for {set x 1} {$x < 10000} {incr x} {

From 397201c48f4cb7fd052fd98c66385eaab1981e1c Mon Sep 17 00:00:00 2001
From: Jim Brunner <brunnerj@amazon.com>
Date: Tue, 3 Dec 2024 08:42:29 -0800
Subject: [PATCH 11/73] Refactor of ActiveDefrag to reduce latencies (#1242)

Refer to:  https://github.com/valkey-io/valkey/issues/1141

This update refactors the defrag code to:
* Make the overall code more readable and maintainable
* Reduce latencies incurred during defrag processing

With this update, the defrag cycle time is reduced to 500us, with more
frequent cycles. This results in much more predictable latencies, with a
dramatic reduction in tail latencies.

(See https://github.com/valkey-io/valkey/issues/1141 for more complete
details.)

This update is focused mostly on the high-level processing, and does NOT
address lower level functions which aren't currently timebound (e.g.
`activeDefragSdsDict()`, and `moduleDefragGlobals()`). These are out of
scope for this update and left for a future update.

I fixed `kvstoreDictLUTDefrag` because it was using up to 7ms on a CME
single shard. See original github issue for performance details.

---------

Signed-off-by: Jim Brunner <brunnerj@amazon.com>
Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
---
 src/ae.c                        |    2 +-
 src/config.c                    |    5 +-
 src/defrag.c                    | 1078 +++++++++++++++++++------------
 src/dict.c                      |    4 +-
 src/dict.h                      |    2 +-
 src/kvstore.c                   |   23 +-
 src/kvstore.h                   |    4 +-
 src/server.c                    |   29 +-
 src/server.h                    |   11 +-
 tests/unit/memefficiency.tcl    |   25 +-
 tests/unit/moduleapi/defrag.tcl |    1 -
 valkey.conf                     |   18 +-
 12 files changed, 731 insertions(+), 471 deletions(-)

diff --git a/src/ae.c b/src/ae.c
index 9bf8619902..643ff17070 100644
--- a/src/ae.c
+++ b/src/ae.c
@@ -85,7 +85,7 @@ aeEventLoop *aeCreateEventLoop(int setsize) {
     if (eventLoop->events == NULL || eventLoop->fired == NULL) goto err;
     eventLoop->setsize = setsize;
     eventLoop->timeEventHead = NULL;
-    eventLoop->timeEventNextId = 0;
+    eventLoop->timeEventNextId = 1;
     eventLoop->stop = 0;
     eventLoop->maxfd = -1;
     eventLoop->beforesleep = NULL;
diff --git a/src/config.c b/src/config.c
index 7f0901c50a..5a07c2c0f0 100644
--- a/src/config.c
+++ b/src/config.c
@@ -3278,10 +3278,11 @@ standardConfig static_configs[] = {
     createIntConfig("list-max-listpack-size", "list-max-ziplist-size", MODIFIABLE_CONFIG, INT_MIN, INT_MAX, server.list_max_listpack_size, -2, INTEGER_CONFIG, NULL, NULL),
     createIntConfig("tcp-keepalive", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.tcpkeepalive, 300, INTEGER_CONFIG, NULL, NULL),
     createIntConfig("cluster-migration-barrier", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.cluster_migration_barrier, 1, INTEGER_CONFIG, NULL, NULL),
-    createIntConfig("active-defrag-cycle-min", NULL, MODIFIABLE_CONFIG, 1, 99, server.active_defrag_cycle_min, 1, INTEGER_CONFIG, NULL, updateDefragConfiguration),                 /* Default: 1% CPU min (at lower threshold) */
-    createIntConfig("active-defrag-cycle-max", NULL, MODIFIABLE_CONFIG, 1, 99, server.active_defrag_cycle_max, 25, INTEGER_CONFIG, NULL, updateDefragConfiguration),                /* Default: 25% CPU max (at upper threshold) */
+    createIntConfig("active-defrag-cycle-min", NULL, MODIFIABLE_CONFIG, 1, 99, server.active_defrag_cpu_min, 1, INTEGER_CONFIG, NULL, updateDefragConfiguration),                   /* Default: 1% CPU min (at lower threshold) */
+    createIntConfig("active-defrag-cycle-max", NULL, MODIFIABLE_CONFIG, 1, 99, server.active_defrag_cpu_max, 25, INTEGER_CONFIG, NULL, updateDefragConfiguration),                  /* Default: 25% CPU max (at upper threshold) */
     createIntConfig("active-defrag-threshold-lower", NULL, MODIFIABLE_CONFIG, 0, 1000, server.active_defrag_threshold_lower, 10, INTEGER_CONFIG, NULL, NULL),                       /* Default: don't defrag when fragmentation is below 10% */
     createIntConfig("active-defrag-threshold-upper", NULL, MODIFIABLE_CONFIG, 0, 1000, server.active_defrag_threshold_upper, 100, INTEGER_CONFIG, NULL, updateDefragConfiguration), /* Default: maximum defrag force at 100% fragmentation */
+    createIntConfig("active-defrag-cycle-us", NULL, MODIFIABLE_CONFIG, 0, 100000, server.active_defrag_cycle_us, 500, INTEGER_CONFIG, NULL, updateDefragConfiguration),
     createIntConfig("lfu-log-factor", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.lfu_log_factor, 10, INTEGER_CONFIG, NULL, NULL),
     createIntConfig("lfu-decay-time", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.lfu_decay_time, 1, INTEGER_CONFIG, NULL, NULL),
     createIntConfig("replica-priority", "slave-priority", MODIFIABLE_CONFIG, 0, INT_MAX, server.replica_priority, 100, INTEGER_CONFIG, NULL, NULL),
diff --git a/src/defrag.c b/src/defrag.c
index b49a175f7c..d0c7632f17 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -38,23 +38,126 @@
 
 #ifdef HAVE_DEFRAG
 
-typedef struct defragCtx {
-    void *privdata;
+typedef enum { DEFRAG_NOT_DONE = 0,
+               DEFRAG_DONE = 1 } doneStatus;
+
+
+/*
+ * Defragmentation is performed in stages.  Each stage is serviced by a stage function
+ * (defragStageFn).  The stage function is passed a target (void*) to defrag.  The contents of that
+ * target are unique to the particular stage - and may even be NULL for some stage functions.  The
+ * same stage function can be used multiple times (for different stages) each having a different
+ * target.
+ *
+ * The stage function is required to maintain an internal static state.  This allows the stage
+ * function to continue when invoked in an iterative manner.  When invoked with a 0 endtime, the
+ * stage function is required to clear it's internal state and prepare to begin a new stage.  It
+ * should return false (more work to do) as it should NOT perform any real "work" during init.
+ *
+ * Parameters:
+ *  endtime     - This is the monotonic time that the function should end and return.  This ensures
+ *                a bounded latency due to defrag.  When endtime is 0, the internal state should be
+ *                cleared, preparing to begin the stage with a new target.
+ *  target      - This is the "thing" that should be defragged.  It's type is dependent on the
+ *                type of the stage function.  This might be a dict, a kvstore, a DB, or other.
+ *  privdata    - A pointer to arbitrary private data which is unique to the stage function.
+ *
+ * Returns:
+ *  - DEFRAG_DONE if the stage is complete
+ *  - DEFRAG_NOT_DONE if there is more work to do
+ */
+typedef doneStatus (*defragStageFn)(monotime endtime, void *target, void *privdata);
+
+typedef struct {
+    defragStageFn stage_fn; // The function to be invoked for the stage
+    void *target;           // The target that the function will defrag
+    void *privdata;         // Private data, unique to the stage function
+} StageDescriptor;
+
+/* Globals needed for the main defrag processing logic.
+ * Doesn't include variables specific to a stage or type of data. */
+struct DefragContext {
+    monotime start_cycle;           // Time of beginning of defrag cycle
+    long long start_defrag_hits;    // server.stat_active_defrag_hits captured at beginning of cycle
+    list *remaining_stages;         // List of stages which remain to be processed
+    StageDescriptor *current_stage; // The stage that's currently being processed
+
+    long long timeproc_id;      // Eventloop ID of the timerproc (or AE_DELETED_EVENT_ID)
+    monotime timeproc_end_time; // Ending time of previous timerproc execution
+    long timeproc_overage_us;   // A correction value if over/under target CPU percent
+};
+static struct DefragContext defrag;
+
+
+/* There are a number of stages which process a kvstore.  To simplify this, a stage helper function
+ * `defragStageKvstoreHelper()` is defined.  This function aids in iterating over the kvstore.  It
+ * uses these definitions.
+ */
+/* State of the kvstore helper.  The private data (privdata) passed to the kvstore helper MUST BEGIN
+ *  with a kvstoreIterState (or be passed as NULL). */
+#define KVS_SLOT_DEFRAG_LUT -2
+#define KVS_SLOT_UNASSIGNED -1
+typedef struct {
+    kvstore *kvs;
     int slot;
-    void *aux;
-} defragCtx;
+    unsigned long cursor;
+} kvstoreIterState;
+/* The kvstore helper uses this function to perform tasks before continuing the iteration.  For the
+ * main dictionary, large items are set aside and processed by this function before continuing with
+ * iteration over the kvstore.
+ *  endtime     - This is the monotonic time that the function should end and return.
+ *  privdata    - Private data for functions invoked by the helper.  If provided in the call to
+ *                `defragStageKvstoreHelper()`, the `kvstoreIterState` portion (at the beginning)
+ *                will be updated with the current kvstore iteration status.
+ *
+ * Returns:
+ *  - DEFRAG_DONE if the pre-continue work is complete
+ *  - DEFRAG_NOT_DONE if there is more work to do
+ */
+typedef doneStatus (*kvstoreHelperPreContinueFn)(monotime endtime, void *privdata);
+
+
+// Private data for main dictionary keys
+typedef struct {
+    kvstoreIterState kvstate;
+    serverDb *db;
+    dictEntry *saved_expire_de;
+} defragKeysCtx;
+static_assert(offsetof(defragKeysCtx, kvstate) == 0, "defragStageKvstoreHelper requires this");
+
+// Private data for pubsub kvstores
+typedef dict *(*getClientChannelsFn)(client *);
+typedef struct {
+    getClientChannelsFn fn;
+} getClientChannelsFnWrapper;
 
-typedef struct defragPubSubCtx {
-    kvstore *pubsub_channels;
-    dict *(*clientPubSubChannels)(client *);
+typedef struct {
+    kvstoreIterState kvstate;
+    getClientChannelsFn getPubSubChannels;
 } defragPubSubCtx;
+static_assert(offsetof(defragPubSubCtx, kvstate) == 0, "defragStageKvstoreHelper requires this");
 
-/* Defrag helper for generic allocations.
- *
- * returns NULL in case the allocation wasn't moved.
- * when it returns a non-null value, the old pointer was already released
- * and should NOT be accessed. */
-void *activeDefragAlloc(void *ptr) {
+
+/* When scanning a main kvstore, large elements are queued for later handling rather than
+ * causing a large latency spike while processing a hash table bucket.  This list is only used
+ * for stage: "defragStageDbKeys".  It will only contain values for the current kvstore being
+ * defragged.
+ * Note that this is a list of key names.  It's possible that the key may be deleted or modified
+ * before "later" and we will search by key name to find the entry when we defrag the item later.
+ */
+static list *defrag_later;
+static unsigned long defrag_later_cursor;
+
+
+/* this method was added to jemalloc in order to help us understand which
+ * pointers are worthwhile moving and which aren't */
+int je_get_defrag_hint(void *ptr);
+
+/* Defrag function which allocates and copies memory if needed, but DOESN'T free the old block.
+ * It is the responsibility of the caller to free the old block if a non-NULL value (new block)
+ * is returned.  (Returns NULL if no relocation was needed.)
+ */
+static void *activeDefragAllocWithoutFree(void *ptr, size_t *allocation_size) {
     size_t size;
     void *newptr;
     if (!allocatorShouldDefrag(ptr)) {
@@ -67,28 +170,43 @@ void *activeDefragAlloc(void *ptr) {
     size = zmalloc_size(ptr);
     newptr = allocatorDefragAlloc(size);
     memcpy(newptr, ptr, size);
-    allocatorDefragFree(ptr, size);
+    if (allocation_size) *allocation_size = size;
+
     server.stat_active_defrag_hits++;
     return newptr;
 }
 
+/* Defrag helper for generic allocations.
+ *
+ * Returns NULL in case the allocation wasn't moved.
+ * When it returns a non-null value, the old pointer was already released
+ * and should NOT be accessed. */
+void *activeDefragAlloc(void *ptr) {
+    size_t allocation_size;
+    void *newptr = activeDefragAllocWithoutFree(ptr, &allocation_size);
+    if (newptr) allocatorDefragFree(ptr, allocation_size);
+    return newptr;
+}
+
 /* This method captures the expiry db dict entry which refers to data stored in keys db dict entry. */
-void defragEntryStartCbForKeys(void *ctx, void *oldptr) {
-    defragCtx *defragctx = (defragCtx *)ctx;
-    serverDb *db = defragctx->privdata;
+static void defragEntryStartCbForKeys(void *ctx, void *oldptr) {
+    defragKeysCtx *defragctx = (defragKeysCtx *)ctx;
+    serverDb *db = defragctx->db;
     sds oldsds = (sds)dictGetKey((dictEntry *)oldptr);
-    int slot = defragctx->slot;
+    int slot = defragctx->kvstate.slot;
     if (kvstoreDictSize(db->expires, slot)) {
         dictEntry *expire_de = kvstoreDictFind(db->expires, slot, oldsds);
-        defragctx->aux = expire_de;
+        defragctx->saved_expire_de = expire_de;
+    } else {
+        defragctx->saved_expire_de = NULL;
     }
 }
 
 /* This method updates the key of expiry db dict entry. The key might be no longer valid
  * as it could have been cleaned up during the defrag-realloc of the main dictionary. */
-void defragEntryFinishCbForKeys(void *ctx, void *newptr) {
-    defragCtx *defragctx = (defragCtx *)ctx;
-    dictEntry *expire_de = (dictEntry *)defragctx->aux;
+static void defragEntryFinishCbForKeys(void *ctx, void *newptr) {
+    defragKeysCtx *defragctx = (defragKeysCtx *)ctx;
+    dictEntry *expire_de = defragctx->saved_expire_de;
     /* Item doesn't have TTL associated to it. */
     if (!expire_de) return;
     /* No reallocation happened. */
@@ -96,18 +214,18 @@ void defragEntryFinishCbForKeys(void *ctx, void *newptr) {
         expire_de = NULL;
         return;
     }
-    serverDb *db = defragctx->privdata;
+    serverDb *db = defragctx->db;
     sds newsds = (sds)dictGetKey((dictEntry *)newptr);
-    int slot = defragctx->slot;
+    int slot = defragctx->kvstate.slot;
     kvstoreDictSetKey(db->expires, slot, expire_de, newsds);
 }
 
-/*Defrag helper for sds strings
+/* Defrag helper for sds strings
  *
- * returns NULL in case the allocation wasn't moved.
- * when it returns a non-null value, the old pointer was already released
+ * Returns NULL in case the allocation wasn't moved.
+ * When it returns a non-null value, the old pointer was already released
  * and should NOT be accessed. */
-sds activeDefragSds(sds sdsptr) {
+static sds activeDefragSds(sds sdsptr) {
     void *ptr = sdsAllocPtr(sdsptr);
     void *newptr = activeDefragAlloc(ptr);
     if (newptr) {
@@ -118,60 +236,48 @@ sds activeDefragSds(sds sdsptr) {
     return NULL;
 }
 
-/* Defrag helper for robj and/or string objects with expected refcount.
- *
- * Like activeDefragStringOb, but it requires the caller to pass in the expected
- * reference count. In some cases, the caller needs to update a robj whose
- * reference count is not 1, in these cases, the caller must explicitly pass
- * in the reference count, otherwise defragmentation will not be performed.
- * Note that the caller is responsible for updating any other references to the robj. */
-robj *activeDefragStringObEx(robj *ob, int expected_refcount) {
-    robj *ret = NULL;
-    if (ob->refcount != expected_refcount) return NULL;
-
-    /* try to defrag robj (only if not an EMBSTR type (handled below). */
-    if (ob->type != OBJ_STRING || ob->encoding != OBJ_ENCODING_EMBSTR) {
-        if ((ret = activeDefragAlloc(ob))) {
-            ob = ret;
-        }
+/* Performs defrag on a string-type (or generic) robj, but does not free the old robj.  This is the
+ * caller's responsibility.  This is necessary for string objects with multiple references.  In this
+ * case the caller can fix the references before freeing the original object.
+ */
+static robj *activeDefragStringObWithoutFree(robj *ob, size_t *allocation_size) {
+    if (ob->type == OBJ_STRING && ob->encoding == OBJ_ENCODING_RAW) {
+        // Try to defrag the linked sds, regardless of if robj will be moved
+        sds newsds = activeDefragSds((sds)ob->ptr);
+        if (newsds) ob->ptr = newsds;
     }
 
-    /* try to defrag string object */
-    if (ob->type == OBJ_STRING) {
-        if (ob->encoding == OBJ_ENCODING_RAW) {
-            sds newsds = activeDefragSds((sds)ob->ptr);
-            if (newsds) {
-                ob->ptr = newsds;
-            }
-        } else if (ob->encoding == OBJ_ENCODING_EMBSTR) {
-            /* The sds is embedded in the object allocation, calculate the
-             * offset and update the pointer in the new allocation. */
-            long ofs = (intptr_t)ob->ptr - (intptr_t)ob;
-            if ((ret = activeDefragAlloc(ob))) {
-                ret->ptr = (void *)((intptr_t)ret + ofs);
-            }
-        } else if (ob->encoding != OBJ_ENCODING_INT) {
-            serverPanic("Unknown string encoding");
-        }
+    robj *new_robj = activeDefragAllocWithoutFree(ob, allocation_size);
+
+    if (new_robj && ob->type == OBJ_STRING && ob->encoding == OBJ_ENCODING_EMBSTR) {
+        // If the robj is moved, correct the internal pointer
+        long embstr_offset = (intptr_t)ob->ptr - (intptr_t)ob;
+        new_robj->ptr = (void *)((intptr_t)new_robj + embstr_offset);
     }
-    return ret;
+    return new_robj;
 }
 
+
 /* Defrag helper for robj and/or string objects
  *
- * returns NULL in case the allocation wasn't moved.
- * when it returns a non-null value, the old pointer was already released
+ * Returns NULL in case the allocation wasn't moved.
+ * When it returns a non-null value, the old pointer was already released
  * and should NOT be accessed. */
 robj *activeDefragStringOb(robj *ob) {
-    return activeDefragStringObEx(ob, 1);
+    size_t allocation_size;
+    if (ob->refcount != 1) return NULL; // Unsafe to defrag if multiple refs
+    robj *new_robj = activeDefragStringObWithoutFree(ob, &allocation_size);
+    if (new_robj) allocatorDefragFree(ob, allocation_size);
+    return new_robj;
 }
 
+
 /* Defrag helper for lua scripts
  *
- * returns NULL in case the allocation wasn't moved.
- * when it returns a non-null value, the old pointer was already released
+ * Returns NULL in case the allocation wasn't moved.
+ * When it returns a non-null value, the old pointer was already released
  * and should NOT be accessed. */
-luaScript *activeDefragLuaScript(luaScript *script) {
+static luaScript *activeDefragLuaScript(luaScript *script) {
     luaScript *ret = NULL;
 
     /* try to defrag script struct */
@@ -193,7 +299,7 @@ luaScript *activeDefragLuaScript(luaScript *script) {
  * Returns NULL in case the allocation wasn't moved.
  * When it returns a non-null value, the old pointer was already released
  * and should NOT be accessed. */
-dict *dictDefragTables(dict *d) {
+static dict *dictDefragTables(dict *d) {
     dict *ret = NULL;
     dictEntry **newtable;
     /* handle the dict struct */
@@ -211,7 +317,7 @@ dict *dictDefragTables(dict *d) {
 }
 
 /* Internal function used by zslDefrag */
-void zslUpdateNode(zskiplist *zsl, zskiplistNode *oldnode, zskiplistNode *newnode, zskiplistNode **update) {
+static void zslUpdateNode(zskiplist *zsl, zskiplistNode *oldnode, zskiplistNode *newnode, zskiplistNode **update) {
     int i;
     for (i = 0; i < zsl->level; i++) {
         if (update[i]->level[i].forward == oldnode) update[i]->level[i].forward = newnode;
@@ -233,7 +339,7 @@ void zslUpdateNode(zskiplist *zsl, zskiplistNode *oldnode, zskiplistNode *newnod
  * only need to defrag the skiplist, but not update the obj pointer.
  * When return value is non-NULL, it is the score reference that must be updated
  * in the dict record. */
-double *zslDefrag(zskiplist *zsl, double score, sds oldele, sds newele) {
+static double *zslDefrag(zskiplist *zsl, double score, sds oldele, sds newele) {
     zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x, *newx;
     int i;
     sds ele = newele ? newele : oldele;
@@ -267,7 +373,7 @@ double *zslDefrag(zskiplist *zsl, double score, sds oldele, sds newele) {
 
 /* Defrag helper for sorted set.
  * Defrag a single dict entry key name, and corresponding skiplist struct */
-void activeDefragZsetEntry(zset *zs, dictEntry *de) {
+static void activeDefragZsetEntry(zset *zs, dictEntry *de) {
     sds newsds;
     double *newscore;
     sds sdsele = dictGetKey(de);
@@ -284,13 +390,13 @@ void activeDefragZsetEntry(zset *zs, dictEntry *de) {
 #define DEFRAG_SDS_DICT_VAL_VOID_PTR 3
 #define DEFRAG_SDS_DICT_VAL_LUA_SCRIPT 4
 
-void activeDefragSdsDictCallback(void *privdata, const dictEntry *de) {
+static void activeDefragSdsDictCallback(void *privdata, const dictEntry *de) {
     UNUSED(privdata);
     UNUSED(de);
 }
 
 /* Defrag a dict with sds key and optional value (either ptr, sds or robj string) */
-void activeDefragSdsDict(dict *d, int val_type) {
+static void activeDefragSdsDict(dict *d, int val_type) {
     unsigned long cursor = 0;
     dictDefragFunctions defragfns = {
         .defragAlloc = activeDefragAlloc,
@@ -306,34 +412,7 @@ void activeDefragSdsDict(dict *d, int val_type) {
 }
 
 /* Defrag a list of ptr, sds or robj string values */
-void activeDefragList(list *l, int val_type) {
-    listNode *ln, *newln;
-    for (ln = l->head; ln; ln = ln->next) {
-        if ((newln = activeDefragAlloc(ln))) {
-            if (newln->prev)
-                newln->prev->next = newln;
-            else
-                l->head = newln;
-            if (newln->next)
-                newln->next->prev = newln;
-            else
-                l->tail = newln;
-            ln = newln;
-        }
-        if (val_type == DEFRAG_SDS_DICT_VAL_IS_SDS) {
-            sds newsds, sdsele = ln->value;
-            if ((newsds = activeDefragSds(sdsele))) ln->value = newsds;
-        } else if (val_type == DEFRAG_SDS_DICT_VAL_IS_STROB) {
-            robj *newele, *ele = ln->value;
-            if ((newele = activeDefragStringOb(ele))) ln->value = newele;
-        } else if (val_type == DEFRAG_SDS_DICT_VAL_VOID_PTR) {
-            void *newptr, *ptr = ln->value;
-            if ((newptr = activeDefragAlloc(ptr))) ln->value = newptr;
-        }
-    }
-}
-
-void activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) {
+static void activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) {
     quicklistNode *newnode, *node = *node_ref;
     unsigned char *newzl;
     if ((newnode = activeDefragAlloc(node))) {
@@ -350,7 +429,7 @@ void activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) {
     if ((newzl = activeDefragAlloc(node->entry))) node->entry = newzl;
 }
 
-void activeDefragQuickListNodes(quicklist *ql) {
+static void activeDefragQuickListNodes(quicklist *ql) {
     quicklistNode *node = ql->head;
     while (node) {
         activeDefragQuickListNode(ql, &node);
@@ -361,13 +440,18 @@ void activeDefragQuickListNodes(quicklist *ql) {
 /* when the value has lots of elements, we want to handle it later and not as
  * part of the main dictionary scan. this is needed in order to prevent latency
  * spikes when handling large items */
-void defragLater(serverDb *db, dictEntry *kde) {
+static void defragLater(dictEntry *kde) {
+    if (!defrag_later) {
+        defrag_later = listCreate();
+        listSetFreeMethod(defrag_later, (void (*)(void *))sdsfree);
+        defrag_later_cursor = 0;
+    }
     sds key = sdsdup(dictGetKey(kde));
-    listAddNodeTail(db->defrag_later, key);
+    listAddNodeTail(defrag_later, key);
 }
 
 /* returns 0 if no more work needs to be been done, and 1 if time is up and more work is needed. */
-long scanLaterList(robj *ob, unsigned long *cursor, long long endtime) {
+static long scanLaterList(robj *ob, unsigned long *cursor, monotime endtime) {
     quicklist *ql = ob->ptr;
     quicklistNode *node;
     long iterations = 0;
@@ -392,7 +476,7 @@ long scanLaterList(robj *ob, unsigned long *cursor, long long endtime) {
         activeDefragQuickListNode(ql, &node);
         server.stat_active_defrag_scanned++;
         if (++iterations > 128 && !bookmark_failed) {
-            if (ustime() > endtime) {
+            if (getMonotonicUs() > endtime) {
                 if (!quicklistBookmarkCreate(&ql, "_AD", node)) {
                     bookmark_failed = 1;
                 } else {
@@ -413,14 +497,14 @@ typedef struct {
     zset *zs;
 } scanLaterZsetData;
 
-void scanLaterZsetCallback(void *privdata, const dictEntry *_de) {
+static void scanLaterZsetCallback(void *privdata, const dictEntry *_de) {
     dictEntry *de = (dictEntry *)_de;
     scanLaterZsetData *data = privdata;
     activeDefragZsetEntry(data->zs, de);
     server.stat_active_defrag_scanned++;
 }
 
-void scanLaterZset(robj *ob, unsigned long *cursor) {
+static void scanLaterZset(robj *ob, unsigned long *cursor) {
     if (ob->type != OBJ_ZSET || ob->encoding != OBJ_ENCODING_SKIPLIST) return;
     zset *zs = (zset *)ob->ptr;
     dict *d = zs->dict;
@@ -430,13 +514,13 @@ void scanLaterZset(robj *ob, unsigned long *cursor) {
 }
 
 /* Used as scan callback when all the work is done in the dictDefragFunctions. */
-void scanCallbackCountScanned(void *privdata, const dictEntry *de) {
+static void scanCallbackCountScanned(void *privdata, const dictEntry *de) {
     UNUSED(privdata);
     UNUSED(de);
     server.stat_active_defrag_scanned++;
 }
 
-void scanLaterSet(robj *ob, unsigned long *cursor) {
+static void scanLaterSet(robj *ob, unsigned long *cursor) {
     if (ob->type != OBJ_SET || ob->encoding != OBJ_ENCODING_HT) return;
     dict *d = ob->ptr;
     dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc,
@@ -444,7 +528,7 @@ void scanLaterSet(robj *ob, unsigned long *cursor) {
     *cursor = dictScanDefrag(d, *cursor, scanCallbackCountScanned, &defragfns, NULL);
 }
 
-void scanLaterHash(robj *ob, unsigned long *cursor) {
+static void scanLaterHash(robj *ob, unsigned long *cursor) {
     if (ob->type != OBJ_HASH || ob->encoding != OBJ_ENCODING_HT) return;
     dict *d = ob->ptr;
     dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc,
@@ -453,18 +537,18 @@ void scanLaterHash(robj *ob, unsigned long *cursor) {
     *cursor = dictScanDefrag(d, *cursor, scanCallbackCountScanned, &defragfns, NULL);
 }
 
-void defragQuicklist(serverDb *db, dictEntry *kde) {
+static void defragQuicklist(dictEntry *kde) {
     robj *ob = dictGetVal(kde);
     quicklist *ql = ob->ptr, *newql;
     serverAssert(ob->type == OBJ_LIST && ob->encoding == OBJ_ENCODING_QUICKLIST);
     if ((newql = activeDefragAlloc(ql))) ob->ptr = ql = newql;
     if (ql->len > server.active_defrag_max_scan_fields)
-        defragLater(db, kde);
+        defragLater(kde);
     else
         activeDefragQuickListNodes(ql);
 }
 
-void defragZsetSkiplist(serverDb *db, dictEntry *kde) {
+static void defragZsetSkiplist(dictEntry *kde) {
     robj *ob = dictGetVal(kde);
     zset *zs = (zset *)ob->ptr;
     zset *newzs;
@@ -477,7 +561,7 @@ void defragZsetSkiplist(serverDb *db, dictEntry *kde) {
     if ((newzsl = activeDefragAlloc(zs->zsl))) zs->zsl = newzsl;
     if ((newheader = activeDefragAlloc(zs->zsl->header))) zs->zsl->header = newheader;
     if (dictSize(zs->dict) > server.active_defrag_max_scan_fields)
-        defragLater(db, kde);
+        defragLater(kde);
     else {
         dictIterator *di = dictGetIterator(zs->dict);
         while ((de = dictNext(di)) != NULL) {
@@ -489,26 +573,26 @@ void defragZsetSkiplist(serverDb *db, dictEntry *kde) {
     if ((newdict = dictDefragTables(zs->dict))) zs->dict = newdict;
 }
 
-void defragHash(serverDb *db, dictEntry *kde) {
+static void defragHash(dictEntry *kde) {
     robj *ob = dictGetVal(kde);
     dict *d, *newd;
     serverAssert(ob->type == OBJ_HASH && ob->encoding == OBJ_ENCODING_HT);
     d = ob->ptr;
     if (dictSize(d) > server.active_defrag_max_scan_fields)
-        defragLater(db, kde);
+        defragLater(kde);
     else
         activeDefragSdsDict(d, DEFRAG_SDS_DICT_VAL_IS_SDS);
     /* defrag the dict struct and tables */
     if ((newd = dictDefragTables(ob->ptr))) ob->ptr = newd;
 }
 
-void defragSet(serverDb *db, dictEntry *kde) {
+static void defragSet(dictEntry *kde) {
     robj *ob = dictGetVal(kde);
     dict *d, *newd;
     serverAssert(ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HT);
     d = ob->ptr;
     if (dictSize(d) > server.active_defrag_max_scan_fields)
-        defragLater(db, kde);
+        defragLater(kde);
     else
         activeDefragSdsDict(d, DEFRAG_SDS_DICT_NO_VAL);
     /* defrag the dict struct and tables */
@@ -517,7 +601,7 @@ void defragSet(serverDb *db, dictEntry *kde) {
 
 /* Defrag callback for radix tree iterator, called for each node,
  * used in order to defrag the nodes allocations. */
-int defragRaxNode(raxNode **noderef) {
+static int defragRaxNode(raxNode **noderef) {
     raxNode *newnode = activeDefragAlloc(*noderef);
     if (newnode) {
         *noderef = newnode;
@@ -527,7 +611,7 @@ int defragRaxNode(raxNode **noderef) {
 }
 
 /* returns 0 if no more work needs to be been done, and 1 if time is up and more work is needed. */
-int scanLaterStreamListpacks(robj *ob, unsigned long *cursor, long long endtime) {
+static int scanLaterStreamListpacks(robj *ob, unsigned long *cursor, monotime endtime) {
     static unsigned char last[sizeof(streamID)];
     raxIterator ri;
     long iterations = 0;
@@ -563,7 +647,7 @@ int scanLaterStreamListpacks(robj *ob, unsigned long *cursor, long long endtime)
         if (newdata) raxSetData(ri.node, ri.data = newdata);
         server.stat_active_defrag_scanned++;
         if (++iterations > 128) {
-            if (ustime() > endtime) {
+            if (getMonotonicUs() > endtime) {
                 serverAssert(ri.key_len == sizeof(last));
                 memcpy(last, ri.key, ri.key_len);
                 raxStop(&ri);
@@ -585,7 +669,7 @@ typedef void *(raxDefragFunction)(raxIterator *ri, void *privdata);
  * 2) rax nodes
  * 3) rax entry data (only if defrag_data is specified)
  * 4) call a callback per element, and allow the callback to return a new pointer for the element */
-void defragRadixTree(rax **raxref, int defrag_data, raxDefragFunction *element_cb, void *element_cb_data) {
+static void defragRadixTree(rax **raxref, int defrag_data, raxDefragFunction *element_cb, void *element_cb_data) {
     raxIterator ri;
     rax *rax;
     if ((rax = activeDefragAlloc(*raxref))) *raxref = rax;
@@ -608,7 +692,7 @@ typedef struct {
     streamConsumer *c;
 } PendingEntryContext;
 
-void *defragStreamConsumerPendingEntry(raxIterator *ri, void *privdata) {
+static void *defragStreamConsumerPendingEntry(raxIterator *ri, void *privdata) {
     PendingEntryContext *ctx = privdata;
     streamNACK *nack = ri->data, *newnack;
     nack->consumer = ctx->c; /* update nack pointer to consumer */
@@ -622,7 +706,7 @@ void *defragStreamConsumerPendingEntry(raxIterator *ri, void *privdata) {
     return newnack;
 }
 
-void *defragStreamConsumer(raxIterator *ri, void *privdata) {
+static void *defragStreamConsumer(raxIterator *ri, void *privdata) {
     streamConsumer *c = ri->data;
     streamCG *cg = privdata;
     void *newc = activeDefragAlloc(c);
@@ -638,7 +722,7 @@ void *defragStreamConsumer(raxIterator *ri, void *privdata) {
     return newc; /* returns NULL if c was not defragged */
 }
 
-void *defragStreamConsumerGroup(raxIterator *ri, void *privdata) {
+static void *defragStreamConsumerGroup(raxIterator *ri, void *privdata) {
     streamCG *cg = ri->data;
     UNUSED(privdata);
     if (cg->consumers) defragRadixTree(&cg->consumers, 0, defragStreamConsumer, cg);
@@ -646,7 +730,7 @@ void *defragStreamConsumerGroup(raxIterator *ri, void *privdata) {
     return NULL;
 }
 
-void defragStream(serverDb *db, dictEntry *kde) {
+static void defragStream(dictEntry *kde) {
     robj *ob = dictGetVal(kde);
     serverAssert(ob->type == OBJ_STREAM && ob->encoding == OBJ_ENCODING_STREAM);
     stream *s = ob->ptr, *news;
@@ -657,7 +741,7 @@ void defragStream(serverDb *db, dictEntry *kde) {
     if (raxSize(s->rax) > server.active_defrag_max_scan_fields) {
         rax *newrax = activeDefragAlloc(s->rax);
         if (newrax) s->rax = newrax;
-        defragLater(db, kde);
+        defragLater(kde);
     } else
         defragRadixTree(&s->rax, 1, NULL, NULL);
 
@@ -667,25 +751,25 @@ void defragStream(serverDb *db, dictEntry *kde) {
 /* Defrag a module key. This is either done immediately or scheduled
  * for later. Returns then number of pointers defragged.
  */
-void defragModule(serverDb *db, dictEntry *kde) {
+static void defragModule(serverDb *db, dictEntry *kde) {
     robj *obj = dictGetVal(kde);
     serverAssert(obj->type == OBJ_MODULE);
 
-    if (!moduleDefragValue(dictGetKey(kde), obj, db->id)) defragLater(db, kde);
+    if (!moduleDefragValue(dictGetKey(kde), obj, db->id)) defragLater(kde);
 }
 
 /* for each key we scan in the main dict, this function will attempt to defrag
  * all the various pointers it has. */
-void defragKey(defragCtx *ctx, dictEntry *de) {
-    serverDb *db = ctx->privdata;
-    int slot = ctx->slot;
+static void defragKey(defragKeysCtx *ctx, dictEntry *de) {
+    serverDb *db = ctx->db;
+    int slot = ctx->kvstate.slot;
     robj *newob, *ob;
     unsigned char *newzl;
 
     /* Try to defrag robj and / or string value. */
     ob = dictGetVal(de);
     if ((newob = activeDefragStringOb(ob))) {
-        kvstoreDictSetVal(db->keys, slot, de, newob);
+        kvstoreDictSetVal(ctx->kvstate.kvs, slot, de, newob);
         ob = newob;
     }
 
@@ -693,7 +777,7 @@ void defragKey(defragCtx *ctx, dictEntry *de) {
         /* Already handled in activeDefragStringOb. */
     } else if (ob->type == OBJ_LIST) {
         if (ob->encoding == OBJ_ENCODING_QUICKLIST) {
-            defragQuicklist(db, de);
+            defragQuicklist(de);
         } else if (ob->encoding == OBJ_ENCODING_LISTPACK) {
             if ((newzl = activeDefragAlloc(ob->ptr))) ob->ptr = newzl;
         } else {
@@ -701,7 +785,7 @@ void defragKey(defragCtx *ctx, dictEntry *de) {
         }
     } else if (ob->type == OBJ_SET) {
         if (ob->encoding == OBJ_ENCODING_HT) {
-            defragSet(db, de);
+            defragSet(de);
         } else if (ob->encoding == OBJ_ENCODING_INTSET || ob->encoding == OBJ_ENCODING_LISTPACK) {
             void *newptr, *ptr = ob->ptr;
             if ((newptr = activeDefragAlloc(ptr))) ob->ptr = newptr;
@@ -712,7 +796,7 @@ void defragKey(defragCtx *ctx, dictEntry *de) {
         if (ob->encoding == OBJ_ENCODING_LISTPACK) {
             if ((newzl = activeDefragAlloc(ob->ptr))) ob->ptr = newzl;
         } else if (ob->encoding == OBJ_ENCODING_SKIPLIST) {
-            defragZsetSkiplist(db, de);
+            defragZsetSkiplist(de);
         } else {
             serverPanic("Unknown sorted set encoding");
         }
@@ -720,12 +804,12 @@ void defragKey(defragCtx *ctx, dictEntry *de) {
         if (ob->encoding == OBJ_ENCODING_LISTPACK) {
             if ((newzl = activeDefragAlloc(ob->ptr))) ob->ptr = newzl;
         } else if (ob->encoding == OBJ_ENCODING_HT) {
-            defragHash(db, de);
+            defragHash(de);
         } else {
             serverPanic("Unknown hash encoding");
         }
     } else if (ob->type == OBJ_STREAM) {
-        defragStream(db, de);
+        defragStream(de);
     } else if (ob->type == OBJ_MODULE) {
         defragModule(db, de);
     } else {
@@ -734,9 +818,9 @@ void defragKey(defragCtx *ctx, dictEntry *de) {
 }
 
 /* Defrag scan callback for the main db dictionary. */
-void defragScanCallback(void *privdata, const dictEntry *de) {
+static void dbKeysScanCallback(void *privdata, const dictEntry *de) {
     long long hits_before = server.stat_active_defrag_hits;
-    defragKey((defragCtx *)privdata, (dictEntry *)de);
+    defragKey((defragKeysCtx *)privdata, (dictEntry *)de);
     if (server.stat_active_defrag_hits != hits_before)
         server.stat_active_defrag_key_hits++;
     else
@@ -750,7 +834,7 @@ void defragScanCallback(void *privdata, const dictEntry *de) {
  * fragmentation ratio in order to decide if a defrag action should be taken
  * or not, a false detection can cause the defragmenter to waste a lot of CPU
  * without the possibility of getting any results. */
-float getAllocatorFragmentation(size_t *out_frag_bytes) {
+static float getAllocatorFragmentation(size_t *out_frag_bytes) {
     size_t resident, active, allocated, frag_smallbins_bytes;
     zmalloc_get_allocator_info(&allocated, &active, &resident, NULL, NULL);
     frag_smallbins_bytes = allocatorDefragGetFragSmallbins();
@@ -768,18 +852,18 @@ float getAllocatorFragmentation(size_t *out_frag_bytes) {
 }
 
 /* Defrag scan callback for the pubsub dictionary. */
-void defragPubsubScanCallback(void *privdata, const dictEntry *de) {
-    defragCtx *ctx = privdata;
-    defragPubSubCtx *pubsub_ctx = ctx->privdata;
-    kvstore *pubsub_channels = pubsub_ctx->pubsub_channels;
+static void defragPubsubScanCallback(void *privdata, const dictEntry *de) {
+    defragPubSubCtx *ctx = privdata;
+    kvstore *pubsub_channels = ctx->kvstate.kvs;
     robj *newchannel, *channel = dictGetKey(de);
     dict *newclients, *clients = dictGetVal(de);
+    size_t allocation_size;
 
     /* Try to defrag the channel name. */
     serverAssert(channel->refcount == (int)dictSize(clients) + 1);
-    newchannel = activeDefragStringObEx(channel, dictSize(clients) + 1);
+    newchannel = activeDefragStringObWithoutFree(channel, &allocation_size);
     if (newchannel) {
-        kvstoreDictSetKey(pubsub_channels, ctx->slot, (dictEntry *)de, newchannel);
+        kvstoreDictSetKey(pubsub_channels, ctx->kvstate.slot, (dictEntry *)de, newchannel);
 
         /* The channel name is shared by the client's pubsub(shard) and server's
          * pubsub(shard), after defraging the channel name, we need to update
@@ -788,35 +872,26 @@ void defragPubsubScanCallback(void *privdata, const dictEntry *de) {
         dictEntry *clientde;
         while ((clientde = dictNext(di)) != NULL) {
             client *c = dictGetKey(clientde);
-            dictEntry *pubsub_channel = dictFind(pubsub_ctx->clientPubSubChannels(c), newchannel);
+            dict *client_channels = ctx->getPubSubChannels(c);
+            dictEntry *pubsub_channel = dictFind(client_channels, newchannel);
             serverAssert(pubsub_channel);
-            dictSetKey(pubsub_ctx->clientPubSubChannels(c), pubsub_channel, newchannel);
+            dictSetKey(ctx->getPubSubChannels(c), pubsub_channel, newchannel);
         }
         dictReleaseIterator(di);
+        // Now that we're done correcting the references, we can safely free the old channel robj
+        allocatorDefragFree(channel, allocation_size);
     }
 
     /* Try to defrag the dictionary of clients that is stored as the value part. */
     if ((newclients = dictDefragTables(clients)))
-        kvstoreDictSetVal(pubsub_channels, ctx->slot, (dictEntry *)de, newclients);
+        kvstoreDictSetVal(pubsub_channels, ctx->kvstate.slot, (dictEntry *)de, newclients);
 
     server.stat_active_defrag_scanned++;
 }
 
-/* We may need to defrag other globals, one small allocation can hold a full allocator run.
- * so although small, it is still important to defrag these */
-void defragOtherGlobals(void) {
-    /* there are many more pointers to defrag (e.g. client argv, output / aof buffers, etc.
-     * but we assume most of these are short lived, we only need to defrag allocations
-     * that remain static for a long time */
-    activeDefragSdsDict(evalScriptsDict(), DEFRAG_SDS_DICT_VAL_LUA_SCRIPT);
-    moduleDefragGlobals();
-    kvstoreDictLUTDefrag(server.pubsub_channels, dictDefragTables);
-    kvstoreDictLUTDefrag(server.pubsubshard_channels, dictDefragTables);
-}
-
 /* returns 0 more work may or may not be needed (see non-zero cursor),
  * and 1 if time is up and more work is needed. */
-int defragLaterItem(dictEntry *de, unsigned long *cursor, long long endtime, int dbid) {
+static int defragLaterItem(dictEntry *de, unsigned long *cursor, monotime endtime, int dbid) {
     if (de) {
         robj *ob = dictGetVal(de);
         if (ob->type == OBJ_LIST) {
@@ -830,7 +905,8 @@ int defragLaterItem(dictEntry *de, unsigned long *cursor, long long endtime, int
         } else if (ob->type == OBJ_STREAM) {
             return scanLaterStreamListpacks(ob, cursor, endtime);
         } else if (ob->type == OBJ_MODULE) {
-            return moduleLateDefrag(dictGetKey(de), ob, cursor, endtime, dbid);
+            long long endtimeWallClock = ustime() + (endtime - getMonotonicUs());
+            return moduleLateDefrag(dictGetKey(de), ob, cursor, endtimeWallClock, dbid);
         } else {
             *cursor = 0; /* object type may have changed since we schedule it for later */
         }
@@ -840,299 +916,474 @@ int defragLaterItem(dictEntry *de, unsigned long *cursor, long long endtime, int
     return 0;
 }
 
-/* static variables serving defragLaterStep to continue scanning a key from were we stopped last time. */
-static sds defrag_later_current_key = NULL;
-static unsigned long defrag_later_cursor = 0;
 
-/* returns 0 if no more work needs to be been done, and 1 if time is up and more work is needed. */
-int defragLaterStep(serverDb *db, int slot, long long endtime) {
+// A kvstoreHelperPreContinueFn
+static doneStatus defragLaterStep(monotime endtime, void *privdata) {
+    defragKeysCtx *ctx = privdata;
+
     unsigned int iterations = 0;
     unsigned long long prev_defragged = server.stat_active_defrag_hits;
     unsigned long long prev_scanned = server.stat_active_defrag_scanned;
-    long long key_defragged;
 
-    do {
-        /* if we're not continuing a scan from the last call or loop, start a new one */
-        if (!defrag_later_cursor) {
-            listNode *head = listFirst(db->defrag_later);
-
-            /* Move on to next key */
-            if (defrag_later_current_key) {
-                serverAssert(defrag_later_current_key == head->value);
-                listDelNode(db->defrag_later, head);
-                defrag_later_cursor = 0;
-                defrag_later_current_key = NULL;
-            }
+    while (defrag_later && listLength(defrag_later) > 0) {
+        listNode *head = listFirst(defrag_later);
+        sds key = head->value;
+        dictEntry *de = kvstoreDictFind(ctx->kvstate.kvs, ctx->kvstate.slot, key);
 
-            /* stop if we reached the last one. */
-            head = listFirst(db->defrag_later);
-            if (!head) return 0;
-
-            /* start a new key */
-            defrag_later_current_key = head->value;
-            defrag_later_cursor = 0;
-        }
-
-        /* each time we enter this function we need to fetch the key from the dict again (if it still exists) */
-        dictEntry *de = kvstoreDictFind(db->keys, slot, defrag_later_current_key);
-        key_defragged = server.stat_active_defrag_hits;
-        do {
-            int quit = 0;
-            if (defragLaterItem(de, &defrag_later_cursor, endtime, db->id))
-                quit = 1; /* time is up, we didn't finish all the work */
-
-            /* Once in 16 scan iterations, 512 pointer reallocations, or 64 fields
-             * (if we have a lot of pointers in one hash bucket, or rehashing),
-             * check if we reached the time limit. */
-            if (quit || (++iterations > 16 || server.stat_active_defrag_hits - prev_defragged > 512 ||
-                         server.stat_active_defrag_scanned - prev_scanned > 64)) {
-                if (quit || ustime() > endtime) {
-                    if (key_defragged != server.stat_active_defrag_hits)
-                        server.stat_active_defrag_key_hits++;
-                    else
-                        server.stat_active_defrag_key_misses++;
-                    return 1;
-                }
-                iterations = 0;
-                prev_defragged = server.stat_active_defrag_hits;
-                prev_scanned = server.stat_active_defrag_scanned;
-            }
-        } while (defrag_later_cursor);
-        if (key_defragged != server.stat_active_defrag_hits)
+        long long key_defragged = server.stat_active_defrag_hits;
+        bool timeout = (defragLaterItem(de, &defrag_later_cursor, endtime, ctx->db->id) == 1);
+        if (key_defragged != server.stat_active_defrag_hits) {
             server.stat_active_defrag_key_hits++;
-        else
+        } else {
             server.stat_active_defrag_key_misses++;
-    } while (1);
-}
+        }
 
-#define INTERPOLATE(x, x1, x2, y1, y2) ((y1) + ((x) - (x1)) * ((y2) - (y1)) / ((x2) - (x1)))
-#define LIMIT(y, min, max) ((y) < (min) ? min : ((y) > (max) ? max : (y)))
+        if (timeout) break;
 
-/* decide if defrag is needed, and at what CPU effort to invest in it */
-void computeDefragCycles(void) {
-    size_t frag_bytes;
-    float frag_pct = getAllocatorFragmentation(&frag_bytes);
-    /* If we're not already running, and below the threshold, exit. */
-    if (!server.active_defrag_running) {
-        if (frag_pct < server.active_defrag_threshold_lower || frag_bytes < server.active_defrag_ignore_bytes) return;
+        if (defrag_later_cursor == 0) {
+            // the item is finished, move on
+            listDelNode(defrag_later, head);
+        }
+
+        if (++iterations > 16 || server.stat_active_defrag_hits - prev_defragged > 512 ||
+            server.stat_active_defrag_scanned - prev_scanned > 64) {
+            if (getMonotonicUs() > endtime) break;
+            iterations = 0;
+            prev_defragged = server.stat_active_defrag_hits;
+            prev_scanned = server.stat_active_defrag_scanned;
+        }
     }
 
-    /* Calculate the adaptive aggressiveness of the defrag based on the current
-     * fragmentation and configurations. */
-    int cpu_pct = INTERPOLATE(frag_pct, server.active_defrag_threshold_lower, server.active_defrag_threshold_upper,
-                              server.active_defrag_cycle_min, server.active_defrag_cycle_max);
-    cpu_pct = LIMIT(cpu_pct, server.active_defrag_cycle_min, server.active_defrag_cycle_max);
+    return (!defrag_later || listLength(defrag_later) == 0) ? DEFRAG_DONE : DEFRAG_NOT_DONE;
+}
 
-    /* Normally we allow increasing the aggressiveness during a scan, but don't
-     * reduce it, since we should not lower the aggressiveness when fragmentation
-     * drops. But when a configuration is made, we should reconsider it. */
-    if (cpu_pct > server.active_defrag_running || server.active_defrag_configuration_changed) {
-        server.active_defrag_running = cpu_pct;
-        server.active_defrag_configuration_changed = 0;
-        serverLog(LL_VERBOSE, "Starting active defrag, frag=%.0f%%, frag_bytes=%zu, cpu=%d%%", frag_pct, frag_bytes,
-                  cpu_pct);
+
+/* This helper function handles most of the work for iterating over a kvstore.  'privdata', if
+ * provided, MUST begin with 'kvstoreIterState' and this part is automatically updated by this
+ * function during the iteration. */
+static doneStatus defragStageKvstoreHelper(monotime endtime,
+                                           kvstore *kvs,
+                                           dictScanFunction scan_fn,
+                                           kvstoreHelperPreContinueFn precontinue_fn,
+                                           const dictDefragFunctions *defragfns,
+                                           void *privdata) {
+    static kvstoreIterState state; // STATIC - this persists
+    if (endtime == 0) {
+        // Starting the stage, set up the state information for this stage
+        state.kvs = kvs;
+        state.slot = KVS_SLOT_DEFRAG_LUT;
+        state.cursor = 0;
+        return DEFRAG_NOT_DONE;
     }
-}
+    serverAssert(kvs == state.kvs); // Shouldn't change during the stage
 
-/* Perform incremental defragmentation work from the serverCron.
- * This works in a similar way to activeExpireCycle, in the sense that
- * we do incremental work across calls. */
-void activeDefragCycle(void) {
-    static int slot = -1;
-    static int current_db = -1;
-    static int defrag_later_item_in_progress = 0;
-    static int defrag_stage = 0;
-    static unsigned long defrag_cursor = 0;
-    static serverDb *db = NULL;
-    static long long start_scan, start_stat;
     unsigned int iterations = 0;
     unsigned long long prev_defragged = server.stat_active_defrag_hits;
     unsigned long long prev_scanned = server.stat_active_defrag_scanned;
-    long long start, timelimit, endtime;
-    mstime_t latency;
-    int all_stages_finished = 0;
-    int quit = 0;
 
-    if (!server.active_defrag_enabled) {
-        if (server.active_defrag_running) {
-            /* if active defrag was disabled mid-run, start from fresh next time. */
-            server.active_defrag_running = 0;
-            server.active_defrag_configuration_changed = 0;
-            if (db) listEmpty(db->defrag_later);
-            defrag_later_current_key = NULL;
-            defrag_later_cursor = 0;
-            current_db = -1;
-            defrag_stage = 0;
-            defrag_cursor = 0;
-            slot = -1;
-            defrag_later_item_in_progress = 0;
-            db = NULL;
-            goto update_metrics;
+    if (state.slot == KVS_SLOT_DEFRAG_LUT) {
+        // Before we start scanning the kvstore, handle the main structures
+        do {
+            state.cursor = kvstoreDictLUTDefrag(kvs, state.cursor, dictDefragTables);
+            if (getMonotonicUs() >= endtime) return DEFRAG_NOT_DONE;
+        } while (state.cursor != 0);
+        state.slot = KVS_SLOT_UNASSIGNED;
+    }
+
+    while (true) {
+        if (++iterations > 16 || server.stat_active_defrag_hits - prev_defragged > 512 || server.stat_active_defrag_scanned - prev_scanned > 64) {
+            if (getMonotonicUs() >= endtime) break;
+            iterations = 0;
+            prev_defragged = server.stat_active_defrag_hits;
+            prev_scanned = server.stat_active_defrag_scanned;
         }
-        return;
+
+        if (precontinue_fn) {
+            if (privdata) *(kvstoreIterState *)privdata = state;
+            if (precontinue_fn(endtime, privdata) == DEFRAG_NOT_DONE) return DEFRAG_NOT_DONE;
+        }
+
+        if (!state.cursor) {
+            // If there's no cursor, we're ready to begin a new kvstore slot.
+            if (state.slot == KVS_SLOT_UNASSIGNED) {
+                state.slot = kvstoreGetFirstNonEmptyDictIndex(kvs);
+            } else {
+                state.slot = kvstoreGetNextNonEmptyDictIndex(kvs, state.slot);
+            }
+
+            if (state.slot == KVS_SLOT_UNASSIGNED) return DEFRAG_DONE;
+        }
+
+        // Whatever privdata's actual type, this function requires that it begins with kvstoreIterState.
+        if (privdata) *(kvstoreIterState *)privdata = state;
+        state.cursor = kvstoreDictScanDefrag(kvs, state.slot, state.cursor,
+                                             scan_fn, defragfns, privdata);
     }
 
-    if (hasActiveChildProcess()) return; /* Defragging memory while there's a fork will just do damage. */
+    return DEFRAG_NOT_DONE;
+}
+
 
-    /* Once a second, check if the fragmentation justfies starting a scan
-     * or making it more aggressive. */
-    run_with_period(1000) {
-        computeDefragCycles();
+// Note: target is a DB, (not a KVS like most stages)
+static doneStatus defragStageDbKeys(monotime endtime, void *target, void *privdata) {
+    UNUSED(privdata);
+    serverDb *db = (serverDb *)target;
+
+    static defragKeysCtx ctx; // STATIC - this persists
+    if (endtime == 0) {
+        ctx.db = db;
+        // Don't return yet.  Call the helper with endtime==0 below.
     }
+    serverAssert(ctx.db == db);
 
-    /* Normally it is checked once a second, but when there is a configuration
-     * change, we want to check it as soon as possible. */
-    if (server.active_defrag_configuration_changed) {
-        computeDefragCycles();
-        server.active_defrag_configuration_changed = 0;
+    /* Note: for DB keys, we use the start/finish callback to fix an expires table entry if
+     *       the main DB entry has been moved. */
+    static const dictDefragFunctions defragfns = {
+        .defragAlloc = activeDefragAlloc,
+        .defragKey = NULL, // Handled by dbKeysScanCallback
+        .defragVal = NULL, // Handled by dbKeysScanCallback
+        .defragEntryStartCb = defragEntryStartCbForKeys,
+        .defragEntryFinishCb = defragEntryFinishCbForKeys};
+
+    return defragStageKvstoreHelper(endtime, db->keys,
+                                    dbKeysScanCallback, defragLaterStep, &defragfns, &ctx);
+}
+
+
+static doneStatus defragStageExpiresKvstore(monotime endtime, void *target, void *privdata) {
+    UNUSED(privdata);
+    static const dictDefragFunctions defragfns = {
+        .defragAlloc = activeDefragAlloc,
+        .defragKey = NULL, // Not needed for expires (just a ref)
+        .defragVal = NULL, // Not needed for expires (no value)
+    };
+    return defragStageKvstoreHelper(endtime, (kvstore *)target,
+                                    scanCallbackCountScanned, NULL, &defragfns, NULL);
+}
+
+
+static doneStatus defragStagePubsubKvstore(monotime endtime, void *target, void *privdata) {
+    // target is server.pubsub_channels or server.pubsubshard_channels
+    getClientChannelsFnWrapper *fnWrapper = privdata;
+
+    static const dictDefragFunctions defragfns = {
+        .defragAlloc = activeDefragAlloc,
+        .defragKey = NULL, // Handled by defragPubsubScanCallback
+        .defragVal = NULL, // Not needed for expires (no value)
+    };
+    defragPubSubCtx ctx;
+
+    ctx.getPubSubChannels = fnWrapper->fn;
+    return defragStageKvstoreHelper(endtime, (kvstore *)target,
+                                    defragPubsubScanCallback, NULL, &defragfns, &ctx);
+}
+
+
+static doneStatus defragLuaScripts(monotime endtime, void *target, void *privdata) {
+    UNUSED(target);
+    UNUSED(privdata);
+    if (endtime == 0) return DEFRAG_NOT_DONE; // required initialization
+    activeDefragSdsDict(evalScriptsDict(), DEFRAG_SDS_DICT_VAL_LUA_SCRIPT);
+    return DEFRAG_DONE;
+}
+
+
+static doneStatus defragModuleGlobals(monotime endtime, void *target, void *privdata) {
+    UNUSED(target);
+    UNUSED(privdata);
+    if (endtime == 0) return DEFRAG_NOT_DONE; // required initialization
+    moduleDefragGlobals();
+    return DEFRAG_DONE;
+}
+
+
+static bool defragIsRunning(void) {
+    return (defrag.timeproc_id > 0);
+}
+
+
+static void addDefragStage(defragStageFn stage_fn, void *target, void *privdata) {
+    StageDescriptor *stage = zmalloc(sizeof(StageDescriptor));
+    stage->stage_fn = stage_fn;
+    stage->target = target;
+    stage->privdata = privdata;
+    listAddNodeTail(defrag.remaining_stages, stage);
+}
+
+
+// Called at the end of a complete defrag cycle, or when defrag is terminated
+static void endDefragCycle(bool normal_termination) {
+    if (normal_termination) {
+        // For normal termination, we expect...
+        serverAssert(!defrag.current_stage);
+        serverAssert(listLength(defrag.remaining_stages) == 0);
+        serverAssert(!defrag_later || listLength(defrag_later) == 0);
+    } else {
+        // Defrag is being terminated abnormally
+        aeDeleteTimeEvent(server.el, defrag.timeproc_id);
+
+        if (defrag.current_stage) {
+            zfree(defrag.current_stage);
+            defrag.current_stage = NULL;
+        }
+        listSetFreeMethod(defrag.remaining_stages, zfree);
     }
+    defrag.timeproc_id = AE_DELETED_EVENT_ID;
 
-    if (!server.active_defrag_running) return;
+    listRelease(defrag.remaining_stages);
+    defrag.remaining_stages = NULL;
 
-    /* See activeExpireCycle for how timelimit is handled. */
-    start = ustime();
-    timelimit = 1000000 * server.active_defrag_running / server.hz / 100;
-    if (timelimit <= 0) timelimit = 1;
-    endtime = start + timelimit;
-    latencyStartMonitor(latency);
+    if (defrag_later) {
+        listRelease(defrag_later);
+        defrag_later = NULL;
+    }
+    defrag_later_cursor = 0;
 
-    dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc,
-                                     .defragEntryStartCb = defragEntryStartCbForKeys,
-                                     .defragEntryFinishCb = defragEntryFinishCbForKeys};
-    do {
-        /* if we're not continuing a scan from the last call or loop, start a new one */
-        if (!defrag_stage && !defrag_cursor && (slot < 0)) {
-            /* finish any leftovers from previous db before moving to the next one */
-            if (db && defragLaterStep(db, slot, endtime)) {
-                quit = 1; /* time is up, we didn't finish all the work */
-                break;    /* this will exit the function and we'll continue on the next cycle */
-            }
+    size_t frag_bytes;
+    float frag_pct = getAllocatorFragmentation(&frag_bytes);
+    serverLog(LL_VERBOSE, "Active defrag done in %dms, reallocated=%d, frag=%.0f%%, frag_bytes=%zu",
+              (int)elapsedMs(defrag.start_cycle), (int)(server.stat_active_defrag_hits - defrag.start_defrag_hits),
+              frag_pct, frag_bytes);
 
-            /* Move on to next database, and stop if we reached the last one. */
-            if (++current_db >= server.dbnum) {
-                /* defrag other items not part of the db / keys */
-                defragOtherGlobals();
-
-                long long now = ustime();
-                size_t frag_bytes;
-                float frag_pct = getAllocatorFragmentation(&frag_bytes);
-                serverLog(LL_VERBOSE, "Active defrag done in %dms, reallocated=%d, frag=%.0f%%, frag_bytes=%zu",
-                          (int)((now - start_scan) / 1000), (int)(server.stat_active_defrag_hits - start_stat),
-                          frag_pct, frag_bytes);
-
-                start_scan = now;
-                current_db = -1;
-                defrag_stage = 0;
-                defrag_cursor = 0;
-                slot = -1;
-                defrag_later_item_in_progress = 0;
-                db = NULL;
-                server.active_defrag_running = 0;
-
-                computeDefragCycles(); /* if another scan is needed, start it right away */
-                if (server.active_defrag_running != 0 && ustime() < endtime) continue;
-                break;
-            } else if (current_db == 0) {
-                /* Start a scan from the first database. */
-                start_scan = ustime();
-                start_stat = server.stat_active_defrag_hits;
-            }
+    server.stat_total_active_defrag_time += elapsedUs(server.stat_last_active_defrag_time);
+    server.stat_last_active_defrag_time = 0;
+    server.active_defrag_cpu_percent = 0;
+}
+
+
+/* Must be called at the start of the timeProc as it measures the delay from the end of the previous
+ * timeProc invocation when performing the computation. */
+static int computeDefragCycleUs(void) {
+    long dutyCycleUs;
 
-            db = &server.db[current_db];
-            kvstoreDictLUTDefrag(db->keys, dictDefragTables);
-            kvstoreDictLUTDefrag(db->expires, dictDefragTables);
-            defrag_stage = 0;
-            defrag_cursor = 0;
-            slot = -1;
-            defrag_later_item_in_progress = 0;
+    int targetCpuPercent = server.active_defrag_cpu_percent;
+    serverAssert(targetCpuPercent > 0 && targetCpuPercent < 100);
+
+    static int prevCpuPercent = 0; // STATIC - this persists
+    if (targetCpuPercent != prevCpuPercent) {
+        /* If the targetCpuPercent changes, the value might be different from when the last wait
+         *  time was computed.  In this case, don't consider wait time.  (This is really only an
+         *  issue in crazy tests that dramatically increase CPU while defrag is running.) */
+        defrag.timeproc_end_time = 0;
+        prevCpuPercent = targetCpuPercent;
+    }
+
+    // Given when the last duty cycle ended, compute time needed to achieve the desired percentage.
+    if (defrag.timeproc_end_time == 0) {
+        // Either the first call to the timeProc, or we were paused for some reason.
+        defrag.timeproc_overage_us = 0;
+        dutyCycleUs = server.active_defrag_cycle_us;
+    } else {
+        long waitedUs = getMonotonicUs() - defrag.timeproc_end_time;
+        /* Given the elapsed wait time between calls, compute the necessary duty time needed to
+         *  achieve the desired CPU percentage.
+         *  With:  D = duty time, W = wait time, P = percent
+         *  Solve:    D          P
+         *          -----   =  -----
+         *          D + W       100
+         *  Solving for D:
+         *     D = P * W / (100 - P)
+         *
+         * Note that dutyCycleUs addresses starvation.  If the wait time was long, we will compensate
+         *  with a proportionately long duty-cycle.  This won't significantly affect perceived
+         *  latency, because clients are already being impacted by the long cycle time which caused
+         *  the starvation of the timer. */
+        dutyCycleUs = targetCpuPercent * waitedUs / (100 - targetCpuPercent);
+
+        // Also adjust for any accumulated overage(underage).
+        dutyCycleUs -= defrag.timeproc_overage_us;
+        defrag.timeproc_overage_us = 0;
+
+        if (dutyCycleUs < server.active_defrag_cycle_us) {
+            /* We never reduce our cycle time, that would increase overhead.  Instead, we track this
+             *  as part of the overage, and increase wait time between cycles. */
+            defrag.timeproc_overage_us = server.active_defrag_cycle_us - dutyCycleUs;
+            dutyCycleUs = server.active_defrag_cycle_us;
         }
+    }
+    return dutyCycleUs;
+}
 
-        /* This array of structures holds the parameters for all defragmentation stages. */
-        typedef struct defragStage {
-            kvstore *kvs;
-            dictScanFunction *scanfn;
-            void *privdata;
-        } defragStage;
-        defragStage defrag_stages[] = {
-            {db->keys, defragScanCallback, db},
-            {db->expires, scanCallbackCountScanned, NULL},
-            {server.pubsub_channels, defragPubsubScanCallback,
-             &(defragPubSubCtx){server.pubsub_channels, getClientPubSubChannels}},
-            {server.pubsubshard_channels, defragPubsubScanCallback,
-             &(defragPubSubCtx){server.pubsubshard_channels, getClientPubSubShardChannels}},
-        };
-        do {
-            int num_stages = sizeof(defrag_stages) / sizeof(defrag_stages[0]);
-            serverAssert(defrag_stage < num_stages);
-            defragStage *current_stage = &defrag_stages[defrag_stage];
-
-            /* before scanning the next bucket, see if we have big keys left from the previous bucket to scan */
-            if (defragLaterStep(db, slot, endtime)) {
-                quit = 1; /* time is up, we didn't finish all the work */
-                break;    /* this will exit the function and we'll continue on the next cycle */
-            }
 
-            if (!defrag_later_item_in_progress) {
-                /* Continue defragmentation from the previous stage.
-                 * If slot is -1, it means this stage starts from the first non-empty slot. */
-                if (slot == -1) slot = kvstoreGetFirstNonEmptyDictIndex(current_stage->kvs);
-                defrag_cursor = kvstoreDictScanDefrag(current_stage->kvs, slot, defrag_cursor, current_stage->scanfn,
-                                                      &defragfns, &(defragCtx){current_stage->privdata, slot});
-            }
+/* Must be called at the end of the timeProc as it records the timeproc_end_time for use in the next
+ * computeDefragCycleUs computation. */
+static int computeDelayMs(monotime intendedEndtime) {
+    defrag.timeproc_end_time = getMonotonicUs();
+    int overage = defrag.timeproc_end_time - intendedEndtime;
+    defrag.timeproc_overage_us += overage; // track over/under desired CPU
+
+    int targetCpuPercent = server.active_defrag_cpu_percent;
+    serverAssert(targetCpuPercent > 0 && targetCpuPercent < 100);
+
+    // Given the desired duty cycle, what inter-cycle delay do we need to achieve that?
+    // We want to achieve a specific CPU percent.  To do that, we can't use a skewed computation.
+    // Example, if we run for 1ms and delay 10ms, that's NOT 10%, because the total cycle time is 11ms.
+    // Instead, if we rum for 1ms, our total time should be 10ms.  So the delay is only 9ms.
+    long totalCycleTimeUs = server.active_defrag_cycle_us * 100 / targetCpuPercent;
+    long delayUs = totalCycleTimeUs - server.active_defrag_cycle_us;
+    // Only increase delay by the fraction of the overage that would be non-duty-cycle
+    delayUs += defrag.timeproc_overage_us * (100 - targetCpuPercent) / 100; // "overage" might be negative
+    if (delayUs < 0) delayUs = 0;
+    long delayMs = delayUs / 1000; // round down
+    return delayMs;
+}
 
-            if (!defrag_cursor) {
-                /* Move to the next slot only if regular and large item scanning has been completed. */
-                if (listLength(db->defrag_later) > 0) {
-                    defrag_later_item_in_progress = 1;
-                    continue;
-                }
 
-                /* Move to the next slot in the current stage. If we've reached the end, move to the next stage. */
-                if ((slot = kvstoreGetNextNonEmptyDictIndex(current_stage->kvs, slot)) == -1) defrag_stage++;
-                defrag_later_item_in_progress = 0;
-            }
+/* An independent time proc for defrag.  While defrag is running, this is called much more often
+ *  than the server cron.  Frequent short calls provides low latency impact. */
+static long long activeDefragTimeProc(struct aeEventLoop *eventLoop, long long id, void *clientData) {
+    UNUSED(eventLoop);
+    UNUSED(id);
+    UNUSED(clientData);
 
-            /* Check if all defragmentation stages have been processed.
-             * If so, mark as finished and reset the stage counter to move on to next database. */
-            if (defrag_stage == num_stages) {
-                all_stages_finished = 1;
-                defrag_stage = 0;
-            }
+    // This timer shouldn't be registered unless there's work to do.
+    serverAssert(defrag.current_stage || listLength(defrag.remaining_stages) > 0);
 
-            /* Once in 16 scan iterations, 512 pointer reallocations. or 64 keys
-             * (if we have a lot of pointers in one hash bucket or rehashing),
-             * check if we reached the time limit.
-             * But regardless, don't start a new db in this loop, this is because after
-             * the last db we call defragOtherGlobals, which must be done in one cycle */
-            if (all_stages_finished || ++iterations > 16 || server.stat_active_defrag_hits - prev_defragged > 512 ||
-                server.stat_active_defrag_scanned - prev_scanned > 64) {
-                /* Quit if all stages were finished or timeout. */
-                if (all_stages_finished || ustime() > endtime) {
-                    quit = 1;
-                    break;
-                }
-                iterations = 0;
-                prev_defragged = server.stat_active_defrag_hits;
-                prev_scanned = server.stat_active_defrag_scanned;
-            }
-        } while (!all_stages_finished && !quit);
-    } while (!quit);
+    if (!server.active_defrag_enabled) {
+        // Defrag has been disabled while running
+        endDefragCycle(false);
+        return AE_NOMORE;
+    }
+
+    if (hasActiveChildProcess()) {
+        // If there's a child process, pause the defrag, polling until the child completes.
+        defrag.timeproc_end_time = 0; // prevent starvation recovery
+        return 100;
+    }
+
+    monotime starttime = getMonotonicUs();
+    monotime endtime = starttime + computeDefragCycleUs();
+
+    mstime_t latency;
+    latencyStartMonitor(latency);
+
+    if (!defrag.current_stage) {
+        defrag.current_stage = listNodeValue(listFirst(defrag.remaining_stages));
+        listDelNode(defrag.remaining_stages, listFirst(defrag.remaining_stages));
+        // Initialize the stage with endtime==0
+        doneStatus status = defrag.current_stage->stage_fn(0, defrag.current_stage->target, defrag.current_stage->privdata);
+        serverAssert(status == DEFRAG_NOT_DONE); // Initialization should always return DEFRAG_NOT_DONE
+    }
+
+    doneStatus status = defrag.current_stage->stage_fn(endtime, defrag.current_stage->target, defrag.current_stage->privdata);
+    if (status == DEFRAG_DONE) {
+        zfree(defrag.current_stage);
+        defrag.current_stage = NULL;
+    }
 
     latencyEndMonitor(latency);
     latencyAddSampleIfNeeded("active-defrag-cycle", latency);
 
-update_metrics:
-    if (server.active_defrag_running > 0) {
-        if (server.stat_last_active_defrag_time == 0) elapsedStart(&server.stat_last_active_defrag_time);
-    } else if (server.stat_last_active_defrag_time != 0) {
-        server.stat_total_active_defrag_time += elapsedUs(server.stat_last_active_defrag_time);
-        server.stat_last_active_defrag_time = 0;
+    if (defrag.current_stage || listLength(defrag.remaining_stages) > 0) {
+        return computeDelayMs(endtime);
+    } else {
+        endDefragCycle(true);
+        return AE_NOMORE; // Ends the timer proc
+    }
+}
+
+
+/* During long running scripts, or while loading, there is a periodic function for handling other
+ * actions.  This interface allows defrag to continue running, avoiding a single long defrag step
+ * after the long operation completes. */
+void defragWhileBlocked(void) {
+    if (!defragIsRunning()) return;
+
+    // Save off the timeproc_id.  If we have a normal termination, it will be cleared.
+    long long timeproc_id = defrag.timeproc_id;
+
+    // Simulate a single call of the timer proc
+    long long reschedule_delay = activeDefragTimeProc(NULL, 0, NULL);
+    if (reschedule_delay == AE_NOMORE) {
+        // If it's done, deregister the timer
+        aeDeleteTimeEvent(server.el, timeproc_id);
     }
+    /* Otherwise, just ignore the reschedule_delay, the timer will pop the next time that the
+     * event loop can process timers again. */
+}
+
+
+static void beginDefragCycle(void) {
+    serverAssert(!defragIsRunning());
+
+    serverAssert(defrag.remaining_stages == NULL);
+    defrag.remaining_stages = listCreate();
+
+    for (int dbid = 0; dbid < server.dbnum; dbid++) {
+        serverDb *db = &server.db[dbid];
+        addDefragStage(defragStageDbKeys, db, NULL);
+        addDefragStage(defragStageExpiresKvstore, db->expires, NULL);
+    }
+
+    static getClientChannelsFnWrapper getClientPubSubChannelsFn = {getClientPubSubChannels};
+    static getClientChannelsFnWrapper getClientPubSubShardChannelsFn = {getClientPubSubShardChannels};
+    addDefragStage(defragStagePubsubKvstore, server.pubsub_channels, &getClientPubSubChannelsFn);
+    addDefragStage(defragStagePubsubKvstore, server.pubsubshard_channels, &getClientPubSubShardChannelsFn);
+
+    addDefragStage(defragLuaScripts, NULL, NULL);
+    addDefragStage(defragModuleGlobals, NULL, NULL);
+
+    defrag.current_stage = NULL;
+    defrag.start_cycle = getMonotonicUs();
+    defrag.start_defrag_hits = server.stat_active_defrag_hits;
+    defrag.timeproc_end_time = 0;
+    defrag.timeproc_overage_us = 0;
+    defrag.timeproc_id = aeCreateTimeEvent(server.el, 0, activeDefragTimeProc, NULL, NULL);
+
+    elapsedStart(&server.stat_last_active_defrag_time);
+}
+
+
+#define INTERPOLATE(x, x1, x2, y1, y2) ((y1) + ((x) - (x1)) * ((y2) - (y1)) / ((x2) - (x1)))
+#define LIMIT(y, min, max) ((y) < (min) ? min : ((y) > (max) ? max : (y)))
+
+/* decide if defrag is needed, and at what CPU effort to invest in it */
+static void updateDefragCpuPercent(void) {
+    size_t frag_bytes;
+    float frag_pct = getAllocatorFragmentation(&frag_bytes);
+    if (server.active_defrag_cpu_percent == 0) {
+        if (frag_pct < server.active_defrag_threshold_lower ||
+            frag_bytes < server.active_defrag_ignore_bytes) return;
+    }
+
+    /* Calculate the adaptive aggressiveness of the defrag based on the current
+     * fragmentation and configurations. */
+    int cpu_pct = INTERPOLATE(frag_pct, server.active_defrag_threshold_lower, server.active_defrag_threshold_upper,
+                              server.active_defrag_cpu_min, server.active_defrag_cpu_max);
+    cpu_pct = LIMIT(cpu_pct, server.active_defrag_cpu_min, server.active_defrag_cpu_max);
+
+    /* Normally we allow increasing the aggressiveness during a scan, but don't
+     * reduce it, since we should not lower the aggressiveness when fragmentation
+     * drops. But when a configuration is made, we should reconsider it. */
+    if (cpu_pct > server.active_defrag_cpu_percent || server.active_defrag_configuration_changed) {
+        server.active_defrag_configuration_changed = 0;
+        if (defragIsRunning()) {
+            serverLog(LL_VERBOSE, "Changing active defrag CPU, frag=%.0f%%, frag_bytes=%zu, cpu=%d%%",
+                      frag_pct, frag_bytes, cpu_pct);
+        } else {
+            serverLog(LL_VERBOSE, "Starting active defrag, frag=%.0f%%, frag_bytes=%zu, cpu=%d%%",
+                      frag_pct, frag_bytes, cpu_pct);
+        }
+        server.active_defrag_cpu_percent = cpu_pct;
+    }
+}
+
+
+void monitorActiveDefrag(void) {
+    if (!server.active_defrag_enabled) return;
+
+    /* Defrag gets paused while a child process is active.  So there's no point in starting a new
+     *  cycle or adjusting the CPU percentage for an existing cycle. */
+    if (hasActiveChildProcess()) return;
+
+    updateDefragCpuPercent();
+
+    if (server.active_defrag_cpu_percent > 0 && !defragIsRunning()) beginDefragCycle();
 }
 
 #else /* HAVE_DEFRAG */
 
-void activeDefragCycle(void) {
+void monitorActiveDefrag(void) {
     /* Not implemented yet. */
 }
 
@@ -1146,4 +1397,7 @@ robj *activeDefragStringOb(robj *ob) {
     return NULL;
 }
 
+void defragWhileBlocked(void) {
+}
+
 #endif
diff --git a/src/dict.c b/src/dict.c
index 48c0f815bb..f75369d533 100644
--- a/src/dict.c
+++ b/src/dict.c
@@ -1321,7 +1321,7 @@ unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count) {
 
 /* Reallocate the dictEntry, key and value allocations in a bucket using the
  * provided allocation functions in order to defrag them. */
-static void dictDefragBucket(dictEntry **bucketref, dictDefragFunctions *defragfns, void *privdata) {
+static void dictDefragBucket(dictEntry **bucketref, const dictDefragFunctions *defragfns, void *privdata) {
     dictDefragAllocFunction *defragalloc = defragfns->defragAlloc;
     dictDefragAllocFunction *defragkey = defragfns->defragKey;
     dictDefragAllocFunction *defragval = defragfns->defragVal;
@@ -1499,7 +1499,7 @@ unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, void *pri
  * where NULL means that no reallocation happened and the old memory is still
  * valid. */
 unsigned long
-dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, dictDefragFunctions *defragfns, void *privdata) {
+dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, const dictDefragFunctions *defragfns, void *privdata) {
     int htidx0, htidx1;
     const dictEntry *de, *next;
     unsigned long m0, m1;
diff --git a/src/dict.h b/src/dict.h
index 88ebd7bf99..854d026cdc 100644
--- a/src/dict.h
+++ b/src/dict.h
@@ -238,7 +238,7 @@ void dictSetHashFunctionSeed(uint8_t *seed);
 uint8_t *dictGetHashFunctionSeed(void);
 unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, void *privdata);
 unsigned long
-dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, dictDefragFunctions *defragfns, void *privdata);
+dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, const dictDefragFunctions *defragfns, void *privdata);
 uint64_t dictGetHash(dict *d, const void *key);
 void dictRehashingInfo(dict *d, unsigned long long *from_size, unsigned long long *to_size);
 
diff --git a/src/kvstore.c b/src/kvstore.c
index 49662f330a..344a8af5cf 100644
--- a/src/kvstore.c
+++ b/src/kvstore.c
@@ -739,7 +739,7 @@ unsigned long kvstoreDictScanDefrag(kvstore *kvs,
                                     int didx,
                                     unsigned long v,
                                     dictScanFunction *fn,
-                                    dictDefragFunctions *defragfns,
+                                    const dictDefragFunctions *defragfns,
                                     void *privdata) {
     dict *d = kvstoreGetDict(kvs, didx);
     if (!d) return 0;
@@ -750,14 +750,27 @@ unsigned long kvstoreDictScanDefrag(kvstore *kvs,
  * within dict, it only reallocates the memory used by the dict structure itself using
  * the provided allocation function. This feature was added for the active defrag feature.
  *
- * The 'defragfn' callback is called with a reference to the dict
- * that callback can reallocate. */
-void kvstoreDictLUTDefrag(kvstore *kvs, kvstoreDictLUTDefragFunction *defragfn) {
-    for (int didx = 0; didx < kvs->num_dicts; didx++) {
+ * With 16k dictionaries for cluster mode with 1 shard, this operation may require substantial time
+ * to execute.  A "cursor" is used to perform the operation iteratively.  When first called, a
+ * cursor value of 0 should be provided.  The return value is an updated cursor which should be
+ * provided on the next iteration.  The operation is complete when 0 is returned.
+ *
+ * The 'defragfn' callback is called with a reference to the dict that callback can reallocate. */
+unsigned long kvstoreDictLUTDefrag(kvstore *kvs, unsigned long cursor, kvstoreDictLUTDefragFunction *defragfn) {
+    for (int didx = cursor; didx < kvs->num_dicts; didx++) {
         dict **d = kvstoreGetDictRef(kvs, didx), *newd;
         if (!*d) continue;
+
+        listNode *rehashing_node = NULL;
+        if (listLength(kvs->rehashing) > 0) {
+            rehashing_node = ((kvstoreDictMetadata *)dictMetadata(*d))->rehashing_node;
+        }
+
         if ((newd = defragfn(*d))) *d = newd;
+        if (rehashing_node) listNodeValue(rehashing_node) = *d;
+        return (didx + 1);
     }
+    return 0;
 }
 
 uint64_t kvstoreGetHash(kvstore *kvs, const void *key) {
diff --git a/src/kvstore.h b/src/kvstore.h
index 81a0d9a96e..00ec472e73 100644
--- a/src/kvstore.h
+++ b/src/kvstore.h
@@ -68,10 +68,10 @@ unsigned long kvstoreDictScanDefrag(kvstore *kvs,
                                     int didx,
                                     unsigned long v,
                                     dictScanFunction *fn,
-                                    dictDefragFunctions *defragfns,
+                                    const dictDefragFunctions *defragfns,
                                     void *privdata);
 typedef dict *(kvstoreDictLUTDefragFunction)(dict *d);
-void kvstoreDictLUTDefrag(kvstore *kvs, kvstoreDictLUTDefragFunction *defragfn);
+unsigned long kvstoreDictLUTDefrag(kvstore *kvs, unsigned long cursor, kvstoreDictLUTDefragFunction *defragfn);
 void *kvstoreDictFetchValue(kvstore *kvs, int didx, const void *key);
 dictEntry *kvstoreDictFind(kvstore *kvs, int didx, void *key);
 dictEntry *kvstoreDictAddRaw(kvstore *kvs, int didx, void *key, dictEntry **existing);
diff --git a/src/server.c b/src/server.c
index ef9f523145..d77f67248c 100644
--- a/src/server.c
+++ b/src/server.c
@@ -1140,8 +1140,8 @@ void databasesCron(void) {
         }
     }
 
-    /* Defrag keys gradually. */
-    activeDefragCycle();
+    /* Start active defrag cycle or adjust defrag CPU if needed. */
+    monitorActiveDefrag();
 
     /* Perform hash tables rehashing if needed, but only if there are no
      * other processes saving the DB on disk. Otherwise rehashing is bad
@@ -1611,24 +1611,7 @@ void whileBlockedCron(void) {
     mstime_t latency;
     latencyStartMonitor(latency);
 
-    /* In some cases we may be called with big intervals, so we may need to do
-     * extra work here. This is because some of the functions in serverCron rely
-     * on the fact that it is performed every 10 ms or so. For instance, if
-     * activeDefragCycle needs to utilize 25% cpu, it will utilize 2.5ms, so we
-     * need to call it multiple times. */
-    long hz_ms = 1000 / server.hz;
-    while (server.blocked_last_cron < server.mstime) {
-        /* Defrag keys gradually. */
-        activeDefragCycle();
-
-        server.blocked_last_cron += hz_ms;
-
-        /* Increment cronloop so that run_with_period works. */
-        server.cronloops++;
-    }
-
-    /* Other cron jobs do not need to be done in a loop. No need to check
-     * server.blocked_last_cron since we have an early exit at the top. */
+    defragWhileBlocked();
 
     /* Update memory stats during loading (excluding blocked scripts) */
     if (server.loading) cronUpdateMemoryStats();
@@ -2120,7 +2103,7 @@ void initServerConfig(void) {
     server.aof_flush_postponed_start = 0;
     server.aof_last_incr_size = 0;
     server.aof_last_incr_fsync_offset = 0;
-    server.active_defrag_running = 0;
+    server.active_defrag_cpu_percent = 0;
     server.active_defrag_configuration_changed = 0;
     server.notify_keyspace_events = 0;
     server.blocked_clients = 0;
@@ -2722,8 +2705,6 @@ void initServer(void) {
         server.db[j].watched_keys = dictCreate(&keylistDictType);
         server.db[j].id = j;
         server.db[j].avg_ttl = 0;
-        server.db[j].defrag_later = listCreate();
-        listSetFreeMethod(server.db[j].defrag_later, (void (*)(void *))sdsfree);
     }
     evictionPoolAlloc(); /* Initialize the LRU keys pool. */
     /* Note that server.pubsub_channels was chosen to be a kvstore (with only one dict, which
@@ -5704,7 +5685,7 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) {
                 "mem_aof_buffer:%zu\r\n", mh->aof_buffer,
                 "mem_allocator:%s\r\n", ZMALLOC_LIB,
                 "mem_overhead_db_hashtable_rehashing:%zu\r\n", mh->overhead_db_hashtable_rehashing,
-                "active_defrag_running:%d\r\n", server.active_defrag_running,
+                "active_defrag_running:%d\r\n", server.active_defrag_cpu_percent,
                 "lazyfree_pending_objects:%zu\r\n", lazyfreeGetPendingObjectsCount(),
                 "lazyfreed_objects:%zu\r\n", lazyfreeGetFreedObjectsCount()));
         freeMemoryOverheadData(mh);
diff --git a/src/server.h b/src/server.h
index b9e8be9479..0aac1acbd8 100644
--- a/src/server.h
+++ b/src/server.h
@@ -961,7 +961,6 @@ typedef struct serverDb {
     int id;                               /* Database ID */
     long long avg_ttl;                    /* Average TTL, just for stats */
     unsigned long expires_cursor;         /* Cursor of the active expire cycle. */
-    list *defrag_later;                   /* List of key names to attempt to defrag one by one, gradually. */
 } serverDb;
 
 /* forward declaration for functions ctx */
@@ -1702,7 +1701,7 @@ struct valkeyServer {
     int last_sig_received;               /* Indicates the last SIGNAL received, if any (e.g., SIGINT or SIGTERM). */
     int shutdown_flags;                  /* Flags passed to prepareForShutdown(). */
     int activerehashing;                 /* Incremental rehash in serverCron() */
-    int active_defrag_running;           /* Active defragmentation running (holds current scan aggressiveness) */
+    int active_defrag_cpu_percent;       /* Current desired CPU percentage for active defrag */
     char *pidfile;                       /* PID file path */
     int arch_bits;                       /* 32 or 64 depending on sizeof(long) */
     int cronloops;                       /* Number of times the cron function run */
@@ -1899,8 +1898,9 @@ struct valkeyServer {
     size_t active_defrag_ignore_bytes;           /* minimum amount of fragmentation waste to start active defrag */
     int active_defrag_threshold_lower;           /* minimum percentage of fragmentation to start active defrag */
     int active_defrag_threshold_upper;           /* maximum percentage of fragmentation at which we use maximum effort */
-    int active_defrag_cycle_min;                 /* minimal effort for defrag in CPU percentage */
-    int active_defrag_cycle_max;                 /* maximal effort for defrag in CPU percentage */
+    int active_defrag_cpu_min;                   /* minimal effort for defrag in CPU percentage */
+    int active_defrag_cpu_max;                   /* maximal effort for defrag in CPU percentage */
+    int active_defrag_cycle_us;                  /* standard duration of defrag cycle */
     unsigned long active_defrag_max_scan_fields; /* maximum number of fields of set/hash/zset/list to process from
                                                     within the main dict scan */
     size_t client_max_querybuf_len;              /* Limit for client query buffer length */
@@ -3353,7 +3353,8 @@ void bytesToHuman(char *s, size_t size, unsigned long long n);
 void enterExecutionUnit(int update_cached_time, long long us);
 void exitExecutionUnit(void);
 void resetServerStats(void);
-void activeDefragCycle(void);
+void monitorActiveDefrag(void);
+void defragWhileBlocked(void);
 unsigned int getLRUClock(void);
 unsigned int LRU_CLOCK(void);
 const char *evictPolicyToString(void);
diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl
index 67329f03f1..abd23b1d83 100644
--- a/tests/unit/memefficiency.tcl
+++ b/tests/unit/memefficiency.tcl
@@ -40,7 +40,6 @@ run_solo {defrag} {
     proc test_active_defrag {type} {
     if {[string match {*jemalloc*} [s mem_allocator]] && [r debug mallctl arenas.page] <= 8192} {
         test "Active defrag main dictionary: $type" {
-            r config set hz 100
             r config set activedefrag no
             r config set active-defrag-threshold-lower 5
             r config set active-defrag-cycle-min 65
@@ -89,6 +88,8 @@ run_solo {defrag} {
                 r config set active-defrag-cycle-min 65
                 r config set active-defrag-cycle-max 75
 
+                after 1000 ;# Give defrag time to work (might be multiple cycles)
+
                 # Wait for the active defrag to stop working.
                 wait_for_condition 2000 100 {
                     [s active_defrag_running] eq 0
@@ -138,12 +139,13 @@ run_solo {defrag} {
                 r config resetstat
                 r config set key-load-delay -25 ;# sleep on average 1/25 usec
                 r debug loadaof
+                after 1000 ;# give defrag a chance to work before turning it off
                 r config set activedefrag no
+
                 # measure hits and misses right after aof loading
                 set misses [s active_defrag_misses]
                 set hits [s active_defrag_hits]
 
-                after 120 ;# serverCron only updates the info once in 100ms
                 set frag [s allocator_frag_ratio]
                 set max_latency 0
                 foreach event [r latency latest] {
@@ -181,7 +183,6 @@ run_solo {defrag} {
             r flushdb sync
             r script flush sync
             r config resetstat
-            r config set hz 100
             r config set activedefrag no
             r config set active-defrag-threshold-lower 5
             r config set active-defrag-cycle-min 65
@@ -203,7 +204,7 @@ run_solo {defrag} {
                 $rd read ; # Discard script load replies
                 $rd read ; # Discard set replies
             }
-            after 120 ;# serverCron only updates the info once in 100ms
+            after 1000 ;# give defrag some time to work
             if {$::verbose} {
                 puts "used [s allocator_allocated]"
                 puts "rss [s allocator_active]"
@@ -239,6 +240,8 @@ run_solo {defrag} {
                     fail "defrag not started."
                 }
 
+                after 1000 ;# Give defrag time to work (might be multiple cycles)
+
                 # wait for the active defrag to stop working
                 wait_for_condition 500 100 {
                     [s active_defrag_running] eq 0
@@ -266,7 +269,6 @@ run_solo {defrag} {
         test "Active defrag big keys: $type" {
             r flushdb sync
             r config resetstat
-            r config set hz 100
             r config set activedefrag no
             r config set active-defrag-max-scan-fields 1000
             r config set active-defrag-threshold-lower 5
@@ -361,6 +363,8 @@ run_solo {defrag} {
                     fail "defrag not started."
                 }
 
+                after 1000 ;# Give defrag some time to work (it may run several cycles)
+
                 # wait for the active defrag to stop working
                 wait_for_condition 500 100 {
                     [s active_defrag_running] eq 0
@@ -407,7 +411,6 @@ run_solo {defrag} {
         test "Active defrag pubsub: $type" {
             r flushdb sync
             r config resetstat
-            r config set hz 100
             r config set activedefrag no
             r config set active-defrag-threshold-lower 5
             r config set active-defrag-cycle-min 65
@@ -430,7 +433,6 @@ run_solo {defrag} {
                 $rd read ; # Discard set replies
             }
 
-            after 120 ;# serverCron only updates the info once in 100ms
             if {$::verbose} {
                 puts "used [s allocator_allocated]"
                 puts "rss [s allocator_active]"
@@ -466,6 +468,8 @@ run_solo {defrag} {
                     fail "defrag not started."
                 }
 
+                after 1000 ;# Give defrag some time to work (it may run several cycles)
+
                 # wait for the active defrag to stop working
                 wait_for_condition 500 100 {
                     [s active_defrag_running] eq 0
@@ -475,6 +479,7 @@ run_solo {defrag} {
                     puts [r memory malloc-stats]
                     fail "defrag didn't stop."
                 }
+                r config set activedefrag no ;# disable before we accidentally create more frag
 
                 # test the fragmentation is lower
                 after 120 ;# serverCron only updates the info once in 100ms
@@ -507,7 +512,6 @@ run_solo {defrag} {
         test "Active defrag big list: $type" {
             r flushdb sync
             r config resetstat
-            r config set hz 100
             r config set activedefrag no
             r config set active-defrag-max-scan-fields 1000
             r config set active-defrag-threshold-lower 5
@@ -561,6 +565,8 @@ run_solo {defrag} {
                     fail "defrag not started."
                 }
 
+                after 1000 ;# Give defrag some time to work (it may run several cycles)
+
                 # wait for the active defrag to stop working
                 wait_for_condition 500 100 {
                     [s active_defrag_running] eq 0
@@ -619,7 +625,6 @@ run_solo {defrag} {
             start_server {tags {"defrag"} overrides {save ""}} {
                 r flushdb sync
                 r config resetstat
-                r config set hz 100
                 r config set activedefrag no
                 r config set active-defrag-max-scan-fields 1000
                 r config set active-defrag-threshold-lower 5
@@ -685,6 +690,8 @@ run_solo {defrag} {
                         fail "defrag not started."
                     }
 
+                    after 1000 ;# Give defrag some time to work (it may run several cycles)
+
                     # wait for the active defrag to stop working
                     wait_for_condition 500 100 {
                         [s active_defrag_running] eq 0
diff --git a/tests/unit/moduleapi/defrag.tcl b/tests/unit/moduleapi/defrag.tcl
index e169f8de9b..6d8f55bd06 100644
--- a/tests/unit/moduleapi/defrag.tcl
+++ b/tests/unit/moduleapi/defrag.tcl
@@ -2,7 +2,6 @@ set testmodule [file normalize tests/modules/defragtest.so]
 
 start_server {tags {"modules"} overrides {{save ""}}} {
     r module load $testmodule 10000
-    r config set hz 100
     r config set active-defrag-ignore-bytes 1
     r config set active-defrag-threshold-lower 0
     r config set active-defrag-cycle-min 99
diff --git a/valkey.conf b/valkey.conf
index 8d3e11c515..b997e8179b 100644
--- a/valkey.conf
+++ b/valkey.conf
@@ -2381,9 +2381,8 @@ rdb-save-incremental-fsync yes
 # Fragmentation is a natural process that happens with every allocator (but
 # less so with Jemalloc, fortunately) and certain workloads. Normally a server
 # restart is needed in order to lower the fragmentation, or at least to flush
-# away all the data and create it again. However thanks to this feature
-# implemented by Oran Agra, this process can happen at runtime
-# in a "hot" way, while the server is running.
+# away all the data and create it again. However thanks to this feature, this
+# process can happen at runtime in a "hot" way, while the server is running.
 #
 # Basically when the fragmentation is over a certain level (see the
 # configuration options below) the server will start to create new copies of the
@@ -2421,18 +2420,23 @@ rdb-save-incremental-fsync yes
 # Maximum percentage of fragmentation at which we use maximum effort
 # active-defrag-threshold-upper 100
 
-# Minimal effort for defrag in CPU percentage, to be used when the lower
-# threshold is reached
+# Minimal effort for defrag in CPU percentage, not cycle time as the name might
+# suggest, to be used when the lower threshold is reached.
 # active-defrag-cycle-min 1
 
-# Maximal effort for defrag in CPU percentage, to be used when the upper
-# threshold is reached
+# Maximal effort for defrag in CPU percentage, not cycle time as the name might
+# suggest, to be used when the upper threshold is reached.
 # active-defrag-cycle-max 25
 
 # Maximum number of set/hash/zset/list fields that will be processed from
 # the main dictionary scan
 # active-defrag-max-scan-fields 1000
 
+# The time spent (in microseconds) of the periodic active defrag process.  This
+# affects the latency impact of active defrag on client commands.  Smaller numbers
+# will result in less latency impact at the cost of increased defrag overhead.
+# active-defrag-cycle-us 500
+
 # Jemalloc background thread for purging will be enabled by default
 jemalloc-bg-thread yes
 

From 9f8b174c2eec4be1b6bc15745ac479c95dbd3a6b Mon Sep 17 00:00:00 2001
From: uriyage <78144248+uriyage@users.noreply.github.com>
Date: Tue, 3 Dec 2024 19:20:31 +0200
Subject: [PATCH 12/73] Optimize IO thread offload for modified argv (#1360)

### Improve expired commands performance with IO threads

#### Background
In our IO threads architecture, IO threads allocate client argv's and
later when we free it after processCommand we offload its free to the IO
threads.
With jemalloc, it's crucial that the same thread that allocates memory
also frees it.

For some commands we modify the client's argv in the main thread during
command processing (for example in `SET EX` command we rewrite the
command to use absolute time for replication propagation).

#### Current issues
1. When commands are rewritten (e.g., expire commands), we store the
original argv
   in `c->original_argv`. However, we're currently:
   - Freeing new argv (allocated by main thread) in IO threads
   - Freeing original argv (allocated by IO threads) in main thread
2. Currently, `c->original_argv` points to new array with old
objects, while `c->argv` has old array with new objects, making memory
free management complicated.

#### Changes
1. Refactored argv modification handling code to ensure consistency -
both array and objects are now either all new or all old
2. Moved original_argv cleanup to happen in resetClient after argv
cleanup
3. Modified IO threads code to properly handle original argv cleanup
when argv are modified.

#### Performance Impact
Benchmark with `SET EX` commands (650 clients, 512 byte value, 8 IO
threads):
- New implementation: **729,548 ops/sec**
- Old implementation: **633,243 ops/sec**
Representing a **~15%** performance improvement due to more efficient
memory handling.

---------

Signed-off-by: Uri Yagelnik <uriy@amazon.com>
Signed-off-by: ranshid <88133677+ranshid@users.noreply.github.com>
Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com>
---
 src/blocked.c                |   1 -
 src/io_threads.c             |  18 ++---
 src/io_threads.h             |   2 +-
 src/multi.c                  |   4 ++
 src/networking.c             |  90 ++++++++++++++++--------
 src/server.c                 |   4 --
 src/unit/test_files.h        |   4 ++
 src/unit/test_networking.c   | 131 +++++++++++++++++++++++++++++++++++
 tests/unit/introspection.tcl |  26 +++++++
 9 files changed, 235 insertions(+), 45 deletions(-)
 create mode 100644 src/unit/test_networking.c

diff --git a/src/blocked.c b/src/blocked.c
index 8e1974a703..aeec560b3f 100644
--- a/src/blocked.c
+++ b/src/blocked.c
@@ -206,7 +206,6 @@ void unblockClient(client *c, int queue_for_reprocessing) {
     /* Reset the client for a new query, unless the client has pending command to process
      * or in case a shutdown operation was canceled and we are still in the processCommand sequence  */
     if (!c->flag.pending_command && c->bstate.btype != BLOCKED_SHUTDOWN) {
-        freeClientOriginalArgv(c);
         /* Clients that are not blocked on keys are not reprocessed so we must
          * call reqresAppendResponse here (for clients blocked on key,
          * unblockClientOnKey is called, which eventually calls processCommand,
diff --git a/src/io_threads.c b/src/io_threads.c
index f4471b96d0..1ebd748bc2 100644
--- a/src/io_threads.c
+++ b/src/io_threads.c
@@ -441,8 +441,8 @@ void IOThreadFreeArgv(void *data) {
 /* This function attempts to offload the client's argv to an IO thread.
  * Returns C_OK if the client's argv were successfully offloaded to an IO thread,
  * C_ERR otherwise. */
-int tryOffloadFreeArgvToIOThreads(client *c) {
-    if (server.active_io_threads_num <= 1 || c->argc == 0) {
+int tryOffloadFreeArgvToIOThreads(client *c, int argc, robj **argv) {
+    if (server.active_io_threads_num <= 1 || argc == 0) {
         return C_ERR;
     }
 
@@ -456,11 +456,11 @@ int tryOffloadFreeArgvToIOThreads(client *c) {
     int last_arg_to_free = -1;
 
     /* Prepare the argv */
-    for (int j = 0; j < c->argc; j++) {
-        if (c->argv[j]->refcount > 1) {
-            decrRefCount(c->argv[j]);
+    for (int j = 0; j < argc; j++) {
+        if (argv[j]->refcount > 1) {
+            decrRefCount(argv[j]);
             /* Set argv[j] to NULL to avoid double free */
-            c->argv[j] = NULL;
+            argv[j] = NULL;
         } else {
             last_arg_to_free = j;
         }
@@ -468,17 +468,17 @@ int tryOffloadFreeArgvToIOThreads(client *c) {
 
     /* If no argv to free, free the argv array at the main thread */
     if (last_arg_to_free == -1) {
-        zfree(c->argv);
+        zfree(argv);
         return C_OK;
     }
 
     /* We set the refcount of the last arg to free to 0 to indicate that
      * this is the last argument to free. With this approach, we don't need to
      * send the argc to the IO thread and we can send just the argv ptr. */
-    c->argv[last_arg_to_free]->refcount = 0;
+    argv[last_arg_to_free]->refcount = 0;
 
     /* Must succeed as we checked the free space before. */
-    IOJobQueue_push(jq, IOThreadFreeArgv, c->argv);
+    IOJobQueue_push(jq, IOThreadFreeArgv, argv);
 
     return C_OK;
 }
diff --git a/src/io_threads.h b/src/io_threads.h
index f9a9cf762f..8818f08588 100644
--- a/src/io_threads.h
+++ b/src/io_threads.h
@@ -9,7 +9,7 @@ int inMainThread(void);
 int trySendReadToIOThreads(client *c);
 int trySendWriteToIOThreads(client *c);
 int tryOffloadFreeObjToIOThreads(robj *o);
-int tryOffloadFreeArgvToIOThreads(client *c);
+int tryOffloadFreeArgvToIOThreads(client *c, int argc, robj **argv);
 void adjustIOThreadsByEventLoad(int numevents, int increase_only);
 void drainIOThreadsQueue(void);
 void trySendPollJobToIOThreads(void);
diff --git a/src/multi.c b/src/multi.c
index bcffb90912..9e1f019244 100644
--- a/src/multi.c
+++ b/src/multi.c
@@ -238,6 +238,10 @@ void execCommand(client *c) {
         c->mstate.commands[j].argv = c->argv;
         c->mstate.commands[j].argv_len = c->argv_len;
         c->mstate.commands[j].cmd = c->cmd;
+
+        /* The original argv has already been processed for slowlog and monitor,
+         * so we can safely free it before proceeding to the next command. */
+        freeClientOriginalArgv(c);
     }
 
     // restore old DENY_BLOCKING value
diff --git a/src/networking.c b/src/networking.c
index bbd684a3e5..debd94ddfc 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -1488,14 +1488,19 @@ void freeClientOriginalArgv(client *c) {
     /* We didn't rewrite this client */
     if (!c->original_argv) return;
 
-    for (int j = 0; j < c->original_argc; j++) decrRefCount(c->original_argv[j]);
-    zfree(c->original_argv);
+    if (tryOffloadFreeArgvToIOThreads(c, c->original_argc, c->original_argv) == C_ERR) {
+        for (int j = 0; j < c->original_argc; j++) decrRefCount(c->original_argv[j]);
+        zfree(c->original_argv);
+    }
+
     c->original_argv = NULL;
     c->original_argc = 0;
 }
 
 void freeClientArgv(client *c) {
-    if (tryOffloadFreeArgvToIOThreads(c) == C_ERR) {
+    /* If original_argv exists, 'c->argv' was allocated by the main thread,
+     * so it's more efficient to free it directly here rather than offloading to IO threads */
+    if (c->original_argv || tryOffloadFreeArgvToIOThreads(c, c->argc, c->argv) == C_ERR) {
         for (int j = 0; j < c->argc; j++) decrRefCount(c->argv[j]);
         zfree(c->argv);
     }
@@ -2545,6 +2550,7 @@ void resetClient(client *c) {
     serverCommandProc *prevcmd = c->cmd ? c->cmd->proc : NULL;
 
     freeClientArgv(c);
+    freeClientOriginalArgv(c);
     c->cur_script = NULL;
     c->reqtype = 0;
     c->multibulklen = 0;
@@ -4248,16 +4254,53 @@ void securityWarningCommand(client *c) {
     freeClientAsync(c);
 }
 
-/* Keep track of the original command arguments so that we can generate
- * an accurate slowlog entry after the command has been executed. */
-static void retainOriginalCommandVector(client *c) {
-    /* We already rewrote this command, so don't rewrite it again */
-    if (c->original_argv) return;
-    c->original_argc = c->argc;
-    c->original_argv = zmalloc(sizeof(robj *) * (c->argc));
-    for (int j = 0; j < c->argc; j++) {
-        c->original_argv[j] = c->argv[j];
-        incrRefCount(c->argv[j]);
+/* This function preserves the original command arguments for accurate slowlog recording.
+ *
+ * It performs the following operations:
+ * - Stores the initial command vector if not already saved
+ * - Manages memory allocation for command argument modifications
+ *
+ * new_argc - The new number of arguments to allocate space for if necessary.
+ * new_argv - Optional pointer to a new argument vector. If NULL, space will be
+ *                allocated for new_argc arguments, preserving the existing arguments.
+ */
+static void backupAndUpdateClientArgv(client *c, int new_argc, robj **new_argv) {
+    robj **old_argv = c->argv;
+    int old_argc = c->argc;
+
+    /* Store original arguments if not already saved */
+    if (!c->original_argv) {
+        c->original_argc = old_argc;
+        c->original_argv = old_argv;
+    }
+
+    /* Handle direct argv replacement */
+    if (new_argv) {
+        c->argv = new_argv;
+    } else if (c->original_argv == old_argv || new_argc > old_argc) {
+        /* Allocate new array if necessary */
+        c->argv = zmalloc(sizeof(robj *) * new_argc);
+
+        for (int i = 0; i < old_argc && i < new_argc; i++) {
+            c->argv[i] = old_argv[i];
+            incrRefCount(c->argv[i]);
+        }
+
+        /* Initialize new argument slots to NULL */
+        for (int i = old_argc; i < new_argc; i++) {
+            c->argv[i] = NULL;
+        }
+    }
+
+    c->argc = new_argc;
+    c->argv_len = new_argc;
+
+    /* Clean up old argv if necessary */
+    if (c->argv != old_argv && c->original_argv != old_argv) {
+        for (int i = 0; i < old_argc; i++) {
+            if (old_argv[i]) decrRefCount(old_argv[i]);
+        }
+        zfree(old_argv);
     }
 }
 
@@ -4265,7 +4308,7 @@ static void retainOriginalCommandVector(client *c) {
  * in the slowlog. This information is stored in the
  * original_argv array. */
 void redactClientCommandArgument(client *c, int argc) {
-    retainOriginalCommandVector(c);
+    backupAndUpdateClientArgv(c, c->argc, NULL);
     if (c->original_argv[argc] == shared.redacted) {
         /* This argument has already been redacted */
         return;
@@ -4298,10 +4341,7 @@ void rewriteClientCommandVector(client *c, int argc, ...) {
 /* Completely replace the client command vector with the provided one. */
 void replaceClientCommandVector(client *c, int argc, robj **argv) {
     int j;
-    retainOriginalCommandVector(c);
-    freeClientArgv(c);
-    c->argv = argv;
-    c->argc = argc;
+    backupAndUpdateClientArgv(c, argc, argv);
     c->argv_len_sum = 0;
     for (j = 0; j < c->argc; j++)
         if (c->argv[j]) c->argv_len_sum += getStringObjectLen(c->argv[j]);
@@ -4322,19 +4362,9 @@ void replaceClientCommandVector(client *c, int argc, robj **argv) {
  *    free the no longer used objects on c->argv. */
 void rewriteClientCommandArgument(client *c, int i, robj *newval) {
     robj *oldval;
-    retainOriginalCommandVector(c);
+    int new_argc = (i >= c->argc) ? i + 1 : c->argc;
+    backupAndUpdateClientArgv(c, new_argc, NULL);
 
-    /* We need to handle both extending beyond argc (just update it and
-     * initialize the new element) or beyond argv_len (realloc is needed).
-     */
-    if (i >= c->argc) {
-        if (i >= c->argv_len) {
-            c->argv = zrealloc(c->argv, sizeof(robj *) * (i + 1));
-            c->argv_len = i + 1;
-        }
-        c->argc = i + 1;
-        c->argv[i] = NULL;
-    }
     oldval = c->argv[i];
     if (oldval) c->argv_len_sum -= getStringObjectLen(oldval);
     if (newval) c->argv_len_sum += getStringObjectLen(newval);
diff --git a/src/server.c b/src/server.c
index d77f67248c..21dca85067 100644
--- a/src/server.c
+++ b/src/server.c
@@ -3659,10 +3659,6 @@ void call(client *c, int flags) {
         replicationFeedMonitors(c, server.monitors, c->db->id, argv, argc);
     }
 
-    /* Clear the original argv.
-     * If the client is blocked we will handle slowlog when it is unblocked. */
-    if (!c->flag.blocked) freeClientOriginalArgv(c);
-
     /* Populate the per-command and per-slot statistics that we show in INFO commandstats and CLUSTER SLOT-STATS,
      * respectively. If the client is blocked we will handle latency stats and duration when it is unblocked. */
     if (update_command_stats && !c->flag.blocked) {
diff --git a/src/unit/test_files.h b/src/unit/test_files.h
index 6ab7373007..bc3eac4222 100644
--- a/src/unit/test_files.h
+++ b/src/unit/test_files.h
@@ -84,6 +84,8 @@ int test_listpackBenchmarkLpValidateIntegrity(int argc, char **argv, int flags);
 int test_listpackBenchmarkLpCompareWithString(int argc, char **argv, int flags);
 int test_listpackBenchmarkLpCompareWithNumber(int argc, char **argv, int flags);
 int test_listpackBenchmarkFree(int argc, char **argv, int flags);
+int test_backupAndUpdateClientArgv(int argc, char **argv, int flags);
+int test_rewriteClientCommandArgument(int argc, char **argv, int flags);
 int test_quicklistCreateList(int argc, char **argv, int flags);
 int test_quicklistAddToTailOfEmptyList(int argc, char **argv, int flags);
 int test_quicklistAddToHeadOfEmptyList(int argc, char **argv, int flags);
@@ -216,6 +218,7 @@ unitTest __test_endianconv_c[] = {{"test_endianconv", test_endianconv}, {NULL, N
 unitTest __test_intset_c[] = {{"test_intsetValueEncodings", test_intsetValueEncodings}, {"test_intsetBasicAdding", test_intsetBasicAdding}, {"test_intsetLargeNumberRandomAdd", test_intsetLargeNumberRandomAdd}, {"test_intsetUpgradeFromint16Toint32", test_intsetUpgradeFromint16Toint32}, {"test_intsetUpgradeFromint16Toint64", test_intsetUpgradeFromint16Toint64}, {"test_intsetUpgradeFromint32Toint64", test_intsetUpgradeFromint32Toint64}, {"test_intsetStressLookups", test_intsetStressLookups}, {"test_intsetStressAddDelete", test_intsetStressAddDelete}, {NULL, NULL}};
 unitTest __test_kvstore_c[] = {{"test_kvstoreAdd16Keys", test_kvstoreAdd16Keys}, {"test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict}, {NULL, NULL}};
 unitTest __test_listpack_c[] = {{"test_listpackCreateIntList", test_listpackCreateIntList}, {"test_listpackCreateList", test_listpackCreateList}, {"test_listpackLpPrepend", test_listpackLpPrepend}, {"test_listpackLpPrependInteger", test_listpackLpPrependInteger}, {"test_listpackGetELementAtIndex", test_listpackGetELementAtIndex}, {"test_listpackPop", test_listpackPop}, {"test_listpackGetELementAtIndex2", test_listpackGetELementAtIndex2}, {"test_listpackIterate0toEnd", test_listpackIterate0toEnd}, {"test_listpackIterate1toEnd", test_listpackIterate1toEnd}, {"test_listpackIterate2toEnd", test_listpackIterate2toEnd}, {"test_listpackIterateBackToFront", test_listpackIterateBackToFront}, {"test_listpackIterateBackToFrontWithDelete", test_listpackIterateBackToFrontWithDelete}, {"test_listpackDeleteWhenNumIsMinusOne", test_listpackDeleteWhenNumIsMinusOne}, {"test_listpackDeleteWithNegativeIndex", test_listpackDeleteWithNegativeIndex}, {"test_listpackDeleteInclusiveRange0_0", test_listpackDeleteInclusiveRange0_0}, {"test_listpackDeleteInclusiveRange0_1", test_listpackDeleteInclusiveRange0_1}, {"test_listpackDeleteInclusiveRange1_2", test_listpackDeleteInclusiveRange1_2}, {"test_listpackDeleteWitStartIndexOutOfRange", test_listpackDeleteWitStartIndexOutOfRange}, {"test_listpackDeleteWitNumOverflow", test_listpackDeleteWitNumOverflow}, {"test_listpackBatchDelete", test_listpackBatchDelete}, {"test_listpackDeleteFooWhileIterating", test_listpackDeleteFooWhileIterating}, {"test_listpackReplaceWithSameSize", test_listpackReplaceWithSameSize}, {"test_listpackReplaceWithDifferentSize", test_listpackReplaceWithDifferentSize}, {"test_listpackRegressionGt255Bytes", test_listpackRegressionGt255Bytes}, {"test_listpackCreateLongListAndCheckIndices", test_listpackCreateLongListAndCheckIndices}, {"test_listpackCompareStrsWithLpEntries", test_listpackCompareStrsWithLpEntries}, {"test_listpackLpMergeEmptyLps", test_listpackLpMergeEmptyLps}, {"test_listpackLpMergeLp1Larger", test_listpackLpMergeLp1Larger}, {"test_listpackLpMergeLp2Larger", test_listpackLpMergeLp2Larger}, {"test_listpackLpNextRandom", test_listpackLpNextRandom}, {"test_listpackLpNextRandomCC", test_listpackLpNextRandomCC}, {"test_listpackRandomPairWithOneElement", test_listpackRandomPairWithOneElement}, {"test_listpackRandomPairWithManyElements", test_listpackRandomPairWithManyElements}, {"test_listpackRandomPairsWithOneElement", test_listpackRandomPairsWithOneElement}, {"test_listpackRandomPairsWithManyElements", test_listpackRandomPairsWithManyElements}, {"test_listpackRandomPairsUniqueWithOneElement", test_listpackRandomPairsUniqueWithOneElement}, {"test_listpackRandomPairsUniqueWithManyElements", test_listpackRandomPairsUniqueWithManyElements}, {"test_listpackPushVariousEncodings", test_listpackPushVariousEncodings}, {"test_listpackLpFind", test_listpackLpFind}, {"test_listpackLpValidateIntegrity", test_listpackLpValidateIntegrity}, {"test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN", test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN}, {"test_listpackStressWithRandom", test_listpackStressWithRandom}, {"test_listpackSTressWithVariableSize", test_listpackSTressWithVariableSize}, {"test_listpackBenchmarkInit", test_listpackBenchmarkInit}, {"test_listpackBenchmarkLpAppend", test_listpackBenchmarkLpAppend}, {"test_listpackBenchmarkLpFindString", test_listpackBenchmarkLpFindString}, {"test_listpackBenchmarkLpFindNumber", test_listpackBenchmarkLpFindNumber}, {"test_listpackBenchmarkLpSeek", test_listpackBenchmarkLpSeek}, {"test_listpackBenchmarkLpValidateIntegrity", test_listpackBenchmarkLpValidateIntegrity}, {"test_listpackBenchmarkLpCompareWithString", test_listpackBenchmarkLpCompareWithString}, {"test_listpackBenchmarkLpCompareWithNumber", test_listpackBenchmarkLpCompareWithNumber}, {"test_listpackBenchmarkFree", test_listpackBenchmarkFree}, {NULL, NULL}};
+unitTest __test_networking_c[] = {{"test_backupAndUpdateClientArgv", test_backupAndUpdateClientArgv}, {"test_rewriteClientCommandArgument", test_rewriteClientCommandArgument}, {NULL, NULL}};
 unitTest __test_quicklist_c[] = {{"test_quicklistCreateList", test_quicklistCreateList}, {"test_quicklistAddToTailOfEmptyList", test_quicklistAddToTailOfEmptyList}, {"test_quicklistAddToHeadOfEmptyList", test_quicklistAddToHeadOfEmptyList}, {"test_quicklistAddToTail5xAtCompress", test_quicklistAddToTail5xAtCompress}, {"test_quicklistAddToHead5xAtCompress", test_quicklistAddToHead5xAtCompress}, {"test_quicklistAddToTail500xAtCompress", test_quicklistAddToTail500xAtCompress}, {"test_quicklistAddToHead500xAtCompress", test_quicklistAddToHead500xAtCompress}, {"test_quicklistRotateEmpty", test_quicklistRotateEmpty}, {"test_quicklistComprassionPlainNode", test_quicklistComprassionPlainNode}, {"test_quicklistNextPlainNode", test_quicklistNextPlainNode}, {"test_quicklistRotatePlainNode", test_quicklistRotatePlainNode}, {"test_quicklistRotateOneValOnce", test_quicklistRotateOneValOnce}, {"test_quicklistRotate500Val5000TimesAtCompress", test_quicklistRotate500Val5000TimesAtCompress}, {"test_quicklistPopEmpty", test_quicklistPopEmpty}, {"test_quicklistPop1StringFrom1", test_quicklistPop1StringFrom1}, {"test_quicklistPopHead1NumberFrom1", test_quicklistPopHead1NumberFrom1}, {"test_quicklistPopHead500From500", test_quicklistPopHead500From500}, {"test_quicklistPopHead5000From500", test_quicklistPopHead5000From500}, {"test_quicklistIterateForwardOver500List", test_quicklistIterateForwardOver500List}, {"test_quicklistIterateReverseOver500List", test_quicklistIterateReverseOver500List}, {"test_quicklistInsertAfter1Element", test_quicklistInsertAfter1Element}, {"test_quicklistInsertBefore1Element", test_quicklistInsertBefore1Element}, {"test_quicklistInsertHeadWhileHeadNodeIsFull", test_quicklistInsertHeadWhileHeadNodeIsFull}, {"test_quicklistInsertTailWhileTailNodeIsFull", test_quicklistInsertTailWhileTailNodeIsFull}, {"test_quicklistInsertOnceInElementsWhileIteratingAtCompress", test_quicklistInsertOnceInElementsWhileIteratingAtCompress}, {"test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress", test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress}, {"test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress", test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress}, {"test_quicklistDuplicateEmptyList", test_quicklistDuplicateEmptyList}, {"test_quicklistDuplicateListOf1Element", test_quicklistDuplicateListOf1Element}, {"test_quicklistDuplicateListOf500", test_quicklistDuplicateListOf500}, {"test_quicklistIndex1200From500ListAtFill", test_quicklistIndex1200From500ListAtFill}, {"test_quicklistIndex12From500ListAtFill", test_quicklistIndex12From500ListAtFill}, {"test_quicklistIndex100From500ListAtFill", test_quicklistIndex100From500ListAtFill}, {"test_quicklistIndexTooBig1From50ListAtFill", test_quicklistIndexTooBig1From50ListAtFill}, {"test_quicklistDeleteRangeEmptyList", test_quicklistDeleteRangeEmptyList}, {"test_quicklistDeleteRangeOfEntireNodeInListOfOneNode", test_quicklistDeleteRangeOfEntireNodeInListOfOneNode}, {"test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts", test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts}, {"test_quicklistDeleteMiddle100Of500List", test_quicklistDeleteMiddle100Of500List}, {"test_quicklistDeleteLessThanFillButAcrossNodes", test_quicklistDeleteLessThanFillButAcrossNodes}, {"test_quicklistDeleteNegative1From500List", test_quicklistDeleteNegative1From500List}, {"test_quicklistDeleteNegative1From500ListWithOverflowCounts", test_quicklistDeleteNegative1From500ListWithOverflowCounts}, {"test_quicklistDeleteNegative100From500List", test_quicklistDeleteNegative100From500List}, {"test_quicklistDelete10Count5From50List", test_quicklistDelete10Count5From50List}, {"test_quicklistNumbersOnlyListRead", test_quicklistNumbersOnlyListRead}, {"test_quicklistNumbersLargerListRead", test_quicklistNumbersLargerListRead}, {"test_quicklistNumbersLargerListReadB", test_quicklistNumbersLargerListReadB}, {"test_quicklistLremTestAtCompress", test_quicklistLremTestAtCompress}, {"test_quicklistIterateReverseDeleteAtCompress", test_quicklistIterateReverseDeleteAtCompress}, {"test_quicklistIteratorAtIndexTestAtCompress", test_quicklistIteratorAtIndexTestAtCompress}, {"test_quicklistLtrimTestAAtCompress", test_quicklistLtrimTestAAtCompress}, {"test_quicklistLtrimTestBAtCompress", test_quicklistLtrimTestBAtCompress}, {"test_quicklistLtrimTestCAtCompress", test_quicklistLtrimTestCAtCompress}, {"test_quicklistLtrimTestDAtCompress", test_quicklistLtrimTestDAtCompress}, {"test_quicklistVerifySpecificCompressionOfInteriorNodes", test_quicklistVerifySpecificCompressionOfInteriorNodes}, {"test_quicklistBookmarkGetUpdatedToNextItem", test_quicklistBookmarkGetUpdatedToNextItem}, {"test_quicklistBookmarkLimit", test_quicklistBookmarkLimit}, {"test_quicklistCompressAndDecompressQuicklistListpackNode", test_quicklistCompressAndDecompressQuicklistListpackNode}, {"test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX", test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX}, {NULL, NULL}};
 unitTest __test_rax_c[] = {{"test_raxRandomWalk", test_raxRandomWalk}, {"test_raxIteratorUnitTests", test_raxIteratorUnitTests}, {"test_raxTryInsertUnitTests", test_raxTryInsertUnitTests}, {"test_raxRegressionTest1", test_raxRegressionTest1}, {"test_raxRegressionTest2", test_raxRegressionTest2}, {"test_raxRegressionTest3", test_raxRegressionTest3}, {"test_raxRegressionTest4", test_raxRegressionTest4}, {"test_raxRegressionTest5", test_raxRegressionTest5}, {"test_raxRegressionTest6", test_raxRegressionTest6}, {"test_raxBenchmark", test_raxBenchmark}, {"test_raxHugeKey", test_raxHugeKey}, {"test_raxFuzz", test_raxFuzz}, {NULL, NULL}};
 unitTest __test_sds_c[] = {{"test_sds", test_sds}, {"test_typesAndAllocSize", test_typesAndAllocSize}, {"test_sdsHeaderSizes", test_sdsHeaderSizes}, {"test_sdssplitargs", test_sdssplitargs}, {NULL, NULL}};
@@ -237,6 +240,7 @@ struct unitTestSuite {
     {"test_intset.c", __test_intset_c},
     {"test_kvstore.c", __test_kvstore_c},
     {"test_listpack.c", __test_listpack_c},
+    {"test_networking.c", __test_networking_c},
     {"test_quicklist.c", __test_quicklist_c},
     {"test_rax.c", __test_rax_c},
     {"test_sds.c", __test_sds_c},
diff --git a/src/unit/test_networking.c b/src/unit/test_networking.c
new file mode 100644
index 0000000000..ac042d907f
--- /dev/null
+++ b/src/unit/test_networking.c
@@ -0,0 +1,131 @@
+#include <stdatomic.h>
+
+#include "../networking.c"
+#include "../server.c"
+#include "test_help.h"
+
+int test_backupAndUpdateClientArgv(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    UNUSED(flags);
+
+    client *c = zmalloc(sizeof(client));
+
+    /* Test 1: Initial backup of arguments */
+    c->argc = 2;
+    robj **initial_argv = zmalloc(sizeof(robj *) * 2);
+    c->argv = initial_argv;
+    c->argv[0] = createObject(OBJ_STRING, sdscatfmt(sdsempty(), "test"));
+    c->argv[1] = createObject(OBJ_STRING, sdscatfmt(sdsempty(), "test2"));
+    c->original_argv = NULL;
+
+    backupAndUpdateClientArgv(c, 3, NULL);
+
+    TEST_ASSERT(c->argv != initial_argv);
+    TEST_ASSERT(c->original_argv == initial_argv);
+    TEST_ASSERT(c->original_argc == 2);
+    TEST_ASSERT(c->argc == 3);
+    TEST_ASSERT(c->argv_len == 3);
+    TEST_ASSERT(c->argv[0]->refcount == 2);
+    TEST_ASSERT(c->argv[1]->refcount == 2);
+    TEST_ASSERT(c->argv[2] == NULL);
+
+    /* Test 2: Direct argv replacement */
+    robj **new_argv = zmalloc(sizeof(robj *) * 2);
+    new_argv[0] = createObject(OBJ_STRING, sdscatfmt(sdsempty(), "test"));
+    new_argv[1] = createObject(OBJ_STRING, sdscatfmt(sdsempty(), "test2"));
+
+    backupAndUpdateClientArgv(c, 2, new_argv);
+
+    TEST_ASSERT(c->argv == new_argv);
+    TEST_ASSERT(c->argc == 2);
+    TEST_ASSERT(c->argv_len == 2);
+    TEST_ASSERT(c->original_argv != c->argv);
+    TEST_ASSERT(c->original_argv == initial_argv);
+    TEST_ASSERT(c->original_argc == 2);
+    TEST_ASSERT(c->original_argv[0]->refcount == 1);
+    TEST_ASSERT(c->original_argv[1]->refcount == 1);
+
+    /* Test 3: Expanding argc */
+    backupAndUpdateClientArgv(c, 4, NULL);
+
+    TEST_ASSERT(c->argc == 4);
+    TEST_ASSERT(c->argv_len == 4);
+    TEST_ASSERT(c->argv[0] != NULL);
+    TEST_ASSERT(c->argv[1] != NULL);
+    TEST_ASSERT(c->argv[2] == NULL);
+    TEST_ASSERT(c->argv[3] == NULL);
+    TEST_ASSERT(c->original_argv == initial_argv);
+
+    /* Cleanup */
+    for (int i = 0; i < c->original_argc; i++) {
+        decrRefCount(c->original_argv[i]);
+    }
+    zfree(c->original_argv);
+
+    for (int i = 0; i < c->argc; i++) {
+        if (c->argv[i]) decrRefCount(c->argv[i]);
+    }
+    zfree(c->argv);
+    zfree(c);
+
+    return 0;
+}
+
+int test_rewriteClientCommandArgument(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    UNUSED(flags);
+
+    client *c = zmalloc(sizeof(client));
+    c->argc = 3;
+    robj **initial_argv = zmalloc(sizeof(robj *) * 3);
+    c->argv = initial_argv;
+    c->original_argv = NULL;
+    c->argv_len_sum = 0;
+
+    /* Initialize client with command "SET key value" */
+    c->argv[0] = createStringObject("SET", 3);
+    robj *original_key = createStringObject("key", 3);
+    c->argv[1] = original_key;
+    c->argv[2] = createStringObject("value", 5);
+    c->argv_len_sum = 11; // 3 + 3 + 5
+
+    /* Test 1: Rewrite existing argument */
+    robj *newval = createStringObject("newkey", 6);
+    rewriteClientCommandArgument(c, 1, newval);
+
+    TEST_ASSERT(c->argv[1] == newval);
+    TEST_ASSERT(c->argv[1]->refcount == 2);
+    TEST_ASSERT(c->argv_len_sum == 14); // 3 + 6 + 5
+    TEST_ASSERT(c->original_argv == initial_argv);
+    TEST_ASSERT(c->original_argv[1] == original_key);
+    TEST_ASSERT(c->original_argv[1]->refcount == 1);
+
+    /* Test 3: Extend argument vector */
+    robj *extraval = createStringObject("extra", 5);
+    rewriteClientCommandArgument(c, 3, extraval);
+
+    TEST_ASSERT(c->argc == 4);
+    TEST_ASSERT(c->argv[3] == extraval);
+    TEST_ASSERT(c->argv_len_sum == 19); // 3 + 6 + 5 + 5
+    TEST_ASSERT(c->original_argv == initial_argv);
+
+    /* Cleanup */
+    for (int i = 0; i < c->argc; i++) {
+        if (c->argv[i]) decrRefCount(c->argv[i]);
+    }
+    zfree(c->argv);
+
+    for (int i = 0; i < c->original_argc; i++) {
+        if (c->original_argv[i]) decrRefCount(c->original_argv[i]);
+    }
+    zfree(c->original_argv);
+
+    decrRefCount(newval);
+    decrRefCount(extraval);
+
+    zfree(c);
+
+    return 0;
+}
diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl
index d79bb1c7da..a51f07927d 100644
--- a/tests/unit/introspection.tcl
+++ b/tests/unit/introspection.tcl
@@ -376,6 +376,32 @@ start_server {tags {"introspection"}} {
         $rd close
     }
 
+    # This test verifies that MONITOR correctly records overwritten commands
+    # when executed within a MULTI-EXEC block. Specifically, it checks that even if
+    # the original SET-EX command arguments are overwritten for replica propagation, the MONITOR output
+    # still shows the original command.
+    test {MONITOR correctly records SET EX in MULTI-EXEC} {
+        # Start monitoring client
+        set rd [valkey_deferring_client]
+        $rd monitor
+        $rd read ; # Discard the OK
+    
+        # Execute multi-exec block with SET EX commands
+        r multi
+        r set "{slot}key1" value1 ex 3600
+        r set "{slot}key2" value2 ex 1800
+        r exec
+    
+        # Verify monitor output shows the original commands:
+        assert_match {*"multi"*} [$rd read]
+        assert_match {*"set"*"{slot}key1"*"value1"*"ex"*"3600"*} [$rd read]
+        assert_match {*"set"*"{slot}key2"*"value2"*"ex"*"1800"*} [$rd read]
+        assert_match {*"exec"*} [$rd read]
+    
+        # Clean up monitoring client
+        $rd close
+    }
+
     test {MONITOR log blocked command only once} {
         # need to reconnect in order to reset the clients state
         reconnect

From 349bc7547bf28ae304537bd6888d575e2409f25c Mon Sep 17 00:00:00 2001
From: Jim Brunner <brunnerj@amazon.com>
Date: Tue, 3 Dec 2024 11:19:53 -0800
Subject: [PATCH 13/73] defrag: use monotime in module interface (#1388)

The recent PR (https://github.com/valkey-io/valkey/pull/1242) converted
Active Defrag to use `monotime`. In that change, a conversion was
performed to continue to use `ustime()` as part of the module interface.
Since this time is only used internally, and never actually exposed to
the module, we can convert this to use `monotime` directly.

Signed-off-by: Jim Brunner <brunnerj@amazon.com>
---
 src/defrag.c | 3 +--
 src/module.c | 6 +++---
 src/server.h | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/defrag.c b/src/defrag.c
index d0c7632f17..9c195e8959 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -905,8 +905,7 @@ static int defragLaterItem(dictEntry *de, unsigned long *cursor, monotime endtim
         } else if (ob->type == OBJ_STREAM) {
             return scanLaterStreamListpacks(ob, cursor, endtime);
         } else if (ob->type == OBJ_MODULE) {
-            long long endtimeWallClock = ustime() + (endtime - getMonotonicUs());
-            return moduleLateDefrag(dictGetKey(de), ob, cursor, endtimeWallClock, dbid);
+            return moduleLateDefrag(dictGetKey(de), ob, cursor, endtime, dbid);
         } else {
             *cursor = 0; /* object type may have changed since we schedule it for later */
         }
diff --git a/src/module.c b/src/module.c
index 794038beb4..4092ae6b06 100644
--- a/src/module.c
+++ b/src/module.c
@@ -13344,7 +13344,7 @@ const char *VM_GetCurrentCommandName(ValkeyModuleCtx *ctx) {
  * defrag callback.
  */
 struct ValkeyModuleDefragCtx {
-    long long int endtime;
+    monotime endtime;
     unsigned long *cursor;
     struct serverObject *key; /* Optional name of key processed, NULL when unknown. */
     int dbid;                 /* The dbid of the key being processed, -1 when unknown. */
@@ -13373,7 +13373,7 @@ int VM_RegisterDefragFunc(ValkeyModuleCtx *ctx, ValkeyModuleDefragFunc cb) {
  * so it generally makes sense to do small batches of work in between calls.
  */
 int VM_DefragShouldStop(ValkeyModuleDefragCtx *ctx) {
-    return (ctx->endtime != 0 && ctx->endtime < ustime());
+    return (ctx->endtime != 0 && ctx->endtime <= getMonotonicUs());
 }
 
 /* Store an arbitrary cursor value for future re-use.
@@ -13455,7 +13455,7 @@ ValkeyModuleString *VM_DefragValkeyModuleString(ValkeyModuleDefragCtx *ctx, Valk
  * Returns a zero value (and initializes the cursor) if no more needs to be done,
  * or a non-zero value otherwise.
  */
-int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, long long endtime, int dbid) {
+int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, monotime endtime, int dbid) {
     moduleValue *mv = value->ptr;
     moduleType *mt = mv->type;
 
diff --git a/src/server.h b/src/server.h
index 0aac1acbd8..896ff735b3 100644
--- a/src/server.h
+++ b/src/server.h
@@ -2732,7 +2732,7 @@ size_t moduleGetFreeEffort(robj *key, robj *val, int dbid);
 size_t moduleGetMemUsage(robj *key, robj *val, size_t sample_size, int dbid);
 robj *moduleTypeDupOrReply(client *c, robj *fromkey, robj *tokey, int todb, robj *value);
 int moduleDefragValue(robj *key, robj *obj, int dbid);
-int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, long long endtime, int dbid);
+int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, monotime endtime, int dbid);
 void moduleDefragGlobals(void);
 void *moduleGetHandleByName(char *modulename);
 int moduleIsModuleCommand(void *module_handle, struct serverCommand *cmd);

From 105509cdad1e667cd15ad751bf8b918d9ca1ca06 Mon Sep 17 00:00:00 2001
From: zhenwei pi <pizhenwei@bytedance.com>
Date: Wed, 4 Dec 2024 06:09:56 +0800
Subject: [PATCH 14/73] Run RDMA builtin in CI workflow (#1380)

Since 4695d118dd (#1209), RDMA supports builtin.
And module connection type may be removed in future. So run a builtin
RDMA support for CI workflow.

RDMA module is complied only in CI, keep it building check only until
module connection type gets obsolete.

Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
---
 .github/workflows/ci.yml | 10 +++++++---
 tests/rdma/run.py        |  3 +--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3fec424cee..df3eaa1905 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -77,10 +77,14 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      - name: make
+      - name: prepare-development-libraries
+        run: sudo apt-get install librdmacm-dev libibverbs-dev
+      - name: make-rdma-module
+        run: make -j4 BUILD_RDMA=module
+      - name: make-rdma-builtin
         run: |
-          sudo apt-get install librdmacm-dev libibverbs-dev
-          make -j4 BUILD_RDMA=module
+          make distclean
+          make -j4 BUILD_RDMA=yes
       - name: clone-rxe-kmod
         run: |
           mkdir -p tests/rdma/rxe
diff --git a/tests/rdma/run.py b/tests/rdma/run.py
index 09168f368a..77e0f285fe 100755
--- a/tests/rdma/run.py
+++ b/tests/rdma/run.py
@@ -60,10 +60,9 @@ def test_rdma(ipaddr):
 
     # step 2, start server
     svrpath = valkeydir + "/src/valkey-server"
-    rdmapath = valkeydir + "/src/valkey-rdma.so"
     svrcmd = [svrpath, "--port", "0", "--loglevel", "verbose", "--protected-mode", "yes",
              "--appendonly", "no", "--daemonize", "no", "--dir", valkeydir + "/tests/rdma/tmp",
-             "--loadmodule", rdmapath, "--rdma-port", "6379", "--rdma-bind", ipaddr]
+             "--rdma-port", "6379", "--rdma-bind", ipaddr]
 
     svr = subprocess.Popen(svrcmd, shell=False, stdout=subprocess.PIPE)
     try:

From a401e3789d58c4d41769c3099a5f1cc009130994 Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Wed, 4 Dec 2024 10:33:14 -0800
Subject: [PATCH 15/73] Update code of conduct maintainers email address
 (#1391)

Updating code of conduct maintainer's email address

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 CODE_OF_CONDUCT.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index 1c530ec7ba..36764bb81b 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -49,7 +49,7 @@ representative at an online or offline event.
 Enforcement
 Instances of abusive, harassing, or otherwise unacceptable behavior may be
 reported to the community leaders responsible for enforcement at
-this email address: placeholderkv@gmail.com.
+this email address: maintainers@lists.valkey.io.
 All complaints will be reviewed and investigated promptly and fairly.
 All community leaders are obligated to respect the privacy and security of the
 reporter of any incident.

From 71560a2a4a1a73085dba9a8ea8f835c371358cfa Mon Sep 17 00:00:00 2001
From: Wen Hui <wen.hui.ware@gmail.com>
Date: Thu, 5 Dec 2024 11:58:24 -0500
Subject: [PATCH 16/73] Add API UpdateRuntimeArgs for updating the module
 arguments during runtime (#1041)

Before Redis OSS 7, if we load a module with some arguments during
runtime,
and run the command "config rewrite", the module information will not be
saved into the
config file.

Since Redis OSS 7 and Valkey 7.2, if we load a module with some
arguments during runtime,
the module information (path, arguments number, and arguments value) can
be saved into the config file after config rewrite command is called.
Thus, the module will be loaded automatically when the server startup
next time.

Following is one example:

bind 172.25.0.58
port 7000
protected-mode no
enable-module-command yes

Generated by CONFIG REWRITE
latency-tracking-info-percentiles 50 99 99.9
dir "/home/ubuntu/valkey"
save 3600 1 300 100 60 10000
user default on nopass sanitize-payload ~* &* +https://github.com/ALL
loadmodule tests/modules/datatype.so 10 20

However, there is one problem.
If developers write a module, and update the running arguments by
someway, the updated arguments can not be saved into the config file
even "config rewrite" is called.
The reason comes from the following function
rewriteConfigLoadmoduleOption (src/config.c)

void rewriteConfigLoadmoduleOption(struct rewriteConfigState *state) {
..........
struct ValkeyModule *module = dictGetVal(de);
line = sdsnew("loadmodule ");
line = sdscatsds(line, module->loadmod->path);
for (int i = 0; i < module->loadmod->argc; i++) {
line = sdscatlen(line, " ", 1);
line = sdscatsds(line, module->loadmod->argv[i]->ptr);
}
rewriteConfigRewriteLine(state, "loadmodule", line, 1);
.......
}

The function only save the initial arguments information
(module->loadmod) into the configfile.

After core members discuss, ref
https://github.com/valkey-io/valkey/issues/1177


We decide add the following API to implement this feature:

Original proposal:

int VM_UpdateRunTimeArgs(ValkeyModuleCtx *ctx, int index, char *value);


Updated proposal:

ValkeyModuleString **values VM_GetRuntimeArgs(ValkeyModuleCtx *ctx);
**int VM_UpdateRuntimeArgs(ValkeyModuleCtx *ctx, int argc,
ValkeyModuleString **values);


Why we do not recommend the following way:


MODULE UNLOAD
Update module args in the conf file
MODULE LOAD

I think there are the following disadvantages:

1. Some modules can not be unloaded. Such as the example module
datatype.so, which is tests/modules/datatype.so
2. it is not atomic operation for MODULE UNLOAD + MODULE LOAD
3. sometimes, if we just run the module unload, the client business
could be interrupted

---------

Signed-off-by: hwware <wen.hui.ware@gmail.com>
---
 src/module.c                           | 22 ++++++++++++++++++++
 src/valkeymodule.h                     |  2 ++
 tests/modules/Makefile                 |  1 +
 tests/modules/moduleparameter.c        | 28 ++++++++++++++++++++++++++
 tests/unit/moduleapi/moduleconfigs.tcl | 13 +++++++++++-
 5 files changed, 65 insertions(+), 1 deletion(-)
 create mode 100644 tests/modules/moduleparameter.c

diff --git a/src/module.c b/src/module.c
index 4092ae6b06..5f9dff0402 100644
--- a/src/module.c
+++ b/src/module.c
@@ -2255,6 +2255,27 @@ int moduleIsModuleCommand(void *module_handle, struct serverCommand *cmd) {
     return (cp->module == module_handle);
 }
 
+/* ValkeyModule_UpdateRuntimeArgs can be used to update the module argument values.
+ * The function parameter 'argc' indicates the number of updated arguments, and 'argv'
+ * represents the values of the updated arguments.
+ * Once 'CONFIG REWRITE' command is called, the updated argument values can be saved into conf file.
+ *
+ * The function always returns VALKEYMODULE_OK. */
+int VM_UpdateRuntimeArgs(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int argc) {
+    struct moduleLoadQueueEntry *loadmod = ctx->module->loadmod;
+    for (int i = 0; i < loadmod->argc; i++) {
+        decrRefCount(loadmod->argv[i]);
+    }
+    zfree(loadmod->argv);
+    loadmod->argv = argc - 1 ? zmalloc(sizeof(robj *) * (argc - 1)) : NULL;
+    loadmod->argc = argc - 1;
+    for (int i = 1; i < argc; i++) {
+        loadmod->argv[i - 1] = argv[i];
+        incrRefCount(loadmod->argv[i - 1]);
+    }
+    return VALKEYMODULE_OK;
+}
+
 /* --------------------------------------------------------------------------
  * ## Module information and time measurement
  * -------------------------------------------------------------------------- */
@@ -13560,6 +13581,7 @@ void moduleRegisterCoreAPI(void) {
     REGISTER_API(SetModuleAttribs);
     REGISTER_API(IsModuleNameBusy);
     REGISTER_API(WrongArity);
+    REGISTER_API(UpdateRuntimeArgs);
     REGISTER_API(ReplyWithLongLong);
     REGISTER_API(ReplyWithError);
     REGISTER_API(ReplyWithErrorFormat);
diff --git a/src/valkeymodule.h b/src/valkeymodule.h
index c2cdb2f0e7..7c3adfd477 100644
--- a/src/valkeymodule.h
+++ b/src/valkeymodule.h
@@ -967,6 +967,7 @@ VALKEYMODULE_API void (*ValkeyModule_SetModuleAttribs)(ValkeyModuleCtx *ctx, con
     VALKEYMODULE_ATTR;
 VALKEYMODULE_API int (*ValkeyModule_IsModuleNameBusy)(const char *name) VALKEYMODULE_ATTR;
 VALKEYMODULE_API int (*ValkeyModule_WrongArity)(ValkeyModuleCtx *ctx) VALKEYMODULE_ATTR;
+VALKEYMODULE_API int (*ValkeyModule_UpdateRuntimeArgs)(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int argc) VALKEYMODULE_ATTR;
 VALKEYMODULE_API int (*ValkeyModule_ReplyWithLongLong)(ValkeyModuleCtx *ctx, long long ll) VALKEYMODULE_ATTR;
 VALKEYMODULE_API int (*ValkeyModule_GetSelectedDb)(ValkeyModuleCtx *ctx) VALKEYMODULE_ATTR;
 VALKEYMODULE_API int (*ValkeyModule_SelectDb)(ValkeyModuleCtx *ctx, int newid) VALKEYMODULE_ATTR;
@@ -1673,6 +1674,7 @@ static int ValkeyModule_Init(ValkeyModuleCtx *ctx, const char *name, int ver, in
     VALKEYMODULE_GET_API(SetModuleAttribs);
     VALKEYMODULE_GET_API(IsModuleNameBusy);
     VALKEYMODULE_GET_API(WrongArity);
+    VALKEYMODULE_GET_API(UpdateRuntimeArgs);
     VALKEYMODULE_GET_API(ReplyWithLongLong);
     VALKEYMODULE_GET_API(ReplyWithError);
     VALKEYMODULE_GET_API(ReplyWithErrorFormat);
diff --git a/tests/modules/Makefile b/tests/modules/Makefile
index 1690b9b627..82813bb6f7 100644
--- a/tests/modules/Makefile
+++ b/tests/modules/Makefile
@@ -58,6 +58,7 @@ TEST_MODULES = \
     eventloop.so \
     moduleconfigs.so \
     moduleconfigstwo.so \
+    moduleparameter.so \
     publish.so \
     usercall.so \
     postnotifications.so \
diff --git a/tests/modules/moduleparameter.c b/tests/modules/moduleparameter.c
new file mode 100644
index 0000000000..6c110f2cfb
--- /dev/null
+++ b/tests/modules/moduleparameter.c
@@ -0,0 +1,28 @@
+#include "valkeymodule.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+
+int test_module_update_parameter(ValkeyModuleCtx *ctx,
+                                 ValkeyModuleString **argv, int argc) {
+
+  ValkeyModule_UpdateRuntimeArgs(ctx, argv, argc);
+  return ValkeyModule_ReplyWithSimpleString(ctx, "OK");
+}
+
+int ValkeyModule_OnLoad(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int argc) {
+    VALKEYMODULE_NOT_USED(argv);
+    VALKEYMODULE_NOT_USED(argc);
+
+    if (ValkeyModule_Init(ctx, "moduleparameter", 1, VALKEYMODULE_APIVER_1) ==
+        VALKEYMODULE_ERR)
+      return VALKEYMODULE_ERR;
+
+    if (ValkeyModule_CreateCommand(ctx, "testmoduleparameter.update.parameter",
+                                   test_module_update_parameter, "fast", 0, 0,
+                                   0) == VALKEYMODULE_ERR)
+      return VALKEYMODULE_ERR;
+
+    return VALKEYMODULE_OK;
+}
diff --git a/tests/unit/moduleapi/moduleconfigs.tcl b/tests/unit/moduleapi/moduleconfigs.tcl
index 44f994d2d0..54de5f2611 100644
--- a/tests/unit/moduleapi/moduleconfigs.tcl
+++ b/tests/unit/moduleapi/moduleconfigs.tcl
@@ -1,5 +1,7 @@
 set testmodule [file normalize tests/modules/moduleconfigs.so]
 set testmoduletwo [file normalize tests/modules/moduleconfigstwo.so]
+set testmoduleparameter [file normalize tests/modules/moduleparameter.so]
+
 
 start_server {tags {"modules"}} {
     r module load $testmodule
@@ -243,5 +245,14 @@ start_server {tags {"modules"}} {
             assert_equal [r config get moduleconfigs.memory_numeric] "moduleconfigs.memory_numeric 1024"
         }
     }
-}
+    test {Module Update Args} {
+       r module load $testmoduleparameter 10 20 30
 
+       set t [r module list]
+       set modulename [lmap x [r module list] {dict get $x name}]
+       assert_not_equal [lsearch $modulename moduleparameter] -1
+       assert_equal "{10 20 30}" [lmap x [r module list] {dict get $x args}]
+       assert_equal OK [r testmoduleparameter.update.parameter 40 50 60 70]
+       assert_equal "{40 50 60 70}" [lmap x [r module list] {dict get $x args}]
+    }
+}

From 6b3e1228cd043ebd35eec9c4354c933d5a8f968c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=A3=8E=E5=8E=BB=E5=B9=BD=E5=A2=A8?=
 <43802771+fengquyoumo@users.noreply.github.com>
Date: Fri, 6 Dec 2024 01:26:56 +0800
Subject: [PATCH 17/73] RDMA: Fix dead loop when transfer large data (20KB)
 (#1386)

Determine the status of the Client when attempting to read data. If
state=CLIENT_COMPLETED_IO, no read attempt is made and I/O operations on
the Client are rescheduled by the main thread.

> And 20474 Byte = PROTO_IOBUF_LEN(16KB) + SDS_HDR_VAR(16, s)(4090 Byte)

Fixes #1385

---------

Signed-off-by: fengquyoumo <1455117463@qq.com>
---
 src/rdma.c | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/src/rdma.c b/src/rdma.c
index de7ea396a1..7fe65ad2d2 100644
--- a/src/rdma.c
+++ b/src/rdma.c
@@ -77,9 +77,12 @@ typedef enum ValkeyRdmaOpcode {
 #define VALKEY_RDMA_INVALID_OPCODE 0xffff
 #define VALKEY_RDMA_KEEPALIVE_MS 3000
 
+#define RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE (1 << 0)
+
 typedef struct rdma_connection {
     connection c;
     struct rdma_cm_id *cm_id;
+    int flags;
     int last_errno;
     listNode *pending_list_node;
 } rdma_connection;
@@ -693,7 +696,7 @@ static void connRdmaEventHandler(struct aeEventLoop *el, int fd, void *clientDat
     }
 
     /* uplayer should read all */
-    while (ctx->rx.pos < ctx->rx.offset) {
+    while (!(rdma_conn->flags & RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE) && ctx->rx.pos < ctx->rx.offset) {
         if (conn->read_handler && (callHandler(conn, conn->read_handler) == C_ERR)) {
             return;
         }
@@ -705,7 +708,7 @@ static void connRdmaEventHandler(struct aeEventLoop *el, int fd, void *clientDat
     }
 
     /* RDMA comp channel has no POLLOUT event, try to send remaining buffer */
-    if ((ctx->tx.offset < ctx->tx.length) && conn->write_handler) {
+    if (!(rdma_conn->flags & RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE) && ctx->tx.offset < ctx->tx.length && conn->write_handler) {
         callHandler(conn, conn->write_handler);
     }
 }
@@ -884,6 +887,9 @@ static void connRdmaAcceptHandler(aeEventLoop *el, int fd, void *privdata, int m
 }
 
 static int connRdmaSetRwHandler(connection *conn) {
+    rdma_connection *rdma_conn = (rdma_connection *)conn;
+    if (rdma_conn->flags & RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE) return C_OK;
+
     /* IB channel only has POLLIN event */
     if (conn->read_handler || conn->write_handler) {
         if (aeCreateFileEvent(server.el, conn->fd, AE_READABLE, conn->type->ae_handler, conn) == AE_ERR) {
@@ -1721,12 +1727,12 @@ static int rdmaProcessPendingData(void) {
     listNode *ln;
     rdma_connection *rdma_conn;
     connection *conn;
-    int processed;
+    int processed = 0;
 
-    processed = listLength(pending_list);
     listRewind(pending_list, &li);
     while ((ln = listNext(&li))) {
         rdma_conn = listNodeValue(ln);
+        if (rdma_conn->flags & RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE) continue;
         conn = &rdma_conn->c;
 
         /* a connection can be disconnected by remote peer, CM event mark state as CONN_STATE_CLOSED, kick connection
@@ -1741,15 +1747,32 @@ static int rdmaProcessPendingData(void) {
                 callHandler(conn, conn->write_handler);
             }
 
+            ++processed;
             continue;
         }
 
         connRdmaEventHandler(NULL, -1, rdma_conn, 0);
+        ++processed;
     }
 
     return processed;
 }
 
+static void postPoneUpdateRdmaState(struct connection *conn, int postpone) {
+    rdma_connection *rdma_conn = (rdma_connection *)conn;
+    if (postpone) {
+        rdma_conn->flags |= RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE;
+    } else {
+        rdma_conn->flags &= ~RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE;
+    }
+}
+
+static void updateRdmaState(struct connection *conn) {
+    rdma_connection *rdma_conn = (rdma_connection *)conn;
+    connRdmaSetRwHandler(conn);
+    connRdmaEventHandler(NULL, -1, rdma_conn, 0);
+}
+
 static ConnectionType CT_RDMA = {
     /* connection type */
     .get_type = connRdmaGetType,
@@ -1792,6 +1815,8 @@ static ConnectionType CT_RDMA = {
     /* pending data */
     .has_pending_data = rdmaHasPendingData,
     .process_pending_data = rdmaProcessPendingData,
+    .postpone_update_state = postPoneUpdateRdmaState,
+    .update_state = updateRdmaState,
 };
 
 ConnectionType *connectionTypeRdma(void) {

From 6df376d68a97e9c0da4549f57db96742b5482202 Mon Sep 17 00:00:00 2001
From: Caiyi Wu <53631337+Codebells@users.noreply.github.com>
Date: Fri, 6 Dec 2024 03:01:38 +0800
Subject: [PATCH 18/73] Fix coredump when use hellodict example module (#1395)

In the ValkeyModule_OnLoad method of the file hellodict.c, the parameter
keystep of ValkeyModule_CreateCommand should be 1. Otherwise, execute
command will coredump.

    MODULE LOAD /home/tiger/valkey/src/modules/hellodict.so
    COMMAND GETKEYS HELLODICT.SET key value

Signed-off-by: Codebells <1347103071@qq.com>
---
 src/modules/hellodict.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/modules/hellodict.c b/src/modules/hellodict.c
index e0af06ba2f..db2fd17e8a 100644
--- a/src/modules/hellodict.c
+++ b/src/modules/hellodict.c
@@ -109,13 +109,13 @@ int ValkeyModule_OnLoad(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int arg
 
     if (ValkeyModule_Init(ctx, "hellodict", 1, VALKEYMODULE_APIVER_1) == VALKEYMODULE_ERR) return VALKEYMODULE_ERR;
 
-    if (ValkeyModule_CreateCommand(ctx, "hellodict.set", cmd_SET, "write deny-oom", 1, 1, 0) == VALKEYMODULE_ERR)
+    if (ValkeyModule_CreateCommand(ctx, "hellodict.set", cmd_SET, "write deny-oom", 1, 1, 1) == VALKEYMODULE_ERR)
         return VALKEYMODULE_ERR;
 
-    if (ValkeyModule_CreateCommand(ctx, "hellodict.get", cmd_GET, "readonly", 1, 1, 0) == VALKEYMODULE_ERR)
+    if (ValkeyModule_CreateCommand(ctx, "hellodict.get", cmd_GET, "readonly", 1, 1, 1) == VALKEYMODULE_ERR)
         return VALKEYMODULE_ERR;
 
-    if (ValkeyModule_CreateCommand(ctx, "hellodict.keyrange", cmd_KEYRANGE, "readonly", 1, 1, 0) == VALKEYMODULE_ERR)
+    if (ValkeyModule_CreateCommand(ctx, "hellodict.keyrange", cmd_KEYRANGE, "readonly", 1, 1, 1) == VALKEYMODULE_ERR)
         return VALKEYMODULE_ERR;
 
     /* Create our global dictionary. Here we'll set our keys and values. */

From a2fe6af457e353425d39c858b8cf68f1b4d6a9b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= <viktor.soderqvist@est.tech>
Date: Sat, 7 Dec 2024 10:25:40 +0100
Subject: [PATCH 19/73] Fix Module Update Args test when other modules are
 loaded (#1403)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes #1400

Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
---
 tests/unit/moduleapi/moduleconfigs.tcl | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/tests/unit/moduleapi/moduleconfigs.tcl b/tests/unit/moduleapi/moduleconfigs.tcl
index 54de5f2611..2474ad3567 100644
--- a/tests/unit/moduleapi/moduleconfigs.tcl
+++ b/tests/unit/moduleapi/moduleconfigs.tcl
@@ -2,6 +2,14 @@ set testmodule [file normalize tests/modules/moduleconfigs.so]
 set testmoduletwo [file normalize tests/modules/moduleconfigstwo.so]
 set testmoduleparameter [file normalize tests/modules/moduleparameter.so]
 
+proc module_get_args {mod} {
+    foreach line [r module list] {
+        if {[dict get $line name] eq $mod} {
+            return [dict get $line args]
+        }
+    }
+    throw error {module not found}
+}
 
 start_server {tags {"modules"}} {
     r module load $testmodule
@@ -246,13 +254,13 @@ start_server {tags {"modules"}} {
         }
     }
     test {Module Update Args} {
-       r module load $testmoduleparameter 10 20 30
+        r module load $testmoduleparameter 10 20 30
 
-       set t [r module list]
-       set modulename [lmap x [r module list] {dict get $x name}]
-       assert_not_equal [lsearch $modulename moduleparameter] -1
-       assert_equal "{10 20 30}" [lmap x [r module list] {dict get $x args}]
-       assert_equal OK [r testmoduleparameter.update.parameter 40 50 60 70]
-       assert_equal "{40 50 60 70}" [lmap x [r module list] {dict get $x args}]
+        set t [r module list]
+        set modulename [lmap x [r module list] {dict get $x name}]
+        assert_not_equal [lsearch $modulename moduleparameter] -1
+        assert_equal {10 20 30} [module_get_args moduleparameter]
+        assert_equal OK [r testmoduleparameter.update.parameter 40 50 60 70]
+        assert_equal {40 50 60 70} [module_get_args moduleparameter]
     }
 }

From f20d629dbe31d31eb82e360f9da4ef94ba9aabdc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= <viktor.soderqvist@est.tech>
Date: Sat, 7 Dec 2024 10:26:31 +0100
Subject: [PATCH 20/73] Fix sanitizer builds with clang (#1402)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

By including <stdatomic.h> after the other includes in the unit test, we
can avoid redefining a macro which led to a build failure.

Fixes #1394

---------

Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
---
 src/unit/test_networking.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/unit/test_networking.c b/src/unit/test_networking.c
index ac042d907f..566583bcc5 100644
--- a/src/unit/test_networking.c
+++ b/src/unit/test_networking.c
@@ -1,9 +1,9 @@
-#include <stdatomic.h>
-
 #include "../networking.c"
 #include "../server.c"
 #include "test_help.h"
 
+#include <stdatomic.h>
+
 int test_backupAndUpdateClientArgv(int argc, char **argv, int flags) {
     UNUSED(argc);
     UNUSED(argv);

From 176fafcaf71793efdadefba8e49ef711748b0c20 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Sun, 8 Dec 2024 20:28:14 +0800
Subject: [PATCH 21/73] Add a note to conf about the dangers of modifying dir
 at runtime (#887)

We've had security issues in the past with it, which is why
we marked it as PROTECTED. But, modifying during runtime
is also a dangerous action. For example, when child processes
are running, persistent temp files and log files may have
unexpected effects.

A scenario for modifying dir at runtime is to migrate a disk
failure, such as using disk-based replication to migrate a node,
writing nodes.conf to save the cluster configuration.

We decided to leave it as is and add a note in the conf
about the dangers of modifying dir at runtime.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 tests/unit/introspection.tcl | 7 +++++++
 valkey.conf                  | 6 ++++++
 2 files changed, 13 insertions(+)

diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl
index a51f07927d..47490a295b 100644
--- a/tests/unit/introspection.tcl
+++ b/tests/unit/introspection.tcl
@@ -980,6 +980,13 @@ start_server {tags {"introspection"}} {
         }
     } {} {external:skip}
 
+    test {valkey-server command line arguments - dir multiple times} {
+        start_server {config "default.conf" args {--dir "./" --dir "./"}} {
+            r config get dir
+            assert_equal {PONG} [r ping]
+        }
+    } {} {external:skip}
+
     # Config file at this point is at a weird state, and includes all
     # known keywords. Might be a good idea to avoid adding tests here.
 }
diff --git a/valkey.conf b/valkey.conf
index b997e8179b..e23aea39de 100644
--- a/valkey.conf
+++ b/valkey.conf
@@ -582,6 +582,9 @@ rdb-del-sync-files no
 
 # The working directory.
 #
+# The server log is written relative this directory, if the 'logfile'
+# configuration directive is a relative path.
+#
 # The DB will be written inside this directory, with the filename specified
 # above using the 'dbfilename' configuration directive.
 #
@@ -591,6 +594,9 @@ rdb-del-sync-files no
 # 'cluster-config-file' configuration directive is a relative path.
 #
 # Note that you must specify a directory here, not a file name.
+# Note that modifying 'dir' during runtime may have unexpected behavior,
+# for example when a child process is running, related file operations may
+# have unexpected effects.
 dir ./
 
 ################################# REPLICATION #################################

From e8078b7315250dc052b4020a4ea73471a8c0e4a9 Mon Sep 17 00:00:00 2001
From: Guillaume Koenig <106696198+knggk@users.noreply.github.com>
Date: Sun, 8 Dec 2024 07:30:07 -0500
Subject: [PATCH 22/73] Allow MEMORY MALLOC-STATS and MEMORY PURGE during
 loading phase (#1317)

- Enable investigation of memory issues during loading
- Previously, all memory commands were rejected with LOADING error
(except memory help)
- `MEMORY MALLOC-STATS` and `MEMORTY PURGE` are now allowed
as they don't depend on the dataset
- `MEMORY STATS` and `MEMORY USAGE KEY` remain disallowed

Fixes #1299

Signed-off-by: Guillaume Koenig <knggk@amazon.com>
Signed-off-by: Binbin <binloveplay1314@qq.com>
Co-authored-by: Binbin <binloveplay1314@qq.com>
---
 src/commands.def                      |  4 +--
 src/commands/memory-malloc-stats.json |  3 ++
 src/commands/memory-purge.json        |  3 ++
 tests/unit/introspection.tcl          | 43 +++++++++++++++++++++++++++
 4 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/src/commands.def b/src/commands.def
index ecc77126af..1ac2368ee1 100644
--- a/src/commands.def
+++ b/src/commands.def
@@ -7320,8 +7320,8 @@ struct COMMAND_ARG MEMORY_USAGE_Args[] = {
 struct COMMAND_STRUCT MEMORY_Subcommands[] = {
 {MAKE_CMD("doctor","Outputs a memory problems report.","O(1)","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_DOCTOR_History,0,MEMORY_DOCTOR_Tips,3,memoryCommand,2,0,0,MEMORY_DOCTOR_Keyspecs,0,NULL,0)},
 {MAKE_CMD("help","Returns helpful text about the different subcommands.","O(1)","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_HELP_History,0,MEMORY_HELP_Tips,0,memoryCommand,2,CMD_LOADING|CMD_STALE,0,MEMORY_HELP_Keyspecs,0,NULL,0)},
-{MAKE_CMD("malloc-stats","Returns the allocator statistics.","Depends on how much memory is allocated, could be slow","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_MALLOC_STATS_History,0,MEMORY_MALLOC_STATS_Tips,3,memoryCommand,2,0,0,MEMORY_MALLOC_STATS_Keyspecs,0,NULL,0)},
-{MAKE_CMD("purge","Asks the allocator to release memory.","Depends on how much memory is allocated, could be slow","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_PURGE_History,0,MEMORY_PURGE_Tips,2,memoryCommand,2,0,0,MEMORY_PURGE_Keyspecs,0,NULL,0)},
+{MAKE_CMD("malloc-stats","Returns the allocator statistics.","Depends on how much memory is allocated, could be slow","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_MALLOC_STATS_History,0,MEMORY_MALLOC_STATS_Tips,3,memoryCommand,2,CMD_LOADING,0,MEMORY_MALLOC_STATS_Keyspecs,0,NULL,0)},
+{MAKE_CMD("purge","Asks the allocator to release memory.","Depends on how much memory is allocated, could be slow","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_PURGE_History,0,MEMORY_PURGE_Tips,2,memoryCommand,2,CMD_LOADING,0,MEMORY_PURGE_Keyspecs,0,NULL,0)},
 {MAKE_CMD("stats","Returns details about memory usage.","O(1)","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_STATS_History,0,MEMORY_STATS_Tips,3,memoryCommand,2,0,0,MEMORY_STATS_Keyspecs,0,NULL,0)},
 {MAKE_CMD("usage","Estimates the memory usage of a key.","O(N) where N is the number of samples.","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_USAGE_History,0,MEMORY_USAGE_Tips,0,memoryCommand,-3,CMD_READONLY,0,MEMORY_USAGE_Keyspecs,1,NULL,2),.args=MEMORY_USAGE_Args},
 {0}
diff --git a/src/commands/memory-malloc-stats.json b/src/commands/memory-malloc-stats.json
index 5ef6a31c40..af5d439744 100644
--- a/src/commands/memory-malloc-stats.json
+++ b/src/commands/memory-malloc-stats.json
@@ -12,6 +12,9 @@
             "REQUEST_POLICY:ALL_SHARDS",
             "RESPONSE_POLICY:SPECIAL"
         ],
+        "command_flags": [
+            "LOADING"
+        ],
         "reply_schema": {
             "type": "string",
             "description": "The memory allocator's internal statistics report."
diff --git a/src/commands/memory-purge.json b/src/commands/memory-purge.json
index 77ed61dc5b..aea3e2d24a 100644
--- a/src/commands/memory-purge.json
+++ b/src/commands/memory-purge.json
@@ -11,6 +11,9 @@
             "REQUEST_POLICY:ALL_SHARDS",
             "RESPONSE_POLICY:ALL_SUCCEEDED"
         ],
+        "command_flags": [
+            "LOADING"
+        ],
         "reply_schema": {
             "const": "OK"
         }
diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl
index 47490a295b..bafc46d4b7 100644
--- a/tests/unit/introspection.tcl
+++ b/tests/unit/introspection.tcl
@@ -1042,6 +1042,49 @@ test {config during loading} {
     }
 } {} {external:skip}
 
+test {MEMORY commands during loading} {
+    start_server [list overrides [list key-load-delay 50 loading-process-events-interval-bytes 1024]] {
+        # Set up some initial data
+        r debug populate 100000 key 1000
+
+        # Save and restart
+        r save
+        restart_server 0 false false
+
+        # At this point, keys are loaded one at time, busy looping 50usec
+        # between each. Further, other events are processed every 1024 bytes
+        # of RDB. We're sending all our commands deferred, so they have a
+        # chance to be processed all at once between loading two keys.
+
+        set rd [valkey_deferring_client]
+
+        # Allowed during loading
+        $rd memory help
+        $rd memory malloc-stats
+        $rd memory purge
+
+        # Disallowed during loading (because directly dependent on the dataset)
+        $rd memory doctor
+        $rd memory stats
+        $rd memory usage key:1
+
+        # memory help
+        assert_match {{MEMORY <subcommand> *}} [$rd read]
+        # memory malloc-stats
+        assert_match {*alloc*} [$rd read]
+        # memory purge
+        assert_match OK [$rd read]
+        # memory doctor
+        assert_error {*LOADING*} {$rd read}
+        # memory stats
+        assert_error {*LOADING*} {$rd read}
+        # memory usage key:1
+        assert_error {*LOADING*} {$rd read}
+
+        $rd close
+    }
+} {} {external:skip}
+
 test {CONFIG REWRITE handles rename-command properly} {
     start_server {tags {"introspection"} overrides {rename-command {flushdb badger}}} {
         assert_error {ERR unknown command*} {r flushdb}

From b09db3ef788896f7192b068b1089c11b761ed3fe Mon Sep 17 00:00:00 2001
From: Roman Gershman <romange@gmail.com>
Date: Mon, 9 Dec 2024 10:01:43 +0200
Subject: [PATCH 23/73] Fix typo in streams seen-time / active-time test
 (#1409)

This variable name is wrong, it causes the wrong variable to be asserted.

Signed-off-by: Roman Gershman <romange@gmail.com>
---
 tests/unit/type/stream-cgroups.tcl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/type/stream-cgroups.tcl b/tests/unit/type/stream-cgroups.tcl
index d934e48140..d736b9cdb7 100644
--- a/tests/unit/type/stream-cgroups.tcl
+++ b/tests/unit/type/stream-cgroups.tcl
@@ -944,7 +944,7 @@ start_server {
 
         # Simulate loading from RDB
 
-        set reply [r XINFO STREAM x FULL]
+        set reply [r XINFO STREAM mystream FULL]
         set group [lindex [dict get $reply groups] 0]
         set consumer [lindex [dict get $group consumers] 0]
         set prev_seen [dict get $consumer seen-time]
@@ -954,7 +954,7 @@ start_server {
         r DEL mystream
         r RESTORE mystream 0 $dump
 
-        set reply [r XINFO STREAM x FULL]
+        set reply [r XINFO STREAM mystream FULL]
         set group [lindex [dict get $reply groups] 0]
         set consumer [lindex [dict get $group consumers] 0]
         assert_equal $prev_seen [dict get $consumer seen-time]

From 924729eb1695a8a5913fe32531a8d520560fe70b Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Mon, 9 Dec 2024 16:19:02 +0800
Subject: [PATCH 24/73] Fix the election was reset wrongly before failover
 epoch was obtained (#1339)

After #1009, we will reset the election when we received
a claim with an equal or higher epoch since a node can win
an election in the past.

But we need to consider the time before the node actually
obtains the failover_auth_epoch. The failover_auth_epoch
default is 0, so before the node actually get the failover
epoch, we might wrongly reset the election.

This is probably harmless, but will produce misleading log
output and may delay election by a cron cycle or beforesleep.
Now we will only reset the election when a node is actually
obtains the failover epoch.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c             | 3 ++-
 tests/unit/cluster/failover2.tcl | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 6ea8eb2e67..50a8ffca38 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3149,7 +3149,8 @@ int clusterProcessPacket(clusterLink *link) {
             sender->configEpoch = sender_claimed_config_epoch;
             clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG);
 
-            if (server.cluster->failover_auth_time && sender->configEpoch >= server.cluster->failover_auth_epoch) {
+            if (server.cluster->failover_auth_time && server.cluster->failover_auth_sent &&
+                sender->configEpoch >= server.cluster->failover_auth_epoch) {
                 /* Another node has claimed an epoch greater than or equal to ours.
                  * If we have an ongoing election, reset it because we cannot win
                  * with an epoch smaller than or equal to the incoming claim. This
diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl
index 21c4f4a678..9262049e4e 100644
--- a/tests/unit/cluster/failover2.tcl
+++ b/tests/unit/cluster/failover2.tcl
@@ -86,6 +86,11 @@ start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval
             fail "No failover detected"
         }
 
+        # Make sure there is no false epoch 0.
+        verify_no_log_message -7 "*Failover election in progress for epoch 0*" 0
+        verify_no_log_message -8 "*Failover election in progress for epoch 0*" 0
+        verify_no_log_message -9 "*Failover election in progress for epoch 0*" 0
+
         # Make sure there is no failover timeout.
         verify_no_log_message -7 "*Failover attempt expired*" 0
         verify_no_log_message -8 "*Failover attempt expired*" 0

From 5be4ce6d27c0fb8c046508ff04016a1395ca9d5e Mon Sep 17 00:00:00 2001
From: ranshid <88133677+ranshid@users.noreply.github.com>
Date: Mon, 9 Dec 2024 16:48:46 +0200
Subject: [PATCH 25/73] Optimize ZRANK to avoid path comparisons (#1389)

ZRANK is a widly used command for workloads using sorted-sets. For
example, in leaderboards It enables query the specific rank of a player.
The way ZRANK is currently implemented is:

1. locate the element in the SortedSet hashtable.
2. take the score of the element and use it in order to locate the
element in the SkipList (when listpack encoding is not used)
3. During the SkipLis scan for the elemnt we keep the path and use it in
order to sum the span in each path node in order to calculate the elemnt
rank

One problem with this approach is that it involves multiple compare
operations in order to locate the element. Specifically string
comparison can be expensive since it will require access multiple memory
locations for the items the element string is compared against.
Perf analysis showed this can take up to 20% of the rank scan time. (TBD
- provide the perf results for example)

We can improve the rank search by taking advantage of the fact that the
element node in the skiplist is pointed by the hashtable value!
Our Skiplist implementation is using FatKeys, where each added node is
assigned a randomly chosen height. Say we keep a height record for every
skiplist element. In order to get an element rank we simply:

1. locate the element in the SortedSet hashtable.
2. we go directly to the node in the skiplist.
3. we jump to the full height of the node and take the span value.
4. we continue going foreward and always jump to the heighst point in
each node we get to, making sure to sum all the spans.
5. we take off the summed spans from the SkipList length and we now have
the specific node rank. :)

In order to test this method I created several benchmarks. All
benchmarks used the same seeds and the lists contained 1M elements.
Since a very important factor is the number of scores compared to the
number of elements (since small ratio means more string compares during
searches) each benchmark test used different number of scores (1, 10K,
100K, 1M)
some results:

**TPS**

Scores range | non-optimized | optimized | gain
-- | -- | -- | --
1 | 416042 | 605363 | 45.51%
10K | 359776 | 459200 | 27.63%
100K | 380387 | 459157 | 20.71%
1M | 416059 | 450853 | 8.36%

**Latency**

Scores range | non-optimized | optimized | gain
-- | -- | -- | --
1 | 1.191000 | 0.831000 | -30.23%
10K | 1.383000 | 1.095000 | -20.82%
100K | 1.311000 | 1.087000 | -17.09%
1M | 1.191000 | 1.119000 | -6.05%

###  Memory efficiency

adding another field to each skiplist node can cause degredation in
memory efficiency for large sortedsets. We use the fact that level 0
recorded span of ALL nodes can either be 1 or zero (for the last node).
So we use wrappers in order to get a node span and override the span for
level 0 to hold the node height.

---------

Signed-off-by: Ran Shidlansik <ranshid@amazon.com>
---
 src/server.h |   4 ++
 src/t_zset.c | 104 ++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 86 insertions(+), 22 deletions(-)

diff --git a/src/server.h b/src/server.h
index 896ff735b3..44de6eada1 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1449,6 +1449,10 @@ typedef struct zskiplistNode {
     struct zskiplistNode *backward;
     struct zskiplistLevel {
         struct zskiplistNode *forward;
+        /* At each level we keep the span, which is the number of elements which are on the "subtree"
+         * from this node at this level to the next node at the same level.
+         * One exception is the value at level 0. In level 0 the span can only be 1 or 0 (in case the last elements in the list)
+         * So we use it in order to hold the height of the node, which is the number of levels. */
         unsigned long span;
     } level[];
 } zskiplistNode;
diff --git a/src/t_zset.c b/src/t_zset.c
index a1e71208cb..36a9bfffb1 100644
--- a/src/t_zset.c
+++ b/src/t_zset.c
@@ -72,12 +72,51 @@ void zsetConvertAndExpand(robj *zobj, int encoding, unsigned long cap);
 zskiplistNode *zslGetElementByRankFromNode(zskiplistNode *start_node, int start_level, unsigned long rank);
 zskiplistNode *zslGetElementByRank(zskiplist *zsl, unsigned long rank);
 
+static inline unsigned long zslGetNodeSpanAtLevel(zskiplistNode *x, int level) {
+    /* We use the level 0 span in order to hold the node height, so in case the span is requested on
+     * level 0 and this is not the last node we return 1 and 0 otherwise. For the rest of the levels we just return
+     * the recorded span in that level. */
+    if (level > 0) return x->level[level].span;
+    return x->level[level].forward ? 1 : 0;
+}
+
+static inline void zslSetNodeSpanAtLevel(zskiplistNode *x, int level, unsigned long span) {
+    /* We use the level 0 span in order to hold the node height, so we avoid overriding it. */
+    if (level > 0)
+        x->level[level].span = span;
+}
+
+static inline void zslIncrNodeSpanAtLevel(zskiplistNode *x, int level, unsigned long incr) {
+    /* We use the level 0 span in order to hold the node height, so we avoid overriding it. */
+    if (level > 0)
+        x->level[level].span += incr;
+}
+
+static inline void zslDecrNodeSpanAtLevel(zskiplistNode *x, int level, unsigned long decr) {
+    /* We use the level 0 span in order to hold the node height, so we avoid overriding it. */
+    if (level > 0)
+        x->level[level].span -= decr;
+}
+
+static inline unsigned long zslGetNodeHeight(zskiplistNode *x) {
+    /* Since the span at level 0 is always 1 (or 0 for the last node), this
+     * field is instead used for storing the height of the node. */
+    return x->level[0].span;
+}
+
+static inline void zslSetNodeHeight(zskiplistNode *x, int height) {
+    /* Since the span at level 0 is always 1 (or 0 for the last node), this
+     * field is instead used for storing the height of the node. */
+    x->level[0].span = height;
+}
+
 /* Create a skiplist node with the specified number of levels.
  * The SDS string 'ele' is referenced by the node after the call. */
-zskiplistNode *zslCreateNode(int level, double score, sds ele) {
-    zskiplistNode *zn = zmalloc(sizeof(*zn) + level * sizeof(struct zskiplistLevel));
+zskiplistNode *zslCreateNode(int height, double score, sds ele) {
+    zskiplistNode *zn = zmalloc(sizeof(*zn) + height * sizeof(struct zskiplistLevel));
     zn->score = score;
     zn->ele = ele;
+    zslSetNodeHeight(zn, height);
     return zn;
 }
 
@@ -147,7 +186,7 @@ zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele) {
         while (x->level[i].forward &&
                (x->level[i].forward->score < score ||
                 (x->level[i].forward->score == score && sdscmp(x->level[i].forward->ele, ele) < 0))) {
-            rank[i] += x->level[i].span;
+            rank[i] += zslGetNodeSpanAtLevel(x, i);
             x = x->level[i].forward;
         }
         update[i] = x;
@@ -161,9 +200,10 @@ zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele) {
         for (i = zsl->level; i < level; i++) {
             rank[i] = 0;
             update[i] = zsl->header;
-            update[i]->level[i].span = zsl->length;
+            zslSetNodeSpanAtLevel(update[i], i, zsl->length);
         }
         zsl->level = level;
+        zslSetNodeHeight(zsl->header, level);
     }
     x = zslCreateNode(level, score, ele);
     for (i = 0; i < level; i++) {
@@ -171,13 +211,13 @@ zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele) {
         update[i]->level[i].forward = x;
 
         /* update span covered by update[i] as x is inserted here */
-        x->level[i].span = update[i]->level[i].span - (rank[0] - rank[i]);
-        update[i]->level[i].span = (rank[0] - rank[i]) + 1;
+        zslSetNodeSpanAtLevel(x, i, zslGetNodeSpanAtLevel(update[i], i) - (rank[0] - rank[i]));
+        zslSetNodeSpanAtLevel(update[i], i, (rank[0] - rank[i]) + 1);
     }
 
     /* increment span for untouched levels */
     for (i = level; i < zsl->level; i++) {
-        update[i]->level[i].span++;
+        zslIncrNodeSpanAtLevel(update[i], i, 1);
     }
 
     x->backward = (update[0] == zsl->header) ? NULL : update[0];
@@ -195,10 +235,10 @@ void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
     int i;
     for (i = 0; i < zsl->level; i++) {
         if (update[i]->level[i].forward == x) {
-            update[i]->level[i].span += x->level[i].span - 1;
+            zslIncrNodeSpanAtLevel(update[i], i, zslGetNodeSpanAtLevel(x, i) - 1);
             update[i]->level[i].forward = x->level[i].forward;
         } else {
-            update[i]->level[i].span -= 1;
+            zslDecrNodeSpanAtLevel(update[i], i, 1);
         }
     }
     if (x->level[0].forward) {
@@ -336,7 +376,7 @@ zskiplistNode *zslNthInRange(zskiplist *zsl, zrangespec *range, long n) {
     x = zsl->header;
     i = zsl->level - 1;
     while (x->level[i].forward && !zslValueGteMin(x->level[i].forward->score, range)) {
-        edge_rank += x->level[i].span;
+        edge_rank += zslGetNodeSpanAtLevel(x, i);
         x = x->level[i].forward;
     }
     /* Remember the last node which has zsl->level-1 levels and its rank. */
@@ -348,7 +388,7 @@ zskiplistNode *zslNthInRange(zskiplist *zsl, zrangespec *range, long n) {
             /* Go forward while *OUT* of range. */
             while (x->level[i].forward && !zslValueGteMin(x->level[i].forward->score, range)) {
                 /* Count the rank of the last element smaller than the range. */
-                edge_rank += x->level[i].span;
+                edge_rank += zslGetNodeSpanAtLevel(x, i);
                 x = x->level[i].forward;
             }
         }
@@ -372,7 +412,7 @@ zskiplistNode *zslNthInRange(zskiplist *zsl, zrangespec *range, long n) {
             /* Go forward while *IN* range. */
             while (x->level[i].forward && zslValueLteMax(x->level[i].forward->score, range)) {
                 /* Count the rank of the last element in range. */
-                edge_rank += x->level[i].span;
+                edge_rank += zslGetNodeSpanAtLevel(x, i);
                 x = x->level[i].forward;
             }
         }
@@ -464,8 +504,8 @@ unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned
 
     x = zsl->header;
     for (i = zsl->level - 1; i >= 0; i--) {
-        while (x->level[i].forward && (traversed + x->level[i].span) < start) {
-            traversed += x->level[i].span;
+        while (x->level[i].forward && (traversed + zslGetNodeSpanAtLevel(x, i)) < start) {
+            traversed += zslGetNodeSpanAtLevel(x, i);
             x = x->level[i].forward;
         }
         update[i] = x;
@@ -499,7 +539,7 @@ unsigned long zslGetRank(zskiplist *zsl, double score, sds ele) {
         while (x->level[i].forward &&
                (x->level[i].forward->score < score ||
                 (x->level[i].forward->score == score && sdscmp(x->level[i].forward->ele, ele) <= 0))) {
-            rank += x->level[i].span;
+            rank += zslGetNodeSpanAtLevel(x, i);
             x = x->level[i].forward;
         }
 
@@ -511,6 +551,18 @@ unsigned long zslGetRank(zskiplist *zsl, double score, sds ele) {
     return 0;
 }
 
+/* Find the rank for a specific skiplist node. */
+unsigned long zslGetRankByNode(zskiplist *zsl, zskiplistNode *x) {
+    int i = zslGetNodeHeight(x) - 1;
+    unsigned long rank = zslGetNodeSpanAtLevel(x, i);
+    while (x->level[zslGetNodeHeight(x) - 1].forward) {
+        x = x->level[zslGetNodeHeight(x) - 1].forward;
+        rank += zslGetNodeSpanAtLevel(x, zslGetNodeHeight(x) - 1);
+    }
+    rank = zsl->length - rank;
+    return rank;
+}
+
 /* Finds an element by its rank from start node. The rank argument needs to be 1-based. */
 zskiplistNode *zslGetElementByRankFromNode(zskiplistNode *start_node, int start_level, unsigned long rank) {
     zskiplistNode *x;
@@ -519,8 +571,8 @@ zskiplistNode *zslGetElementByRankFromNode(zskiplistNode *start_node, int start_
 
     x = start_node;
     for (i = start_level; i >= 0; i--) {
-        while (x->level[i].forward && (traversed + x->level[i].span) <= rank) {
-            traversed += x->level[i].span;
+        while (x->level[i].forward && (traversed + zslGetNodeSpanAtLevel(x, i)) <= rank) {
+            traversed += zslGetNodeSpanAtLevel(x, i);
             x = x->level[i].forward;
         }
         if (traversed == rank) {
@@ -690,7 +742,7 @@ zskiplistNode *zslNthInLexRange(zskiplist *zsl, zlexrangespec *range, long n) {
     x = zsl->header;
     i = zsl->level - 1;
     while (x->level[i].forward && !zslLexValueGteMin(x->level[i].forward->ele, range)) {
-        edge_rank += x->level[i].span;
+        edge_rank += zslGetNodeSpanAtLevel(x, i);
         x = x->level[i].forward;
     }
     /* Remember the last node which has zsl->level-1 levels and its rank. */
@@ -702,7 +754,7 @@ zskiplistNode *zslNthInLexRange(zskiplist *zsl, zlexrangespec *range, long n) {
             /* Go forward while *OUT* of range. */
             while (x->level[i].forward && !zslLexValueGteMin(x->level[i].forward->ele, range)) {
                 /* Count the rank of the last element smaller than the range. */
-                edge_rank += x->level[i].span;
+                edge_rank += zslGetNodeSpanAtLevel(x, i);
                 x = x->level[i].forward;
             }
         }
@@ -726,7 +778,7 @@ zskiplistNode *zslNthInLexRange(zskiplist *zsl, zlexrangespec *range, long n) {
             /* Go forward while *IN* range. */
             while (x->level[i].forward && zslLexValueLteMax(x->level[i].forward->ele, range)) {
                 /* Count the rank of the last element in range. */
-                edge_rank += x->level[i].span;
+                edge_rank += zslGetNodeSpanAtLevel(x, i);
                 x = x->level[i].forward;
             }
         }
@@ -1173,6 +1225,13 @@ unsigned char *zzlDeleteRangeByRank(unsigned char *zl, unsigned int start, unsig
  * Common sorted set API
  *----------------------------------------------------------------------------*/
 
+/* Utility function used for mapping the hashtable entry to the matching skiplist node.
+ * For example, this is used in case of ZRANK query. */
+static inline zskiplistNode *zsetGetSLNodeByEntry(dictEntry *de) {
+    char *score_ref = ((char *)dictGetVal(de));
+    return (zskiplistNode *)(score_ref - offsetof(zskiplistNode, score));
+}
+
 unsigned long zsetLength(const robj *zobj) {
     unsigned long length = 0;
     if (zobj->encoding == OBJ_ENCODING_LISTPACK) {
@@ -1603,8 +1662,9 @@ long zsetRank(robj *zobj, sds ele, int reverse, double *output_score) {
 
         de = dictFind(zs->dict, ele);
         if (de != NULL) {
-            score = *(double *)dictGetVal(de);
-            rank = zslGetRank(zsl, score, ele);
+            zskiplistNode *n = zsetGetSLNodeByEntry(de);
+            score = n->score;
+            rank = zslGetRankByNode(zsl, n);
             /* Existing elements always have a rank. */
             serverAssert(rank != 0);
             if (output_score) *output_score = score;

From 1ba85d002a824a12b0107bdd2b493a3a0516cec9 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Tue, 10 Dec 2024 00:37:04 +0800
Subject: [PATCH 26/73] Use binary representation in assert crash log, cleanup
 in crash log (#1410)

Change assert crash log to also use binary representation like 5bdd72bea77d4bb237441c9a671e80edcdc998ad.
And do not print the password in assert crash log like 56eef6fb5ab7a755485c19f358761954ca459472.

In addition, for 5bdd72bea77d4bb237441c9a671e80edcdc998ad, we will print '"argv"',
because originally the code would print a '', and sdscatrepr will add an extra "",
so now removing the extra '' here.

Extract the getArgvReprString method and clean up the code a bit.

Examples:
```
debug assert "\x00abc"

before:
client->argv[0] = "debug" (refcount: 1)
client->argv[1] = "assert" (refcount: 1)
client->argv[2] = "" (refcount: 1)

after:
client->argv[0] = "debug" (refcount: 1)
client->argv[1] = "assert" (refcount: 1)
client->argv[2] = "\x00abc" (refcount: 1)

debug panic "\x00abc"

before:
argc: '3'
argv[0]: '"debug"'
argv[1]: '"panic"'
argv[2]: '"\x00abc"'

after:
argc: 3
argv[0]: "debug"
argv[1]: "panic"
argv[2]: "\x00abc"
```

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/debug.c | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/src/debug.c b/src/debug.c
index 38b66dacb5..7407af3514 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -1049,6 +1049,14 @@ __attribute__((noinline, weak)) void _serverAssert(const char *estr, const char
     bugReportEnd(0, 0);
 }
 
+/* Returns the argv argument in binary representation, limited to length 128. */
+sds getArgvReprString(robj *argv) {
+    robj *decoded = getDecodedObject(argv);
+    sds repr = sdscatrepr(sdsempty(), decoded->ptr, min(sdslen(decoded->ptr), 128));
+    decrRefCount(decoded);
+    return repr;
+}
+
 /* Checks if the argument at the given index should be redacted from logs. */
 int shouldRedactArg(const client *c, int idx) {
     serverAssert(idx < c->argc);
@@ -1073,16 +1081,12 @@ void _serverAssertPrintClientInfo(const client *c) {
             serverLog(LL_WARNING, "client->argv[%d]: %zu bytes", j, sdslen((sds)c->argv[j]->ptr));
             continue;
         }
-        char buf[128];
-        char *arg;
-
-        if (c->argv[j]->type == OBJ_STRING && sdsEncodedObject(c->argv[j])) {
-            arg = (char *)c->argv[j]->ptr;
-        } else {
-            snprintf(buf, sizeof(buf), "Object type: %u, encoding: %u", c->argv[j]->type, c->argv[j]->encoding);
-            arg = buf;
+        sds repr = getArgvReprString(c->argv[j]);
+        serverLog(LL_WARNING, "client->argv[%d] = %s (refcount: %d)", j, repr, c->argv[j]->refcount);
+        sdsfree(repr);
+        if (!strcasecmp(c->argv[j]->ptr, "auth") || !strcasecmp(c->argv[j]->ptr, "auth2")) {
+            break;
         }
-        serverLog(LL_WARNING, "client->argv[%d] = \"%s\" (refcount: %d)", j, arg, c->argv[j]->refcount);
     }
 }
 
@@ -1890,23 +1894,18 @@ void logCurrentClient(client *cc, const char *title) {
     client = catClientInfoString(sdsempty(), cc, server.hide_user_data_from_log);
     serverLog(LL_WARNING | LL_RAW, "%s\n", client);
     sdsfree(client);
-    serverLog(LL_WARNING | LL_RAW, "argc: '%d'\n", cc->argc);
+    serverLog(LL_WARNING | LL_RAW, "argc: %d\n", cc->argc);
     for (j = 0; j < cc->argc; j++) {
         if (shouldRedactArg(cc, j)) {
             serverLog(LL_WARNING | LL_RAW, "argv[%d]: %zu bytes\n", j, sdslen((sds)cc->argv[j]->ptr));
             continue;
         }
-        robj *decoded;
-        decoded = getDecodedObject(cc->argv[j]);
-        sds repr = sdscatrepr(sdsempty(), decoded->ptr, min(sdslen(decoded->ptr), 128));
-        serverLog(LL_WARNING | LL_RAW, "argv[%d]: '%s'\n", j, (char *)repr);
-        if (!strcasecmp(decoded->ptr, "auth") || !strcasecmp(decoded->ptr, "auth2")) {
-            sdsfree(repr);
-            decrRefCount(decoded);
+        sds repr = getArgvReprString(cc->argv[j]);
+        serverLog(LL_WARNING | LL_RAW, "argv[%d]: %s\n", j, repr);
+        sdsfree(repr);
+        if (!strcasecmp(cc->argv[j]->ptr, "auth") || !strcasecmp(cc->argv[j]->ptr, "auth2")) {
             break;
         }
-        sdsfree(repr);
-        decrRefCount(decoded);
     }
     /* Check if the first argument, usually a key, is found inside the
      * selected DB, and if so print info about the associated object. */

From 4f61034934cf165163ef272e5795bccadc288b09 Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Mon, 9 Dec 2024 12:28:17 -0800
Subject: [PATCH 27/73] Update governance and maintainers file for Valkey
 committers (#1390)

We added two more committers, but according to our governance document
that makes them TSC members. As we discussed, for now we want to keep
the balance of corporate interests, so so updating the governance to
explicitly list TSC members compared to folks with just write
permissions.

Also adds the new new folks with commit permissions.

---------

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 GOVERNANCE.md  |  4 +++-
 MAINTAINERS.md | 10 +++++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/GOVERNANCE.md b/GOVERNANCE.md
index 33c3887430..7fd33272cb 100644
--- a/GOVERNANCE.md
+++ b/GOVERNANCE.md
@@ -2,7 +2,9 @@
 
 The Valkey project is managed by a Technical Steering Committee (TSC) composed of the maintainers of the Valkey repository.
 The Valkey project includes all of the current and future repositories under the Valkey-io organization.
-Maintainers are defined as individuals with full commit access to a repository, which shall be in sync with the MAINTAINERS.md file in a given projects repository.
+Committers are defined as individuals with write access to the code within a repository.
+Maintainers are defined as individuals with full access to a repository and own its governance.
+Both maintainers and committers should be clearly listed in the MAINTAINERS.md file in a given projects repository.
 Maintainers of other repositories within the Valkey project are not members of the TSC unless explicitly added.
 
 ## Technical Steering Committee
diff --git a/MAINTAINERS.md b/MAINTAINERS.md
index 635bf25067..947979eb33 100644
--- a/MAINTAINERS.md
+++ b/MAINTAINERS.md
@@ -16,8 +16,16 @@ Maintainers listed in alphabetical order by their github ID.
 | Zhao Zhao           | [soloestoy](https://github.com/soloestoy)       | Alibaba     |
 | Viktor Söderqvist   | [zuiderkwast](https://github.com/zuiderkwast)   | Ericsson    |
 
+## Current Committers
 
-### Former Maintainers
+Committers listed in alphabetical order by their github ID.
+
+| Committer           | GitHub ID                                       | Affiliation |
+| ------------------- | ----------------------------------------------- | ----------- |
+| Harkrishn Patro     | [hpatro](https://github.com/hpatro)             | Amazon      |
+| Ran Shidlansik      | [ranshid](https://github.com/ranshid)           | Amazon      |
+
+### Former Maintainers and Committers 
 
 | Maintainer          | GitHub ID                                       | Affiliation |
 | ------------------- | ----------------------------------------------- | ----------- |
\ No newline at end of file

From 9cfe1b3d81466ed324c28e55ba60be66dea0b7c9 Mon Sep 17 00:00:00 2001
From: Sarthak Aggarwal <sarthakaggarwal97@gmail.com>
Date: Tue, 10 Dec 2024 03:54:49 -0800
Subject: [PATCH 28/73] Set Command with IFEQ Support (#1324)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR allows the Valkey users to perform conditional updates where the
SET command is completed if the given comparison-value matches the key’s
current value.

Syntax:

```
SET key value IFEQ comparison-value
```

Behavior:

If the values match, the SET completes as expected. If they do not
match, the command returns a (nil), except if the GET argument is also
given (see below).

Behavior with Additional Flags:

1. ```SET key value IFEQ comparison-value GET``` returns the existing
value, regardless of whether it matches comparison-value or not. The
conditional set operation is performed if the given comparison value
matches the existing value. To check if the SET succeeded, the caller
needs to check if the returned string matches the comparison-value.
2. ```SET key value IFEQ comparison-value XX``` is a syntax error.
3.  ```SET key value IFEQ comparison-value NX``` is a syntax error.

Closes: #1215

---------

Signed-off-by: Sarthak Aggarwal <sarthagg@amazon.com>
---
 src/commands.def                     | 10 ++--
 src/commands/set.json                | 25 +++++++--
 src/t_string.c                       | 79 ++++++++++++++++++++--------
 tests/assets/test_cli_hint_suite.txt | 22 ++++----
 tests/unit/type/string.tcl           | 50 ++++++++++++++++++
 5 files changed, 145 insertions(+), 41 deletions(-)

diff --git a/src/commands.def b/src/commands.def
index 1ac2368ee1..f03e44db9f 100644
--- a/src/commands.def
+++ b/src/commands.def
@@ -10632,6 +10632,7 @@ commandHistory SET_History[] = {
 {"6.0.0","Added the `KEEPTTL` option."},
 {"6.2.0","Added the `GET`, `EXAT` and `PXAT` option."},
 {"7.0.0","Allowed the `NX` and `GET` options to be used together."},
+{"8.1.0","Added the `IFEQ` option."},
 };
 #endif
 
@@ -10649,8 +10650,9 @@ keySpec SET_Keyspecs[1] = {
 
 /* SET condition argument table */
 struct COMMAND_ARG SET_condition_Subargs[] = {
-{MAKE_ARG("nx",ARG_TYPE_PURE_TOKEN,-1,"NX",NULL,NULL,CMD_ARG_NONE,0,NULL)},
-{MAKE_ARG("xx",ARG_TYPE_PURE_TOKEN,-1,"XX",NULL,NULL,CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("nx",ARG_TYPE_PURE_TOKEN,-1,"NX",NULL,"2.6.12",CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("xx",ARG_TYPE_PURE_TOKEN,-1,"XX",NULL,"2.6.12",CMD_ARG_NONE,0,NULL)},
+{MAKE_ARG("comparison-value",ARG_TYPE_STRING,-1,"IFEQ","Sets the key's value only if the current value matches the specified comparison value.","8.1.0",CMD_ARG_NONE,0,NULL)},
 };
 
 /* SET expiration argument table */
@@ -10666,7 +10668,7 @@ struct COMMAND_ARG SET_expiration_Subargs[] = {
 struct COMMAND_ARG SET_Args[] = {
 {MAKE_ARG("key",ARG_TYPE_KEY,0,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)},
 {MAKE_ARG("value",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)},
-{MAKE_ARG("condition",ARG_TYPE_ONEOF,-1,NULL,NULL,"2.6.12",CMD_ARG_OPTIONAL,2,NULL),.subargs=SET_condition_Subargs},
+{MAKE_ARG("condition",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL,3,NULL),.subargs=SET_condition_Subargs},
 {MAKE_ARG("get",ARG_TYPE_PURE_TOKEN,-1,"GET",NULL,"6.2.0",CMD_ARG_OPTIONAL,0,NULL)},
 {MAKE_ARG("expiration",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL,5,NULL),.subargs=SET_expiration_Subargs},
 };
@@ -11139,7 +11141,7 @@ struct COMMAND_STRUCT serverCommandTable[] = {
 {MAKE_CMD("mset","Atomically creates or modifies the string values of one or more keys.","O(N) where N is the number of keys to set.","1.0.1",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,MSET_History,0,MSET_Tips,2,msetCommand,-3,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_STRING,MSET_Keyspecs,1,NULL,1),.args=MSET_Args},
 {MAKE_CMD("msetnx","Atomically modifies the string values of one or more keys only when all keys don't exist.","O(N) where N is the number of keys to set.","1.0.1",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,MSETNX_History,0,MSETNX_Tips,0,msetnxCommand,-3,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_STRING,MSETNX_Keyspecs,1,NULL,1),.args=MSETNX_Args},
 {MAKE_CMD("psetex","Sets both string value and expiration time in milliseconds of a key. The key is created if it doesn't exist.","O(1)","2.6.0",CMD_DOC_DEPRECATED,"`SET` with the `PX` argument","2.6.12","string",COMMAND_GROUP_STRING,PSETEX_History,0,PSETEX_Tips,0,psetexCommand,4,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_STRING,PSETEX_Keyspecs,1,NULL,3),.args=PSETEX_Args},
-{MAKE_CMD("set","Sets the string value of a key, ignoring its type. The key is created if it doesn't exist.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,SET_History,4,SET_Tips,0,setCommand,-3,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_STRING,SET_Keyspecs,1,setGetKeys,5),.args=SET_Args},
+{MAKE_CMD("set","Sets the string value of a key, ignoring its type. The key is created if it doesn't exist.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,SET_History,5,SET_Tips,0,setCommand,-3,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_STRING,SET_Keyspecs,1,setGetKeys,5),.args=SET_Args},
 {MAKE_CMD("setex","Sets the string value and expiration time of a key. Creates the key if it doesn't exist.","O(1)","2.0.0",CMD_DOC_DEPRECATED,"`SET` with the `EX` argument","2.6.12","string",COMMAND_GROUP_STRING,SETEX_History,0,SETEX_Tips,0,setexCommand,4,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_STRING,SETEX_Keyspecs,1,NULL,3),.args=SETEX_Args},
 {MAKE_CMD("setnx","Set the string value of a key only when the key doesn't exist.","O(1)","1.0.0",CMD_DOC_DEPRECATED,"`SET` with the `NX` argument","2.6.12","string",COMMAND_GROUP_STRING,SETNX_History,0,SETNX_Tips,0,setnxCommand,3,CMD_WRITE|CMD_DENYOOM|CMD_FAST,ACL_CATEGORY_STRING,SETNX_Keyspecs,1,NULL,2),.args=SETNX_Args},
 {MAKE_CMD("setrange","Overwrites a part of a string value with another by an offset. Creates the key if it doesn't exist.","O(1), not counting the time taken to copy the new string in place. Usually, this string is very small so the amortized complexity is O(1). Otherwise, complexity is O(M) with M being the length of the value argument.","2.2.0",CMD_DOC_NONE,NULL,NULL,"string",COMMAND_GROUP_STRING,SETRANGE_History,0,SETRANGE_Tips,0,setrangeCommand,4,CMD_WRITE|CMD_DENYOOM,ACL_CATEGORY_STRING,SETRANGE_Keyspecs,1,NULL,3),.args=SETRANGE_Args},
diff --git a/src/commands/set.json b/src/commands/set.json
index 8236bc7bb9..3d3800f11d 100644
--- a/src/commands/set.json
+++ b/src/commands/set.json
@@ -23,6 +23,10 @@
             [
                 "7.0.0",
                 "Allowed the `NX` and `GET` options to be used together."
+            ],
+            [
+                "8.1.0",
+                "Added the `IFEQ` option."
             ]
         ],
         "command_flags": [
@@ -89,17 +93,32 @@
                 "name": "condition",
                 "type": "oneof",
                 "optional": true,
-                "since": "2.6.12",
                 "arguments": [
                     {
                         "name": "nx",
                         "type": "pure-token",
-                        "token": "NX"
+                        "token": "NX",
+                        "since": "2.6.12"
                     },
                     {
                         "name": "xx",
                         "type": "pure-token",
-                        "token": "XX"
+                        "token": "XX",
+                        "since": "2.6.12"
+                    },
+                    {
+                        "name": "comparison-value",
+                        "type": "string",
+                        "token": "IFEQ",
+                        "since": "8.1.0",
+                        "summary": "Sets the key's value only if the current value matches the specified comparison value.",
+                        "arguments": [
+                            {
+                                "name": "comparison-value",
+                                "type": "string",
+                                "summary": "The value to compare with the current key's value before setting."
+                            }
+                        ]
                     }
                 ]
             },
diff --git a/src/t_string.c b/src/t_string.c
index 1c90eabf3e..0dfebee038 100644
--- a/src/t_string.c
+++ b/src/t_string.c
@@ -67,15 +67,16 @@ static int checkStringLength(client *c, long long size, long long append) {
  * If abort_reply is NULL, "$-1" is used. */
 
 #define OBJ_NO_FLAGS 0
-#define OBJ_SET_NX (1 << 0)  /* Set if key not exists. */
-#define OBJ_SET_XX (1 << 1)  /* Set if key exists. */
-#define OBJ_EX (1 << 2)      /* Set if time in seconds is given */
-#define OBJ_PX (1 << 3)      /* Set if time in ms in given */
-#define OBJ_KEEPTTL (1 << 4) /* Set and keep the ttl */
-#define OBJ_SET_GET (1 << 5) /* Set if want to get key before set */
-#define OBJ_EXAT (1 << 6)    /* Set if timestamp in second is given */
-#define OBJ_PXAT (1 << 7)    /* Set if timestamp in ms is given */
-#define OBJ_PERSIST (1 << 8) /* Set if we need to remove the ttl */
+#define OBJ_SET_NX (1 << 0)   /* Set if key not exists. */
+#define OBJ_SET_XX (1 << 1)   /* Set if key exists. */
+#define OBJ_EX (1 << 2)       /* Set if time in seconds is given */
+#define OBJ_PX (1 << 3)       /* Set if time in ms in given */
+#define OBJ_KEEPTTL (1 << 4)  /* Set and keep the ttl */
+#define OBJ_SET_GET (1 << 5)  /* Set if want to get key before set */
+#define OBJ_EXAT (1 << 6)     /* Set if timestamp in second is given */
+#define OBJ_PXAT (1 << 7)     /* Set if timestamp in ms is given */
+#define OBJ_PERSIST (1 << 8)  /* Set if we need to remove the ttl */
+#define OBJ_SET_IFEQ (1 << 9) /* Set if we need compare and set */
 
 /* Forward declaration */
 static int getExpireMillisecondsOrReply(client *c, robj *expire, int flags, int unit, long long *milliseconds);
@@ -87,7 +88,8 @@ void setGenericCommand(client *c,
                        robj *expire,
                        int unit,
                        robj *ok_reply,
-                       robj *abort_reply) {
+                       robj *abort_reply,
+                       robj *comparison) {
     long long milliseconds = 0; /* initialized to avoid any harmness warning */
     int found = 0;
     int setkey_flags = 0;
@@ -100,7 +102,27 @@ void setGenericCommand(client *c,
         if (getGenericCommand(c) == C_ERR) return;
     }
 
-    found = (lookupKeyWrite(c->db, key) != NULL);
+    robj *existing_value = lookupKeyWrite(c->db, key);
+    found = existing_value != NULL;
+
+    /* Handle the IFEQ conditional check */
+    if (flags & OBJ_SET_IFEQ && found) {
+        if (!(flags & OBJ_SET_GET) && checkType(c, existing_value, OBJ_STRING)) {
+            return;
+        }
+
+        if (compareStringObjects(existing_value, comparison) != 0) {
+            if (!(flags & OBJ_SET_GET)) {
+                addReply(c, abort_reply ? abort_reply : shared.null[c->resp]);
+            }
+            return;
+        }
+    } else if (flags & OBJ_SET_IFEQ && !found) {
+        if (!(flags & OBJ_SET_GET)) {
+            addReply(c, abort_reply ? abort_reply : shared.null[c->resp]);
+        }
+        return;
+    }
 
     if ((flags & OBJ_SET_NX && found) || (flags & OBJ_SET_XX && !found)) {
         if (!(flags & OBJ_SET_GET)) {
@@ -208,7 +230,7 @@ static int getExpireMillisecondsOrReply(client *c, robj *expire, int flags, int
  * string arguments used in SET and GET command.
  *
  * Get specific commands - PERSIST/DEL
- * Set specific commands - XX/NX/GET
+ * Set specific commands - XX/NX/GET/IFEQ
  * Common commands - EX/EXAT/PX/PXAT/KEEPTTL
  *
  * Function takes pointers to client, flags, unit, pointer to pointer of expire obj if needed
@@ -219,7 +241,7 @@ static int getExpireMillisecondsOrReply(client *c, robj *expire, int flags, int
  * Input flags are updated upon parsing the arguments. Unit and expire are updated if there are any
  * EX/EXAT/PX/PXAT arguments. Unit is updated to millisecond if PX/PXAT is set.
  */
-int parseExtendedStringArgumentsOrReply(client *c, int *flags, int *unit, robj **expire, int command_type) {
+int parseExtendedStringArgumentsOrReply(client *c, int *flags, int *unit, robj **expire, robj **compare_val, int command_type) {
     int j = command_type == COMMAND_GET ? 2 : 3;
     for (; j < c->argc; j++) {
         char *opt = c->argv[j]->ptr;
@@ -228,14 +250,23 @@ int parseExtendedStringArgumentsOrReply(client *c, int *flags, int *unit, robj *
         /* clang-format off */
         if ((opt[0] == 'n' || opt[0] == 'N') &&
             (opt[1] == 'x' || opt[1] == 'X') && opt[2] == '\0' &&
-            !(*flags & OBJ_SET_XX) && (command_type == COMMAND_SET))
+            !(*flags & OBJ_SET_XX || *flags & OBJ_SET_IFEQ) && (command_type == COMMAND_SET))
         {
             *flags |= OBJ_SET_NX;
         } else if ((opt[0] == 'x' || opt[0] == 'X') &&
                    (opt[1] == 'x' || opt[1] == 'X') && opt[2] == '\0' &&
-                   !(*flags & OBJ_SET_NX) && (command_type == COMMAND_SET))
+                   !(*flags & OBJ_SET_NX || *flags & OBJ_SET_IFEQ) && (command_type == COMMAND_SET))
         {
             *flags |= OBJ_SET_XX;
+        } else if ((opt[0] == 'i' || opt[0] == 'I') &&
+            (opt[1] == 'f' || opt[1] == 'F') &&
+            (opt[2] == 'e' || opt[2] == 'E') &&
+            (opt[3] == 'q' || opt[3] == 'Q') && opt[4] == '\0' &&
+            next && !(*flags & OBJ_SET_NX || *flags & OBJ_SET_XX || *flags & OBJ_SET_IFEQ) && (command_type == COMMAND_SET))
+        {
+            *flags |= OBJ_SET_IFEQ;
+            *compare_val = next;
+            j++;
         } else if ((opt[0] == 'g' || opt[0] == 'G') &&
                    (opt[1] == 'e' || opt[1] == 'E') &&
                    (opt[2] == 't' || opt[2] == 'T') && opt[3] == '\0' &&
@@ -304,34 +335,36 @@ int parseExtendedStringArgumentsOrReply(client *c, int *flags, int *unit, robj *
     return C_OK;
 }
 
-/* SET key value [NX] [XX] [KEEPTTL] [GET] [EX <seconds>] [PX <milliseconds>]
- *     [EXAT <seconds-timestamp>][PXAT <milliseconds-timestamp>] */
+/* SET key value [NX | XX | IFEQ comparison-value] [GET]
+ *     [EX seconds | PX milliseconds |
+ *      EXAT seconds-timestamp | PXAT milliseconds-timestamp | KEEPTTL] */
 void setCommand(client *c) {
     robj *expire = NULL;
+    robj *comparison = NULL;
     int unit = UNIT_SECONDS;
     int flags = OBJ_NO_FLAGS;
 
-    if (parseExtendedStringArgumentsOrReply(c, &flags, &unit, &expire, COMMAND_SET) != C_OK) {
+    if (parseExtendedStringArgumentsOrReply(c, &flags, &unit, &expire, &comparison, COMMAND_SET) != C_OK) {
         return;
     }
 
     c->argv[2] = tryObjectEncoding(c->argv[2]);
-    setGenericCommand(c, flags, c->argv[1], c->argv[2], expire, unit, NULL, NULL);
+    setGenericCommand(c, flags, c->argv[1], c->argv[2], expire, unit, NULL, NULL, comparison);
 }
 
 void setnxCommand(client *c) {
     c->argv[2] = tryObjectEncoding(c->argv[2]);
-    setGenericCommand(c, OBJ_SET_NX, c->argv[1], c->argv[2], NULL, 0, shared.cone, shared.czero);
+    setGenericCommand(c, OBJ_SET_NX, c->argv[1], c->argv[2], NULL, 0, shared.cone, shared.czero, NULL);
 }
 
 void setexCommand(client *c) {
     c->argv[3] = tryObjectEncoding(c->argv[3]);
-    setGenericCommand(c, OBJ_EX, c->argv[1], c->argv[3], c->argv[2], UNIT_SECONDS, NULL, NULL);
+    setGenericCommand(c, OBJ_EX, c->argv[1], c->argv[3], c->argv[2], UNIT_SECONDS, NULL, NULL, NULL);
 }
 
 void psetexCommand(client *c) {
     c->argv[3] = tryObjectEncoding(c->argv[3]);
-    setGenericCommand(c, OBJ_PX, c->argv[1], c->argv[3], c->argv[2], UNIT_MILLISECONDS, NULL, NULL);
+    setGenericCommand(c, OBJ_PX, c->argv[1], c->argv[3], c->argv[2], UNIT_MILLISECONDS, NULL, NULL, NULL);
 }
 
 int getGenericCommand(client *c) {
@@ -377,7 +410,7 @@ void getexCommand(client *c) {
     int unit = UNIT_SECONDS;
     int flags = OBJ_NO_FLAGS;
 
-    if (parseExtendedStringArgumentsOrReply(c, &flags, &unit, &expire, COMMAND_GET) != C_OK) {
+    if (parseExtendedStringArgumentsOrReply(c, &flags, &unit, &expire, NULL, COMMAND_GET) != C_OK) {
         return;
     }
 
diff --git a/tests/assets/test_cli_hint_suite.txt b/tests/assets/test_cli_hint_suite.txt
index 3cebf5229c..b8cfb0fdf1 100644
--- a/tests/assets/test_cli_hint_suite.txt
+++ b/tests/assets/test_cli_hint_suite.txt
@@ -68,17 +68,17 @@
 "ZRANGE k 1 2 WITHSCORES " "[BYSCORE|BYLEX] [REV] [LIMIT offset count]"
 
 # Optional one-of args with parameters: SET key value [NX|XX] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]
-"SET key value " "[NX|XX] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]"
-"SET key value EX" "[NX|XX] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]"
-"SET key value EX " "seconds [NX|XX] [GET]"
-"SET key value EX 23 " "[NX|XX] [GET]"
-"SET key value EXAT" "[NX|XX] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]"
-"SET key value EXAT " "unix-time-seconds [NX|XX] [GET]"
-"SET key value PX" "[NX|XX] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]"
-"SET key value PX " "milliseconds [NX|XX] [GET]"
-"SET key value PXAT" "[NX|XX] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]"
-"SET key value PXAT " "unix-time-milliseconds [NX|XX] [GET]"
-"SET key value KEEPTTL " "[NX|XX] [GET]"
+"SET key value " "[NX|XX|IFEQ comparison-value] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]"
+"SET key value EX" "[NX|XX|IFEQ comparison-value] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]"
+"SET key value EX " "seconds [NX|XX|IFEQ comparison-value] [GET]"
+"SET key value EX 23 " "[NX|XX|IFEQ comparison-value] [GET]"
+"SET key value EXAT" "[NX|XX|IFEQ comparison-value] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]"
+"SET key value EXAT " "unix-time-seconds [NX|XX|IFEQ comparison-value] [GET]"
+"SET key value PX" "[NX|XX|IFEQ comparison-value] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]"
+"SET key value PX " "milliseconds [NX|XX|IFEQ comparison-value] [GET]"
+"SET key value PXAT" "[NX|XX|IFEQ comparison-value] [GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]"
+"SET key value PXAT " "unix-time-milliseconds [NX|XX|IFEQ comparison-value] [GET]"
+"SET key value KEEPTTL " "[NX|XX|IFEQ comparison-value] [GET]"
 "SET key value XX " "[GET] [EX seconds|PX milliseconds|EXAT unix-time-seconds|PXAT unix-time-milliseconds|KEEPTTL]"
 
 # If an input word can't be matched, stop hinting.
diff --git a/tests/unit/type/string.tcl b/tests/unit/type/string.tcl
index d7969b5b3e..bbfb30b60d 100644
--- a/tests/unit/type/string.tcl
+++ b/tests/unit/type/string.tcl
@@ -582,6 +582,56 @@ if {[string match {*jemalloc*} [s mem_allocator]]} {
       set err1
     } {*WRONGTYPE*}
 
+    test "SET with IFEQ conditional" {
+        r del foo
+
+        r set foo "initial_value"
+
+        assert_equal {OK} [r set foo "new_value" ifeq "initial_value"]
+        assert_equal "new_value" [r get foo]
+
+        assert_equal {} [r set foo "should_not_set" ifeq "wrong_value"]
+        assert_equal "new_value" [r get foo]
+    }
+
+    test "SET with IFEQ conditional - non-string current value" {
+        r del foo
+
+        r sadd foo "some_set_value"
+        assert_error {WRONGTYPE Operation against a key holding the wrong kind of value} {r set foo "new_value" ifeq "some_set_value"}
+    }
+
+
+    test "SET with IFEQ conditional - with get" {
+        r del foo
+
+        assert_equal {} [r set foo "new_value" ifeq "initial_value" get]
+        assert_equal {} [r get foo]
+
+        r set foo "initial_value"
+
+        assert_equal "initial_value" [r set foo "new_value" ifeq "initial_value" get]
+        assert_equal "new_value" [r get foo]
+    }
+
+    test "SET with IFEQ conditional - non string current value with get" {
+        r del foo
+
+        r sadd foo "some_set_value"
+
+        assert_error {WRONGTYPE Operation against a key holding the wrong kind of value} {r set foo "new_value" ifeq "initial_value" get}
+    }
+
+    test "SET with IFEQ conditional - with xx" {
+        r del foo
+        assert_error {ERR syntax error} {r set foo "new_value" ifeq "initial_value" xx}
+    }
+
+    test "SET with IFEQ conditional - with nx" {
+        r del foo
+        assert_error {ERR syntax error} {r set foo "new_value" ifeq "initial_value" nx}
+    }
+
     test {Extended SET EX option} {
         r del foo
         r set foo bar ex 10

From f951a1ca730c5b00b1c1d8e1590bc7f4c2d7d5c2 Mon Sep 17 00:00:00 2001
From: Yanqi Lv <lvyanqi.lyq@alibaba-inc.com>
Date: Tue, 10 Dec 2024 20:35:07 +0800
Subject: [PATCH 29/73] Add new flag in `CLIENT LIST` for import-source client
 (#1398)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add new flag "I" in `CLIENT LIST` for import-source client
- Add `DEBUG_CONFIG` for import-mode
- Allow import-source status to be turned off when import-mode is off

Fixes #1350 and
https://github.com/valkey-io/valkey/pull/1185#discussion_r1851049362.

---------

Signed-off-by: lvyanqi.lyq <lvyanqi.lyq@alibaba-inc.com>
Signed-off-by: Yanqi Lv <lvyanqi.lyq@alibaba-inc.com>
Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
Co-authored-by: Binbin <binloveplay1314@qq.com>
---
 src/config.c          |  2 +-
 src/networking.c      |  3 ++-
 tests/unit/expire.tcl | 13 +++++++++++++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/config.c b/src/config.c
index 5a07c2c0f0..dcb5a99ce3 100644
--- a/src/config.c
+++ b/src/config.c
@@ -3209,7 +3209,7 @@ standardConfig static_configs[] = {
     createBoolConfig("enable-debug-assert", NULL, IMMUTABLE_CONFIG | HIDDEN_CONFIG, server.enable_debug_assert, 0, NULL, NULL),
     createBoolConfig("cluster-slot-stats-enabled", NULL, MODIFIABLE_CONFIG, server.cluster_slot_stats_enabled, 0, NULL, NULL),
     createBoolConfig("hide-user-data-from-log", NULL, MODIFIABLE_CONFIG, server.hide_user_data_from_log, 1, NULL, NULL),
-    createBoolConfig("import-mode", NULL, MODIFIABLE_CONFIG, server.import_mode, 0, NULL, NULL),
+    createBoolConfig("import-mode", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.import_mode, 0, NULL, NULL),
 
     /* String Configs */
     createStringConfig("aclfile", NULL, IMMUTABLE_CONFIG, ALLOW_EMPTY_STRING, server.acl_filename, "", NULL, NULL),
diff --git a/src/networking.c b/src/networking.c
index debd94ddfc..4d386d6dc4 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -3340,6 +3340,7 @@ sds catClientInfoString(sds s, client *client, int hide_user_data) {
     if (client->flag.readonly) *p++ = 'r';
     if (client->flag.no_evict) *p++ = 'e';
     if (client->flag.no_touch) *p++ = 'T';
+    if (client->flag.import_source) *p++ = 'I';
     if (p == flags) *p++ = 'N';
     *p++ = '\0';
 
@@ -4101,7 +4102,7 @@ void clientCommand(client *c) {
         addReply(c, shared.ok);
     } else if (!strcasecmp(c->argv[1]->ptr, "import-source")) {
         /* CLIENT IMPORT-SOURCE ON|OFF */
-        if (!server.import_mode) {
+        if (!server.import_mode && strcasecmp(c->argv[2]->ptr, "off")) {
             addReplyError(c, "Server is not in import mode");
             return;
         }
diff --git a/tests/unit/expire.tcl b/tests/unit/expire.tcl
index 941acfad38..c5c11191c0 100644
--- a/tests/unit/expire.tcl
+++ b/tests/unit/expire.tcl
@@ -833,6 +833,19 @@ start_server {tags {"expire"}} {
         assert_equal [r debug set-active-expire 1] {OK}
     } {} {needs:debug}
 
+    test {import-source can be closed when import-mode is off} {
+        r config set import-mode no
+        assert_error "ERR Server is not in import mode" {r client import-source on}
+
+        r config set import-mode yes
+        assert_equal [r client import-source on] {OK}
+        assert_match {*flags=I*} [r client list id [r client id]]
+
+        r config set import-mode no
+        assert_equal [r client import-source off] {OK}
+        assert_match {*flags=N*} [r client list id [r client id]]
+    }
+
     test {Import mode should forbid active expiration} {
         r flushall
 

From 2dfe25b40839bb7e904d83622b09b999b25fb160 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= <viktor.soderqvist@est.tech>
Date: Tue, 10 Dec 2024 14:46:21 +0100
Subject: [PATCH 30/73] Fix race in test "CLUSTER SLOT-STATS cpu-usec for
 blocking commands, unblocked on timeout" (#1416)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fix changes the timeout for BLPOP in this test case from 1 second
to 0.5 seconds.

In the test case quoted below, the procedure
`wait_for_blocked_clients_count` waits for one second by default. If
BLPOP has 1 second timeout and the first
`wait_for_blocked_clients_count` finishes very fast, then the second
`wait_for_blocked_clients_count` can time out before the BLPOP has been
unblocked.

```TCL
    test "CLUSTER SLOT-STATS cpu-usec for blocking commands, unblocked on timeout." {
        # Blocking command with 1 second timeout.
        set rd [valkey_deferring_client]
        $rd BLPOP $key 1

        # Confirm that the client is blocked, then unblocked after 1 second timeout.
        wait_for_blocked_clients_count 1
        wait_for_blocked_clients_count 0
```

As seen in the definition of `wait_for_blocked_clients_count`, the total
time to wait is 1 second by default.

```TCL
proc wait_for_blocked_clients_count {count {maxtries 100} {delay 10} {idx 0}} {
    wait_for_condition $maxtries $delay  {
        [s $idx blocked_clients] == $count
    } else {
        fail "Timeout waiting for blocked clients"
    }
}
```

Fixes #1121

Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
---
 tests/unit/cluster/slot-stats.tcl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/unit/cluster/slot-stats.tcl b/tests/unit/cluster/slot-stats.tcl
index 3e3487a612..99f9c1c03a 100644
--- a/tests/unit/cluster/slot-stats.tcl
+++ b/tests/unit/cluster/slot-stats.tcl
@@ -228,11 +228,11 @@ start_cluster 1 0 {tags {external:skip cluster} overrides {cluster-slot-stats-en
     R 0 FLUSHALL
 
     test "CLUSTER SLOT-STATS cpu-usec for blocking commands, unblocked on timeout." {
-        # Blocking command with 1 second timeout.
+        # Blocking command with 0.5 seconds timeout.
         set rd [valkey_deferring_client]
-        $rd BLPOP $key 1
+        $rd BLPOP $key 0.5
 
-        # Confirm that the client is blocked, then unblocked after 1 second timeout.
+        # Confirm that the client is blocked, then unblocked within 1 second.
         wait_for_blocked_clients_count 1
         wait_for_blocked_clients_count 0
 
@@ -971,4 +971,4 @@ start_cluster 1 1 {tags {external:skip cluster} overrides {cluster-slot-stats-en
     }
     R 0 CONFIG RESETSTAT
     R 1 CONFIG RESETSTAT
-}
\ No newline at end of file
+}

From 7e564887b93ea3d1008cd2ea2d2bb82c4a4b0a04 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Wed, 11 Dec 2024 00:37:18 +0800
Subject: [PATCH 31/73] Set HIDDEN_CONFIG flag on events-per-io-thread (#1408)

events-per-io-thread is for testing purposes that allow us to force the
main thread to always offload the works to the IO threads, see
adjustIOThreadsByEventLoad for more details.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/config.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/config.c b/src/config.c
index dcb5a99ce3..9ea28298d7 100644
--- a/src/config.c
+++ b/src/config.c
@@ -3271,7 +3271,7 @@ standardConfig static_configs[] = {
     createIntConfig("databases", NULL, IMMUTABLE_CONFIG, 1, INT_MAX, server.dbnum, 16, INTEGER_CONFIG, NULL, NULL),
     createIntConfig("port", NULL, MODIFIABLE_CONFIG, 0, 65535, server.port, 6379, INTEGER_CONFIG, NULL, updatePort),                                   /* TCP port. */
     createIntConfig("io-threads", NULL, DEBUG_CONFIG | IMMUTABLE_CONFIG, 1, IO_THREADS_MAX_NUM, server.io_threads_num, 1, INTEGER_CONFIG, NULL, NULL), /* Single threaded by default */
-    createIntConfig("events-per-io-thread", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.events_per_io_thread, 2, INTEGER_CONFIG, NULL, NULL),
+    createIntConfig("events-per-io-thread", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 0, INT_MAX, server.events_per_io_thread, 2, INTEGER_CONFIG, NULL, NULL),
     createIntConfig("prefetch-batch-max-size", NULL, MODIFIABLE_CONFIG, 0, 128, server.prefetch_batch_max_size, 16, INTEGER_CONFIG, NULL, NULL),
     createIntConfig("auto-aof-rewrite-percentage", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.aof_rewrite_perc, 100, INTEGER_CONFIG, NULL, NULL),
     createIntConfig("cluster-replica-validity-factor", "cluster-slave-validity-factor", MODIFIABLE_CONFIG, 0, INT_MAX, server.cluster_replica_validity_factor, 10, INTEGER_CONFIG, NULL, NULL), /* replica max data age factor. */

From b4c2a1804a20211a0cca634393d689e5bc96ddf7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= <viktor.soderqvist@est.tech>
Date: Tue, 10 Dec 2024 19:52:06 +0100
Subject: [PATCH 32/73] Fix flaky init_test proc in maxmemory test suite
 (#1419)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The following error has been seen, but not reliably reproduced:

```
*** [err]: eviction due to output buffers of pubsub, client eviction: true in tests/unit/maxmemory.tcl
Expected '42' to be equal to '50' (context: type proc line 17 cmd {assert_equal [r dbsize] 50} proc ::init_test level 2)
```

The reason is probably that FLUSHDB is asynchronous and when we start
populating new keys, they are evicted because the background flush is
too slow. Changing this to FLUSHDB SYNC prevents this.

Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
---
 tests/unit/maxmemory.tcl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/maxmemory.tcl b/tests/unit/maxmemory.tcl
index 89e9699a3e..3b0a44a156 100644
--- a/tests/unit/maxmemory.tcl
+++ b/tests/unit/maxmemory.tcl
@@ -4,7 +4,7 @@ start_server {tags {"maxmemory" "external:skip"}} {
     set server_pid [s process_id]
 
     proc init_test {client_eviction} {
-        r flushdb
+        r flushdb sync
 
         set prev_maxmemory_clients [r config get maxmemory-clients]
         if $client_eviction {
@@ -628,4 +628,4 @@ start_server {tags {"maxmemory" "external:skip"}} {
 
         assert_equal [r dbsize] {0}
     }
-}
\ No newline at end of file
+}

From c8ee5c2c46c52bcb2b48e190a8f5131fdbcf355e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= <viktor.soderqvist@est.tech>
Date: Mon, 18 Nov 2024 10:29:49 +0100
Subject: [PATCH 33/73] Hashtable implementation including unit tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A cache-line aware hash table with a user-defined key-value entry type,
supporting incremental rehashing, scan, iterator, random sampling,
incremental lookup and more...

Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
---
 cmake/Modules/SourceFiles.cmake |    1 +
 src/Makefile                    |    2 +-
 src/hashtable.c                 | 2138 +++++++++++++++++++++++++++++++
 src/hashtable.h                 |  167 +++
 src/unit/test_files.h           |   18 +
 src/unit/test_hashtable.c       |  869 +++++++++++++
 6 files changed, 3194 insertions(+), 1 deletion(-)
 create mode 100644 src/hashtable.c
 create mode 100644 src/hashtable.h
 create mode 100644 src/unit/test_hashtable.c

diff --git a/cmake/Modules/SourceFiles.cmake b/cmake/Modules/SourceFiles.cmake
index c34ae644a2..1a754ff846 100644
--- a/cmake/Modules/SourceFiles.cmake
+++ b/cmake/Modules/SourceFiles.cmake
@@ -10,6 +10,7 @@ set(VALKEY_SERVER_SRCS
     ${CMAKE_SOURCE_DIR}/src/ae.c
     ${CMAKE_SOURCE_DIR}/src/anet.c
     ${CMAKE_SOURCE_DIR}/src/dict.c
+    ${CMAKE_SOURCE_DIR}/src/hashtable.c
     ${CMAKE_SOURCE_DIR}/src/kvstore.c
     ${CMAKE_SOURCE_DIR}/src/sds.c
     ${CMAKE_SOURCE_DIR}/src/zmalloc.c
diff --git a/src/Makefile b/src/Makefile
index 3b4ad0a2ef..8552deb3d9 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -411,7 +411,7 @@ endif
 ENGINE_NAME=valkey
 SERVER_NAME=$(ENGINE_NAME)-server$(PROG_SUFFIX)
 ENGINE_SENTINEL_NAME=$(ENGINE_NAME)-sentinel$(PROG_SUFFIX)
-ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o memory_prefetch.o io_threads.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o allocator_defrag.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o rdma.o
+ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o hashtable.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o memory_prefetch.o io_threads.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o allocator_defrag.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o rdma.o
 ENGINE_CLI_NAME=$(ENGINE_NAME)-cli$(PROG_SUFFIX)
 ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o
 ENGINE_BENCHMARK_NAME=$(ENGINE_NAME)-benchmark$(PROG_SUFFIX)
diff --git a/src/hashtable.c b/src/hashtable.c
new file mode 100644
index 0000000000..9d963b9ddc
--- /dev/null
+++ b/src/hashtable.c
@@ -0,0 +1,2138 @@
+/*
+ * Copyright Valkey Contributors.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD 3-Clause
+ */
+
+/* Hashtable
+ * =========
+ *
+ * This is an implementation of a hash table with cache-line sized buckets. It's
+ * designed for speed and low memory overhead. It provides the following
+ * features:
+ *
+ * - Incremental rehashing using two tables.
+ *
+ * - Stateless iteration using 'scan'.
+ *
+ * - A hash table contains pointers to user-defined entries. An entry needs to
+ *   contain a key. Other than that, the hash table implementation doesn't care
+ *   what it contains. To use it as a set, an entry is just a key. Using as a
+ *   key-value map requires combining key and value into an entry object and
+ *   inserting this object into the hash table. A callback for fetching the key
+ *   from within the entry object is provided by the caller when creating the
+ *   hash table.
+ *
+ * - The entry type, key type, hash function and other properties are
+ *   configurable as callbacks in a 'type' structure provided when creating a
+ *   hash table.
+ *
+ * Conventions
+ * -----------
+ *
+ * Functions and types are prefixed by "hashtable", macros by "HASHTABLE". Internal
+ * names don't use the prefix. Internal functions are 'static'.
+ *
+ * Credits
+ * -------
+ *
+ * - The hashtable was designed by Viktor Söderqvist.
+ * - The bucket chaining is based on an idea by Madelyn Olson.
+ * - The cache-line sized bucket is inspired by ideas used in 'Swiss tables'
+ *   (Benzaquen, Evlogimenos, Kulukundis, and Perepelitsa et. al.).
+ * - The incremental rehashing using two tables and much of the API is based on
+ *   the design used in dict, designed by Salvatore Sanfilippo.
+ * - The original scan algorithm was designed by Pieter Noordhuis.
+ */
+#include "hashtable.h"
+#include "serverassert.h"
+#include "zmalloc.h"
+#include "mt19937-64.h"
+#include "monotonic.h"
+#include "config.h"
+
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/* The default hashing function uses the SipHash implementation in siphash.c. */
+
+uint64_t siphash(const uint8_t *in, const size_t inlen, const uint8_t *k);
+uint64_t siphash_nocase(const uint8_t *in, const size_t inlen, const uint8_t *k);
+
+/* --- Global variables --- */
+
+static uint8_t hash_function_seed[16];
+static hashtableResizePolicy resize_policy = HASHTABLE_RESIZE_ALLOW;
+
+/* --- Fill factor --- */
+
+/* We use a soft and a hard limit for the minimum and maximum fill factor. The
+ * hard limits are used when resizing should be avoided, according to the resize
+ * policy. Resizing is typically to be avoided when we have forked child process
+ * running. Then, we don't want to move too much memory around, since the fork
+ * is using copy-on-write.
+ *
+ * Even if we resize and start inserting new entries in the new table, we can
+ * avoid actively moving entries from the old table to the new table. When the
+ * resize policy is AVOID, we perform a step of incremental rehashing only on
+ * insertions and not on lookups. */
+
+#define MAX_FILL_PERCENT_SOFT 100
+#define MAX_FILL_PERCENT_HARD 500
+
+#define MIN_FILL_PERCENT_SOFT 13
+#define MIN_FILL_PERCENT_HARD 3
+
+/* --- Hash function API --- */
+
+/* The seed needs to be 16 bytes. */
+void hashtableSetHashFunctionSeed(const uint8_t *seed) {
+    memcpy(hash_function_seed, seed, sizeof(hash_function_seed));
+}
+
+uint8_t *hashtableGetHashFunctionSeed(void) {
+    return hash_function_seed;
+}
+
+uint64_t hashtableGenHashFunction(const char *buf, size_t len) {
+    return siphash((const uint8_t *)buf, len, hash_function_seed);
+}
+
+uint64_t hashtableGenCaseHashFunction(const char *buf, size_t len) {
+    return siphash_nocase((const uint8_t *)buf, len, hash_function_seed);
+}
+
+/* --- Global resize policy API --- */
+
+/* The global resize policy is one of
+ *
+ *   - HASHTABLE_RESIZE_ALLOW: Rehash as required for optimal performance.
+ *
+ *   - HASHTABLE_RESIZE_AVOID: Don't rehash and move memory if it can be avoided;
+ *     used when there is a fork running and we want to avoid affecting
+ *     copy-on-write memory.
+ *
+ *   - HASHTABLE_RESIZE_FORBID: Don't rehash at all. Used in a child process which
+ *     doesn't add any keys.
+ *
+ * Incremental rehashing works in the following way: A new table is allocated
+ * and entries are incrementally moved from the old to the new table.
+ *
+ * To avoid affecting copy-on-write, we avoid rehashing when there is a forked
+ * child process.
+ *
+ * We don't completely forbid resizing the table but the fill factor is
+ * significantly larger when the resize policy is set to HASHTABLE_RESIZE_AVOID
+ * and we resize with incremental rehashing paused, so new entries are added to
+ * the new table and the old entries are rehashed only when the child process is
+ * done.
+ */
+void hashtableSetResizePolicy(hashtableResizePolicy policy) {
+    resize_policy = policy;
+}
+
+/* --- Hash table layout --- */
+
+#if SIZE_MAX == UINT64_MAX /* 64-bit version */
+
+#define ENTRIES_PER_BUCKET 7
+#define BUCKET_BITS_TYPE uint8_t
+#define BITS_NEEDED_TO_STORE_POS_WITHIN_BUCKET 3
+
+/* Selecting the number of buckets.
+ *
+ * When resizing the table, we want to select an appropriate number of buckets
+ * without an expensive division. Division by a power of two is cheap, but any
+ * other division is expensive. We pick a fill factor to make division cheap for
+ * our choice of ENTRIES_PER_BUCKET.
+ *
+ * The number of buckets we want is NUM_ENTRIES / (ENTRIES_PER_BUCKET * FILL_FACTOR),
+ * rounded up. The fill is the number of entries we have, or want to put, in
+ * the table.
+ *
+ * Instead of the above fraction, we multiply by an integer BUCKET_FACTOR and
+ * divide by a power-of-two BUCKET_DIVISOR. This gives us a fill factor of at
+ * most MAX_FILL_PERCENT_SOFT, the soft limit for expanding.
+ *
+ *     NUM_BUCKETS = ceil(NUM_ENTRIES * BUCKET_FACTOR / BUCKET_DIVISOR)
+ *
+ * This gives us
+ *
+ *     FILL_FACTOR = NUM_ENTRIES / (NUM_BUCKETS * ENTRIES_PER_BUCKET)
+ *                 = 1 / (BUCKET_FACTOR / BUCKET_DIVISOR) / ENTRIES_PER_BUCKET
+ *                 = BUCKET_DIVISOR / BUCKET_FACTOR / ENTRIES_PER_BUCKET
+ */
+
+#define BUCKET_FACTOR 5
+#define BUCKET_DIVISOR 32
+/* When resizing, we get a fill of at most 91.43% (32 / 5 / 7). */
+
+#define randomSizeT() ((size_t)genrand64_int64())
+
+#elif SIZE_MAX == UINT32_MAX /* 32-bit version */
+
+#define ENTRIES_PER_BUCKET 12
+#define BUCKET_BITS_TYPE uint16_t
+#define BITS_NEEDED_TO_STORE_POS_WITHIN_BUCKET 4
+#define BUCKET_FACTOR 3
+#define BUCKET_DIVISOR 32
+/* When resizing, we get a fill of at most 88.89% (32 / 3 / 12). */
+
+#define randomSizeT() ((size_t)random())
+
+#else
+#error "Only 64-bit or 32-bit architectures are supported"
+#endif /* 64-bit vs 32-bit version */
+
+#ifndef static_assert
+#define static_assert _Static_assert
+#endif
+
+static_assert(100 * BUCKET_DIVISOR / BUCKET_FACTOR / ENTRIES_PER_BUCKET <= MAX_FILL_PERCENT_SOFT,
+              "Expand must result in a fill below the soft max fill factor");
+static_assert(MAX_FILL_PERCENT_SOFT <= MAX_FILL_PERCENT_HARD, "Soft vs hard fill factor");
+
+/* --- Random entry --- */
+
+#define FAIR_RANDOM_SAMPLE_SIZE (ENTRIES_PER_BUCKET * 40)
+#define WEAK_RANDOM_SAMPLE_SIZE ENTRIES_PER_BUCKET
+
+/* --- Types --- */
+
+/* Design
+ * ------
+ *
+ * We use a design with buckets of 64 bytes (one cache line). Each bucket
+ * contains metadata and entry slots for a fixed number of entries. In a 64-bit
+ * system, there are up to 7 entries per bucket. These are unordered and an
+ * entry can be inserted in any of the free slots. Additionally, the bucket
+ * contains metadata for the entries. This includes a few bits of the hash of
+ * the key of each entry, which are used to rule out false positives when
+ * looking up entries.
+ *
+ * Bucket chaining
+ * ---------------
+ *
+ * Each key hashes to a bucket in the hash table. If a bucket is full, the last
+ * entry is replaced by a pointer to a separately allocated child bucket.
+ * Child buckets form a bucket chain.
+ *
+ *           Bucket          Bucket          Bucket
+ *     -----+---------------+---------------+---------------+-----
+ *      ... | x x x x x x p | x x x x x x x | x x x x x x x | ...
+ *     -----+-------------|-+---------------+---------------+-----
+ *                        |
+ *                        v  Child bucket
+ *                      +---------------+
+ *                      | x x x x x x p |
+ *                      +-------------|-+
+ *                                    |
+ *                                    v  Child bucket
+ *                                  +---------------+
+ *                                  | x x x x x x x |
+ *                                  +---------------+
+ *
+ * Bucket layout
+ * -------------
+ *
+ * Within each bucket chain, the entries are unordered. To avoid false positives
+ * when looking up an entry, a few bits of the hash value is stored in a bucket
+ * metadata section in each bucket. The bucket metadata also contains a bit that
+ * indicates that the bucket has a child bucket.
+ *
+ *         +------------------------------------------------------------------+
+ *         | Metadata | Entry | Entry | Entry | Entry | Entry | Entry | Entry |
+ *         +------------------------------------------------------------------+
+ *        /            ` - - . _ _
+ *       /                         `- - . _ _
+ *      /                                     ` - . _
+ *     +----------------------------------------------+
+ *     | c ppppppp hash hash hash hash hash hash hash |
+ *     +----------------------------------------------+
+ *      |    |       |
+ *      |    |      One byte of hash for each entry position in the bucket.
+ *      |    |
+ *      |   Presence bits. One bit for each entry position, indicating if an
+ *      |   entry present or not.
+ *      |
+ *     Chained? One bit. If set, the last entry is a child bucket pointer.
+ *
+ * 64-bit version, 7 entries per bucket:
+ *
+ *     1 bit     7 bits    [1 byte] x 7  [8 bytes] x 7 = 64 bytes
+ *     chained   presence  hashes        entries
+ *
+ * 32-bit version, 12 entries per bucket:
+ *
+ *     1 bit     12 bits   3 bits  [1 byte] x 12  2 bytes  [4 bytes] x 12 = 64 bytes
+ *     chained   presence  unused  hashes         unused   entries
+ */
+
+typedef struct hashtableBucket {
+    BUCKET_BITS_TYPE chained : 1;
+    BUCKET_BITS_TYPE presence : ENTRIES_PER_BUCKET;
+    uint8_t hashes[ENTRIES_PER_BUCKET];
+    void *entries[ENTRIES_PER_BUCKET];
+} bucket;
+
+/* A key property is that the bucket size is one cache line. */
+static_assert(sizeof(bucket) == HASHTABLE_BUCKET_SIZE, "Bucket size mismatch");
+
+struct hashtable {
+    hashtableType *type;
+    ssize_t rehash_idx;        /* -1 = rehashing not in progress. */
+    bucket *tables[2];         /* 0 = main table, 1 = rehashing target.  */
+    size_t used[2];            /* Number of entries in each table. */
+    int8_t bucket_exp[2];      /* Exponent for num buckets (num = 1 << exp). */
+    int16_t pause_rehash;      /* Non-zero = rehashing is paused */
+    int16_t pause_auto_shrink; /* Non-zero = automatic resizing disallowed. */
+    size_t child_buckets[2];   /* Number of allocated child buckets. */
+    void *metadata[];
+};
+
+typedef struct {
+    hashtable *hashtable;
+    bucket *bucket;
+    long index;
+    uint16_t pos_in_bucket;
+    uint8_t table;
+    uint8_t safe;
+    union {
+        /* Unsafe iterator fingerprint for misuse detection. */
+        uint64_t fingerprint;
+        /* Safe iterator temporary storage for bucket chain compaction. */
+        uint64_t last_seen_size;
+    };
+} iter;
+
+/* The opaque hashtableIterator is defined as a blob of bytes. */
+static_assert(sizeof(hashtableIterator) >= sizeof(iter),
+              "Opaque iterator size");
+
+/* Position, used by some hashtable functions such as two-phase insert and delete. */
+typedef struct {
+    bucket *bucket;
+    uint16_t pos_in_bucket;
+    uint16_t table_index;
+} position;
+
+static_assert(sizeof(hashtablePosition) >= sizeof(position),
+              "Opaque iterator size");
+
+/* State for incremental find. */
+typedef struct {
+    enum {
+        HASHTABLE_CHECK_ENTRY,
+        HASHTABLE_NEXT_ENTRY,
+        HASHTABLE_NEXT_BUCKET,
+        HASHTABLE_FOUND,
+        HASHTABLE_NOT_FOUND
+    } state;
+    short table;
+    short pos;
+    hashtable *hashtable;
+    bucket *bucket;
+    const void *key;
+    uint64_t hash;
+} incrementalFind;
+
+static_assert(sizeof(hashtableIncrementalFindState) >= sizeof(incrementalFind),
+              "Opaque incremental find state size");
+
+/* Struct used for stats functions. */
+struct hashtableStats {
+    int table_index;                /* 0 or 1 (old or new while rehashing). */
+    unsigned long toplevel_buckets; /* Number of buckets in table. */
+    unsigned long child_buckets;    /* Number of child buckets. */
+    unsigned long size;             /* Capacity of toplevel buckets. */
+    unsigned long used;             /* Number of entries in the table. */
+    unsigned long max_chain_len;    /* Length of longest bucket chain. */
+    unsigned long *clvector;        /* Chain length vector; entry i counts
+                                     * bucket chains of length i. */
+};
+
+/* Struct for sampling entries using scan, used by random key functions. */
+
+typedef struct {
+    unsigned size;  /* Size of the entries array. */
+    unsigned seen;  /* Number of entries seen. */
+    void **entries; /* Array of sampled entries. */
+} scan_samples;
+
+/* --- Internal functions --- */
+
+static bucket *findBucketForInsert(hashtable *ht, uint64_t hash, int *pos_in_bucket, int *table_index);
+
+static inline void freeEntry(hashtable *ht, void *entry) {
+    if (ht->type->entryDestructor) ht->type->entryDestructor(entry);
+}
+
+static inline int compareKeys(hashtable *ht, const void *key1, const void *key2) {
+    if (ht->type->keyCompare != NULL) {
+        return ht->type->keyCompare(key1, key2);
+    } else {
+        return key1 != key2;
+    }
+}
+
+static inline const void *entryGetKey(hashtable *ht, const void *entry) {
+    if (ht->type->entryGetKey != NULL) {
+        return ht->type->entryGetKey(entry);
+    } else {
+        return entry;
+    }
+}
+
+static inline uint64_t hashKey(hashtable *ht, const void *key) {
+    if (ht->type->hashFunction != NULL) {
+        return ht->type->hashFunction(key);
+    } else {
+        return hashtableGenHashFunction((const char *)&key, sizeof(key));
+    }
+}
+
+static inline uint64_t hashEntry(hashtable *ht, const void *entry) {
+    return hashKey(ht, entryGetKey(ht, entry));
+}
+
+
+/* For the hash bits stored in the bucket, we use the highest bits of the hash
+ * value, since these are not used for selecting the bucket. */
+static inline uint8_t highBits(uint64_t hash) {
+    return hash >> (CHAR_BIT * 7);
+}
+
+static inline int numBucketPositions(bucket *b) {
+    return ENTRIES_PER_BUCKET - (b->chained ? 1 : 0);
+}
+
+static inline int bucketIsFull(bucket *b) {
+    return b->presence == (1 << numBucketPositions(b)) - 1;
+}
+
+/* Returns non-zero if the position within the bucket is occupied. */
+static inline int isPositionFilled(bucket *b, int position) {
+    return b->presence & (1 << position);
+}
+static void resetTable(hashtable *ht, int table_idx) {
+    ht->tables[table_idx] = NULL;
+    ht->used[table_idx] = 0;
+    ht->bucket_exp[table_idx] = -1;
+    ht->child_buckets[table_idx] = 0;
+}
+
+/* Number of top-level buckets. */
+static inline size_t numBuckets(int exp) {
+    return exp == -1 ? 0 : (size_t)1 << exp;
+}
+
+/* Bitmask for masking the hash value to get bucket index. */
+static inline size_t expToMask(int exp) {
+    return exp == -1 ? 0 : numBuckets(exp) - 1;
+}
+
+/* Returns the 'exp', where num_buckets = 1 << exp. The number of
+ * buckets is a power of two. */
+static signed char nextBucketExp(size_t min_capacity) {
+    if (min_capacity == 0) return -1;
+    /* ceil(x / y) = floor((x - 1) / y) + 1 */
+    size_t min_buckets = (min_capacity * BUCKET_FACTOR - 1) / BUCKET_DIVISOR + 1;
+    if (min_buckets >= SIZE_MAX / 2) return CHAR_BIT * sizeof(size_t) - 1;
+    if (min_buckets == 1) return 0;
+    return CHAR_BIT * sizeof(size_t) - __builtin_clzl(min_buckets - 1);
+}
+
+/* Swaps the tables and frees the old table. */
+static void rehashingCompleted(hashtable *ht) {
+    if (ht->type->rehashingCompleted) ht->type->rehashingCompleted(ht);
+    if (ht->tables[0]) {
+        zfree(ht->tables[0]);
+        if (ht->type->trackMemUsage) {
+            ht->type->trackMemUsage(ht, -sizeof(bucket) * numBuckets(ht->bucket_exp[0]));
+        }
+    }
+    ht->bucket_exp[0] = ht->bucket_exp[1];
+    ht->tables[0] = ht->tables[1];
+    ht->used[0] = ht->used[1];
+    ht->child_buckets[0] = ht->child_buckets[1];
+    resetTable(ht, 1);
+    ht->rehash_idx = -1;
+}
+
+/* Reverse bits, adapted to use bswap, from
+ * https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel */
+static size_t rev(size_t v) {
+#if SIZE_MAX == UINT64_MAX
+    /* Swap odd and even bits. */
+    v = ((v >> 1) & 0x5555555555555555) | ((v & 0x5555555555555555) << 1);
+    /* Swap consecutive pairs. */
+    v = ((v >> 2) & 0x3333333333333333) | ((v & 0x3333333333333333) << 2);
+    /* Swap nibbles. */
+    v = ((v >> 4) & 0x0F0F0F0F0F0F0F0F) | ((v & 0x0F0F0F0F0F0F0F0F) << 4);
+    /* Reverse bytes. */
+    v = __builtin_bswap64(v);
+#else
+    /* 32-bit version. */
+    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
+    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
+    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
+    v = __builtin_bswap32(v);
+#endif
+    return v;
+}
+
+/* Advances a scan cursor to the next value. It increments the reverse bit
+ * representation of the masked bits of v. This algorithm was invented by Pieter
+ * Noordhuis. */
+size_t nextCursor(size_t v, size_t mask) {
+    v |= ~mask; /* Set the unmasked (high) bits. */
+    v = rev(v); /* Reverse. The unmasked bits are now the low bits. */
+    v++;        /* Increment the reversed cursor, flipping the unmasked bits to
+                 * 0 and increments the masked bits. */
+    v = rev(v); /* Reverse the bits back to normal. */
+    return v;
+}
+
+/* Returns the next bucket in a bucket chain, or NULL if there's no next. */
+static bucket *bucketNext(bucket *b) {
+    return b->chained ? b->entries[ENTRIES_PER_BUCKET - 1] : NULL;
+}
+
+/* Attempts to defrag bucket 'b' using the defrag callback function. If the
+ * defrag callback function returns a pointer to a new allocation, this pointer
+ * is returned and the 'prev' bucket is updated to point to the new allocation.
+ * Otherwise, the 'b' pointer is returned. */
+static bucket *bucketDefrag(bucket *prev, bucket *b, void *(*defragfn)(void *)) {
+    bucket *reallocated = defragfn(b);
+    if (reallocated == NULL) return b;
+    prev->entries[ENTRIES_PER_BUCKET - 1] = reallocated;
+    return reallocated;
+}
+
+/* Rehashes one bucket. */
+static void rehashBucket(hashtable *ht, bucket *b) {
+    int pos;
+    for (pos = 0; pos < numBucketPositions(b); pos++) {
+        if (!isPositionFilled(b, pos)) continue; /* empty */
+        void *entry = b->entries[pos];
+        uint8_t h2 = b->hashes[pos];
+        /* Insert into table 1. */
+        uint64_t hash;
+        /* When shrinking, it's possible to avoid computing the hash. We can
+         * just use idx has the hash. */
+        if (ht->bucket_exp[1] < ht->bucket_exp[0]) {
+            hash = ht->rehash_idx;
+        } else {
+            hash = hashEntry(ht, entry);
+        }
+        int pos_in_dst_bucket;
+        bucket *dst = findBucketForInsert(ht, hash, &pos_in_dst_bucket, NULL);
+        dst->entries[pos_in_dst_bucket] = entry;
+        dst->hashes[pos_in_dst_bucket] = h2;
+        dst->presence |= (1 << pos_in_dst_bucket);
+        ht->used[0]--;
+        ht->used[1]++;
+    }
+    /* Mark the source bucket as empty. */
+    b->presence = 0;
+}
+
+static void rehashStep(hashtable *ht) {
+    assert(hashtableIsRehashing(ht));
+    size_t idx = ht->rehash_idx;
+    bucket *b = &ht->tables[0][idx];
+    rehashBucket(ht, b);
+    if (b->chained) {
+        /* Rehash and free child buckets. */
+        bucket *next = bucketNext(b);
+        b->chained = 0;
+        b = next;
+        while (b != NULL) {
+            rehashBucket(ht, b);
+            next = bucketNext(b);
+            zfree(b);
+            if (ht->type->trackMemUsage) ht->type->trackMemUsage(ht, -sizeof(bucket));
+            ht->child_buckets[0]--;
+            b = next;
+        }
+    }
+
+    /* Advance to the next bucket. */
+    ht->rehash_idx++;
+    if ((size_t)ht->rehash_idx >= numBuckets(ht->bucket_exp[0])) {
+        rehashingCompleted(ht);
+    }
+}
+
+/* Called internally on lookup and other reads to the table. */
+static inline void rehashStepOnReadIfNeeded(hashtable *ht) {
+    if (!hashtableIsRehashing(ht) || ht->pause_rehash) return;
+    if (resize_policy != HASHTABLE_RESIZE_ALLOW) return;
+    rehashStep(ht);
+}
+
+/* When inserting or deleting, we first do a find (read) and rehash one step if
+ * resize policy is set to ALLOW, so here we only do it if resize policy is
+ * AVOID. The reason for doing it on insert and delete is to ensure that we
+ * finish rehashing before we need to resize the table again. */
+static inline void rehashStepOnWriteIfNeeded(hashtable *ht) {
+    if (!hashtableIsRehashing(ht) || ht->pause_rehash) return;
+    if (resize_policy != HASHTABLE_RESIZE_AVOID) return;
+    rehashStep(ht);
+}
+
+/* Allocates a new table and initiates incremental rehashing if necessary.
+ * Returns 1 on resize (success), 0 on no resize (failure). If 0 is returned and
+ * 'malloc_failed' is provided, it is set to 1 if allocation failed. If
+ * 'malloc_failed' is not provided, an allocation failure triggers a panic. */
+static int resize(hashtable *ht, size_t min_capacity, int *malloc_failed) {
+    if (malloc_failed) *malloc_failed = 0;
+
+    /* Adjust minimum size. We don't resize to zero currently. */
+    if (min_capacity == 0) min_capacity = 1;
+
+    /* Size of new table. */
+    signed char exp = nextBucketExp(min_capacity);
+    size_t num_buckets = numBuckets(exp);
+    size_t new_capacity = num_buckets * ENTRIES_PER_BUCKET;
+    if (new_capacity < min_capacity || num_buckets * sizeof(bucket) < num_buckets) {
+        /* Overflow */
+        return 0;
+    }
+
+    signed char old_exp = ht->bucket_exp[hashtableIsRehashing(ht) ? 1 : 0];
+    size_t alloc_size = num_buckets * sizeof(bucket);
+    if (exp == old_exp) {
+        /* Can't resize to same size. */
+        return 0;
+    }
+
+    if (ht->type->resizeAllowed) {
+        double fill_factor = (double)min_capacity / ((double)numBuckets(old_exp) * ENTRIES_PER_BUCKET);
+        if (fill_factor * 100 < MAX_FILL_PERCENT_HARD && !ht->type->resizeAllowed(alloc_size, fill_factor)) {
+            /* Resize callback says no. */
+            return 0;
+        }
+    }
+
+    /* We can't resize if rehashing is already ongoing. Fast-forward ongoing
+     * rehashing before we continue. This can happen only in exceptional
+     * scenarios, such as when many insertions are made while rehashing is
+     * paused. */
+    if (hashtableIsRehashing(ht)) {
+        if (hashtableIsRehashingPaused(ht)) return 0;
+        while (hashtableIsRehashing(ht)) {
+            rehashStep(ht);
+        }
+    }
+
+    /* Allocate the new hash table. */
+    bucket *new_table;
+    if (malloc_failed) {
+        new_table = ztrycalloc(alloc_size);
+        if (new_table == NULL) {
+            *malloc_failed = 1;
+            return 0;
+        }
+    } else {
+        new_table = zcalloc(alloc_size);
+    }
+    if (ht->type->trackMemUsage) ht->type->trackMemUsage(ht, alloc_size);
+    ht->bucket_exp[1] = exp;
+    ht->tables[1] = new_table;
+    ht->used[1] = 0;
+    ht->rehash_idx = 0;
+    if (ht->type->rehashingStarted) ht->type->rehashingStarted(ht);
+
+    /* If the old table was empty, the rehashing is completed immediately. */
+    if (ht->tables[0] == NULL || ht->used[0] == 0) {
+        rehashingCompleted(ht);
+    } else if (ht->type->instant_rehashing) {
+        while (hashtableIsRehashing(ht)) {
+            rehashStep(ht);
+        }
+    }
+    return 1;
+}
+
+/* Returns 1 if the table is expanded, 0 if not expanded. If 0 is returned and
+ * 'malloc_failed' is provided, it is set to 1 if malloc failed and 0
+ * otherwise. */
+static int expand(hashtable *ht, size_t size, int *malloc_failed) {
+    if (size < hashtableSize(ht)) {
+        return 0;
+    }
+    return resize(ht, size, malloc_failed);
+}
+
+/* Finds an entry matching the key. If a match is found, returns a pointer to
+ * the bucket containing the matching entry and points 'pos_in_bucket' to the
+ * index within the bucket. Returns NULL if no matching entry was found.
+ *
+ * If 'table_index' is provided, it is set to the index of the table (0 or 1)
+ * the returned bucket belongs to. */
+static bucket *findBucket(hashtable *ht, uint64_t hash, const void *key, int *pos_in_bucket, int *table_index) {
+    if (hashtableSize(ht) == 0) return 0;
+    uint8_t h2 = highBits(hash);
+    int table;
+
+    /* Do some incremental rehashing. */
+    rehashStepOnReadIfNeeded(ht);
+
+    for (table = 0; table <= 1; table++) {
+        if (ht->used[table] == 0) continue;
+        size_t mask = expToMask(ht->bucket_exp[table]);
+        size_t bucket_idx = hash & mask;
+        /* Skip already rehashed buckets. */
+        if (table == 0 && ht->rehash_idx >= 0 && bucket_idx < (size_t)ht->rehash_idx) {
+            continue;
+        }
+        bucket *b = &ht->tables[table][bucket_idx];
+        do {
+            /* Find candidate entries with presence flag set and matching h2 hash. */
+            for (int pos = 0; pos < numBucketPositions(b); pos++) {
+                if (isPositionFilled(b, pos) && b->hashes[pos] == h2) {
+                    /* It's a candidate. */
+                    void *entry = b->entries[pos];
+                    const void *elem_key = entryGetKey(ht, entry);
+                    if (compareKeys(ht, key, elem_key) == 0) {
+                        /* It's a match. */
+                        assert(pos_in_bucket != NULL);
+                        *pos_in_bucket = pos;
+                        if (table_index) *table_index = table;
+                        return b;
+                    }
+                }
+            }
+            b = bucketNext(b);
+        } while (b != NULL);
+    }
+    return NULL;
+}
+
+/* Move an entry from one bucket to another. */
+static void moveEntry(bucket *bucket_to, int pos_to, bucket *bucket_from, int pos_from) {
+    assert(!isPositionFilled(bucket_to, pos_to));
+    assert(isPositionFilled(bucket_from, pos_from));
+    bucket_to->entries[pos_to] = bucket_from->entries[pos_from];
+    bucket_to->hashes[pos_to] = bucket_from->hashes[pos_from];
+    bucket_to->presence |= (1 << pos_to);
+    bucket_from->presence &= ~(1 << pos_from);
+}
+
+/* Converts a full bucket b to a chained bucket and adds a new child bucket. */
+static void bucketConvertToChained(hashtable *ht, bucket *b) {
+    assert(!b->chained);
+    /* We'll move the last entry from the bucket to the new child bucket. */
+    int pos = ENTRIES_PER_BUCKET - 1;
+    assert(isPositionFilled(b, pos));
+    bucket *child = zcalloc(sizeof(bucket));
+    if (ht->type->trackMemUsage) ht->type->trackMemUsage(ht, sizeof(bucket));
+    moveEntry(child, 0, b, pos);
+    b->chained = 1;
+    b->entries[pos] = child;
+}
+
+/* Converts a bucket with a next-bucket pointer to one without one. */
+static void bucketConvertToUnchained(bucket *b) {
+    assert(b->chained);
+    b->chained = 0;
+    assert(!isPositionFilled(b, ENTRIES_PER_BUCKET - 1));
+}
+
+/* If the last bucket is empty, free it. The before-last bucket is converted
+ * back to an "unchained" bucket, becoming the new last bucket in the chain. If
+ * there's only one entry left in the last bucket, it's moved to the
+ * before-last bucket's last position, to take the place of the next-bucket
+ * link.
+ *
+ * This function needs the penultimate 'before_last' bucket in the chain, to be
+ * able to update it when the last bucket is freed. */
+static void pruneLastBucket(hashtable *ht, bucket *before_last, bucket *last, int table_index) {
+    assert(before_last->chained && bucketNext(before_last) == last);
+    assert(!last->chained);
+    assert(last->presence == 0 || __builtin_popcount(last->presence) == 1);
+    bucketConvertToUnchained(before_last);
+    if (last->presence != 0) {
+        /* Move the last remaining entry to the new last position in the
+         * before-last bucket. */
+        int pos_in_last = __builtin_ctz(last->presence);
+        moveEntry(before_last, ENTRIES_PER_BUCKET - 1, last, pos_in_last);
+    }
+    zfree(last);
+    if (ht->type->trackMemUsage) ht->type->trackMemUsage(ht, -sizeof(bucket));
+    ht->child_buckets[table_index]--;
+}
+
+/* After removing an entry in a bucket with children, we can fill the hole
+ * with an entry from the end of the bucket chain and potentially free the
+ * last bucket in the chain. */
+static void fillBucketHole(hashtable *ht, bucket *b, int pos_in_bucket, int table_index) {
+    assert(b->chained && !isPositionFilled(b, pos_in_bucket));
+    /* Find the last bucket */
+    bucket *before_last = b;
+    bucket *last = bucketNext(b);
+    while (last->chained) {
+        before_last = last;
+        last = bucketNext(last);
+    }
+    /* Unless the last bucket is empty, find an entry in the last bucket and
+     * move it to the hole in b. */
+    if (last->presence != 0) {
+        int pos_in_last = __builtin_ctz(last->presence);
+        assert(pos_in_last < ENTRIES_PER_BUCKET && isPositionFilled(last, pos_in_last));
+        moveEntry(b, pos_in_bucket, last, pos_in_last);
+    }
+    /* Free the last bucket if it becomes empty. */
+    if (last->presence == 0 || __builtin_popcount(last->presence) == 1) {
+        pruneLastBucket(ht, before_last, last, table_index);
+    }
+}
+
+/* When entries are deleted while rehashing is paused, they leave empty holes in
+ * the buckets. This functions attempts to fill the holes by moving entries from
+ * the end of the bucket chain to fill the holes and free any empty buckets in
+ * the end of the chain. */
+static void compactBucketChain(hashtable *ht, size_t bucket_index, int table_index) {
+    bucket *b = &ht->tables[table_index][bucket_index];
+    while (b->chained) {
+        bucket *next = bucketNext(b);
+        if (next->chained && next->presence == 0) {
+            /* Empty bucket in the middle of the chain. Remove it from the chain. */
+            bucket *next_next = bucketNext(next);
+            b->entries[ENTRIES_PER_BUCKET - 1] = next_next;
+            zfree(next);
+            if (ht->type->trackMemUsage) ht->type->trackMemUsage(ht, -sizeof(bucket));
+            ht->child_buckets[table_index]--;
+            continue;
+        }
+
+        if (!next->chained && (next->presence == 0 || __builtin_popcount(next->presence) == 1)) {
+            /* Next is the last bucket and it's empty or has only one entry.
+             * Delete it and turn b into an "unchained" bucket. */
+            pruneLastBucket(ht, b, next, table_index);
+            return;
+        }
+
+        if (__builtin_popcount(b->presence) < ENTRIES_PER_BUCKET - 1) {
+            /* Fill the holes in the bucket. */
+            for (int pos = 0; pos < ENTRIES_PER_BUCKET - 1; pos++) {
+                if (!isPositionFilled(b, pos)) {
+                    fillBucketHole(ht, b, pos, table_index);
+                    if (!b->chained) return;
+                }
+            }
+        }
+
+        /* Bucket is full. Move forward to next bucket. */
+        b = next;
+    }
+}
+
+/* Find an empty position in the table for inserting an entry with the given hash. */
+static bucket *findBucketForInsert(hashtable *ht, uint64_t hash, int *pos_in_bucket, int *table_index) {
+    int table = hashtableIsRehashing(ht) ? 1 : 0;
+    assert(ht->tables[table]);
+    size_t mask = expToMask(ht->bucket_exp[table]);
+    size_t bucket_idx = hash & mask;
+    bucket *b = &ht->tables[table][bucket_idx];
+    /* Find bucket that's not full, or create one. */
+    while (bucketIsFull(b)) {
+        if (!b->chained) {
+            bucketConvertToChained(ht, b);
+            ht->child_buckets[table]++;
+        }
+        b = bucketNext(b);
+    }
+    /* Find a free slot in the bucket. There must be at least one. */
+    int pos;
+    for (pos = 0; pos < ENTRIES_PER_BUCKET; pos++) {
+        if (!isPositionFilled(b, pos)) break;
+    }
+    assert(pos < ENTRIES_PER_BUCKET);
+    assert(pos_in_bucket != NULL);
+    *pos_in_bucket = pos;
+    if (table_index) *table_index = table;
+    return b;
+}
+
+/* Helper to insert an entry. Doesn't check if an entry with a matching key
+ * already exists. This must be ensured by the caller. */
+static void insert(hashtable *ht, uint64_t hash, void *entry) {
+    hashtableExpandIfNeeded(ht);
+    rehashStepOnWriteIfNeeded(ht);
+    int pos_in_bucket;
+    int table_index;
+    bucket *b = findBucketForInsert(ht, hash, &pos_in_bucket, &table_index);
+    b->entries[pos_in_bucket] = entry;
+    b->presence |= (1 << pos_in_bucket);
+    b->hashes[pos_in_bucket] = highBits(hash);
+    ht->used[table_index]++;
+}
+
+/* A 64-bit fingerprint of some of the state of the hash table. */
+static uint64_t hashtableFingerprint(hashtable *ht) {
+    uint64_t integers[6], hash = 0;
+    integers[0] = (uintptr_t)ht->tables[0];
+    integers[1] = ht->bucket_exp[0];
+    integers[2] = ht->used[0];
+    integers[3] = (uintptr_t)ht->tables[1];
+    integers[4] = ht->bucket_exp[1];
+    integers[5] = ht->used[1];
+
+    /* Result = hash(hash(hash(int1)+int2)+int3) */
+    for (int j = 0; j < 6; j++) {
+        hash += integers[j];
+        /* Tomas Wang's 64 bit integer hash. */
+        hash = (~hash) + (hash << 21); /* hash = (hash << 21) - hash - 1; */
+        hash = hash ^ (hash >> 24);
+        hash = (hash + (hash << 3)) + (hash << 8); /* hash * 265 */
+        hash = hash ^ (hash >> 14);
+        hash = (hash + (hash << 2)) + (hash << 4); /* hash * 21 */
+        hash = hash ^ (hash >> 28);
+        hash = hash + (hash << 31);
+    }
+    return hash;
+}
+
+/* Scan callback function used by hashtableGetSomeEntries() for sampling entries
+ * using scan. */
+static void sampleEntriesScanFn(void *privdata, void *entry) {
+    scan_samples *samples = privdata;
+    if (samples->seen < samples->size) {
+        samples->entries[samples->seen++] = entry;
+    } else {
+        /* More entries than we wanted. This can happen if there are long
+         * bucket chains. Replace random entries using reservoir sampling. */
+        samples->seen++;
+        unsigned idx = random() % samples->seen;
+        if (idx < samples->size) samples->entries[idx] = entry;
+    }
+}
+
+/* Conversion from internal iterator struct to user-facing opaque type. */
+static inline hashtableIterator *iteratorToOpaque(iter *iterator) {
+    return (hashtableIterator *)(void *)iterator;
+}
+
+/* Conversion from user-facing opaque iterator type to internal struct. */
+static inline iter *iteratorFromOpaque(hashtableIterator *iterator) {
+    return (iter *)(void *)iterator;
+}
+
+/* Conversion from user-facing opaque type to internal struct. */
+static inline position *positionFromOpaque(hashtablePosition *p) {
+    return (position *)(void *)p;
+}
+
+/* Conversion from user-facing opaque type to internal struct. */
+static inline incrementalFind *incrementalFindFromOpaque(hashtableIncrementalFindState *state) {
+    return (incrementalFind *)(void *)state;
+}
+
+/* --- API functions --- */
+
+/* Allocates and initializes a new hashtable specified by the given type. */
+hashtable *hashtableCreate(hashtableType *type) {
+    size_t metasize = type->getMetadataSize ? type->getMetadataSize() : 0;
+    size_t alloc_size = sizeof(hashtable) + metasize;
+    hashtable *ht = zmalloc(alloc_size);
+    if (metasize > 0) {
+        memset(&ht->metadata, 0, metasize);
+    }
+    ht->type = type;
+    ht->rehash_idx = -1;
+    ht->pause_rehash = 0;
+    ht->pause_auto_shrink = 0;
+    resetTable(ht, 0);
+    resetTable(ht, 1);
+    if (type->trackMemUsage) type->trackMemUsage(ht, alloc_size);
+    return ht;
+}
+
+/* Deletes all the entries. If a callback is provided, it is called from time
+ * to time to indicate progress. */
+void hashtableEmpty(hashtable *ht, void(callback)(hashtable *)) {
+    if (hashtableIsRehashing(ht)) {
+        /* Pretend rehashing completed. */
+        if (ht->type->rehashingCompleted) ht->type->rehashingCompleted(ht);
+        ht->rehash_idx = -1;
+    }
+    for (int table_index = 0; table_index <= 1; table_index++) {
+        if (ht->bucket_exp[table_index] < 0) {
+            continue;
+        }
+        if (ht->used[table_index] > 0) {
+            for (size_t idx = 0; idx < numBuckets(ht->bucket_exp[table_index]); idx++) {
+                if (callback && (idx & 65535) == 0) callback(ht);
+                bucket *b = &ht->tables[table_index][idx];
+                do {
+                    /* Call the destructor with each entry. */
+                    if (ht->type->entryDestructor != NULL && b->presence != 0) {
+                        for (int pos = 0; pos < ENTRIES_PER_BUCKET; pos++) {
+                            if (isPositionFilled(b, pos)) {
+                                ht->type->entryDestructor(b->entries[pos]);
+                            }
+                        }
+                    }
+                    bucket *next = bucketNext(b);
+
+                    /* Free allocated bucket. */
+                    if (b != &ht->tables[table_index][idx]) {
+                        zfree(b);
+                        if (ht->type->trackMemUsage) {
+                            ht->type->trackMemUsage(ht, -sizeof(bucket));
+                        }
+                    }
+                    b = next;
+                } while (b != NULL);
+            }
+        }
+        zfree(ht->tables[table_index]);
+        if (ht->type->trackMemUsage) {
+            ht->type->trackMemUsage(ht, -sizeof(bucket) * numBuckets(ht->bucket_exp[table_index]));
+        }
+        resetTable(ht, table_index);
+    }
+}
+
+/* Deletes all the entries and frees the table. */
+void hashtableRelease(hashtable *ht) {
+    hashtableEmpty(ht, NULL);
+    /* Call trackMemUsage before zfree, so trackMemUsage can access ht. */
+    if (ht->type->trackMemUsage) {
+        size_t alloc_size = sizeof(hashtable);
+        if (ht->type->getMetadataSize) alloc_size += ht->type->getMetadataSize();
+        ht->type->trackMemUsage(ht, -alloc_size);
+    }
+    zfree(ht);
+}
+
+/* Returns the type of the hashtable. */
+hashtableType *hashtableGetType(hashtable *ht) {
+    return ht->type;
+}
+
+/* Returns a pointer to the table's metadata (userdata) section. */
+void *hashtableMetadata(hashtable *ht) {
+    return &ht->metadata;
+}
+
+/* Returns the number of entries stored. */
+size_t hashtableSize(hashtable *ht) {
+    return ht->used[0] + ht->used[1];
+}
+
+/* Returns the number of buckets in the hash table itself. */
+size_t hashtableBuckets(hashtable *ht) {
+    return numBuckets(ht->bucket_exp[0]) + numBuckets(ht->bucket_exp[1]);
+}
+
+/* Returns the number of buckets that have a child bucket. Equivalently, the
+ * number of allocated buckets, outside of the hash table itself. */
+size_t hashtableChainedBuckets(hashtable *ht, int table) {
+    return ht->child_buckets[table];
+}
+
+/* Returns the size of the hashtable structures, in bytes (not including the sizes
+ * of the entries, if the entries are pointers to allocated objects). */
+size_t hashtableMemUsage(hashtable *ht) {
+    size_t num_buckets = numBuckets(ht->bucket_exp[0]) + numBuckets(ht->bucket_exp[1]);
+    num_buckets += ht->child_buckets[0] + ht->child_buckets[1];
+    size_t metasize = ht->type->getMetadataSize ? ht->type->getMetadataSize() : 0;
+    return sizeof(hashtable) + metasize + sizeof(bucket) * num_buckets;
+}
+
+/* Pauses automatic shrinking. This can be called before deleting a lot of
+ * entries, to prevent automatic shrinking from being triggered multiple times.
+ * Call hashtableResumeAutoShrink afterwards to restore automatic shrinking. */
+void hashtablePauseAutoShrink(hashtable *ht) {
+    ht->pause_auto_shrink++;
+}
+
+/* Re-enables automatic shrinking, after it has been paused. If you have deleted
+ * many entries while automatic shrinking was paused, you may want to call
+ * hashtableShrinkIfNeeded. */
+void hashtableResumeAutoShrink(hashtable *ht) {
+    ht->pause_auto_shrink--;
+    if (ht->pause_auto_shrink == 0) {
+        hashtableShrinkIfNeeded(ht);
+    }
+}
+
+/* Pauses incremental rehashing. When rehashing is paused, bucket chains are not
+ * automatically compacted when entries are deleted. Doing so may leave empty
+ * spaces, "holes", in the bucket chains, which wastes memory. */
+static void hashtablePauseRehashing(hashtable *ht) {
+    ht->pause_rehash++;
+}
+
+/* Resumes incremental rehashing, after pausing it. */
+static void hashtableResumeRehashing(hashtable *ht) {
+    ht->pause_rehash--;
+}
+
+/* Returns 1 if incremental rehashing is paused, 0 if it isn't. */
+int hashtableIsRehashingPaused(hashtable *ht) {
+    return ht->pause_rehash > 0;
+}
+
+/* Returns 1 if incremental rehashing is in progress, 0 otherwise. */
+int hashtableIsRehashing(hashtable *ht) {
+    return ht->rehash_idx != -1;
+}
+
+/* Provides the number of buckets in the old and new tables during rehashing. To
+ * get the sizes in bytes, multiply by HASHTABLE_BUCKET_SIZE. This function can
+ * only be used when rehashing is in progress, and from the rehashingStarted and
+ * rehashingCompleted callbacks. */
+void hashtableRehashingInfo(hashtable *ht, size_t *from_size, size_t *to_size) {
+    assert(hashtableIsRehashing(ht));
+    *from_size = numBuckets(ht->bucket_exp[0]);
+    *to_size = numBuckets(ht->bucket_exp[1]);
+}
+
+/* Performs incremental rehashing for the specified number of microseconds.
+ * Returns the number of rehashed buckets chains. */
+int hashtableRehashMicroseconds(hashtable *ht, uint64_t us) {
+    if (ht->pause_rehash > 0) return 0;
+    if (resize_policy != HASHTABLE_RESIZE_ALLOW) return 0;
+
+    monotime timer;
+    elapsedStart(&timer);
+    int rehashes = 0;
+
+    while (hashtableIsRehashing(ht)) {
+        rehashStep(ht);
+        rehashes++;
+        if (rehashes % 128 == 0 && elapsedUs(timer) >= us) break;
+    }
+    return rehashes;
+}
+
+/* Return 1 if expand was performed; 0 otherwise. */
+int hashtableExpand(hashtable *ht, size_t size) {
+    return expand(ht, size, NULL);
+}
+
+/* Returns 1 if expand was performed or if expand is not needed. Returns 0 if
+ * expand failed due to memory allocation failure. */
+int hashtableTryExpand(hashtable *ht, size_t size) {
+    int malloc_failed = 0;
+    return expand(ht, size, &malloc_failed) || !malloc_failed;
+}
+
+/* Expanding is done automatically on insertion, but less eagerly if resize
+ * policy is set to AVOID or FORBID. After restoring resize policy to ALLOW, you
+ * may want to call hashtableExpandIfNeeded. Returns 1 if expanding, 0 if not
+ * expanding. */
+int hashtableExpandIfNeeded(hashtable *ht) {
+    size_t min_capacity = ht->used[0] + ht->used[1] + 1;
+    size_t num_buckets = numBuckets(ht->bucket_exp[hashtableIsRehashing(ht) ? 1 : 0]);
+    size_t current_capacity = num_buckets * ENTRIES_PER_BUCKET;
+    unsigned max_fill_percent = resize_policy == HASHTABLE_RESIZE_AVOID ? MAX_FILL_PERCENT_HARD : MAX_FILL_PERCENT_SOFT;
+    if (min_capacity * 100 <= current_capacity * max_fill_percent) {
+        return 0;
+    }
+    return resize(ht, min_capacity, NULL);
+}
+
+/* Shrinking is done automatically on deletion, but less eagerly if resize
+ * policy is set to AVOID and not at all if set to FORBID. After restoring
+ * resize policy to ALLOW, you may want to call hashtableShrinkIfNeeded. */
+int hashtableShrinkIfNeeded(hashtable *ht) {
+    /* Don't shrink if rehashing is already in progress. */
+    if (hashtableIsRehashing(ht) || resize_policy == HASHTABLE_RESIZE_FORBID) {
+        return 0;
+    }
+    size_t current_capacity = numBuckets(ht->bucket_exp[0]) * ENTRIES_PER_BUCKET;
+    unsigned min_fill_percent = resize_policy == HASHTABLE_RESIZE_AVOID ? MIN_FILL_PERCENT_HARD : MIN_FILL_PERCENT_SOFT;
+    if (ht->used[0] * 100 > current_capacity * min_fill_percent) {
+        return 0;
+    }
+    return resize(ht, ht->used[0], NULL);
+}
+
+/* Defragment the main allocations of the hashtable by reallocating them. The
+ * provided defragfn callback should either return NULL (if reallocation is not
+ * necessary) or reallocate the memory like realloc() would do.
+ *
+ * Note that this doesn't cover allocated chained buckets. To defragment them,
+ * you need to do a scan using hashtableScanDefrag with the same 'defragfn'.
+ *
+ * Returns NULL if the hashtable's top-level struct hasn't been reallocated.
+ * Returns non-NULL if the top-level allocation has been allocated and thus
+ * making the 'ht' pointer invalid. */
+hashtable *hashtableDefragTables(hashtable *ht, void *(*defragfn)(void *)) {
+    /* The hashtable struct */
+    hashtable *ht1 = defragfn(ht);
+    if (ht1 != NULL) ht = ht1;
+    /* The tables */
+    for (int i = 0; i <= 1; i++) {
+        if (ht->tables[i] == NULL) continue;
+        void *table = defragfn(ht->tables[i]);
+        if (table != NULL) ht->tables[i] = table;
+    }
+    return ht1;
+}
+
+/* Returns 1 if an entry was found matching the key. Also points *found to it,
+ * if found is provided. Returns 0 if no matching entry was found. */
+int hashtableFind(hashtable *ht, const void *key, void **found) {
+    if (hashtableSize(ht) == 0) return 0;
+    uint64_t hash = hashKey(ht, key);
+    int pos_in_bucket = 0;
+    bucket *b = findBucket(ht, hash, key, &pos_in_bucket, NULL);
+    if (b) {
+        if (found) *found = b->entries[pos_in_bucket];
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+/* Returns a pointer to where an entry is stored within the hash table, or
+ * NULL if not found. To get the entry, dereference the returned pointer. The
+ * pointer can be used to replace the entry with an equivalent entry (same
+ * key, same hash value), but note that the pointer may be invalidated by future
+ * accesses to the hash table due to incermental rehashing, so use with care. */
+void **hashtableFindRef(hashtable *ht, const void *key) {
+    if (hashtableSize(ht) == 0) return NULL;
+    uint64_t hash = hashKey(ht, key);
+    int pos_in_bucket = 0;
+    bucket *b = findBucket(ht, hash, key, &pos_in_bucket, NULL);
+    return b ? &b->entries[pos_in_bucket] : NULL;
+}
+
+/* Adds an entry. Returns 1 on success. Returns 0 if there was already an entry
+ * with the same key. */
+int hashtableAdd(hashtable *ht, void *entry) {
+    return hashtableAddOrFind(ht, entry, NULL);
+}
+
+/* Adds an entry and returns 1 on success. Returns 0 if there was already an
+ * entry with the same key and, if an 'existing' pointer is provided, it is
+ * pointed to the existing entry. */
+int hashtableAddOrFind(hashtable *ht, void *entry, void **existing) {
+    const void *key = entryGetKey(ht, entry);
+    uint64_t hash = hashKey(ht, key);
+    int pos_in_bucket = 0;
+    bucket *b = findBucket(ht, hash, key, &pos_in_bucket, NULL);
+    if (b != NULL) {
+        if (existing) *existing = b->entries[pos_in_bucket];
+        return 0;
+    } else {
+        insert(ht, hash, entry);
+        return 1;
+    }
+}
+
+/* Finds a position within the hashtable where an entry with the
+ * given key should be inserted using hashtableInsertAtPosition. This is the first
+ * phase in a two-phase insert operation and it can be used if you want to avoid
+ * creating an entry before you know if it already exists in the table or not,
+ * and without a separate lookup to the table.
+ *
+ * The function returns 1 if a position was found where an entry with the
+ * given key can be inserted. The position is stored in provided 'position'
+ * argument, which can be stack-allocated. This position should then be used in
+ * a call to hashtableInsertAtPosition.
+ *
+ * If the function returns 0, it means that an an entry with the given key
+ * already exists in the table. If an 'existing' pointer is provided, it is
+ * pointed to the existing entry with the matching key.
+ *
+ * Example:
+ *
+ *     hashtablePosition position;
+ *     void *existing;
+ *     if (hashtableFindPositionForInsert(ht, key, &position, &existing)) {
+ *         // Position found where we can insert an entry with this key.
+ *         void *entry = createNewEntryWithKeyAndValue(key, some_value);
+ *         hashtableInsertAtPosition(ht, entry, &position);
+ *     } else {
+ *         // Existing entry found with the matching key.
+ *         doSomethingWithExistingEntry(existing);
+ *     }
+ */
+int hashtableFindPositionForInsert(hashtable *ht, void *key, hashtablePosition *pos, void **existing) {
+    position *p = positionFromOpaque(pos);
+    uint64_t hash = hashKey(ht, key);
+    int pos_in_bucket, table_index;
+    bucket *b = findBucket(ht, hash, key, &pos_in_bucket, NULL);
+    if (b != NULL) {
+        if (existing) *existing = b->entries[pos_in_bucket];
+        return 0;
+    } else {
+        hashtableExpandIfNeeded(ht);
+        rehashStepOnWriteIfNeeded(ht);
+        b = findBucketForInsert(ht, hash, &pos_in_bucket, &table_index);
+        assert(!isPositionFilled(b, pos_in_bucket));
+
+        /* Store the hash bits now, so we don't need to compute the hash again
+         * when hashtableInsertAtPosition() is called. */
+        b->hashes[pos_in_bucket] = highBits(hash);
+
+        /* Populate position struct. */
+        assert(p != NULL);
+        p->bucket = b;
+        p->pos_in_bucket = pos_in_bucket;
+        p->table_index = table_index;
+        return 1;
+    }
+}
+
+/* Inserts an entry at the position previously acquired using
+ * hashtableFindPositionForInsert(). The entry must match the key provided when
+ * finding the position. You must not access the hashtable in any way between
+ * hashtableFindPositionForInsert() and hashtableInsertAtPosition(), since even a
+ * hashtableFind() may cause incremental rehashing to move entries in memory. */
+void hashtableInsertAtPosition(hashtable *ht, void *entry, hashtablePosition *pos) {
+    position *p = positionFromOpaque(pos);
+    bucket *b = p->bucket;
+    int pos_in_bucket = p->pos_in_bucket;
+    int table_index = p->table_index;
+    assert(!isPositionFilled(b, pos_in_bucket));
+    b->presence |= (1 << pos_in_bucket);
+    b->entries[pos_in_bucket] = entry;
+    ht->used[table_index]++;
+    /* Hash bits are already set by hashtableFindPositionForInsert. */
+}
+
+/* Removes the entry with the matching key and returns it. The entry
+ * destructor is not called. Returns 1 and points 'popped' to the entry if a
+ * matching entry was found. Returns 0 if no matching entry was found. */
+int hashtablePop(hashtable *ht, const void *key, void **popped) {
+    if (hashtableSize(ht) == 0) return 0;
+    uint64_t hash = hashKey(ht, key);
+    int pos_in_bucket = 0;
+    int table_index = 0;
+    bucket *b = findBucket(ht, hash, key, &pos_in_bucket, &table_index);
+    if (b) {
+        if (popped) *popped = b->entries[pos_in_bucket];
+        b->presence &= ~(1 << pos_in_bucket);
+        ht->used[table_index]--;
+        if (b->chained && !hashtableIsRehashingPaused(ht)) {
+            /* Rehashing is paused while iterating and when a scan callback is
+             * running. In those cases, we do the compaction in the scan and
+             * iterator code instead. */
+            fillBucketHole(ht, b, pos_in_bucket, table_index);
+        }
+        hashtableShrinkIfNeeded(ht);
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+/* Deletes the entry with the matching key. Returns 1 if an entry was
+ * deleted, 0 if no matching entry was found. */
+int hashtableDelete(hashtable *ht, const void *key) {
+    void *entry;
+    if (hashtablePop(ht, key, &entry)) {
+        freeEntry(ht, entry);
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+/* When an entry has been reallocated, it can be replaced in a hash table
+ * without dereferencing the old pointer which may no longer be valid. The new
+ * entry with the same key and hash is used for finding the old entry and
+ * replacing it with the new entry. Returns 1 if the entry was replaced and 0 if
+ * the entry wasn't found. */
+int hashtableReplaceReallocatedEntry(hashtable *ht, const void *old_entry, void *new_entry) {
+    const void *key = entryGetKey(ht, new_entry);
+    uint64_t hash = hashKey(ht, key);
+    uint8_t h2 = highBits(hash);
+    for (int table = 0; table <= 1; table++) {
+        if (ht->used[table] == 0) continue;
+        size_t mask = expToMask(ht->bucket_exp[table]);
+        size_t bucket_idx = hash & mask;
+        /* Skip already rehashed buckets. */
+        if (table == 0 && ht->rehash_idx >= 0 && bucket_idx < (size_t)ht->rehash_idx) {
+            continue;
+        }
+        bucket *b = &ht->tables[table][bucket_idx];
+        do {
+            for (int pos = 0; pos < numBucketPositions(b); pos++) {
+                if (isPositionFilled(b, pos) && b->hashes[pos] == h2 && b->entries[pos] == old_entry) {
+                    /* It's a match. */
+                    b->entries[pos] = new_entry;
+                    return 1;
+                }
+            }
+            b = bucketNext(b);
+        } while (b != NULL);
+    }
+    return 0;
+}
+
+/* Two-phase pop: Look up an entry, do something with it, then delete it
+ * without searching the hash table again.
+ *
+ * hashtableTwoPhasePopFindRef finds an entry in the table and also the position
+ * of the entry within the table, so that it can be deleted without looking it
+ * up in the table again. The function returns a pointer to the entry pointer
+ * within the hash table, if an entry with a matching key is found, and NULL
+ * otherwise.
+ *
+ * If non-NULL is returned, call 'hashtableTwoPhasePopDelete' with the returned
+ * 'position' afterwards to actually delete the entry from the table. These two
+ * functions are designed be used in pair. `hashtableTwoPhasePopFindRef` pauses
+ * rehashing and `hashtableTwoPhasePopDelete` resumes rehashing.
+ *
+ * While hashtablePop finds and returns an entry, the purpose of two-phase pop
+ * is to provide an optimized equivalent of hashtableFindRef followed by
+ * hashtableDelete, where the first call finds the entry but doesn't delete it
+ * from the hash table and the latter doesn't need to look up the entry in the
+ * hash table again.
+ *
+ * Example:
+ *
+ *     hashtablePosition position;
+ *     void **ref = hashtableTwoPhasePopFindRef(ht, key, &position)
+ *     if (ref != NULL) {
+ *         void *entry = *ref;
+ *         // do something with the entry, then...
+ *         hashtableTwoPhasePopDelete(ht, &position);
+ *     }
+ */
+
+/* Like hashtableTwoPhasePopFind, but returns a pointer to where the entry is
+ * stored in the table, or NULL if no matching entry is found. The 'position'
+ * argument is populated with a representation of where the entry is stored.
+ * This must be provided to hashtableTwoPhasePopDelete to complete the
+ * operation. */
+void **hashtableTwoPhasePopFindRef(hashtable *ht, const void *key, hashtablePosition *pos) {
+    position *p = positionFromOpaque(pos);
+    if (hashtableSize(ht) == 0) return NULL;
+    uint64_t hash = hashKey(ht, key);
+    int pos_in_bucket = 0;
+    int table_index = 0;
+    bucket *b = findBucket(ht, hash, key, &pos_in_bucket, &table_index);
+    if (b) {
+        hashtablePauseRehashing(ht);
+
+        /* Store position. */
+        assert(p != NULL);
+        p->bucket = b;
+        p->pos_in_bucket = pos_in_bucket;
+        p->table_index = table_index;
+        return &b->entries[pos_in_bucket];
+    } else {
+        return NULL;
+    }
+}
+
+/* Clears the position of the entry in the hashtable and resumes rehashing. The
+ * entry destructor is NOT called. The position is acquired using a preceding
+ * call to hashtableTwoPhasePopFindRef(). */
+void hashtableTwoPhasePopDelete(hashtable *ht, hashtablePosition *pos) {
+    /* Read position. */
+    position *p = positionFromOpaque(pos);
+    bucket *b = p->bucket;
+    int pos_in_bucket = p->pos_in_bucket;
+    int table_index = p->table_index;
+
+    /* Delete the entry and resume rehashing. */
+    assert(isPositionFilled(b, pos_in_bucket));
+    b->presence &= ~(1 << pos_in_bucket);
+    ht->used[table_index]--;
+    hashtableShrinkIfNeeded(ht);
+    hashtableResumeRehashing(ht);
+    if (b->chained && !hashtableIsRehashingPaused(ht)) {
+        /* Rehashing paused also means bucket chain compaction paused. It is
+         * paused while iterating and when a scan callback is running, to be
+         * able to live up to the scan and iterator guarantees. In those cases,
+         * we do the compaction in the scan and iterator code instead. */
+        fillBucketHole(ht, b, pos_in_bucket, table_index);
+    }
+}
+
+/* Initializes the state for an incremental find operation.
+ *
+ * Incremental find can be used to speed up the loading of multiple objects by
+ * utilizing CPU branch predictions to parallelize memory accesses. Initialize
+ * the data for a number of incremental find operations. Then call
+ * hashtableIncrementalFindStep on them in a round-robin order until all of them
+ * are complete. Finally, if necessary, call hashtableIncrementalFindGetResult.
+ */
+void hashtableIncrementalFindInit(hashtableIncrementalFindState *state, hashtable *ht, const void *key) {
+    incrementalFind *data = incrementalFindFromOpaque(state);
+    if (hashtableSize(ht) == 0) {
+        data->state = HASHTABLE_NOT_FOUND;
+    } else {
+        data->state = HASHTABLE_NEXT_BUCKET;
+        data->bucket = NULL;
+        data->hashtable = ht;
+        data->key = key;
+        data->hash = hashKey(ht, key);
+    }
+}
+
+/* Returns 1 if more work is needed, 0 when done. Call this function repeatedly
+ * until it returns 0. Then use hashtableIncrementalFindGetResult to fetch the
+ * result. */
+int hashtableIncrementalFindStep(hashtableIncrementalFindState *state) {
+    incrementalFind *data = incrementalFindFromOpaque(state);
+    switch (data->state) {
+    case HASHTABLE_CHECK_ENTRY:
+        /* Current entry is prefetched. Now check if it's a match. */
+        {
+            hashtable *ht = data->hashtable;
+            void *entry = data->bucket->entries[data->pos];
+            const void *elem_key = entryGetKey(ht, entry);
+            if (compareKeys(ht, data->key, elem_key) == 0) {
+                /* It's a match. */
+                data->state = HASHTABLE_FOUND;
+                return 0;
+            }
+            /* No match. Look for next candidate entry in the bucket. */
+            data->pos++;
+        }
+        /* fall through */
+    case HASHTABLE_NEXT_ENTRY:
+        /* Current bucket is prefetched. Prefetch next potential
+         * matching entry in the current bucket. */
+        if (data->bucket->presence != 0 && data->pos < numBucketPositions(data->bucket)) {
+            bucket *b = data->bucket;
+            uint8_t h2 = highBits(data->hash);
+            for (int pos = data->pos; pos < numBucketPositions(b); pos++) {
+                if (isPositionFilled(b, pos) && b->hashes[pos] == h2) {
+                    /* It's a candidate. */
+                    valkey_prefetch(b->entries[pos]);
+                    data->pos = pos;
+                    data->state = HASHTABLE_CHECK_ENTRY;
+                    return 1;
+                }
+            }
+        }
+        /* fall through */
+    case HASHTABLE_NEXT_BUCKET:
+        /* Current bucket is prefetched, if any. Find the next bucket in the
+         * chain, or in next table, and prefetch it. */
+        {
+            hashtable *ht = data->hashtable;
+            if (data->bucket == NULL) {
+                data->table = 0;
+                size_t mask = expToMask(ht->bucket_exp[0]);
+                size_t bucket_idx = data->hash & mask;
+                if (ht->rehash_idx >= 0 && bucket_idx < (size_t)ht->rehash_idx) {
+                    /* Skip already rehashed bucket in table 0. */
+                    data->table = 1;
+                    mask = expToMask(ht->bucket_exp[1]);
+                    bucket_idx = data->hash & mask;
+                }
+                data->bucket = &ht->tables[data->table][bucket_idx];
+            } else if (bucketNext(data->bucket) != NULL) {
+                data->bucket = bucketNext(data->bucket);
+            } else if (data->table == 0 && ht->rehash_idx >= 0) {
+                data->table = 1;
+                size_t mask = expToMask(ht->bucket_exp[1]);
+                size_t bucket_idx = data->hash & mask;
+                data->bucket = &ht->tables[data->table][bucket_idx];
+            } else {
+                /* No more tables. */
+                data->state = HASHTABLE_NOT_FOUND;
+                return 0;
+            }
+            valkey_prefetch(data->bucket);
+            data->state = HASHTABLE_NEXT_ENTRY;
+            data->pos = 0;
+        }
+        return 1;
+    case HASHTABLE_FOUND:
+        return 0;
+    case HASHTABLE_NOT_FOUND:
+        return 0;
+    }
+    assert(0);
+}
+
+/* Call only when hashtableIncrementalFindStep has returned 0.
+ *
+ * Returns 1 and points 'found' to the entry if an entry was found, 0 if it
+ * was not found. */
+int hashtableIncrementalFindGetResult(hashtableIncrementalFindState *state, void **found) {
+    incrementalFind *data = incrementalFindFromOpaque(state);
+    if (data->state == HASHTABLE_FOUND) {
+        if (found) *found = data->bucket->entries[data->pos];
+        return 1;
+    } else {
+        assert(data->state == HASHTABLE_NOT_FOUND);
+        return 0;
+    }
+}
+
+/* --- Scan --- */
+
+/* Scan is a stateless iterator. It works with a cursor that is returned to the
+ * caller and which should be provided to the next call to continue scanning.
+ * The hash table can be modified in any way between two scan calls. The scan
+ * still continues iterating where it was.
+ *
+ * A full scan is performed like this: Start with a cursor of 0. The scan
+ * callback is invoked for each entry scanned and a new cursor is returned. Next
+ * time, call this function with the new cursor. Continue until the function
+ * returns 0.
+ *
+ * We say that an entry is *emitted* when it's passed to the scan callback.
+ *
+ * Scan guarantees:
+ *
+ * - An entry that is present in the hash table during an entire full scan will
+ *   be returned (emitted) at least once. (Most of the time exactly once, but
+ *   sometimes twice.)
+ *
+ * - An entry that is inserted or deleted during a full scan may or may not be
+ *   returned during the scan.
+ *
+ * Scan callback rules:
+ *
+ * - The scan callback may delete the entry that was passed to it.
+ *
+ * - It may not delete other entries, because that may lead to internal
+ *   fragmentation in the form of "holes" in the bucket chains.
+ *
+ * - The scan callback may insert or replace any entry.
+ */
+size_t hashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata) {
+    return hashtableScanDefrag(ht, cursor, fn, privdata, NULL, 0);
+}
+
+/* Like hashtableScan, but additionally reallocates the memory used by the dict
+ * entries using the provided allocation function. This feature was added for
+ * the active defrag feature.
+ *
+ * The 'defragfn' callback is called with a pointer to memory that callback can
+ * reallocate. The callbacks should return a new memory address or NULL, where
+ * NULL means that no reallocation happened and the old memory is still valid.
+ * The 'defragfn' can be NULL if you don't need defrag reallocation.
+ *
+ * The 'flags' argument can be used to tweak the behaviour. It's a bitwise-or
+ * (zero means no flags) of the following:
+ *
+ * - HASHTABLE_SCAN_EMIT_REF: Emit a pointer to the entry's location in the
+ *   table to the scan function instead of the actual entry. This can be used
+ *   for advanced things like reallocating the memory of an entry (for the
+ *   purpose of defragmentation) and updating the pointer to the entry inside
+ *   the hash table.
+ */
+size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata, void *(*defragfn)(void *), int flags) {
+    if (hashtableSize(ht) == 0) return 0;
+
+    /* Prevent entries from being moved around during the scan call, as a
+     * side-effect of the scan callback. */
+    hashtablePauseRehashing(ht);
+
+    /* Flags. */
+    int emit_ref = (flags & HASHTABLE_SCAN_EMIT_REF);
+
+    if (!hashtableIsRehashing(ht)) {
+        /* Emit entries at the cursor index. */
+        size_t mask = expToMask(ht->bucket_exp[0]);
+        bucket *b = &ht->tables[0][cursor & mask];
+        do {
+            if (b->presence != 0) {
+                int pos;
+                for (pos = 0; pos < ENTRIES_PER_BUCKET; pos++) {
+                    if (isPositionFilled(b, pos)) {
+                        void *emit = emit_ref ? &b->entries[pos] : b->entries[pos];
+                        fn(privdata, emit);
+                    }
+                }
+            }
+            bucket *next = bucketNext(b);
+            if (next != NULL && defragfn != NULL) {
+                next = bucketDefrag(b, next, defragfn);
+            }
+            b = next;
+        } while (b != NULL);
+
+        /* Advance cursor. */
+        cursor = nextCursor(cursor, mask);
+    } else {
+        int table_small, table_large;
+        if (ht->bucket_exp[0] <= ht->bucket_exp[1]) {
+            table_small = 0;
+            table_large = 1;
+        } else {
+            table_small = 1;
+            table_large = 0;
+        }
+
+        size_t mask_small = expToMask(ht->bucket_exp[table_small]);
+        size_t mask_large = expToMask(ht->bucket_exp[table_large]);
+
+        /* Emit entries in the smaller table, if this index hasn't already been
+         * rehashed. */
+        size_t idx = cursor & mask_small;
+        if (table_small == 1 || ht->rehash_idx == -1 || idx >= (size_t)ht->rehash_idx) {
+            size_t used_before = ht->used[table_small];
+            bucket *b = &ht->tables[table_small][idx];
+            do {
+                if (b->presence) {
+                    for (int pos = 0; pos < ENTRIES_PER_BUCKET; pos++) {
+                        if (isPositionFilled(b, pos)) {
+                            void *emit = emit_ref ? &b->entries[pos] : b->entries[pos];
+                            fn(privdata, emit);
+                        }
+                    }
+                }
+                bucket *next = bucketNext(b);
+                if (next != NULL && defragfn != NULL) {
+                    next = bucketDefrag(b, next, defragfn);
+                }
+                b = next;
+            } while (b != NULL);
+            /* If any entries were deleted, fill the holes. */
+            if (ht->used[table_small] < used_before) {
+                compactBucketChain(ht, idx, table_small);
+            }
+        }
+
+        /* Iterate over indices in larger table that are the expansion of the
+         * index pointed to by the cursor in the smaller table. */
+        do {
+            /* Emit entries in the larger table at this cursor, if this index
+             * hash't already been rehashed. */
+            idx = cursor & mask_large;
+            if (table_large == 1 || ht->rehash_idx == -1 || idx >= (size_t)ht->rehash_idx) {
+                size_t used_before = ht->used[table_large];
+                bucket *b = &ht->tables[table_large][idx];
+                do {
+                    if (b->presence) {
+                        for (int pos = 0; pos < ENTRIES_PER_BUCKET; pos++) {
+                            if (isPositionFilled(b, pos)) {
+                                void *emit = emit_ref ? &b->entries[pos] : b->entries[pos];
+                                fn(privdata, emit);
+                            }
+                        }
+                    }
+                    bucket *next = bucketNext(b);
+                    if (next != NULL && defragfn != NULL) {
+                        next = bucketDefrag(b, next, defragfn);
+                    }
+                    b = next;
+                } while (b != NULL);
+                /* If any entries were deleted, fill the holes. */
+                if (ht->used[table_large] < used_before) {
+                    compactBucketChain(ht, idx, table_large);
+                }
+            }
+
+            /* Increment the reverse cursor not covered by the smaller mask. */
+            cursor = nextCursor(cursor, mask_large);
+
+            /* Continue while bits covered by mask difference is non-zero. */
+        } while (cursor & (mask_small ^ mask_large));
+    }
+    hashtableResumeRehashing(ht);
+    return cursor;
+}
+
+/* --- Iterator --- */
+
+/* Initialize a iterator, that is not allowed to insert, delete or even lookup
+ * entries in the hashtable, because such operations can trigger incremental
+ * rehashing which moves entries around and confuses the iterator. Only
+ * hashtableNext is allowed. Each entry is returned exactly once. Call
+ * hashtableResetIterator when you are done. See also
+ * hashtableInitSafeIterator. */
+void hashtableInitIterator(hashtableIterator *iterator, hashtable *ht) {
+    iter *iter;
+    iter = iteratorFromOpaque(iterator);
+    iter->hashtable = ht;
+    iter->table = 0;
+    iter->index = -1;
+    iter->safe = 0;
+}
+
+/* Initialize a safe iterator, which is allowed to modify the hash table while
+ * iterating. It pauses incremental rehashing to prevent entries from moving
+ * around. Call hashtableNext to fetch each entry. You must call
+ * hashtableResetIterator when you are done with a safe iterator.
+ *
+ * It's allowed to insert and replace entries. Deleting entries is only allowed
+ * for the entry that was just returned by hashtableNext. Deleting other entries
+ * is possible, but doing so can cause internal fragmentation, so don't.
+ *
+ * Guarantees:
+ *
+ * - Entries that are in the hash table for the entire iteration are returned
+ *   exactly once.
+ *
+ * - Entries that are deleted or replaced after they have been returned are not
+ *   returned again.
+ *
+ * - Entries that are replaced before they've been returned by the iterator will
+ *   be returned.
+ *
+ * - Entries that are inserted during the iteration may or may not be returned
+ *   by the iterator.
+ */
+void hashtableInitSafeIterator(hashtableIterator *iterator, hashtable *ht) {
+    hashtableInitIterator(iterator, ht);
+    iter *iter = iteratorFromOpaque(iterator);
+    iter->safe = 1;
+}
+
+/* Resets a stack-allocated iterator. */
+void hashtableResetIterator(hashtableIterator *iterator) {
+    iter *iter = iteratorFromOpaque(iterator);
+    if (!(iter->index == -1 && iter->table == 0)) {
+        if (iter->safe) {
+            hashtableResumeRehashing(iter->hashtable);
+            assert(iter->hashtable->pause_rehash >= 0);
+        } else {
+            assert(iter->fingerprint == hashtableFingerprint(iter->hashtable));
+        }
+    }
+}
+
+/* Allocates and initializes an iterator. */
+hashtableIterator *hashtableCreateIterator(hashtable *ht) {
+    iter *iter = zmalloc(sizeof(*iter));
+    hashtableIterator *opaque = iteratorToOpaque(iter);
+    hashtableInitIterator(opaque, ht);
+    return opaque;
+}
+
+/* Allocates and initializes a safe iterator. */
+hashtableIterator *hashtableCreateSafeIterator(hashtable *ht) {
+    hashtableIterator *iterator = hashtableCreateIterator(ht);
+    iter *iter = iteratorFromOpaque(iterator);
+    iter->safe = 1;
+    return iterator;
+}
+
+/* Resets and frees the memory of an allocated iterator, i.e. one created using
+ * hashtableCreate(Safe)Iterator. */
+void hashtableReleaseIterator(hashtableIterator *iterator) {
+    hashtableResetIterator(iterator);
+    iter *iter = iteratorFromOpaque(iterator);
+    zfree(iter);
+}
+
+/* Points elemptr to the next entry and returns 1 if there is a next entry.
+ * Returns 0 if there are no more entries. */
+int hashtableNext(hashtableIterator *iterator, void **elemptr) {
+    iter *iter = iteratorFromOpaque(iterator);
+    while (1) {
+        if (iter->index == -1 && iter->table == 0) {
+            /* It's the first call to next. */
+            if (iter->safe) {
+                hashtablePauseRehashing(iter->hashtable);
+                iter->last_seen_size = iter->hashtable->used[iter->table];
+            } else {
+                iter->fingerprint = hashtableFingerprint(iter->hashtable);
+            }
+            if (iter->hashtable->tables[0] == NULL) {
+                /* Empty hashtable. We're done. */
+                break;
+            }
+            iter->index = 0;
+            /* Skip already rehashed buckets. */
+            if (hashtableIsRehashing(iter->hashtable)) {
+                iter->index = iter->hashtable->rehash_idx;
+            }
+            iter->bucket = &iter->hashtable->tables[iter->table][iter->index];
+            iter->pos_in_bucket = 0;
+        } else {
+            /* Advance to the next position within the bucket, or to the next
+             * child bucket in a chain, or to the next bucket index, or to the
+             * next table. */
+            iter->pos_in_bucket++;
+            if (iter->bucket->chained && iter->pos_in_bucket >= ENTRIES_PER_BUCKET - 1) {
+                iter->pos_in_bucket = 0;
+                iter->bucket = bucketNext(iter->bucket);
+            } else if (iter->pos_in_bucket >= ENTRIES_PER_BUCKET) {
+                /* Bucket index done. */
+                if (iter->safe) {
+                    /* If entries in this bucket chain have been deleted,
+                     * they've left empty spaces in the buckets. The chain is
+                     * not automatically compacted when rehashing is paused. If
+                     * this iterator is the only reason for pausing rehashing,
+                     * we can do the compaction now when we're done with a
+                     * bucket chain, before we move on to the next index. */
+                    if (iter->hashtable->pause_rehash == 1 &&
+                        iter->hashtable->used[iter->table] < iter->last_seen_size) {
+                        compactBucketChain(iter->hashtable, iter->index, iter->table);
+                    }
+                    iter->last_seen_size = iter->hashtable->used[iter->table];
+                }
+                iter->pos_in_bucket = 0;
+                iter->index++;
+                if ((size_t)iter->index >= numBuckets(iter->hashtable->bucket_exp[iter->table])) {
+                    if (hashtableIsRehashing(iter->hashtable) && iter->table == 0) {
+                        iter->index = 0;
+                        iter->table++;
+                    } else {
+                        /* Done. */
+                        break;
+                    }
+                }
+                iter->bucket = &iter->hashtable->tables[iter->table][iter->index];
+            }
+        }
+        bucket *b = iter->bucket;
+        if (!isPositionFilled(b, iter->pos_in_bucket)) {
+            /* No entry here. */
+            continue;
+        }
+        /* Return the entry at this position. */
+        if (elemptr) {
+            *elemptr = b->entries[iter->pos_in_bucket];
+        }
+        return 1;
+    }
+    return 0;
+}
+
+/* --- Random entries --- */
+
+/* Points 'found' to a random entry in the hash table and returns 1. Returns 0
+ * if the table is empty. */
+int hashtableRandomEntry(hashtable *ht, void **found) {
+    void *samples[WEAK_RANDOM_SAMPLE_SIZE];
+    unsigned count = hashtableSampleEntries(ht, &samples[0], WEAK_RANDOM_SAMPLE_SIZE);
+    if (count == 0) return 0;
+    unsigned idx = random() % count;
+    *found = samples[idx];
+    return 1;
+}
+
+/* Points 'found' to a random entry in the hash table and returns 1. Returns 0
+ * if the table is empty. This one is more fair than hashtableRandomEntry(). */
+int hashtableFairRandomEntry(hashtable *ht, void **found) {
+    void *samples[FAIR_RANDOM_SAMPLE_SIZE];
+    unsigned count = hashtableSampleEntries(ht, &samples[0], FAIR_RANDOM_SAMPLE_SIZE);
+    if (count == 0) return 0;
+    unsigned idx = random() % count;
+    *found = samples[idx];
+    return 1;
+}
+
+/* This function samples a sequence of entries starting at a random location in
+ * the hash table.
+ *
+ * The sampled entries are stored in the array 'dst' which must have space for
+ * at least 'count' entries.
+ *
+ * The function returns the number of sampled entries, which is 'count' except
+ * if 'count' is greater than the total number of entries in the hash table. */
+unsigned hashtableSampleEntries(hashtable *ht, void **dst, unsigned count) {
+    /* Adjust count. */
+    if (count > hashtableSize(ht)) count = hashtableSize(ht);
+    scan_samples samples;
+    samples.size = count;
+    samples.seen = 0;
+    samples.entries = dst;
+    size_t cursor = randomSizeT();
+    while (samples.seen < count) {
+        cursor = hashtableScan(ht, cursor, sampleEntriesScanFn, &samples);
+    }
+    rehashStepOnReadIfNeeded(ht);
+    /* samples.seen is the number of entries scanned. It may be greater than
+     * the requested count and the size of the dst array. */
+    return samples.seen <= count ? samples.seen : count;
+}
+
+/* --- Stats --- */
+
+#define HASHTABLE_STATS_VECTLEN 50
+void hashtableFreeStats(hashtableStats *stats) {
+    zfree(stats->clvector);
+    zfree(stats);
+}
+
+void hashtableCombineStats(hashtableStats *from, hashtableStats *into) {
+    into->toplevel_buckets += from->toplevel_buckets;
+    into->child_buckets += from->child_buckets;
+    into->max_chain_len = (from->max_chain_len > into->max_chain_len) ? from->max_chain_len : into->max_chain_len;
+    into->size += from->size;
+    into->used += from->used;
+    for (int i = 0; i < HASHTABLE_STATS_VECTLEN; i++) {
+        into->clvector[i] += from->clvector[i];
+    }
+}
+
+hashtableStats *hashtableGetStatsHt(hashtable *ht, int table_index, int full) {
+    unsigned long *clvector = zcalloc(sizeof(unsigned long) * HASHTABLE_STATS_VECTLEN);
+    hashtableStats *stats = zcalloc(sizeof(hashtableStats));
+    stats->table_index = table_index;
+    stats->clvector = clvector;
+    stats->toplevel_buckets = numBuckets(ht->bucket_exp[table_index]);
+    stats->child_buckets = ht->child_buckets[table_index];
+    stats->size = numBuckets(ht->bucket_exp[table_index]) * ENTRIES_PER_BUCKET;
+    stats->used = ht->used[table_index];
+    if (!full) return stats;
+    /* Compute stats about bucket chain lengths. */
+    stats->max_chain_len = 0;
+    for (size_t idx = 0; idx < numBuckets(ht->bucket_exp[table_index]); idx++) {
+        bucket *b = &ht->tables[table_index][idx];
+        unsigned long chainlen = 0;
+        while (b->chained) {
+            chainlen++;
+            b = bucketNext(b);
+        }
+        if (chainlen > stats->max_chain_len) {
+            stats->max_chain_len = chainlen;
+        }
+        if (chainlen >= HASHTABLE_STATS_VECTLEN) {
+            chainlen = HASHTABLE_STATS_VECTLEN - 1;
+        }
+        clvector[chainlen]++;
+    }
+    return stats;
+}
+
+/* Generates human readable stats. */
+size_t hashtableGetStatsMsg(char *buf, size_t bufsize, hashtableStats *stats, int full) {
+    if (stats->used == 0) {
+        return snprintf(buf, bufsize,
+                        "Hash table %d stats (%s):\n"
+                        "No stats available for empty hash tables\n",
+                        stats->table_index,
+                        (stats->table_index == 0) ? "main hash table" : "rehashing target");
+    }
+    size_t l = 0;
+    l += snprintf(buf + l, bufsize - l,
+                  "Hash table %d stats (%s):\n"
+                  " table size: %lu\n"
+                  " number of entries: %lu\n",
+                  stats->table_index,
+                  (stats->table_index == 0) ? "main hash table" : "rehashing target", stats->size,
+                  stats->used);
+    if (full) {
+        l += snprintf(buf + l, bufsize - l,
+                      " top-level buckets: %lu\n"
+                      " child buckets: %lu\n"
+                      " max chain length: %lu\n"
+                      " avg chain length: %.02f\n"
+                      " chain length distribution:\n",
+                      stats->toplevel_buckets,
+                      stats->child_buckets,
+                      stats->max_chain_len,
+                      (float)stats->child_buckets / stats->toplevel_buckets);
+        for (unsigned long i = 0; i < HASHTABLE_STATS_VECTLEN - 1; i++) {
+            if (stats->clvector[i] == 0) continue;
+            if (l >= bufsize) break;
+            l += snprintf(buf + l, bufsize - l, "   %ld: %ld (%.02f%%)\n", i, stats->clvector[i],
+                          ((float)stats->clvector[i] / stats->toplevel_buckets) * 100);
+        }
+    }
+
+    /* Make sure there is a NULL term at the end. */
+    buf[bufsize - 1] = '\0';
+    /* Unlike snprintf(), return the number of characters actually written. */
+    return strlen(buf);
+}
+
+void hashtableGetStats(char *buf, size_t bufsize, hashtable *ht, int full) {
+    size_t l;
+    char *orig_buf = buf;
+    size_t orig_bufsize = bufsize;
+
+    hashtableStats *mainHtStats = hashtableGetStatsHt(ht, 0, full);
+    l = hashtableGetStatsMsg(buf, bufsize, mainHtStats, full);
+    hashtableFreeStats(mainHtStats);
+    buf += l;
+    bufsize -= l;
+    if (hashtableIsRehashing(ht) && bufsize > 0) {
+        hashtableStats *rehashHtStats = hashtableGetStatsHt(ht, 1, full);
+        hashtableGetStatsMsg(buf, bufsize, rehashHtStats, full);
+        hashtableFreeStats(rehashHtStats);
+    }
+    /* Make sure there is a NULL term at the end. */
+    orig_buf[orig_bufsize - 1] = '\0';
+}
+
+/* --- DEBUG --- */
+
+void hashtableDump(hashtable *ht) {
+    for (int table = 0; table <= 1; table++) {
+        printf("Table %d, used %zu, exp %d, top-level buckets %zu, child buckets %zu\n",
+               table, ht->used[table], ht->bucket_exp[table],
+               numBuckets(ht->bucket_exp[table]), ht->child_buckets[table]);
+        for (size_t idx = 0; idx < numBuckets(ht->bucket_exp[table]); idx++) {
+            bucket *b = &ht->tables[table][idx];
+            int level = 0;
+            do {
+                printf("Bucket %d:%zu level:%d\n", table, idx, level);
+                for (int pos = 0; pos < ENTRIES_PER_BUCKET; pos++) {
+                    printf("  %d ", pos);
+                    if (isPositionFilled(b, pos)) {
+                        printf("h2 %02x, key \"%s\"\n", b->hashes[pos],
+                               (const char *)entryGetKey(ht, b->entries[pos]));
+                    } else {
+                        printf("(empty)\n");
+                    }
+                }
+                b = bucketNext(b);
+                level++;
+            } while (b != NULL);
+        }
+    }
+}
+
+/* Prints a histogram-like view of the number of entries in each bucket and
+ * sub-bucket. Example:
+ *
+ *     Bucket fill table=0 size=32 children=9 used=200:
+ *         67453462673764475436556656776756
+ *         2     3 2   3      3  45 5     3
+ */
+void hashtableHistogram(hashtable *ht) {
+    for (int table = 0; table <= 1; table++) {
+        if (ht->bucket_exp[table] < 0) continue;
+        size_t size = numBuckets(ht->bucket_exp[table]);
+        bucket *buckets[size];
+        for (size_t idx = 0; idx < size; idx++) {
+            buckets[idx] = &ht->tables[table][idx];
+        }
+        size_t chains_left = size;
+        printf("Bucket fill table=%d size=%zu children=%zu used=%zu:\n",
+               table, size, ht->child_buckets[table], ht->used[table]);
+        do {
+            printf("    ");
+            for (size_t idx = 0; idx < size; idx++) {
+                bucket *b = buckets[idx];
+                if (b == NULL) {
+                    printf(" ");
+                    continue;
+                }
+                printf("%X", __builtin_popcount(b->presence));
+                buckets[idx] = bucketNext(b);
+                if (buckets[idx] == NULL) chains_left--;
+            }
+            printf("\n");
+        } while (chains_left > 0);
+    }
+}
+
+int hashtableLongestBucketChain(hashtable *ht) {
+    int maxlen = 0;
+    for (int table = 0; table <= 1; table++) {
+        if (ht->bucket_exp[table] < 0) {
+            continue; /* table not used */
+        }
+        for (size_t i = 0; i < numBuckets(ht->bucket_exp[table]); i++) {
+            int chainlen = 0;
+            bucket *b = &ht->tables[table][i];
+            while (b->chained) {
+                if (++chainlen > maxlen) {
+                    maxlen = chainlen;
+                }
+                b = bucketNext(b);
+            }
+        }
+    }
+    return maxlen;
+}
diff --git a/src/hashtable.h b/src/hashtable.h
new file mode 100644
index 0000000000..242531df8f
--- /dev/null
+++ b/src/hashtable.h
@@ -0,0 +1,167 @@
+#ifndef HASHTABLE_H
+#define HASHTABLE_H
+
+/* Hash table implementation.
+ *
+ * This is a cache-friendly hash table implementation. For details about the
+ * implementation and documentation of functions, see comments in hashtable.c.
+ *
+ * The entries in a hashtable are of a user-defined type, but an entry needs to
+ * contain a key. It can represent a key-value entry, or it can be just a key,
+ * if set semantics are desired.
+ *
+ * Terminology:
+ *
+ * hashtable
+ *         An instance of the data structure.
+ *
+ * entry
+ *         An entry in the hashtable. This may be of the same type as the key,
+ *         or a struct containing a key and other fields.
+ * key
+ *         The part of the entry used for looking the entry up in the hashtable.
+ *         May be the entire entry or a struct field within the entry.
+ *
+ * type
+ *         A struct containing callbacks, such as hash function, key comparison
+ *         function and how to get the key in an entry.
+ */
+
+#include "fmacros.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <unistd.h>
+
+/* --- Opaque types --- */
+
+typedef struct hashtable hashtable;
+typedef struct hashtableStats hashtableStats;
+
+/* Can types that can be stack allocated. */
+typedef uint64_t hashtableIterator[5];
+typedef uint64_t hashtablePosition[2];
+typedef uint64_t hashtableIncrementalFindState[5];
+
+/* --- Non-opaque types --- */
+
+/* The hashtableType is a set of callbacks for a hashtable. All callbacks are
+ * optional. With all callbacks omitted, the hashtable is effectively a set of
+ * pointer-sized integers. */
+typedef struct {
+    /* If the type of an entry is not the same as the type of a key used for
+     * lookup, this callback needs to return the key within an entry. */
+    const void *(*entryGetKey)(const void *entry);
+    /* Hash function. Defaults to hashing the bits in the pointer, effectively
+     * treating the pointer as an integer. */
+    uint64_t (*hashFunction)(const void *key);
+    /* Compare function, returns 0 if the keys are equal. Defaults to just
+     * comparing the pointers for equality. */
+    int (*keyCompare)(const void *key1, const void *key2);
+    /* Callback to free an entry when it's overwritten or deleted.
+     * Optional. */
+    void (*entryDestructor)(void *entry);
+    /* Callback to control when resizing should be allowed. */
+    int (*resizeAllowed)(size_t moreMem, double usedRatio);
+    /* Invoked at the start of rehashing. */
+    void (*rehashingStarted)(hashtable *ht);
+    /* Invoked at the end of rehashing. */
+    void (*rehashingCompleted)(hashtable *ht);
+    /* Track memory usage using this callback. It is called with a positive
+     * number when the hashtable allocates some memory and with a negative number
+     * when freeing. */
+    void (*trackMemUsage)(hashtable *ht, ssize_t delta);
+    /* Allow a hashtable to carry extra caller-defined metadata. The extra memory
+     * is initialized to 0. */
+    size_t (*getMetadataSize)(void);
+    /* Flag to disable incremental rehashing */
+    unsigned instant_rehashing : 1;
+} hashtableType;
+
+typedef enum {
+    HASHTABLE_RESIZE_ALLOW = 0,
+    HASHTABLE_RESIZE_AVOID,
+    HASHTABLE_RESIZE_FORBID,
+} hashtableResizePolicy;
+
+typedef void (*hashtableScanFunction)(void *privdata, void *entry);
+
+/* Constants */
+#define HASHTABLE_BUCKET_SIZE 64 /* bytes, the most common cache line size */
+
+/* Scan flags */
+#define HASHTABLE_SCAN_EMIT_REF (1 << 0)
+
+/* --- Prototypes --- */
+
+/* Hash function (global seed) */
+void hashtableSetHashFunctionSeed(const uint8_t *seed);
+uint8_t *hashtableGetHashFunctionSeed(void);
+uint64_t hashtableGenHashFunction(const char *buf, size_t len);
+uint64_t hashtableGenCaseHashFunction(const char *buf, size_t len);
+
+/* Global resize policy */
+void hashtableSetResizePolicy(hashtableResizePolicy policy);
+
+/* Hashtable instance */
+hashtable *hashtableCreate(hashtableType *type);
+void hashtableRelease(hashtable *ht);
+void hashtableEmpty(hashtable *ht, void(callback)(hashtable *));
+hashtableType *hashtableGetType(hashtable *ht);
+void *hashtableMetadata(hashtable *ht);
+size_t hashtableSize(hashtable *ht);
+size_t hashtableBuckets(hashtable *ht);
+size_t hashtableChainedBuckets(hashtable *ht, int table);
+size_t hashtableMemUsage(hashtable *ht);
+void hashtablePauseAutoShrink(hashtable *ht);
+void hashtableResumeAutoShrink(hashtable *ht);
+int hashtableIsRehashing(hashtable *ht);
+int hashtableIsRehashingPaused(hashtable *ht);
+void hashtableRehashingInfo(hashtable *ht, size_t *from_size, size_t *to_size);
+int hashtableRehashMicroseconds(hashtable *ht, uint64_t us);
+int hashtableExpand(hashtable *ht, size_t size);
+int hashtableTryExpand(hashtable *ht, size_t size);
+int hashtableExpandIfNeeded(hashtable *ht);
+int hashtableShrinkIfNeeded(hashtable *ht);
+hashtable *hashtableDefragTables(hashtable *ht, void *(*defragfn)(void *));
+
+/* Entries */
+int hashtableFind(hashtable *ht, const void *key, void **found);
+void **hashtableFindRef(hashtable *ht, const void *key);
+int hashtableAdd(hashtable *ht, void *entry);
+int hashtableAddOrFind(hashtable *ht, void *entry, void **existing);
+int hashtableFindPositionForInsert(hashtable *ht, void *key, hashtablePosition *position, void **existing);
+void hashtableInsertAtPosition(hashtable *ht, void *entry, hashtablePosition *position);
+int hashtablePop(hashtable *ht, const void *key, void **popped);
+int hashtableDelete(hashtable *ht, const void *key);
+void **hashtableTwoPhasePopFindRef(hashtable *ht, const void *key, hashtablePosition *position);
+void hashtableTwoPhasePopDelete(hashtable *ht, hashtablePosition *position);
+int hashtableReplaceReallocatedEntry(hashtable *ht, const void *old_entry, void *new_entry);
+void hashtableIncrementalFindInit(hashtableIncrementalFindState *state, hashtable *ht, const void *key);
+int hashtableIncrementalFindStep(hashtableIncrementalFindState *state);
+int hashtableIncrementalFindGetResult(hashtableIncrementalFindState *state, void **found);
+
+/* Iteration & scan */
+size_t hashtableScan(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata);
+size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction fn, void *privdata, void *(*defragfn)(void *), int flags);
+void hashtableInitIterator(hashtableIterator *iter, hashtable *ht);
+void hashtableInitSafeIterator(hashtableIterator *iter, hashtable *ht);
+void hashtableResetIterator(hashtableIterator *iter);
+hashtableIterator *hashtableCreateIterator(hashtable *ht);
+hashtableIterator *hashtableCreateSafeIterator(hashtable *ht);
+void hashtableReleaseIterator(hashtableIterator *iter);
+int hashtableNext(hashtableIterator *iter, void **elemptr);
+
+/* Random entries */
+int hashtableRandomEntry(hashtable *ht, void **found);
+int hashtableFairRandomEntry(hashtable *ht, void **found);
+unsigned hashtableSampleEntries(hashtable *ht, void **dst, unsigned count);
+
+/* Debug & stats */
+
+void hashtableFreeStats(hashtableStats *stats);
+void hashtableCombineStats(hashtableStats *from, hashtableStats *into);
+hashtableStats *hashtableGetStatsHt(hashtable *ht, int htidx, int full);
+size_t hashtableGetStatsMsg(char *buf, size_t bufsize, hashtableStats *stats, int full);
+void hashtableGetStats(char *buf, size_t bufsize, hashtable *ht, int full);
+
+#endif /* HASHTABLE_H */
diff --git a/src/unit/test_files.h b/src/unit/test_files.h
index bc3eac4222..1de84b344f 100644
--- a/src/unit/test_files.h
+++ b/src/unit/test_files.h
@@ -19,6 +19,22 @@ int test_dictDisableResizeReduceTo3(int argc, char **argv, int flags);
 int test_dictDeleteOneKeyTriggerResizeAgain(int argc, char **argv, int flags);
 int test_dictBenchmark(int argc, char **argv, int flags);
 int test_endianconv(int argc, char *argv[], int flags);
+int test_cursor(int argc, char **argv, int flags);
+int test_set_hash_function_seed(int argc, char **argv, int flags);
+int test_add_find_delete(int argc, char **argv, int flags);
+int test_add_find_delete_avoid_resize(int argc, char **argv, int flags);
+int test_instant_rehashing(int argc, char **argv, int flags);
+int test_bucket_chain_length(int argc, char **argv, int flags);
+int test_two_phase_insert_and_pop(int argc, char **argv, int flags);
+int test_replace_reallocated_entry(int argc, char **argv, int flags);
+int test_incremental_find(int argc, char **argv, int flags);
+int test_scan(int argc, char **argv, int flags);
+int test_iterator(int argc, char **argv, int flags);
+int test_safe_iterator(int argc, char **argv, int flags);
+int test_compact_bucket_chain(int argc, char **argv, int flags);
+int test_random_entry(int argc, char **argv, int flags);
+int test_random_entry_with_long_chain(int argc, char **argv, int flags);
+int test_all_memory_freed(int argc, char **argv, int flags);
 int test_intsetValueEncodings(int argc, char **argv, int flags);
 int test_intsetBasicAdding(int argc, char **argv, int flags);
 int test_intsetLargeNumberRandomAdd(int argc, char **argv, int flags);
@@ -215,6 +231,7 @@ unitTest __test_crc64_c[] = {{"test_crc64", test_crc64}, {NULL, NULL}};
 unitTest __test_crc64combine_c[] = {{"test_crc64combine", test_crc64combine}, {NULL, NULL}};
 unitTest __test_dict_c[] = {{"test_dictCreate", test_dictCreate}, {"test_dictAdd16Keys", test_dictAdd16Keys}, {"test_dictDisableResize", test_dictDisableResize}, {"test_dictAddOneKeyTriggerResize", test_dictAddOneKeyTriggerResize}, {"test_dictDeleteKeys", test_dictDeleteKeys}, {"test_dictDeleteOneKeyTriggerResize", test_dictDeleteOneKeyTriggerResize}, {"test_dictEmptyDirAdd128Keys", test_dictEmptyDirAdd128Keys}, {"test_dictDisableResizeReduceTo3", test_dictDisableResizeReduceTo3}, {"test_dictDeleteOneKeyTriggerResizeAgain", test_dictDeleteOneKeyTriggerResizeAgain}, {"test_dictBenchmark", test_dictBenchmark}, {NULL, NULL}};
 unitTest __test_endianconv_c[] = {{"test_endianconv", test_endianconv}, {NULL, NULL}};
+unitTest __test_hashtable_c[] = {{"test_cursor", test_cursor}, {"test_set_hash_function_seed", test_set_hash_function_seed}, {"test_add_find_delete", test_add_find_delete}, {"test_add_find_delete_avoid_resize", test_add_find_delete_avoid_resize}, {"test_instant_rehashing", test_instant_rehashing}, {"test_bucket_chain_length", test_bucket_chain_length}, {"test_two_phase_insert_and_pop", test_two_phase_insert_and_pop}, {"test_replace_reallocated_entry", test_replace_reallocated_entry}, {"test_incremental_find", test_incremental_find}, {"test_scan", test_scan}, {"test_iterator", test_iterator}, {"test_safe_iterator", test_safe_iterator}, {"test_compact_bucket_chain", test_compact_bucket_chain}, {"test_random_entry", test_random_entry}, {"test_random_entry_with_long_chain", test_random_entry_with_long_chain}, {"test_all_memory_freed", test_all_memory_freed}, {NULL, NULL}};
 unitTest __test_intset_c[] = {{"test_intsetValueEncodings", test_intsetValueEncodings}, {"test_intsetBasicAdding", test_intsetBasicAdding}, {"test_intsetLargeNumberRandomAdd", test_intsetLargeNumberRandomAdd}, {"test_intsetUpgradeFromint16Toint32", test_intsetUpgradeFromint16Toint32}, {"test_intsetUpgradeFromint16Toint64", test_intsetUpgradeFromint16Toint64}, {"test_intsetUpgradeFromint32Toint64", test_intsetUpgradeFromint32Toint64}, {"test_intsetStressLookups", test_intsetStressLookups}, {"test_intsetStressAddDelete", test_intsetStressAddDelete}, {NULL, NULL}};
 unitTest __test_kvstore_c[] = {{"test_kvstoreAdd16Keys", test_kvstoreAdd16Keys}, {"test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict}, {NULL, NULL}};
 unitTest __test_listpack_c[] = {{"test_listpackCreateIntList", test_listpackCreateIntList}, {"test_listpackCreateList", test_listpackCreateList}, {"test_listpackLpPrepend", test_listpackLpPrepend}, {"test_listpackLpPrependInteger", test_listpackLpPrependInteger}, {"test_listpackGetELementAtIndex", test_listpackGetELementAtIndex}, {"test_listpackPop", test_listpackPop}, {"test_listpackGetELementAtIndex2", test_listpackGetELementAtIndex2}, {"test_listpackIterate0toEnd", test_listpackIterate0toEnd}, {"test_listpackIterate1toEnd", test_listpackIterate1toEnd}, {"test_listpackIterate2toEnd", test_listpackIterate2toEnd}, {"test_listpackIterateBackToFront", test_listpackIterateBackToFront}, {"test_listpackIterateBackToFrontWithDelete", test_listpackIterateBackToFrontWithDelete}, {"test_listpackDeleteWhenNumIsMinusOne", test_listpackDeleteWhenNumIsMinusOne}, {"test_listpackDeleteWithNegativeIndex", test_listpackDeleteWithNegativeIndex}, {"test_listpackDeleteInclusiveRange0_0", test_listpackDeleteInclusiveRange0_0}, {"test_listpackDeleteInclusiveRange0_1", test_listpackDeleteInclusiveRange0_1}, {"test_listpackDeleteInclusiveRange1_2", test_listpackDeleteInclusiveRange1_2}, {"test_listpackDeleteWitStartIndexOutOfRange", test_listpackDeleteWitStartIndexOutOfRange}, {"test_listpackDeleteWitNumOverflow", test_listpackDeleteWitNumOverflow}, {"test_listpackBatchDelete", test_listpackBatchDelete}, {"test_listpackDeleteFooWhileIterating", test_listpackDeleteFooWhileIterating}, {"test_listpackReplaceWithSameSize", test_listpackReplaceWithSameSize}, {"test_listpackReplaceWithDifferentSize", test_listpackReplaceWithDifferentSize}, {"test_listpackRegressionGt255Bytes", test_listpackRegressionGt255Bytes}, {"test_listpackCreateLongListAndCheckIndices", test_listpackCreateLongListAndCheckIndices}, {"test_listpackCompareStrsWithLpEntries", test_listpackCompareStrsWithLpEntries}, {"test_listpackLpMergeEmptyLps", test_listpackLpMergeEmptyLps}, {"test_listpackLpMergeLp1Larger", test_listpackLpMergeLp1Larger}, {"test_listpackLpMergeLp2Larger", test_listpackLpMergeLp2Larger}, {"test_listpackLpNextRandom", test_listpackLpNextRandom}, {"test_listpackLpNextRandomCC", test_listpackLpNextRandomCC}, {"test_listpackRandomPairWithOneElement", test_listpackRandomPairWithOneElement}, {"test_listpackRandomPairWithManyElements", test_listpackRandomPairWithManyElements}, {"test_listpackRandomPairsWithOneElement", test_listpackRandomPairsWithOneElement}, {"test_listpackRandomPairsWithManyElements", test_listpackRandomPairsWithManyElements}, {"test_listpackRandomPairsUniqueWithOneElement", test_listpackRandomPairsUniqueWithOneElement}, {"test_listpackRandomPairsUniqueWithManyElements", test_listpackRandomPairsUniqueWithManyElements}, {"test_listpackPushVariousEncodings", test_listpackPushVariousEncodings}, {"test_listpackLpFind", test_listpackLpFind}, {"test_listpackLpValidateIntegrity", test_listpackLpValidateIntegrity}, {"test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN", test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN}, {"test_listpackStressWithRandom", test_listpackStressWithRandom}, {"test_listpackSTressWithVariableSize", test_listpackSTressWithVariableSize}, {"test_listpackBenchmarkInit", test_listpackBenchmarkInit}, {"test_listpackBenchmarkLpAppend", test_listpackBenchmarkLpAppend}, {"test_listpackBenchmarkLpFindString", test_listpackBenchmarkLpFindString}, {"test_listpackBenchmarkLpFindNumber", test_listpackBenchmarkLpFindNumber}, {"test_listpackBenchmarkLpSeek", test_listpackBenchmarkLpSeek}, {"test_listpackBenchmarkLpValidateIntegrity", test_listpackBenchmarkLpValidateIntegrity}, {"test_listpackBenchmarkLpCompareWithString", test_listpackBenchmarkLpCompareWithString}, {"test_listpackBenchmarkLpCompareWithNumber", test_listpackBenchmarkLpCompareWithNumber}, {"test_listpackBenchmarkFree", test_listpackBenchmarkFree}, {NULL, NULL}};
@@ -237,6 +254,7 @@ struct unitTestSuite {
     {"test_crc64combine.c", __test_crc64combine_c},
     {"test_dict.c", __test_dict_c},
     {"test_endianconv.c", __test_endianconv_c},
+    {"test_hashtable.c", __test_hashtable_c},
     {"test_intset.c", __test_intset_c},
     {"test_kvstore.c", __test_kvstore_c},
     {"test_listpack.c", __test_listpack_c},
diff --git a/src/unit/test_hashtable.c b/src/unit/test_hashtable.c
new file mode 100644
index 0000000000..782fa0ee6a
--- /dev/null
+++ b/src/unit/test_hashtable.c
@@ -0,0 +1,869 @@
+#include "../hashtable.h"
+#include "test_help.h"
+#include "../mt19937-64.h"
+#include "../zmalloc.h"
+#include "../monotonic.h"
+
+#include <stdio.h>
+#include <limits.h>
+#include <string.h>
+#include <math.h>
+
+/* Global variable to test the memory tracking callback. */
+static size_t mem_usage;
+
+/* From util.c: getRandomBytes to seed hash function. */
+void getRandomBytes(unsigned char *p, size_t len);
+
+/* Init hash function salt and seed random generator. */
+static void randomSeed(void) {
+    unsigned long long seed;
+    getRandomBytes((void *)&seed, sizeof(seed));
+    init_genrand64(seed);
+    srandom((unsigned)seed);
+}
+
+/* An entry holding a string key and a string value in one allocation. */
+typedef struct {
+    unsigned int keysize; /* Sizes, including null-terminator */
+    unsigned int valsize;
+    char data[]; /* key and value */
+} keyval;
+
+static keyval *create_keyval(const char *key, const char *val) {
+    size_t keysize = strlen(key) + 1;
+    size_t valsize = strlen(val) + 1;
+    keyval *e = malloc(sizeof(keyval) + keysize + valsize);
+    e->keysize = keysize;
+    e->valsize = valsize;
+    memcpy(e->data, key, keysize);
+    memcpy(e->data + keysize, val, valsize);
+    return e;
+}
+
+static const void *getkey(const void *entry) {
+    const keyval *e = entry;
+    return e->data;
+}
+
+static const void *getval(const void *entry) {
+    const keyval *e = entry;
+    return e->data + e->keysize;
+}
+
+static uint64_t hashfunc(const void *key) {
+    return hashtableGenHashFunction(key, strlen(key));
+}
+
+static int keycmp(const void *key1, const void *key2) {
+    return strcmp(key1, key2);
+}
+
+static void freekeyval(void *keyval) {
+    free(keyval);
+}
+
+static void trackmemusage(hashtable *ht, ssize_t delta) {
+    UNUSED(ht);
+    mem_usage += delta;
+}
+
+/* Hashtable type used for some of the tests. */
+static hashtableType keyval_type = {
+    .entryGetKey = getkey,
+    .hashFunction = hashfunc,
+    .keyCompare = keycmp,
+    .entryDestructor = freekeyval,
+    .trackMemUsage = trackmemusage,
+};
+
+/* Callback for testing hashtableEmpty(). */
+static long empty_callback_call_counter;
+void emptyCallback(hashtable *ht) {
+    UNUSED(ht);
+    empty_callback_call_counter++;
+}
+
+/* Prototypes for debugging */
+void hashtableDump(hashtable *ht);
+void hashtableHistogram(hashtable *ht);
+int hashtableLongestBucketChain(hashtable *ht);
+size_t nextCursor(size_t v, size_t mask);
+
+int test_cursor(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    UNUSED(flags);
+    TEST_ASSERT(nextCursor(0x0000, 0xffff) == 0x8000);
+    TEST_ASSERT(nextCursor(0x8000, 0xffff) == 0x4000);
+    TEST_ASSERT(nextCursor(0x4001, 0xffff) == 0xc001);
+    TEST_ASSERT(nextCursor(0xffff, 0xffff) == 0x0000);
+    return 0;
+}
+
+int test_set_hash_function_seed(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    UNUSED(flags);
+    randomSeed();
+    return 0;
+}
+
+static int add_find_delete_test_helper(int flags) {
+    int count = (flags & UNIT_TEST_ACCURATE) ? 1000000 : 200;
+    TEST_ASSERT(mem_usage == 0);
+    hashtable *ht = hashtableCreate(&keyval_type);
+    int j;
+
+    /* Add */
+    for (j = 0; j < count; j++) {
+        char key[32], val[32];
+        snprintf(key, sizeof(key), "%d", j);
+        snprintf(val, sizeof(val), "%d", count - j + 42);
+        keyval *e = create_keyval(key, val);
+        TEST_ASSERT(hashtableAdd(ht, e));
+    }
+    TEST_ASSERT(hashtableMemUsage(ht) == mem_usage);
+
+    if (count < 1000) {
+        hashtableHistogram(ht);
+        printf("Mem usage: %zu\n", hashtableMemUsage(ht));
+    }
+
+    /* Find */
+    for (j = 0; j < count; j++) {
+        char key[32], val[32];
+        snprintf(key, sizeof(key), "%d", j);
+        snprintf(val, sizeof(val), "%d", count - j + 42);
+        void *found;
+        TEST_ASSERT(hashtableFind(ht, key, &found));
+        keyval *e = found;
+        TEST_ASSERT(!strcmp(val, getval(e)));
+    }
+
+    /* Delete half of them */
+    for (j = 0; j < count / 2; j++) {
+        char key[32];
+        snprintf(key, sizeof(key), "%d", j);
+        if (j % 3 == 0) {
+            /* Test hashtablePop */
+            char val[32];
+            snprintf(val, sizeof(val), "%d", count - j + 42);
+            void *popped;
+            TEST_ASSERT(hashtablePop(ht, key, &popped));
+            keyval *e = popped;
+            TEST_ASSERT(!strcmp(val, getval(e)));
+            free(e);
+        } else {
+            TEST_ASSERT(hashtableDelete(ht, key));
+        }
+    }
+    TEST_ASSERT(hashtableMemUsage(ht) == mem_usage);
+
+    /* Empty, i.e. delete remaining entries, with progress callback. */
+    empty_callback_call_counter = 0;
+    hashtableEmpty(ht, emptyCallback);
+    TEST_ASSERT(empty_callback_call_counter > 0);
+
+    /* Release memory */
+    hashtableRelease(ht);
+    TEST_ASSERT(mem_usage == 0);
+    return 0;
+}
+
+int test_add_find_delete(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    TEST_ASSERT(add_find_delete_test_helper(flags) == 0);
+    TEST_ASSERT(zmalloc_used_memory() == 0);
+    return 0;
+}
+
+int test_add_find_delete_avoid_resize(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    hashtableSetResizePolicy(HASHTABLE_RESIZE_AVOID);
+    TEST_ASSERT(add_find_delete_test_helper(flags) == 0);
+    hashtableSetResizePolicy(HASHTABLE_RESIZE_ALLOW);
+    TEST_ASSERT(zmalloc_used_memory() == 0);
+    return 0;
+}
+
+int test_instant_rehashing(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    UNUSED(flags);
+
+    long count = 200;
+
+    /* A set of longs, i.e. pointer-sized values. */
+    hashtableType type = {.instant_rehashing = 1};
+    hashtable *ht = hashtableCreate(&type);
+    long j;
+
+    /* Populate and check that rehashing is never ongoing. */
+    for (j = 0; j < count; j++) {
+        TEST_ASSERT(hashtableAdd(ht, (void *)j));
+        TEST_ASSERT(!hashtableIsRehashing(ht));
+    }
+
+    /* Delete and check that rehashing is never ongoing. */
+    for (j = 0; j < count; j++) {
+        TEST_ASSERT(hashtableDelete(ht, (void *)j));
+        TEST_ASSERT(!hashtableIsRehashing(ht));
+    }
+
+    hashtableRelease(ht);
+    return 0;
+}
+
+int test_bucket_chain_length(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    UNUSED(flags);
+
+    unsigned long count = 1000000;
+
+    /* A set of longs, i.e. pointer-sized integer values. */
+    hashtableType type = {0};
+    hashtable *ht = hashtableCreate(&type);
+    unsigned long j;
+    for (j = 0; j < count; j++) {
+        TEST_ASSERT(hashtableAdd(ht, (void *)j));
+    }
+    /* If it's rehashing, add a few more until rehashing is complete. */
+    while (hashtableIsRehashing(ht)) {
+        j++;
+        TEST_ASSERT(hashtableAdd(ht, (void *)j));
+    }
+    TEST_ASSERT(j < count * 2);
+    int max_chainlen_not_rehashing = hashtableLongestBucketChain(ht);
+    TEST_ASSERT(max_chainlen_not_rehashing < 10);
+
+    /* Add more until rehashing starts again. */
+    while (!hashtableIsRehashing(ht)) {
+        j++;
+        TEST_ASSERT(hashtableAdd(ht, (void *)j));
+    }
+    TEST_ASSERT(j < count * 2);
+    int max_chainlen_rehashing = hashtableLongestBucketChain(ht);
+    TEST_ASSERT(max_chainlen_rehashing < 10);
+
+    hashtableRelease(ht);
+    return 0;
+}
+
+int test_two_phase_insert_and_pop(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    UNUSED(flags);
+
+    int count = (flags & UNIT_TEST_ACCURATE) ? 1000000 : 200;
+    hashtable *ht = hashtableCreate(&keyval_type);
+    int j;
+
+    /* hashtableFindPositionForInsert + hashtableInsertAtPosition */
+    for (j = 0; j < count; j++) {
+        char key[32], val[32];
+        snprintf(key, sizeof(key), "%d", j);
+        snprintf(val, sizeof(val), "%d", count - j + 42);
+        hashtablePosition position;
+        int ret = hashtableFindPositionForInsert(ht, key, &position, NULL);
+        TEST_ASSERT(ret == 1);
+        keyval *e = create_keyval(key, val);
+        hashtableInsertAtPosition(ht, e, &position);
+    }
+
+    if (count < 1000) {
+        hashtableHistogram(ht);
+    }
+
+    /* Check that all entries were inserted. */
+    for (j = 0; j < count; j++) {
+        char key[32], val[32];
+        snprintf(key, sizeof(key), "%d", j);
+        snprintf(val, sizeof(val), "%d", count - j + 42);
+        void *found;
+        TEST_ASSERT(hashtableFind(ht, key, &found));
+        keyval *e = found;
+        TEST_ASSERT(!strcmp(val, getval(e)));
+    }
+
+    /* Test two-phase pop. */
+    for (j = 0; j < count; j++) {
+        char key[32], val[32];
+        snprintf(key, sizeof(key), "%d", j);
+        snprintf(val, sizeof(val), "%d", count - j + 42);
+        hashtablePosition position;
+        size_t size_before_find = hashtableSize(ht);
+        void **ref = hashtableTwoPhasePopFindRef(ht, key, &position);
+        TEST_ASSERT(ref != NULL);
+        keyval *e = *ref;
+        TEST_ASSERT(!strcmp(val, getval(e)));
+        TEST_ASSERT(hashtableSize(ht) == size_before_find);
+        hashtableTwoPhasePopDelete(ht, &position);
+        TEST_ASSERT(hashtableSize(ht) == size_before_find - 1);
+    }
+    TEST_ASSERT(hashtableSize(ht) == 0);
+
+    hashtableRelease(ht);
+    return 0;
+}
+
+int test_replace_reallocated_entry(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    UNUSED(flags);
+
+    int count = 100, j;
+    hashtable *ht = hashtableCreate(&keyval_type);
+
+    /* Add */
+    for (j = 0; j < count; j++) {
+        char key[32], val[32];
+        snprintf(key, sizeof(key), "%d", j);
+        snprintf(val, sizeof(val), "%d", count - j + 42);
+        keyval *e = create_keyval(key, val);
+        TEST_ASSERT(hashtableAdd(ht, e));
+    }
+
+    /* Find and replace */
+    for (j = 0; j < count; j++) {
+        char key[32], val[32];
+        snprintf(key, sizeof(key), "%d", j);
+        snprintf(val, sizeof(val), "%d", count - j + 42);
+        void *found;
+        TEST_ASSERT(hashtableFind(ht, key, &found));
+        keyval *old = found;
+        TEST_ASSERT(strcmp(getkey(old), key) == 0);
+        TEST_ASSERT(strcmp(getval(old), val) == 0);
+        snprintf(val, sizeof(val), "%d", j + 1234);
+        keyval *new = create_keyval(key, val);
+        /* If we free 'old' before the call to hashtableReplaceReallocatedEntry,
+         * we get a use-after-free warning, so instead we just overwrite it with
+         * junk. The purpose is to verify that the function doesn't use the
+         * memory it points to. */
+        memset(old->data, 'x', old->keysize + old->valsize);
+        TEST_ASSERT(hashtableReplaceReallocatedEntry(ht, old, new));
+        free(old);
+    }
+
+    /* Check */
+    for (j = 0; j < count; j++) {
+        char key[32], val[32];
+        snprintf(key, sizeof(key), "%d", j);
+        snprintf(val, sizeof(val), "%d", j + 1234);
+        void *found;
+        TEST_ASSERT(hashtableFind(ht, key, &found));
+        keyval *e = found;
+        TEST_ASSERT(!strcmp(val, getval(e)));
+    }
+
+    hashtableRelease(ht);
+    TEST_ASSERT(zmalloc_used_memory() == 0);
+    return 0;
+}
+
+int test_incremental_find(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    UNUSED(flags);
+
+    size_t count = 2000000;
+    uint8_t element_array[count];
+    memset(element_array, 0, sizeof element_array);
+
+    /* A set of uint8_t pointers */
+    hashtableType type = {0};
+    hashtable *ht = hashtableCreate(&type);
+
+    /* Populate */
+    for (size_t j = 0; j < count; j++) {
+        TEST_ASSERT(hashtableAdd(ht, element_array + j));
+    }
+
+    monotime timer;
+    monotonicInit();
+
+    /* Compare to looking up one by one. */
+    elapsedStart(&timer);
+    for (size_t i = 0; i < count; i++) {
+        uint8_t *key = &element_array[i];
+        void *found;
+        TEST_ASSERT(hashtableFind(ht, key, &found) == 1);
+        TEST_ASSERT(found == key);
+    }
+    uint64_t us2 = elapsedUs(timer);
+    TEST_PRINT_INFO("Lookup %zu elements one by one took %lu microseconds.",
+                    count, (unsigned long)us2);
+
+    /* Lookup elements in batches. */
+    for (size_t batch_size = 1; batch_size <= 64; batch_size *= 2) {
+        elapsedStart(&timer);
+        for (size_t batch = 0; batch < count / batch_size; batch++) {
+            /* Init batches. */
+            hashtableIncrementalFindState states[batch_size];
+            for (size_t i = 0; i < batch_size; i++) {
+                void *key = &element_array[batch * batch_size + i];
+                hashtableIncrementalFindInit(&states[i], ht, key);
+            }
+            /* Work on batches in round-robin order until all are done. */
+            size_t num_left;
+            do {
+                num_left = batch_size;
+                for (size_t i = 0; i < batch_size; i++) {
+                    if (hashtableIncrementalFindStep(&states[i]) == 0) {
+                        num_left--;
+                    }
+                }
+            } while (num_left > 0);
+
+            /* Fetch results. */
+            for (size_t i = 0; i < batch_size; i++) {
+                void *found;
+                TEST_ASSERT(hashtableIncrementalFindGetResult(&states[i], &found) == 1);
+                TEST_ASSERT(found == &element_array[batch * batch_size + i]);
+            }
+        }
+        uint64_t us1 = elapsedUs(timer);
+        TEST_PRINT_INFO("Lookup %zu elements in batches of %zu took %lu microseconds.",
+                        count, batch_size, (unsigned long)us1);
+    }
+
+    hashtableRelease(ht);
+    return 0;
+}
+
+typedef struct {
+    long count;
+    uint8_t entry_seen[];
+} scandata;
+
+void scanfn(void *privdata, void *entry) {
+    scandata *data = (scandata *)privdata;
+    unsigned long j = (unsigned long)entry;
+    data->entry_seen[j]++;
+    data->count++;
+}
+
+int test_scan(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    UNUSED(flags);
+
+    long num_entries = (flags & UNIT_TEST_LARGE_MEMORY) ? 1000000 : 200000;
+    int num_rounds = (flags & UNIT_TEST_ACCURATE) ? 20 : 5;
+
+    /* A set of longs, i.e. pointer-sized values. */
+    hashtableType type = {0};
+    long j;
+
+    for (int round = 0; round < num_rounds; round++) {
+        /* First round count = num_entries, then some more. */
+        long count = num_entries * (1 + 2 * (double)round / num_rounds);
+
+        /* Seed, to make sure each round is different. */
+        randomSeed();
+
+        /* Populate */
+        hashtable *ht = hashtableCreate(&type);
+        for (j = 0; j < count; j++) {
+            TEST_ASSERT(hashtableAdd(ht, (void *)j));
+        }
+
+        /* Scan */
+        scandata *data = calloc(1, sizeof(scandata) + count);
+        long max_entries_per_cycle = 0;
+        unsigned num_cycles = 0;
+        long scanned_count = 0;
+        size_t cursor = 0;
+        do {
+            data->count = 0;
+            cursor = hashtableScan(ht, cursor, scanfn, data);
+            if (data->count > max_entries_per_cycle) {
+                max_entries_per_cycle = data->count;
+            }
+            scanned_count += data->count;
+            data->count = 0;
+            num_cycles++;
+        } while (cursor != 0);
+
+        /* Verify that every entry was returned exactly once. */
+        TEST_ASSERT(scanned_count == count);
+        for (j = 0; j < count; j++) {
+            TEST_ASSERT(data->entry_seen[j] >= 1);
+            TEST_ASSERT(data->entry_seen[j] <= 2);
+        }
+
+        /* Print some information for curious readers. */
+        TEST_PRINT_INFO("Scanned %ld; max emitted per call: %ld; avg emitted per call: %.2lf",
+                        count, max_entries_per_cycle, (double)count / num_cycles);
+
+        /* Cleanup */
+        hashtableRelease(ht);
+        free(data);
+    }
+    return 0;
+}
+
+typedef struct {
+    uint64_t value;
+    uint64_t hash;
+} mock_hash_entry;
+
+static mock_hash_entry *mock_hash_entry_create(uint64_t value, uint64_t hash) {
+    mock_hash_entry *entry = malloc(sizeof(mock_hash_entry));
+    entry->value = value;
+    entry->hash = hash;
+    return entry;
+}
+
+static uint64_t mock_hash_entry_get_hash(const void *entry) {
+    if (entry == NULL) return 0UL;
+    mock_hash_entry *mock = (mock_hash_entry *)entry;
+    return (mock->hash != 0) ? mock->hash : mock->value;
+}
+
+int test_iterator(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    UNUSED(flags);
+
+    size_t count = 2000000;
+    uint8_t entry_array[count];
+    memset(entry_array, 0, sizeof entry_array);
+
+    /* A set of uint8_t pointers */
+    hashtableType type = {0};
+    hashtable *ht = hashtableCreate(&type);
+
+    /* Populate */
+    for (size_t j = 0; j < count; j++) {
+        TEST_ASSERT(hashtableAdd(ht, entry_array + j));
+    }
+
+    /* Iterate */
+    size_t num_returned = 0;
+    hashtableIterator iter;
+    void *next;
+    hashtableInitIterator(&iter, ht);
+    while (hashtableNext(&iter, &next)) {
+        uint8_t *entry = next;
+        num_returned++;
+        TEST_ASSERT(entry >= entry_array && entry < entry_array + count);
+        /* increment entry at this position as a counter */
+        (*entry)++;
+    }
+    hashtableResetIterator(&iter);
+
+    /* Check that all entries were returned exactly once. */
+    TEST_ASSERT(num_returned == count);
+    for (size_t j = 0; j < count; j++) {
+        if (entry_array[j] != 1) {
+            printf("Entry %zu returned %d times\n", j, entry_array[j]);
+            return 0;
+        }
+    }
+
+    hashtableRelease(ht);
+    return 0;
+}
+
+int test_safe_iterator(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    UNUSED(flags);
+
+    size_t count = 1000;
+    uint8_t entry_counts[count * 2];
+    memset(entry_counts, 0, sizeof entry_counts);
+
+    /* A set of pointers into the uint8_t array. */
+    hashtableType type = {0};
+    hashtable *ht = hashtableCreate(&type);
+
+    /* Populate */
+    for (size_t j = 0; j < count; j++) {
+        TEST_ASSERT(hashtableAdd(ht, entry_counts + j));
+    }
+
+    /* Iterate */
+    size_t num_returned = 0;
+    hashtableIterator iter;
+    void *next;
+    hashtableInitSafeIterator(&iter, ht);
+    while (hashtableNext(&iter, &next)) {
+        uint8_t *entry = next;
+        size_t index = entry - entry_counts;
+        num_returned++;
+        TEST_ASSERT(entry >= entry_counts && entry < entry_counts + count * 2);
+        /* increment entry at this position as a counter */
+        (*entry)++;
+        if (index % 4 == 0) {
+            TEST_ASSERT(hashtableDelete(ht, entry));
+        }
+        /* Add new item each time we see one of the original items */
+        if (index < count) {
+            TEST_ASSERT(hashtableAdd(ht, entry + count));
+        }
+    }
+    hashtableResetIterator(&iter);
+
+    /* Check that all entries present during the whole iteration were returned
+     * exactly once. (Some are deleted after being returned.) */
+    TEST_ASSERT(num_returned >= count);
+    for (size_t j = 0; j < count; j++) {
+        if (entry_counts[j] != 1) {
+            printf("Entry %zu returned %d times\n", j, entry_counts[j]);
+            return 0;
+        }
+    }
+    /* Check that entries inserted during the iteration were returned at most
+     * once. */
+    unsigned long num_optional_returned = 0;
+    for (size_t j = count; j < count * 2; j++) {
+        TEST_ASSERT(entry_counts[j] <= 1);
+        num_optional_returned += entry_counts[j];
+    }
+    printf("Safe iterator returned %lu of the %zu entries inserted while iterating.\n", num_optional_returned, count);
+
+    hashtableRelease(ht);
+    return 0;
+}
+
+int test_compact_bucket_chain(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    UNUSED(flags);
+
+    /* Create a table with only one bucket chain. */
+    hashtableSetResizePolicy(HASHTABLE_RESIZE_AVOID);
+    unsigned long count = 30;
+
+    hashtableType type = {0};
+    hashtable *ht = hashtableCreate(&type);
+
+    /* Populate */
+    unsigned long j;
+    for (j = 0; j < count; j++) {
+        TEST_ASSERT(hashtableAdd(ht, (void *)j));
+    }
+    TEST_ASSERT(hashtableBuckets(ht) == 1);
+    printf("Populated a single bucket chain, avoiding resize.\n");
+    hashtableHistogram(ht);
+
+    /* Delete half of the entries while iterating. */
+    size_t num_chained_buckets = hashtableChainedBuckets(ht, 0);
+    size_t num_returned = 0;
+    hashtableIterator iter;
+    hashtableInitSafeIterator(&iter, ht);
+    void *entry;
+    while (hashtableNext(&iter, &entry)) {
+        /* As long as the iterator is still returning entries from the same
+         * bucket chain, the bucket chain is not compacted, so it still has the
+         * same number of buckets. */
+        TEST_ASSERT(hashtableChainedBuckets(ht, 0) == num_chained_buckets);
+        num_returned++;
+        if (num_returned % 2 == 0) {
+            TEST_ASSERT(hashtableDelete(ht, entry));
+        }
+        if (num_returned == count) {
+            printf("Last iteration. Half of them have been deleted.\n");
+            hashtableHistogram(ht);
+        }
+    }
+    hashtableResetIterator(&iter);
+
+    /* Verify that the bucket chain has been compacted by filling the holes and
+     * freeing empty child buckets. */
+    printf("When the iterator leaves the bucket chain, compaction should happen.\n");
+    hashtableHistogram(ht);
+    TEST_ASSERT(hashtableChainedBuckets(ht, 0) < num_chained_buckets);
+
+    hashtableRelease(ht);
+    hashtableSetResizePolicy(HASHTABLE_RESIZE_ALLOW);
+    TEST_ASSERT(zmalloc_used_memory() == 0);
+    return 0;
+}
+
+int test_random_entry(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    UNUSED(flags);
+    randomSeed();
+
+    size_t count = (flags & UNIT_TEST_LARGE_MEMORY) ? 7000 : 400;
+    long num_rounds = (flags & UNIT_TEST_ACCURATE) ? 1000000 : 10000;
+
+    /* A set of ints */
+    hashtableType type = {0};
+    hashtable *ht = hashtableCreate(&type);
+
+    /* Populate */
+    unsigned times_picked[count];
+    memset(times_picked, 0, sizeof(times_picked));
+    for (size_t j = 0; j < count; j++) {
+        TEST_ASSERT(hashtableAdd(ht, times_picked + j));
+    }
+
+    /* Pick entries, and count how many times each entry is picked. */
+    for (long i = 0; i < num_rounds; i++) {
+        /* Using void* variable to avoid a cast that violates strict aliasing */
+        void *entry;
+        TEST_ASSERT(hashtableFairRandomEntry(ht, &entry));
+        unsigned *picked = entry;
+        TEST_ASSERT(picked >= times_picked && picked < times_picked + count);
+        /* increment entry at this position as a counter */
+        (*picked)++;
+    }
+    hashtableRelease(ht);
+
+    /* Fairness measurement
+     * --------------------
+     *
+     * Selecting a single random entry: For any entry in the hash table, let
+     * X=1 if the we selected the entry (success) and X=0 otherwise. With m
+     * entries, our entry is sepected with probability p = 1/m, the expected
+     * value is E(X) = 1/m, E(X^2) = 1/m and the variance:
+     *
+     *     Var(X) = E(X^2) - (E(X))^2 = 1/m - 1/(m^2) = (1/m) * (1 - 1/m).
+     *
+     * Repeating the selection of a random entry: Let's repeat the experiment
+     * n times and let Y be the number of times our entry was selected. This
+     * is a binomial distribution.
+     *
+     *     Y = X_1 + X_2 + ... + X_n
+     *     E(Y) = n/m
+     *
+     * The variance of a sum of independent random variables is the sum of the
+     * variances, so Y has variance np(1−p).
+     *
+     *     Var(Y) = npq = np(1 - p) = (n/m) * (1 - 1/m) = n * (m - 1) / (m * m)
+     */
+    double m = (double)count, n = (double)num_rounds;
+    double expected = n / m;                 /* E(Y) */
+    double variance = n * (m - 1) / (m * m); /* Var(Y) */
+    double std_dev = sqrt(variance);
+
+    /* With large n, the distribution approaches a normal distribution and we
+     * can use p68 = within 1 std dev, p95 = within 2 std dev, p99.7 = within 3
+     * std dev. */
+    long p68 = 0, p95 = 0, p99 = 0, p4dev = 0, p5dev = 0;
+    for (size_t j = 0; j < count; j++) {
+        double dev = expected - times_picked[j];
+        p68 += (dev >= -std_dev && dev <= std_dev);
+        p95 += (dev >= -std_dev * 2 && dev <= std_dev * 2);
+        p99 += (dev >= -std_dev * 3 && dev <= std_dev * 3);
+        p4dev += (dev >= -std_dev * 4 && dev <= std_dev * 4);
+        p5dev += (dev >= -std_dev * 5 && dev <= std_dev * 5);
+    }
+    printf("Random entry fairness test\n");
+    printf("  Pick one of %zu entries, %ld times.\n", count, num_rounds);
+    printf("  Expecting each entry to be picked %.2lf times, std dev %.3lf.\n", expected, std_dev);
+    printf("  Within 1 std dev (p68) = %.2lf%%\n", 100 * p68 / m);
+    printf("  Within 2 std dev (p95) = %.2lf%%\n", 100 * p95 / m);
+    printf("  Within 3 std dev (p99) = %.2lf%%\n", 100 * p99 / m);
+    printf("  Within 4 std dev       = %.2lf%%\n", 100 * p4dev / m);
+    printf("  Within 5 std dev       = %.2lf%%\n", 100 * p5dev / m);
+
+    /* Conclusion? The number of trials (n) relative to the probabilities (p and
+     * 1 − p) must be sufficiently large (n * p ≥ 5 and n * (1 − p) ≥ 5) to
+     * approximate a binomial distribution with a normal distribution. */
+    if (n / m >= 5 && n * (1 - 1 / m) >= 5) {
+        TEST_ASSERT_MESSAGE("Too unfair randomness", 100 * p99 / m >= 60.0);
+    } else {
+        printf("To uncertain numbers to draw any conclusions about fairness.\n");
+    }
+    return 0;
+}
+
+int test_random_entry_with_long_chain(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    UNUSED(flags);
+
+    /* We use an estimator of true probability.
+     * We determine how many samples to take based on how precise of a
+     * measurement we want to take, and how certain we want to be that the
+     * measurement is correct.
+     * https://en.wikipedia.org/wiki/Checking_whether_a_coin_is_fair#Estimator_of_true_probability
+     */
+
+    /* In a thousand runs the worst deviation seen was 0.018 +/- 0.01.
+     * This means the true deviation was at least 0.008 or 0.8%.
+     * Accept a deviation of 5% to be on the safe side so we don't get
+     * a flaky test case. */
+    const double acceptable_probability_deviation = 0.05;
+
+    const size_t num_chained_entries = 64;
+    const size_t num_random_entries = 448;
+    const double p_fair = (double)num_chained_entries / (num_chained_entries + num_random_entries);
+
+    /* Precision of our measurement */
+    const double precision = (flags & UNIT_TEST_ACCURATE) ? 0.001 : 0.01;
+
+    /* This is confidence level for our measurement as the Z value of a normal
+     * distribution. 5 sigma corresponds to 0.00002% probability that our
+     * measurement is farther than 'precision' from the truth. This value is
+     * used in particle physics. */
+    const double z = 5;
+
+    const double n = p_fair * (1 - p_fair) * z * z / (precision * precision);
+    const size_t num_samples = (size_t)n + 1;
+
+    hashtableType type = {
+        .hashFunction = mock_hash_entry_get_hash,
+        .entryDestructor = freekeyval,
+    };
+
+    hashtable *ht = hashtableCreate(&type);
+    hashtableExpand(ht, num_random_entries + num_chained_entries);
+    uint64_t chain_hash = (uint64_t)genrand64_int64();
+    if (chain_hash == 0) chain_hash++;
+
+    /* add random entries */
+    for (size_t i = 0; i < num_random_entries; i++) {
+        uint64_t random_hash = (uint64_t)genrand64_int64();
+        if (random_hash == chain_hash) random_hash++;
+        hashtableAdd(ht, mock_hash_entry_create(random_hash, 0));
+    }
+
+    /* create long chain */
+    for (size_t i = 0; i < num_chained_entries; i++) {
+        hashtableAdd(ht, mock_hash_entry_create(i, chain_hash));
+    }
+
+    TEST_ASSERT(!hashtableIsRehashing(ht));
+
+    printf("Created a table with a long bucket chain.\n");
+    hashtableHistogram(ht);
+
+    printf("Taking %zu random samples\n", num_samples);
+    size_t count_chain_entry_picked = 0;
+    for (size_t i = 0; i < num_samples; i++) {
+        void *entry;
+        TEST_ASSERT(hashtableFairRandomEntry(ht, &entry));
+        mock_hash_entry *mock_entry = entry;
+        if (mock_entry->hash == chain_hash) {
+            count_chain_entry_picked++;
+        }
+    }
+    const double measured_probability = (double)count_chain_entry_picked / num_samples;
+    const double deviation = fabs(measured_probability - p_fair);
+    printf("Measured probability: %.1f%%\n", measured_probability * 100);
+    printf("Expected probability: %.1f%%\n", p_fair * 100);
+    printf("Measured probability deviated %1.1f%% +/- %1.1f%% from expected probability\n",
+           deviation * 100, precision * 100);
+    TEST_ASSERT(deviation <= precision + acceptable_probability_deviation);
+
+    hashtableRelease(ht);
+    return 0;
+}
+
+int test_all_memory_freed(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    UNUSED(flags);
+    TEST_ASSERT(zmalloc_used_memory() == 0);
+    return 0;
+}

From 4efff42f041733d786a73f1cdfa65f6e04c3c4b8 Mon Sep 17 00:00:00 2001
From: Rain Valentine <rsg000@gmail.com>
Date: Mon, 18 Nov 2024 10:36:56 +0100
Subject: [PATCH 34/73] Replace dict with hashtable in command tables (#1065)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This changes the type of command tables from dict to hashtable. Command
table lookup takes ~3% of overall CPU time in benchmarks, so it is a
good candidate for optimization.

My initial SET benchmark comparison suggests that hashtable is about 4.5
times faster than dict and this replacement reduced overall CPU time by
2.79% 🥳

---------

Signed-off-by: Rain Valentine <rainval@amazon.com>
Signed-off-by: Rain Valentine <rsg000@gmail.com>
Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
Co-authored-by: Rain Valentine <rainval@amazon.com>
---
 src/acl.c     |  67 +++++++-------
 src/config.c  |  12 +--
 src/latency.c |  31 ++++---
 src/module.c  |  40 +++++----
 src/server.c  | 238 +++++++++++++++++++++++++++-----------------------
 src/server.h  |  19 ++--
 6 files changed, 215 insertions(+), 192 deletions(-)

diff --git a/src/acl.c b/src/acl.c
index 688820fd89..cfcf102887 100644
--- a/src/acl.c
+++ b/src/acl.c
@@ -652,14 +652,15 @@ void ACLChangeSelectorPerm(aclSelector *selector, struct serverCommand *cmd, int
     unsigned long id = cmd->id;
     ACLSetSelectorCommandBit(selector, id, allow);
     ACLResetFirstArgsForCommand(selector, id);
-    if (cmd->subcommands_dict) {
-        dictEntry *de;
-        dictIterator *di = dictGetSafeIterator(cmd->subcommands_dict);
-        while ((de = dictNext(di)) != NULL) {
-            struct serverCommand *sub = (struct serverCommand *)dictGetVal(de);
+    if (cmd->subcommands_ht) {
+        hashtableIterator iter;
+        hashtableInitSafeIterator(&iter, cmd->subcommands_ht);
+        void *next;
+        while (hashtableNext(&iter, &next)) {
+            struct serverCommand *sub = next;
             ACLSetSelectorCommandBit(selector, sub->id, allow);
         }
-        dictReleaseIterator(di);
+        hashtableResetIterator(&iter);
     }
 }
 
@@ -669,19 +670,20 @@ void ACLChangeSelectorPerm(aclSelector *selector, struct serverCommand *cmd, int
  * value. Since the category passed by the user may be non existing, the
  * function returns C_ERR if the category was not found, or C_OK if it was
  * found and the operation was performed. */
-void ACLSetSelectorCommandBitsForCategory(dict *commands, aclSelector *selector, uint64_t cflag, int value) {
-    dictIterator *di = dictGetIterator(commands);
-    dictEntry *de;
-    while ((de = dictNext(di)) != NULL) {
-        struct serverCommand *cmd = dictGetVal(de);
+void ACLSetSelectorCommandBitsForCategory(hashtable *commands, aclSelector *selector, uint64_t cflag, int value) {
+    hashtableIterator iter;
+    hashtableInitIterator(&iter, commands);
+    void *next;
+    while (hashtableNext(&iter, &next)) {
+        struct serverCommand *cmd = next;
         if (cmd->acl_categories & cflag) {
             ACLChangeSelectorPerm(selector, cmd, value);
         }
-        if (cmd->subcommands_dict) {
-            ACLSetSelectorCommandBitsForCategory(cmd->subcommands_dict, selector, cflag, value);
+        if (cmd->subcommands_ht) {
+            ACLSetSelectorCommandBitsForCategory(cmd->subcommands_ht, selector, cflag, value);
         }
     }
-    dictReleaseIterator(di);
+    hashtableResetIterator(&iter);
 }
 
 /* This function is responsible for recomputing the command bits for all selectors of the existing users.
@@ -732,26 +734,27 @@ int ACLSetSelectorCategory(aclSelector *selector, const char *category, int allo
     return C_OK;
 }
 
-void ACLCountCategoryBitsForCommands(dict *commands,
+void ACLCountCategoryBitsForCommands(hashtable *commands,
                                      aclSelector *selector,
                                      unsigned long *on,
                                      unsigned long *off,
                                      uint64_t cflag) {
-    dictIterator *di = dictGetIterator(commands);
-    dictEntry *de;
-    while ((de = dictNext(di)) != NULL) {
-        struct serverCommand *cmd = dictGetVal(de);
+    hashtableIterator iter;
+    hashtableInitIterator(&iter, commands);
+    void *next;
+    while (hashtableNext(&iter, &next)) {
+        struct serverCommand *cmd = next;
         if (cmd->acl_categories & cflag) {
             if (ACLGetSelectorCommandBit(selector, cmd->id))
                 (*on)++;
             else
                 (*off)++;
         }
-        if (cmd->subcommands_dict) {
-            ACLCountCategoryBitsForCommands(cmd->subcommands_dict, selector, on, off, cflag);
+        if (cmd->subcommands_ht) {
+            ACLCountCategoryBitsForCommands(cmd->subcommands_ht, selector, on, off, cflag);
         }
     }
-    dictReleaseIterator(di);
+    hashtableResetIterator(&iter);
 }
 
 /* Return the number of commands allowed (on) and denied (off) for the user 'u'
@@ -1163,7 +1166,7 @@ int ACLSetSelector(aclSelector *selector, const char *op, size_t oplen) {
                 return C_ERR;
             }
 
-            if (cmd->subcommands_dict) {
+            if (cmd->subcommands_ht) {
                 /* If user is trying to allow a valid subcommand we can just add its unique ID */
                 cmd = ACLLookupCommand(op + 1);
                 if (cmd == NULL) {
@@ -2754,22 +2757,22 @@ sds getAclErrorMessage(int acl_res, user *user, struct serverCommand *cmd, sds e
  * ==========================================================================*/
 
 /* ACL CAT category */
-void aclCatWithFlags(client *c, dict *commands, uint64_t cflag, int *arraylen) {
-    dictEntry *de;
-    dictIterator *di = dictGetIterator(commands);
-
-    while ((de = dictNext(di)) != NULL) {
-        struct serverCommand *cmd = dictGetVal(de);
+void aclCatWithFlags(client *c, hashtable *commands, uint64_t cflag, int *arraylen) {
+    hashtableIterator iter;
+    hashtableInitIterator(&iter, commands);
+    void *next;
+    while (hashtableNext(&iter, &next)) {
+        struct serverCommand *cmd = next;
         if (cmd->acl_categories & cflag) {
             addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname));
             (*arraylen)++;
         }
 
-        if (cmd->subcommands_dict) {
-            aclCatWithFlags(c, cmd->subcommands_dict, cflag, arraylen);
+        if (cmd->subcommands_ht) {
+            aclCatWithFlags(c, cmd->subcommands_ht, cflag, arraylen);
         }
     }
-    dictReleaseIterator(di);
+    hashtableResetIterator(&iter);
 }
 
 /* Add the formatted response from a single selector to the ACL GETUSER
diff --git a/src/config.c b/src/config.c
index 9ea28298d7..cc0f8d2dd8 100644
--- a/src/config.c
+++ b/src/config.c
@@ -539,7 +539,6 @@ void loadServerConfigFromString(char *config) {
             loadServerConfig(argv[1], 0, NULL);
         } else if (!strcasecmp(argv[0], "rename-command") && argc == 3) {
             struct serverCommand *cmd = lookupCommandBySds(argv[1]);
-            int retval;
 
             if (!cmd) {
                 err = "No such command in rename-command";
@@ -548,16 +547,13 @@ void loadServerConfigFromString(char *config) {
 
             /* If the target command name is the empty string we just
              * remove it from the command table. */
-            retval = dictDelete(server.commands, argv[1]);
-            serverAssert(retval == DICT_OK);
+            serverAssert(hashtableDelete(server.commands, argv[1]));
 
             /* Otherwise we re-add the command under a different name. */
             if (sdslen(argv[2]) != 0) {
-                sds copy = sdsdup(argv[2]);
-
-                retval = dictAdd(server.commands, copy, cmd);
-                if (retval != DICT_OK) {
-                    sdsfree(copy);
+                sdsfree(cmd->fullname);
+                cmd->fullname = sdsdup(argv[2]);
+                if (!hashtableAdd(server.commands, cmd)) {
                     err = "Target command name already exists";
                     goto loaderr;
                 }
diff --git a/src/latency.c b/src/latency.c
index 783f04b197..2beb4859d1 100644
--- a/src/latency.c
+++ b/src/latency.c
@@ -526,13 +526,12 @@ void fillCommandCDF(client *c, struct hdr_histogram *histogram) {
 
 /* latencyCommand() helper to produce for all commands,
  * a per command cumulative distribution of latencies. */
-void latencyAllCommandsFillCDF(client *c, dict *commands, int *command_with_data) {
-    dictIterator *di = dictGetSafeIterator(commands);
-    dictEntry *de;
-    struct serverCommand *cmd;
-
-    while ((de = dictNext(di)) != NULL) {
-        cmd = (struct serverCommand *)dictGetVal(de);
+void latencyAllCommandsFillCDF(client *c, hashtable *commands, int *command_with_data) {
+    hashtableIterator iter;
+    hashtableInitSafeIterator(&iter, commands);
+    void *next;
+    while (hashtableNext(&iter, &next)) {
+        struct serverCommand *cmd = next;
         if (cmd->latency_histogram) {
             addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname));
             fillCommandCDF(c, cmd->latency_histogram);
@@ -540,10 +539,10 @@ void latencyAllCommandsFillCDF(client *c, dict *commands, int *command_with_data
         }
 
         if (cmd->subcommands) {
-            latencyAllCommandsFillCDF(c, cmd->subcommands_dict, command_with_data);
+            latencyAllCommandsFillCDF(c, cmd->subcommands_ht, command_with_data);
         }
     }
-    dictReleaseIterator(di);
+    hashtableResetIterator(&iter);
 }
 
 /* latencyCommand() helper to produce for a specific command set,
@@ -564,19 +563,19 @@ void latencySpecificCommandsFillCDF(client *c) {
             command_with_data++;
         }
 
-        if (cmd->subcommands_dict) {
-            dictEntry *de;
-            dictIterator *di = dictGetSafeIterator(cmd->subcommands_dict);
-
-            while ((de = dictNext(di)) != NULL) {
-                struct serverCommand *sub = dictGetVal(de);
+        if (cmd->subcommands_ht) {
+            hashtableIterator iter;
+            hashtableInitSafeIterator(&iter, cmd->subcommands_ht);
+            void *next;
+            while (hashtableNext(&iter, &next)) {
+                struct serverCommand *sub = next;
                 if (sub->latency_histogram) {
                     addReplyBulkCBuffer(c, sub->fullname, sdslen(sub->fullname));
                     fillCommandCDF(c, sub->latency_histogram);
                     command_with_data++;
                 }
             }
-            dictReleaseIterator(di);
+            hashtableResetIterator(&iter);
         }
     }
     setDeferredMapLen(c, replylen, command_with_data);
diff --git a/src/module.c b/src/module.c
index 5f9dff0402..05ab032800 100644
--- a/src/module.c
+++ b/src/module.c
@@ -1298,8 +1298,8 @@ int VM_CreateCommand(ValkeyModuleCtx *ctx,
     cp->serverCmd->arity = cmdfunc ? -1 : -2; /* Default value, can be changed later via dedicated API */
     /* Drain IO queue before modifying commands dictionary to prevent concurrent access while modifying it. */
     drainIOThreadsQueue();
-    serverAssert(dictAdd(server.commands, sdsdup(declared_name), cp->serverCmd) == DICT_OK);
-    serverAssert(dictAdd(server.orig_commands, sdsdup(declared_name), cp->serverCmd) == DICT_OK);
+    serverAssert(hashtableAdd(server.commands, cp->serverCmd));
+    serverAssert(hashtableAdd(server.orig_commands, cp->serverCmd));
     cp->serverCmd->id = ACLGetCommandID(declared_name); /* ID used for ACL. */
     return VALKEYMODULE_OK;
 }
@@ -1431,7 +1431,7 @@ int VM_CreateSubcommand(ValkeyModuleCommand *parent,
 
     /* Check if the command name is busy within the parent command. */
     sds declared_name = sdsnew(name);
-    if (parent_cmd->subcommands_dict && lookupSubcommand(parent_cmd, declared_name) != NULL) {
+    if (parent_cmd->subcommands_ht && lookupSubcommand(parent_cmd, declared_name) != NULL) {
         sdsfree(declared_name);
         return VALKEYMODULE_ERR;
     }
@@ -1441,7 +1441,7 @@ int VM_CreateSubcommand(ValkeyModuleCommand *parent,
         moduleCreateCommandProxy(parent->module, declared_name, fullname, cmdfunc, flags, firstkey, lastkey, keystep);
     cp->serverCmd->arity = -2;
 
-    commandAddSubcommand(parent_cmd, cp->serverCmd, name);
+    commandAddSubcommand(parent_cmd, cp->serverCmd);
     return VALKEYMODULE_OK;
 }
 
@@ -12080,20 +12080,21 @@ int moduleFreeCommand(struct ValkeyModule *module, struct serverCommand *cmd) {
     moduleFreeArgs(cmd->args, cmd->num_args);
     zfree(cp);
 
-    if (cmd->subcommands_dict) {
-        dictEntry *de;
-        dictIterator *di = dictGetSafeIterator(cmd->subcommands_dict);
-        while ((de = dictNext(di)) != NULL) {
-            struct serverCommand *sub = dictGetVal(de);
+    if (cmd->subcommands_ht) {
+        hashtableIterator iter;
+        void *next;
+        hashtableInitSafeIterator(&iter, cmd->subcommands_ht);
+        while (hashtableNext(&iter, &next)) {
+            struct serverCommand *sub = next;
             if (moduleFreeCommand(module, sub) != C_OK) continue;
 
-            serverAssert(dictDelete(cmd->subcommands_dict, sub->declared_name) == DICT_OK);
+            serverAssert(hashtableDelete(cmd->subcommands_ht, sub->declared_name));
             sdsfree((sds)sub->declared_name);
             sdsfree(sub->fullname);
             zfree(sub);
         }
-        dictReleaseIterator(di);
-        dictRelease(cmd->subcommands_dict);
+        hashtableResetIterator(&iter);
+        hashtableRelease(cmd->subcommands_ht);
     }
 
     return C_OK;
@@ -12103,19 +12104,20 @@ void moduleUnregisterCommands(struct ValkeyModule *module) {
     /* Drain IO queue before modifying commands dictionary to prevent concurrent access while modifying it. */
     drainIOThreadsQueue();
     /* Unregister all the commands registered by this module. */
-    dictIterator *di = dictGetSafeIterator(server.commands);
-    dictEntry *de;
-    while ((de = dictNext(di)) != NULL) {
-        struct serverCommand *cmd = dictGetVal(de);
+    hashtableIterator iter;
+    void *next;
+    hashtableInitSafeIterator(&iter, server.commands);
+    while (hashtableNext(&iter, &next)) {
+        struct serverCommand *cmd = next;
         if (moduleFreeCommand(module, cmd) != C_OK) continue;
 
-        serverAssert(dictDelete(server.commands, cmd->fullname) == DICT_OK);
-        serverAssert(dictDelete(server.orig_commands, cmd->fullname) == DICT_OK);
+        serverAssert(hashtableDelete(server.commands, cmd->fullname));
+        serverAssert(hashtableDelete(server.orig_commands, cmd->fullname));
         sdsfree((sds)cmd->declared_name);
         sdsfree(cmd->fullname);
         zfree(cmd);
     }
-    dictReleaseIterator(di);
+    hashtableResetIterator(&iter);
 }
 
 /* We parse argv to add sds "NAME VALUE" pairs to the server.module_configs_queue list of configs.
diff --git a/src/server.c b/src/server.c
index 21dca85067..6a1309e68f 100644
--- a/src/server.c
+++ b/src/server.c
@@ -390,6 +390,11 @@ int dictSdsKeyCaseCompare(const void *key1, const void *key2) {
     return strcasecmp(key1, key2) == 0;
 }
 
+/* Case insensitive key comparison */
+int hashtableStringKeyCaseCompare(const void *key1, const void *key2) {
+    return strcasecmp(key1, key2);
+}
+
 void dictObjectDestructor(void *val) {
     if (val == NULL) return; /* Lazy freeing will set value to NULL. */
     decrRefCount(val);
@@ -506,6 +511,16 @@ int dictResizeAllowed(size_t moreMem, double usedRatio) {
     }
 }
 
+const void *hashtableCommandGetKey(const void *element) {
+    struct serverCommand *command = (struct serverCommand *)element;
+    return command->fullname;
+}
+
+const void *hashtableSubcommandGetKey(const void *element) {
+    struct serverCommand *command = (struct serverCommand *)element;
+    return command->declared_name;
+}
+
 /* Generic hash table type where keys are Objects, Values
  * dummy pointers. */
 dictType objectKeyPointerValueDictType = {
@@ -578,16 +593,17 @@ dictType kvstoreExpiresDictType = {
     kvstoreDictMetadataSize,
 };
 
-/* Command table. sds string -> command struct pointer. */
-dictType commandTableDictType = {
-    dictSdsCaseHash,            /* hash function */
-    NULL,                       /* key dup */
-    dictSdsKeyCaseCompare,      /* key compare */
-    dictSdsDestructor,          /* key destructor */
-    NULL,                       /* val destructor */
-    NULL,                       /* allow to expand */
-    .no_incremental_rehash = 1, /* no incremental rehash as the command table may be accessed from IO threads. */
-};
+/* Command set, hashed by sds string, stores serverCommand structs. */
+hashtableType commandSetType = {.entryGetKey = hashtableCommandGetKey,
+                                .hashFunction = dictSdsCaseHash,
+                                .keyCompare = hashtableStringKeyCaseCompare,
+                                .instant_rehashing = 1};
+
+/* Sub-command set, hashed by char* string, stores serverCommand structs. */
+hashtableType subcommandSetType = {.entryGetKey = hashtableSubcommandGetKey,
+                                   .hashFunction = dictCStrCaseHash,
+                                   .keyCompare = hashtableStringKeyCaseCompare,
+                                   .instant_rehashing = 1};
 
 /* Hash type hash table (note that small hashes are represented with listpacks) */
 dictType hashDictType = {
@@ -2177,8 +2193,8 @@ void initServerConfig(void) {
     /* Command table -- we initialize it here as it is part of the
      * initial configuration, since command names may be changed via
      * valkey.conf using the rename-command directive. */
-    server.commands = dictCreate(&commandTableDictType);
-    server.orig_commands = dictCreate(&commandTableDictType);
+    server.commands = hashtableCreate(&commandSetType);
+    server.orig_commands = hashtableCreate(&commandSetType);
     populateCommandTable();
 
     /* Debugging */
@@ -3017,13 +3033,13 @@ sds catSubCommandFullname(const char *parent_name, const char *sub_name) {
     return sdscatfmt(sdsempty(), "%s|%s", parent_name, sub_name);
 }
 
-void commandAddSubcommand(struct serverCommand *parent, struct serverCommand *subcommand, const char *declared_name) {
-    if (!parent->subcommands_dict) parent->subcommands_dict = dictCreate(&commandTableDictType);
+void commandAddSubcommand(struct serverCommand *parent, struct serverCommand *subcommand) {
+    if (!parent->subcommands_ht) parent->subcommands_ht = hashtableCreate(&subcommandSetType);
 
     subcommand->parent = parent;                            /* Assign the parent command */
     subcommand->id = ACLGetCommandID(subcommand->fullname); /* Assign the ID used for ACL. */
 
-    serverAssert(dictAdd(parent->subcommands_dict, sdsnew(declared_name), subcommand) == DICT_OK);
+    serverAssert(hashtableAdd(parent->subcommands_ht, subcommand));
 }
 
 /* Set implicit ACl categories (see comment above the definition of
@@ -3075,7 +3091,7 @@ int populateCommandStructure(struct serverCommand *c) {
             sub->fullname = catSubCommandFullname(c->declared_name, sub->declared_name);
             if (populateCommandStructure(sub) == C_ERR) continue;
 
-            commandAddSubcommand(c, sub, sub->declared_name);
+            commandAddSubcommand(c, sub);
         }
     }
 
@@ -3099,22 +3115,20 @@ void populateCommandTable(void) {
         c->fullname = sdsnew(c->declared_name);
         if (populateCommandStructure(c) == C_ERR) continue;
 
-        retval1 = dictAdd(server.commands, sdsdup(c->fullname), c);
+        retval1 = hashtableAdd(server.commands, c);
         /* Populate an additional dictionary that will be unaffected
          * by rename-command statements in valkey.conf. */
-        retval2 = dictAdd(server.orig_commands, sdsdup(c->fullname), c);
-        serverAssert(retval1 == DICT_OK && retval2 == DICT_OK);
+        retval2 = hashtableAdd(server.orig_commands, c);
+        serverAssert(retval1 && retval2);
     }
 }
 
-void resetCommandTableStats(dict *commands) {
-    struct serverCommand *c;
-    dictEntry *de;
-    dictIterator *di;
-
-    di = dictGetSafeIterator(commands);
-    while ((de = dictNext(di)) != NULL) {
-        c = (struct serverCommand *)dictGetVal(de);
+void resetCommandTableStats(hashtable *commands) {
+    hashtableIterator iter;
+    void *next;
+    hashtableInitSafeIterator(&iter, commands);
+    while (hashtableNext(&iter, &next)) {
+        struct serverCommand *c = next;
         c->microseconds = 0;
         c->calls = 0;
         c->rejected_calls = 0;
@@ -3123,9 +3137,9 @@ void resetCommandTableStats(dict *commands) {
             hdr_close(c->latency_histogram);
             c->latency_histogram = NULL;
         }
-        if (c->subcommands_dict) resetCommandTableStats(c->subcommands_dict);
+        if (c->subcommands_ht) resetCommandTableStats(c->subcommands_ht);
     }
-    dictReleaseIterator(di);
+    hashtableResetIterator(&iter);
 }
 
 void resetErrorTableStats(void) {
@@ -3172,13 +3186,18 @@ void serverOpArrayFree(serverOpArray *oa) {
 /* ====================== Commands lookup and execution ===================== */
 
 int isContainerCommandBySds(sds s) {
-    struct serverCommand *base_cmd = dictFetchValue(server.commands, s);
-    int has_subcommands = base_cmd && base_cmd->subcommands_dict;
+    void *entry;
+    int found_command = hashtableFind(server.commands, s, &entry);
+    struct serverCommand *base_cmd = entry;
+    int has_subcommands = found_command && base_cmd->subcommands_ht;
     return has_subcommands;
 }
 
 struct serverCommand *lookupSubcommand(struct serverCommand *container, sds sub_name) {
-    return dictFetchValue(container->subcommands_dict, sub_name);
+    void *entry = NULL;
+    hashtableFind(container->subcommands_ht, sub_name, &entry);
+    struct serverCommand *subcommand = entry;
+    return subcommand;
 }
 
 /* Look up a command by argv and argc
@@ -3189,9 +3208,11 @@ struct serverCommand *lookupSubcommand(struct serverCommand *container, sds sub_
  * name (e.g. in COMMAND INFO) rather than to find the command
  * a user requested to execute (in processCommand).
  */
-struct serverCommand *lookupCommandLogic(dict *commands, robj **argv, int argc, int strict) {
-    struct serverCommand *base_cmd = dictFetchValue(commands, argv[0]->ptr);
-    int has_subcommands = base_cmd && base_cmd->subcommands_dict;
+struct serverCommand *lookupCommandLogic(hashtable *commands, robj **argv, int argc, int strict) {
+    void *entry = NULL;
+    int found_command = hashtableFind(commands, argv[0]->ptr, &entry);
+    struct serverCommand *base_cmd = entry;
+    int has_subcommands = found_command && base_cmd->subcommands_ht;
     if (argc == 1 || !has_subcommands) {
         if (strict && argc != 1) return NULL;
         /* Note: It is possible that base_cmd->proc==NULL (e.g. CONFIG) */
@@ -3207,7 +3228,7 @@ struct serverCommand *lookupCommand(robj **argv, int argc) {
     return lookupCommandLogic(server.commands, argv, argc, 0);
 }
 
-struct serverCommand *lookupCommandBySdsLogic(dict *commands, sds s) {
+struct serverCommand *lookupCommandBySdsLogic(hashtable *commands, sds s) {
     int argc, j;
     sds *strings = sdssplitlen(s, sdslen(s), "|", 1, &argc);
     if (strings == NULL) return NULL;
@@ -3234,7 +3255,7 @@ struct serverCommand *lookupCommandBySds(sds s) {
     return lookupCommandBySdsLogic(server.commands, s);
 }
 
-struct serverCommand *lookupCommandByCStringLogic(dict *commands, const char *s) {
+struct serverCommand *lookupCommandByCStringLogic(hashtable *commands, const char *s) {
     struct serverCommand *cmd;
     sds name = sdsnew(s);
 
@@ -4877,23 +4898,25 @@ void addReplyCommandSubCommands(client *c,
                                 struct serverCommand *cmd,
                                 void (*reply_function)(client *, struct serverCommand *),
                                 int use_map) {
-    if (!cmd->subcommands_dict) {
+    if (!cmd->subcommands_ht) {
         addReplySetLen(c, 0);
         return;
     }
 
     if (use_map)
-        addReplyMapLen(c, dictSize(cmd->subcommands_dict));
+        addReplyMapLen(c, hashtableSize(cmd->subcommands_ht));
     else
-        addReplyArrayLen(c, dictSize(cmd->subcommands_dict));
-    dictEntry *de;
-    dictIterator *di = dictGetSafeIterator(cmd->subcommands_dict);
-    while ((de = dictNext(di)) != NULL) {
-        struct serverCommand *sub = (struct serverCommand *)dictGetVal(de);
+        addReplyArrayLen(c, hashtableSize(cmd->subcommands_ht));
+
+    void *next;
+    hashtableIterator iter;
+    hashtableInitSafeIterator(&iter, cmd->subcommands_ht);
+    while (hashtableNext(&iter, &next)) {
+        struct serverCommand *sub = next;
         if (use_map) addReplyBulkCBuffer(c, sub->fullname, sdslen(sub->fullname));
         reply_function(c, sub);
     }
-    dictReleaseIterator(di);
+    hashtableResetIterator(&iter);
 }
 
 /* Output the representation of a server command. Used by the COMMAND command and COMMAND INFO. */
@@ -4939,7 +4962,7 @@ void addReplyCommandDocs(client *c, struct serverCommand *cmd) {
     if (cmd->reply_schema) maplen++;
 #endif
     if (cmd->args) maplen++;
-    if (cmd->subcommands_dict) maplen++;
+    if (cmd->subcommands_ht) maplen++;
     addReplyMapLen(c, maplen);
 
     if (cmd->summary) {
@@ -4989,7 +5012,7 @@ void addReplyCommandDocs(client *c, struct serverCommand *cmd) {
         addReplyBulkCString(c, "arguments");
         addReplyCommandArgList(c, cmd->args, cmd->num_args);
     }
-    if (cmd->subcommands_dict) {
+    if (cmd->subcommands_ht) {
         addReplyBulkCString(c, "subcommands");
         addReplyCommandSubCommands(c, cmd, addReplyCommandDocs, 1);
     }
@@ -5046,20 +5069,20 @@ void getKeysSubcommand(client *c) {
 
 /* COMMAND (no args) */
 void commandCommand(client *c) {
-    dictIterator *di;
-    dictEntry *de;
-
-    addReplyArrayLen(c, dictSize(server.commands));
-    di = dictGetIterator(server.commands);
-    while ((de = dictNext(di)) != NULL) {
-        addReplyCommandInfo(c, dictGetVal(de));
+    hashtableIterator iter;
+    void *next;
+    addReplyArrayLen(c, hashtableSize(server.commands));
+    hashtableInitIterator(&iter, server.commands);
+    while (hashtableNext(&iter, &next)) {
+        struct serverCommand *cmd = next;
+        addReplyCommandInfo(c, cmd);
     }
-    dictReleaseIterator(di);
+    hashtableResetIterator(&iter);
 }
 
 /* COMMAND COUNT */
 void commandCountCommand(client *c) {
-    addReplyLongLong(c, dictSize(server.commands));
+    addReplyLongLong(c, hashtableSize(server.commands));
 }
 
 typedef enum {
@@ -5105,39 +5128,39 @@ int shouldFilterFromCommandList(struct serverCommand *cmd, commandListFilter *fi
 }
 
 /* COMMAND LIST FILTERBY (MODULE <module-name>|ACLCAT <cat>|PATTERN <pattern>) */
-void commandListWithFilter(client *c, dict *commands, commandListFilter filter, int *numcmds) {
-    dictEntry *de;
-    dictIterator *di = dictGetIterator(commands);
-
-    while ((de = dictNext(di)) != NULL) {
-        struct serverCommand *cmd = dictGetVal(de);
+void commandListWithFilter(client *c, hashtable *commands, commandListFilter filter, int *numcmds) {
+    hashtableIterator iter;
+    void *next;
+    hashtableInitIterator(&iter, commands);
+    while (hashtableNext(&iter, &next)) {
+        struct serverCommand *cmd = next;
         if (!shouldFilterFromCommandList(cmd, &filter)) {
             addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname));
             (*numcmds)++;
         }
 
-        if (cmd->subcommands_dict) {
-            commandListWithFilter(c, cmd->subcommands_dict, filter, numcmds);
+        if (cmd->subcommands_ht) {
+            commandListWithFilter(c, cmd->subcommands_ht, filter, numcmds);
         }
     }
-    dictReleaseIterator(di);
+    hashtableResetIterator(&iter);
 }
 
 /* COMMAND LIST */
-void commandListWithoutFilter(client *c, dict *commands, int *numcmds) {
-    dictEntry *de;
-    dictIterator *di = dictGetIterator(commands);
-
-    while ((de = dictNext(di)) != NULL) {
-        struct serverCommand *cmd = dictGetVal(de);
+void commandListWithoutFilter(client *c, hashtable *commands, int *numcmds) {
+    hashtableIterator iter;
+    void *next;
+    hashtableInitIterator(&iter, commands);
+    while (hashtableNext(&iter, &next)) {
+        struct serverCommand *cmd = next;
         addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname));
         (*numcmds)++;
 
-        if (cmd->subcommands_dict) {
-            commandListWithoutFilter(c, cmd->subcommands_dict, numcmds);
+        if (cmd->subcommands_ht) {
+            commandListWithoutFilter(c, cmd->subcommands_ht, numcmds);
         }
     }
-    dictReleaseIterator(di);
+    hashtableResetIterator(&iter);
 }
 
 /* COMMAND LIST [FILTERBY (MODULE <module-name>|ACLCAT <cat>|PATTERN <pattern>)] */
@@ -5186,14 +5209,15 @@ void commandInfoCommand(client *c) {
     int i;
 
     if (c->argc == 2) {
-        dictIterator *di;
-        dictEntry *de;
-        addReplyArrayLen(c, dictSize(server.commands));
-        di = dictGetIterator(server.commands);
-        while ((de = dictNext(di)) != NULL) {
-            addReplyCommandInfo(c, dictGetVal(de));
+        hashtableIterator iter;
+        void *next;
+        addReplyArrayLen(c, hashtableSize(server.commands));
+        hashtableInitIterator(&iter, server.commands);
+        while (hashtableNext(&iter, &next)) {
+            struct serverCommand *cmd = next;
+            addReplyCommandInfo(c, cmd);
         }
-        dictReleaseIterator(di);
+        hashtableResetIterator(&iter);
     } else {
         addReplyArrayLen(c, c->argc - 2);
         for (i = 2; i < c->argc; i++) {
@@ -5207,16 +5231,16 @@ void commandDocsCommand(client *c) {
     int i;
     if (c->argc == 2) {
         /* Reply with an array of all commands */
-        dictIterator *di;
-        dictEntry *de;
-        addReplyMapLen(c, dictSize(server.commands));
-        di = dictGetIterator(server.commands);
-        while ((de = dictNext(di)) != NULL) {
-            struct serverCommand *cmd = dictGetVal(de);
+        hashtableIterator iter;
+        void *next;
+        addReplyMapLen(c, hashtableSize(server.commands));
+        hashtableInitIterator(&iter, server.commands);
+        while (hashtableNext(&iter, &next)) {
+            struct serverCommand *cmd = next;
             addReplyBulkCBuffer(c, cmd->fullname, sdslen(cmd->fullname));
             addReplyCommandDocs(c, cmd);
         }
-        dictReleaseIterator(di);
+        hashtableResetIterator(&iter);
     } else {
         /* Reply with an array of the requested commands (if we find them) */
         int numcmds = 0;
@@ -5336,14 +5360,13 @@ const char *getSafeInfoString(const char *s, size_t len, char **tmp) {
     return memmapchars(new, len, unsafe_info_chars, unsafe_info_chars_substs, sizeof(unsafe_info_chars) - 1);
 }
 
-sds genValkeyInfoStringCommandStats(sds info, dict *commands) {
-    struct serverCommand *c;
-    dictEntry *de;
-    dictIterator *di;
-    di = dictGetSafeIterator(commands);
-    while ((de = dictNext(di)) != NULL) {
+sds genValkeyInfoStringCommandStats(sds info, hashtable *commands) {
+    hashtableIterator iter;
+    void *next;
+    hashtableInitSafeIterator(&iter, commands);
+    while (hashtableNext(&iter, &next)) {
+        struct serverCommand *c = next;
         char *tmpsafe;
-        c = (struct serverCommand *)dictGetVal(de);
         if (c->calls || c->failed_calls || c->rejected_calls) {
             info = sdscatprintf(info,
                                 "cmdstat_%s:calls=%lld,usec=%lld,usec_per_call=%.2f"
@@ -5353,11 +5376,11 @@ sds genValkeyInfoStringCommandStats(sds info, dict *commands) {
                                 c->rejected_calls, c->failed_calls);
             if (tmpsafe != NULL) zfree(tmpsafe);
         }
-        if (c->subcommands_dict) {
-            info = genValkeyInfoStringCommandStats(info, c->subcommands_dict);
+        if (c->subcommands_ht) {
+            info = genValkeyInfoStringCommandStats(info, c->subcommands_ht);
         }
     }
-    dictReleaseIterator(di);
+    hashtableResetIterator(&iter);
 
     return info;
 }
@@ -5374,24 +5397,23 @@ sds genValkeyInfoStringACLStats(sds info) {
     return info;
 }
 
-sds genValkeyInfoStringLatencyStats(sds info, dict *commands) {
-    struct serverCommand *c;
-    dictEntry *de;
-    dictIterator *di;
-    di = dictGetSafeIterator(commands);
-    while ((de = dictNext(di)) != NULL) {
+sds genValkeyInfoStringLatencyStats(sds info, hashtable *commands) {
+    hashtableIterator iter;
+    void *next;
+    hashtableInitSafeIterator(&iter, commands);
+    while (hashtableNext(&iter, &next)) {
+        struct serverCommand *c = next;
         char *tmpsafe;
-        c = (struct serverCommand *)dictGetVal(de);
         if (c->latency_histogram) {
             info = fillPercentileDistributionLatencies(
                 info, getSafeInfoString(c->fullname, sdslen(c->fullname), &tmpsafe), c->latency_histogram);
             if (tmpsafe != NULL) zfree(tmpsafe);
         }
-        if (c->subcommands_dict) {
-            info = genValkeyInfoStringLatencyStats(info, c->subcommands_dict);
+        if (c->subcommands_ht) {
+            info = genValkeyInfoStringLatencyStats(info, c->subcommands_ht);
         }
     }
-    dictReleaseIterator(di);
+    hashtableResetIterator(&iter);
 
     return info;
 }
diff --git a/src/server.h b/src/server.h
index 44de6eada1..4a7d4777c8 100644
--- a/src/server.h
+++ b/src/server.h
@@ -67,7 +67,8 @@ typedef long long ustime_t; /* microsecond time type. */
 
 #include "ae.h"         /* Event driven programming library */
 #include "sds.h"        /* Dynamic safe strings */
-#include "dict.h"       /* Hash tables */
+#include "dict.h"       /* Hash tables (old implementation) */
+#include "hashtable.h"  /* Hash tables (new implementation) */
 #include "kvstore.h"    /* Slot-based hash table */
 #include "adlist.h"     /* Linked lists */
 #include "zmalloc.h"    /* total memory usage aware version of malloc/free */
@@ -1693,8 +1694,8 @@ struct valkeyServer {
     int hz;                   /* serverCron() calls frequency in hertz */
     int in_fork_child;        /* indication that this is a fork child */
     serverDb *db;
-    dict *commands;      /* Command table */
-    dict *orig_commands; /* Command table before command renaming. */
+    hashtable *commands;      /* Command table */
+    hashtable *orig_commands; /* Command table before command renaming. */
     aeEventLoop *el;
     _Atomic AeIoState io_poll_state;     /* Indicates the state of the IO polling. */
     int io_ae_fired_events;              /* Number of poll events received by the IO thread. */
@@ -2577,12 +2578,12 @@ struct serverCommand {
                      bit set in the bitmap of allowed commands. */
     sds fullname; /* A SDS string representing the command fullname. */
     struct hdr_histogram
-        *latency_histogram;        /*points to the command latency command histogram (unit of time nanosecond) */
+        *latency_histogram;        /* Points to the command latency command histogram (unit of time nanosecond). */
     keySpec legacy_range_key_spec; /* The legacy (first,last,step) key spec is
                                     * still maintained (if applicable) so that
                                     * we can still support the reply format of
                                     * COMMAND INFO and COMMAND GETKEYS */
-    dict *subcommands_dict;        /* A dictionary that holds the subcommands, the key is the subcommand sds name
+    hashtable *subcommands_ht;     /* Subcommands hash table. The key is the subcommand sds name
                                     * (not the fullname), and the value is the serverCommand structure pointer. */
     struct serverCommand *parent;
     struct ValkeyModuleCommand *module_cmd; /* A pointer to the module command data (NULL if native command) */
@@ -3311,9 +3312,9 @@ connListener *listenerByType(const char *typename);
 int changeListener(connListener *listener);
 struct serverCommand *lookupSubcommand(struct serverCommand *container, sds sub_name);
 struct serverCommand *lookupCommand(robj **argv, int argc);
-struct serverCommand *lookupCommandBySdsLogic(dict *commands, sds s);
+struct serverCommand *lookupCommandBySdsLogic(hashtable *commands, sds s);
 struct serverCommand *lookupCommandBySds(sds s);
-struct serverCommand *lookupCommandByCStringLogic(dict *commands, const char *s);
+struct serverCommand *lookupCommandByCStringLogic(hashtable *commands, const char *s);
 struct serverCommand *lookupCommandByCString(const char *s);
 struct serverCommand *lookupCommandOrOriginal(robj **argv, int argc);
 int commandCheckExistence(client *c, sds *err);
@@ -3347,7 +3348,7 @@ void serverLogRawFromHandler(int level, const char *msg);
 void usage(void);
 void updateDictResizePolicy(void);
 void populateCommandTable(void);
-void resetCommandTableStats(dict *commands);
+void resetCommandTableStats(hashtable *commands);
 void resetErrorTableStats(void);
 void adjustOpenFilesLimit(void);
 void incrementErrorCount(const char *fullerr, size_t namelen);
@@ -4045,7 +4046,7 @@ int memtest_preserving_test(unsigned long *m, size_t bytes, int passes);
 void mixDigest(unsigned char *digest, const void *ptr, size_t len);
 void xorDigest(unsigned char *digest, const void *ptr, size_t len);
 sds catSubCommandFullname(const char *parent_name, const char *sub_name);
-void commandAddSubcommand(struct serverCommand *parent, struct serverCommand *subcommand, const char *declared_name);
+void commandAddSubcommand(struct serverCommand *parent, struct serverCommand *subcommand);
 void debugDelay(int usec);
 void killThreads(void);
 void makeThreadKillable(void);

From 3eb8314be6af0777e69f852b65f933dd9186d30b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= <viktor.soderqvist@est.tech>
Date: Wed, 11 Sep 2024 16:24:26 +0200
Subject: [PATCH 35/73] Replace dict with hashtable for keys, expires and
 pubsub channels
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of a dictEntry with pointers to key and value, the hashtable
has a pointer directly to the value (robj) which can hold an embedded
key and acts as a key-value in the hashtable. This minimizes the number
of pointers to follow and thus the number of memory accesses to lookup
a key-value pair.

        Keys         robj
      hashtable
      +-------+   +-----------------------+
      | 0     |   | type, encoding, LRU   |
      | 1 ------->| refcount, expire      |
      | 2     |   | ptr                   |
      | ...   |   | optional embedded key |
      +-------+   | optional embedded val |
                  +-----------------------+

The expire timestamp (TTL) is also stored in the robj, if any. The expire
hash table points to the same robj.

Overview of changes:

* Replace dict with hashtable in kvstore (kvstore.c)
* Add functions for embedding key and expire in robj (object.c)
  * When there's unused space, reserve an expire field to avoid realloting
    it later if expire is added.
  * Always reserve space for expire for large key names to avoid realloc
    if it's set later.
* Update db functions (db.c)
  * dbAdd, setKey and setExpire reallocate the object when embedding a key
  * setKey does not increment the reference counter, since it would require
    duplicating the object. This responsibility is moved to the caller.
* Remove logic for shared integer objects as values in the database. The keys
  are now embedded in the objects, so all objects in the database need to be
  unique. Thus, we can't use shared objects as values. Also delete test cases
  for shared integers.
* Adjust various commands to the changes mentioned above.
* Adjust defrag code
  * Improvement: Don't access the expires table before defrag has actually
    reallocated the object.
* Adjust test cases that were using hard-coded sizes for dict when realloc
  would happen, and some other adjustments in test cases.
* Adjust memory prefetch for new hash table implementation in IO-threading,
  using new `hashtableIncrementalFind` API
* Adjust offloading of free() to IO threads: Object free to be done in main
  thread while keeping obj->ptr offloading in IO-thread since the DB object is
  now allocated by the main-thread and not by the IO-thread as it used to be.
* Let expireIfNeeded take an optional value, to avoid looking up the expires
  table when possible.

---------

Signed-off-by: Uri Yagelnik <uriy@amazon.com>
Signed-off-by: uriyage <78144248+uriyage@users.noreply.github.com>
Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
Co-authored-by: Uri Yagelnik <uriy@amazon.com>
---
 src/aof.c                        |  12 +-
 src/bitops.c                     |   5 +-
 src/cluster.c                    |  20 +-
 src/cluster_legacy.c             |  15 +-
 src/db.c                         | 514 ++++++++++++--------
 src/debug.c                      |  30 +-
 src/defrag.c                     | 192 +++-----
 src/evict.c                      |  44 +-
 src/expire.c                     |  40 +-
 src/geo.c                        |   3 +-
 src/hyperloglog.c                |   4 +-
 src/io_threads.c                 |   9 +-
 src/kvstore.c                    | 802 ++++++++++++++++---------------
 src/kvstore.h                    | 115 ++---
 src/lazyfree.c                   |   8 +-
 src/memory_prefetch.c            | 210 ++------
 src/module.c                     |  22 +-
 src/object.c                     | 310 +++++++++---
 src/pubsub.c                     |  73 +--
 src/rdb.c                        |  26 +-
 src/replication.c                |   2 +-
 src/sds.c                        |  11 +-
 src/sds.h                        |   1 +
 src/server.c                     | 166 ++++---
 src/server.h                     |  54 ++-
 src/sort.c                       |   7 +-
 src/t_hash.c                     |   2 +-
 src/t_list.c                     |   4 +-
 src/t_set.c                      |  14 +-
 src/t_stream.c                   |   4 +-
 src/t_string.c                   |  40 +-
 src/t_zset.c                     |  10 +-
 src/unit/test_files.h            |  13 +-
 src/unit/test_kvstore.c          | 133 +++--
 src/unit/test_object.c           |  50 ++
 tests/integration/valkey-cli.tcl |   4 +-
 tests/support/util.tcl           |  24 +
 tests/unit/expire.tcl            |   6 +-
 tests/unit/info.tcl              |  32 +-
 tests/unit/maxmemory.tcl         |  67 +--
 tests/unit/other.tcl             |  30 +-
 tests/unit/type/incr.tcl         |  11 -
 42 files changed, 1704 insertions(+), 1435 deletions(-)
 create mode 100644 src/unit/test_object.c

diff --git a/src/aof.c b/src/aof.c
index e0ca6fbb61..0fd3cf5c26 100644
--- a/src/aof.c
+++ b/src/aof.c
@@ -2190,7 +2190,6 @@ static int rewriteFunctions(rio *aof) {
 }
 
 int rewriteAppendOnlyFileRio(rio *aof) {
-    dictEntry *de;
     int j;
     long key_count = 0;
     long long updated_time = 0;
@@ -2219,17 +2218,18 @@ int rewriteAppendOnlyFileRio(rio *aof) {
 
         kvs_it = kvstoreIteratorInit(db->keys);
         /* Iterate this DB writing every entry */
-        while ((de = kvstoreIteratorNext(kvs_it)) != NULL) {
+        void *next;
+        while (kvstoreIteratorNext(kvs_it, &next)) {
+            robj *o = next;
             sds keystr;
-            robj key, *o;
+            robj key;
             long long expiretime;
             size_t aof_bytes_before_key = aof->processed_bytes;
 
-            keystr = dictGetKey(de);
-            o = dictGetVal(de);
+            keystr = objectGetKey(o);
             initStaticStringObject(key, keystr);
 
-            expiretime = getExpire(db, &key);
+            expiretime = objectGetExpire(o);
 
             /* Save the key and associated value */
             if (o->type == OBJ_STRING) {
diff --git a/src/bitops.c b/src/bitops.c
index 10c383b270..1457cd5322 100644
--- a/src/bitops.c
+++ b/src/bitops.c
@@ -486,7 +486,7 @@ robj *lookupStringForBitCommand(client *c, uint64_t maxbit, int *dirty) {
 
     if (o == NULL) {
         o = createObject(OBJ_STRING, sdsnewlen(NULL, byte + 1));
-        dbAdd(c->db, c->argv[1], o);
+        dbAdd(c->db, c->argv[1], &o);
         if (dirty) *dirty = 1;
     } else {
         o = dbUnshareStringValue(c->db, c->argv[1], o);
@@ -772,9 +772,8 @@ void bitopCommand(client *c) {
     /* Store the computed value into the target key */
     if (maxlen) {
         o = createObject(OBJ_STRING, res);
-        setKey(c, c->db, targetkey, o, 0);
+        setKey(c, c->db, targetkey, &o, 0);
         notifyKeyspaceEvent(NOTIFY_STRING, "set", targetkey, c->db->id);
-        decrRefCount(o);
         server.dirty++;
     } else if (dbDelete(c->db, targetkey)) {
         signalModifiedKey(c, c->db, targetkey);
diff --git a/src/cluster.c b/src/cluster.c
index 9154ac3207..df6bb86454 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -276,9 +276,9 @@ void restoreCommand(client *c) {
     }
 
     /* Create the key and set the TTL if any */
-    dbAdd(c->db, key, obj);
+    dbAdd(c->db, key, &obj);
     if (ttl) {
-        setExpire(c, c->db, key, ttl);
+        obj = setExpire(c, c->db, key, ttl);
         if (!absttl) {
             /* Propagate TTL as absolute timestamp */
             robj *ttl_obj = createStringObjectFromLongLong(ttl);
@@ -811,7 +811,7 @@ static int shouldReturnTlsInfo(void) {
 }
 
 unsigned int countKeysInSlot(unsigned int slot) {
-    return kvstoreDictSize(server.db->keys, slot);
+    return kvstoreHashtableSize(server.db->keys, slot);
 }
 
 void clusterCommandHelp(client *c) {
@@ -908,16 +908,16 @@ void clusterCommand(client *c) {
         unsigned int keys_in_slot = countKeysInSlot(slot);
         unsigned int numkeys = maxkeys > keys_in_slot ? keys_in_slot : maxkeys;
         addReplyArrayLen(c, numkeys);
-        kvstoreDictIterator *kvs_di = NULL;
-        dictEntry *de = NULL;
-        kvs_di = kvstoreGetDictIterator(server.db->keys, slot);
+        kvstoreHashtableIterator *kvs_di = NULL;
+        kvs_di = kvstoreGetHashtableIterator(server.db->keys, slot);
         for (unsigned int i = 0; i < numkeys; i++) {
-            de = kvstoreDictIteratorNext(kvs_di);
-            serverAssert(de != NULL);
-            sds sdskey = dictGetKey(de);
+            void *next;
+            serverAssert(kvstoreHashtableIteratorNext(kvs_di, &next));
+            robj *valkey = next;
+            sds sdskey = objectGetKey(valkey);
             addReplyBulkCBuffer(c, sdskey, sdslen(sdskey));
         }
-        kvstoreReleaseDictIterator(kvs_di);
+        kvstoreReleaseHashtableIterator(kvs_di);
     } else if ((!strcasecmp(c->argv[1]->ptr, "slaves") || !strcasecmp(c->argv[1]->ptr, "replicas")) && c->argc == 3) {
         /* CLUSTER REPLICAS <NODE ID> */
         clusterNode *n = clusterLookupNode(c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 50a8ffca38..a273fe0d86 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -6159,12 +6159,13 @@ unsigned int delKeysInSlot(unsigned int hashslot) {
     server.server_del_keys_in_slot = 1;
     unsigned int j = 0;
 
-    kvstoreDictIterator *kvs_di = NULL;
-    dictEntry *de = NULL;
-    kvs_di = kvstoreGetDictSafeIterator(server.db->keys, hashslot);
-    while ((de = kvstoreDictIteratorNext(kvs_di)) != NULL) {
+    kvstoreHashtableIterator *kvs_di = NULL;
+    void *next;
+    kvs_di = kvstoreGetHashtableSafeIterator(server.db->keys, hashslot);
+    while (kvstoreHashtableIteratorNext(kvs_di, &next)) {
+        robj *valkey = next;
         enterExecutionUnit(1, 0);
-        sds sdskey = dictGetKey(de);
+        sds sdskey = objectGetKey(valkey);
         robj *key = createStringObject(sdskey, sdslen(sdskey));
         dbDelete(&server.db[0], key);
         propagateDeletion(&server.db[0], key, server.lazyfree_lazy_server_del);
@@ -6179,7 +6180,7 @@ unsigned int delKeysInSlot(unsigned int hashslot) {
         j++;
         server.dirty++;
     }
-    kvstoreReleaseDictIterator(kvs_di);
+    kvstoreReleaseHashtableIterator(kvs_di);
 
     server.server_del_keys_in_slot = 0;
     serverAssert(server.execution_nesting == 0);
@@ -6188,7 +6189,7 @@ unsigned int delKeysInSlot(unsigned int hashslot) {
 
 /* Get the count of the channels for a given slot. */
 unsigned int countChannelsInSlot(unsigned int hashslot) {
-    return kvstoreDictSize(server.pubsubshard_channels, hashslot);
+    return kvstoreHashtableSize(server.pubsubshard_channels, hashslot);
 }
 
 clusterNode *getMyClusterNode(void) {
diff --git a/src/db.c b/src/db.c
index 3c3ccb4899..2bd40ba74b 100644
--- a/src/db.c
+++ b/src/db.c
@@ -52,14 +52,14 @@ typedef enum {
     KEY_DELETED    /* The key was deleted now. */
 } keyStatus;
 
-keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, int flags, int dict_index);
-keyStatus expireIfNeeded(serverDb *db, robj *key, int flags);
-int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index);
-int keyIsExpired(serverDb *db, robj *key);
-static void dbSetValue(serverDb *db, robj *key, robj *val, int overwrite, dictEntry *de);
+static keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, robj *val, int flags, int dict_index);
+static keyStatus expireIfNeeded(serverDb *db, robj *key, robj *val, int flags);
+static int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index);
+static int objectIsExpired(robj *val);
+static void dbSetValue(serverDb *db, robj *key, robj **valref, int overwrite, void **oldref);
 static int getKVStoreIndexForKey(sds key);
-dictEntry *dbFindExpiresWithDictIndex(serverDb *db, void *key, int dict_index);
-dictEntry *dbFindWithDictIndex(serverDb *db, void *key, int dict_index);
+static robj *dbFindWithDictIndex(serverDb *db, sds key, int dict_index);
+static robj *dbFindExpiresWithDictIndex(serverDb *db, sds key, int dict_index);
 
 /* Update LFU when an object is accessed.
  * Firstly, decrement the counter if the decrement time is reached.
@@ -99,10 +99,8 @@ void updateLFU(robj *val) {
  * in the replication link. */
 robj *lookupKey(serverDb *db, robj *key, int flags) {
     int dict_index = getKVStoreIndexForKey(key->ptr);
-    dictEntry *de = dbFindWithDictIndex(db, key->ptr, dict_index);
-    robj *val = NULL;
-    if (de) {
-        val = dictGetVal(de);
+    robj *val = dbFindWithDictIndex(db, key->ptr, dict_index);
+    if (val) {
         /* Forcing deletion of expired keys on a replica makes the replica
          * inconsistent with the primary. We forbid it on readonly replicas, but
          * we have to allow it on writable replicas to make write commands
@@ -115,7 +113,7 @@ robj *lookupKey(serverDb *db, robj *key, int flags) {
         int expire_flags = 0;
         if (flags & LOOKUP_WRITE && !is_ro_replica) expire_flags |= EXPIRE_FORCE_DELETE_EXPIRED;
         if (flags & LOOKUP_NOEXPIRE) expire_flags |= EXPIRE_AVOID_DELETE_EXPIRED;
-        if (expireIfNeededWithDictIndex(db, key, expire_flags, dict_index) != KEY_VALID) {
+        if (expireIfNeededWithDictIndex(db, key, val, expire_flags, dict_index) != KEY_VALID) {
             /* The key is no longer valid. */
             val = NULL;
         }
@@ -129,10 +127,8 @@ robj *lookupKey(serverDb *db, robj *key, int flags) {
             server.current_client->cmd->proc != touchCommand)
             flags |= LOOKUP_NOTOUCH;
         if (!hasActiveChildProcess() && !(flags & LOOKUP_NOTOUCH)) {
-            if (!canUseSharedObject() && val->refcount == OBJ_SHARED_REFCOUNT) {
-                val = dupStringObject(val);
-                kvstoreDictSetVal(db->keys, dict_index, de, val);
-            }
+            /* Shared objects can't be stored in the database. */
+            serverAssert(val->refcount != OBJ_SHARED_REFCOUNT);
             if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) {
                 updateLFU(val);
             } else {
@@ -197,32 +193,47 @@ robj *lookupKeyWriteOrReply(client *c, robj *key, robj *reply) {
     return o;
 }
 
-/* Add the key to the DB.
+/* Add a key-value entry to the DB.
+ *
+ * A copy of 'key' is stored in the database. The caller must ensure the
+ * `key` is properly freed by calling decrRefcount(key).
  *
- * In this case a copy of `key` is copied in kvstore, the caller must ensure the `key` is properly freed.
+ * The value may (if its reference counter == 1) be reallocated and become
+ * invalid after a call to this function. The (possibly reallocated) value is
+ * stored in the database and the 'valref' pointer is updated to point to the
+ * new allocation.
  *
- * It's up to the caller to increment the reference
- * counter of the value if needed.
+ * The reference counter of the value pointed to by valref is not incremented,
+ * so the caller should not free the value using decrRefcount after calling this
+ * function.
  *
  * If the update_if_existing argument is false, the program is aborted
  * if the key already exists, otherwise, it can fall back to dbOverwrite. */
-static void dbAddInternal(serverDb *db, robj *key, robj *val, int update_if_existing) {
-    dictEntry *existing;
+static void dbAddInternal(serverDb *db, robj *key, robj **valref, int update_if_existing) {
     int dict_index = getKVStoreIndexForKey(key->ptr);
-    dictEntry *de = kvstoreDictAddRaw(db->keys, dict_index, key->ptr, &existing);
-    if (update_if_existing && existing) {
-        dbSetValue(db, key, val, 1, existing);
-        return;
+    void **oldref = NULL;
+    if (update_if_existing) {
+        oldref = kvstoreHashtableFindRef(db->keys, dict_index, key->ptr);
+        if (oldref != NULL) {
+            dbSetValue(db, key, valref, 1, oldref);
+            return;
+        }
+    } else {
+        debugServerAssertWithInfo(NULL, key, kvstoreHashtableFindRef(db->keys, dict_index, key->ptr) == NULL);
     }
-    serverAssertWithInfo(NULL, key, de != NULL);
+
+    /* Not existing. Convert val to valkey object and insert. */
+    robj *val = *valref;
+    val = objectSetKeyAndExpire(val, key->ptr, -1);
     initObjectLRUOrLFU(val);
-    kvstoreDictSetVal(db->keys, dict_index, de, val);
+    kvstoreHashtableAdd(db->keys, dict_index, val);
     signalKeyAsReady(db, key, val->type);
     notifyKeyspaceEvent(NOTIFY_NEW, "new", key, db->id);
+    *valref = val;
 }
 
-void dbAdd(serverDb *db, robj *key, robj *val) {
-    dbAddInternal(db, key, val, 0);
+void dbAdd(serverDb *db, robj *key, robj **valref) {
+    dbAddInternal(db, key, valref, 0);
 }
 
 /* Returns which dict index should be used with kvstore for a given key. */
@@ -270,20 +281,32 @@ int getKeySlot(sds key) {
  * since it is not useful in this context.
  *
  * The function returns 1 if the key was added to the database, otherwise 0 is returned.
- *
- * In this case a copy of `key` is copied in kvstore, the caller must ensure the `key` is properly freed.
  */
-int dbAddRDBLoad(serverDb *db, sds key, robj *val) {
+int dbAddRDBLoad(serverDb *db, sds key, robj **valref) {
     int dict_index = getKVStoreIndexForKey(key);
-    dictEntry *de = kvstoreDictAddRaw(db->keys, dict_index, key, NULL);
-    if (de == NULL) return 0;
+    hashtablePosition pos;
+    if (!kvstoreHashtableFindPositionForInsert(db->keys, dict_index, key, &pos, NULL)) {
+        return 0;
+    }
+    robj *val = *valref;
+    val = objectSetKeyAndExpire(val, key, -1);
+    kvstoreHashtableInsertAtPosition(db->keys, dict_index, val, &pos);
     initObjectLRUOrLFU(val);
-    kvstoreDictSetVal(db->keys, dict_index, de, val);
+    *valref = val;
     return 1;
 }
 
-/* Overwrite an existing key with a new value. Incrementing the reference
- * count of the new value is up to the caller.
+/* Overwrite an existing key with a new value.
+ *
+ * The value may (if its reference counter == 1) be reallocated and become
+ * invalid after a call to this function. The (possibly reallocated) value is
+ * stored in the database and the 'valref' pointer is updated to point to the
+ * new allocation.
+ *
+ * The reference counter of the value pointed to by valref is not incremented,
+ * so the caller should not free the value using decrRefcount after calling this
+ * function.
+ *
  * This function does not modify the expire time of the existing key.
  *
  * The 'overwrite' flag is an indication whether this is done as part of a
@@ -291,19 +314,23 @@ int dbAddRDBLoad(serverDb *db, sds key, robj *val) {
  * replacement (in which case we need to emit deletion signals), or just an
  * update of a value of an existing key (when false).
  *
- * The dictEntry input is optional, can be used if we already have one.
+ * The 'oldref' argument is optional. If provided, it is a pointer to the
+ * location within the hash table where the old value is stored and the new
+ * value should be stored.
  *
  * The program is aborted if the key was not already present. */
-static void dbSetValue(serverDb *db, robj *key, robj *val, int overwrite, dictEntry *de) {
-    int dict_index = getKVStoreIndexForKey(key->ptr);
-    if (!de) de = kvstoreDictFind(db->keys, dict_index, key->ptr);
-    serverAssertWithInfo(NULL, key, de != NULL);
-    robj *old = dictGetVal(de);
-
-    val->lru = old->lru;
+static void dbSetValue(serverDb *db, robj *key, robj **valref, int overwrite, void **oldref) {
+    robj *val = *valref;
+    if (oldref == NULL) {
+        int dict_index = getKVStoreIndexForKey(key->ptr);
+        oldref = kvstoreHashtableFindRef(db->keys, dict_index, key->ptr);
+    }
+    serverAssertWithInfo(NULL, key, oldref != NULL);
+    robj *old = *oldref;
+    robj *new;
 
     if (overwrite) {
-        /* RM_StringDMA may call dbUnshareStringValue which may free val, so we
+        /* VM_StringDMA may call dbUnshareStringValue which may free val, so we
          * need to incr to retain old */
         incrRefCount(old);
         /* Although the key is not really deleted from the database, we regard
@@ -313,10 +340,40 @@ static void dbSetValue(serverDb *db, robj *key, robj *val, int overwrite, dictEn
         /* We want to try to unblock any module clients or clients using a blocking XREADGROUP */
         signalDeletedKeyAsReady(db, key, old->type);
         decrRefCount(old);
-        /* Because of RM_StringDMA, old may be changed, so we need get old again */
-        old = dictGetVal(de);
+        /* Because of VM_StringDMA, old may be changed, so we need get old again */
+        old = *oldref;
+    }
+
+    if ((old->refcount == 1 && old->encoding != OBJ_ENCODING_EMBSTR) &&
+        (val->refcount == 1 && val->encoding != OBJ_ENCODING_EMBSTR)) {
+        /* Keep old object in the database. Just swap it's ptr, type and
+         * encoding with the content of val. */
+        int tmp_type = old->type;
+        int tmp_encoding = old->encoding;
+        void *tmp_ptr = old->ptr;
+        old->type = val->type;
+        old->encoding = val->encoding;
+        old->ptr = val->ptr;
+        val->type = tmp_type;
+        val->encoding = tmp_encoding;
+        val->ptr = tmp_ptr;
+        /* Set new to old to keep the old object. Set old to val to be freed below. */
+        new = old;
+        old = val;
+    } else {
+        /* Replace the old value at its location in the key space. */
+        val->lru = old->lru;
+        long long expire = objectGetExpire(old);
+        new = objectSetKeyAndExpire(val, key->ptr, expire);
+        *oldref = new;
+        /* Replace the old value at its location in the expire space. */
+        if (expire >= 0) {
+            int dict_index = getKVStoreIndexForKey(key->ptr);
+            void **expireref = kvstoreHashtableFindRef(db->expires, dict_index, key->ptr);
+            serverAssert(expireref != NULL);
+            *expireref = new;
+        }
     }
-    kvstoreDictSetVal(db->keys, dict_index, de, val);
     /* For efficiency, let the I/O thread that allocated an object also deallocate it. */
     if (tryOffloadFreeObjToIOThreads(old) == C_OK) {
         /* OK */
@@ -325,18 +382,21 @@ static void dbSetValue(serverDb *db, robj *key, robj *val, int overwrite, dictEn
     } else {
         decrRefCount(old);
     }
+    *valref = new;
 }
 
 /* Replace an existing key with a new value, we just replace value and don't
  * emit any events */
-void dbReplaceValue(serverDb *db, robj *key, robj *val) {
-    dbSetValue(db, key, val, 0, NULL);
+void dbReplaceValue(serverDb *db, robj *key, robj **valref) {
+    dbSetValue(db, key, valref, 0, NULL);
 }
 
 /* High level Set operation. This function can be used in order to set
  * a key, whatever it was existing or not, to a new object.
  *
- * 1) The ref count of the value object is incremented.
+ * 1) The value may be reallocated when adding it to the database. The value
+ *    pointer 'valref' is updated to point to the reallocated object. The
+ *    reference count of the value object is *not* incremented.
  * 2) clients WATCHing for the destination key notified.
  * 3) The expire time of the key is reset (the key is made persistent),
  *    unless 'SETKEY_KEEPTTL' is enabled in flags.
@@ -346,7 +406,7 @@ void dbReplaceValue(serverDb *db, robj *key, robj *val) {
  * All the new keys in the database should be created via this interface.
  * The client 'c' argument may be set to NULL if the operation is performed
  * in a context where there is no clear client performing the operation. */
-void setKey(client *c, serverDb *db, robj *key, robj *val, int flags) {
+void setKey(client *c, serverDb *db, robj *key, robj **valref, int flags) {
     int keyfound = 0;
 
     if (flags & SETKEY_ALREADY_EXIST)
@@ -357,13 +417,12 @@ void setKey(client *c, serverDb *db, robj *key, robj *val, int flags) {
         keyfound = (lookupKeyWrite(db, key) != NULL);
 
     if (!keyfound) {
-        dbAdd(db, key, val);
+        dbAdd(db, key, valref);
     } else if (keyfound < 0) {
-        dbAddInternal(db, key, val, 1);
+        dbAddInternal(db, key, valref, 1);
     } else {
-        dbSetValue(db, key, val, 1, NULL);
+        dbSetValue(db, key, valref, 1, NULL);
     }
-    incrRefCount(val);
     if (!(flags & SETKEY_KEEPTTL)) removeExpire(db, key);
     if (!(flags & SETKEY_NO_SIGNAL)) signalModifiedKey(c, db, key);
 }
@@ -373,20 +432,18 @@ void setKey(client *c, serverDb *db, robj *key, robj *val, int flags) {
  *
  * The function makes sure to return keys not already expired. */
 robj *dbRandomKey(serverDb *db) {
-    dictEntry *de;
     int maxtries = 100;
     int allvolatile = kvstoreSize(db->keys) == kvstoreSize(db->expires);
 
     while (1) {
-        sds key;
-        robj *keyobj;
-        int randomDictIndex = kvstoreGetFairRandomDictIndex(db->keys);
-        de = kvstoreDictGetFairRandomKey(db->keys, randomDictIndex);
-        if (de == NULL) return NULL;
-
-        key = dictGetKey(de);
-        keyobj = createStringObject(key, sdslen(key));
-        if (dbFindExpiresWithDictIndex(db, key, randomDictIndex)) {
+        void *entry;
+        int randomDictIndex = kvstoreGetFairRandomHashtableIndex(db->keys);
+        int ok = kvstoreHashtableFairRandomEntry(db->keys, randomDictIndex, &entry);
+        if (!ok) return NULL;
+        robj *valkey = entry;
+        sds key = objectGetKey(valkey);
+        robj *keyobj = createStringObject(key, sdslen(key));
+        if (objectIsExpired(valkey)) {
             if (allvolatile && (server.primary_host || server.import_mode) && --maxtries == 0) {
                 /* If the DB is composed only of keys with an expire set,
                  * it could happen that all the keys are already logically
@@ -398,7 +455,7 @@ robj *dbRandomKey(serverDb *db) {
                  * return a key name that may be already expired. */
                 return keyobj;
             }
-            if (expireIfNeededWithDictIndex(db, keyobj, 0, randomDictIndex) != KEY_VALID) {
+            if (expireIfNeededWithDictIndex(db, keyobj, valkey, 0, randomDictIndex) != KEY_VALID) {
                 decrRefCount(keyobj);
                 continue; /* search for another key. This expired. */
             }
@@ -408,31 +465,38 @@ robj *dbRandomKey(serverDb *db) {
 }
 
 int dbGenericDeleteWithDictIndex(serverDb *db, robj *key, int async, int flags, int dict_index) {
-    dictEntry **plink;
-    int table;
-    dictEntry *de = kvstoreDictTwoPhaseUnlinkFind(db->keys, dict_index, key->ptr, &plink, &table);
-    if (de) {
-        robj *val = dictGetVal(de);
-        /* RM_StringDMA may call dbUnshareStringValue which may free val, so we
+    hashtablePosition pos;
+    void **ref = kvstoreHashtableTwoPhasePopFindRef(db->keys, dict_index, key->ptr, &pos);
+    if (ref != NULL) {
+        robj *val = *ref;
+        /* VM_StringDMA may call dbUnshareStringValue which may free val, so we
          * need to incr to retain val */
         incrRefCount(val);
         /* Tells the module that the key has been unlinked from the database. */
         moduleNotifyKeyUnlink(key, val, db->id, flags);
         /* We want to try to unblock any module clients or clients using a blocking XREADGROUP */
         signalDeletedKeyAsReady(db, key, val->type);
-        /* We should call decr before freeObjAsync. If not, the refcount may be
-         * greater than 1, so freeObjAsync doesn't work */
+        /* Match the incrRefCount above. */
         decrRefCount(val);
+        /* Because of dbUnshareStringValue, the val in de may change. */
+        val = *ref;
+
+        /* Delete from keys and expires tables. This will not free the object.
+         * (The expires table has no destructor callback.) */
+        kvstoreHashtableTwoPhasePopDelete(db->keys, dict_index, &pos);
+        if (objectGetExpire(val) != -1) {
+            int deleted = kvstoreHashtableDelete(db->expires, dict_index, key->ptr);
+            serverAssert(deleted);
+        } else {
+            debugServerAssert(0 == kvstoreHashtableDelete(db->expires, dict_index, key->ptr));
+        }
+
         if (async) {
-            /* Because of dbUnshareStringValue, the val in de may change. */
-            freeObjAsync(key, dictGetVal(de), db->id);
-            kvstoreDictSetVal(db->keys, dict_index, de, NULL);
+            freeObjAsync(key, val, db->id);
+        } else {
+            decrRefCount(val);
         }
-        /* Deleting an entry from the expires dict will not free the sds of
-         * the key, because it is shared with the main dictionary. */
-        kvstoreDictDelete(db->expires, dict_index, key->ptr);
 
-        kvstoreDictTwoPhaseUnlinkFree(db->keys, dict_index, de, plink, table);
         return 1;
     } else {
         return 0;
@@ -495,7 +559,7 @@ robj *dbUnshareStringValue(serverDb *db, robj *key, robj *o) {
         robj *decoded = getDecodedObject(o);
         o = createRawStringObject(decoded->ptr, sdslen(decoded->ptr));
         decrRefCount(decoded);
-        dbReplaceValue(db, key, o);
+        dbReplaceValue(db, key, &o);
     }
     return o;
 }
@@ -506,7 +570,7 @@ robj *dbUnshareStringValue(serverDb *db, robj *key, robj *o) {
  * The dbnum can be -1 if all the DBs should be emptied, or the specified
  * DB index if we want to empty only a single database.
  * The function returns the number of keys removed from the database(s). */
-long long emptyDbStructure(serverDb *dbarray, int dbnum, int async, void(callback)(dict *)) {
+long long emptyDbStructure(serverDb *dbarray, int dbnum, int async, void(callback)(hashtable *)) {
     long long removed = 0;
     int startdb, enddb;
 
@@ -548,7 +612,7 @@ long long emptyDbStructure(serverDb *dbarray, int dbnum, int async, void(callbac
  * On success the function returns the number of keys removed from the
  * database(s). Otherwise -1 is returned in the specific case the
  * DB number is out of range, and errno is set to EINVAL. */
-long long emptyData(int dbnum, int flags, void(callback)(dict *)) {
+long long emptyData(int dbnum, int flags, void(callback)(hashtable *)) {
     int async = (flags & EMPTYDB_ASYNC);
     int with_functions = !(flags & EMPTYDB_NOFUNCTIONS);
     ValkeyModuleFlushInfoV1 fi = {VALKEYMODULE_FLUSHINFO_VERSION, !async, dbnum};
@@ -574,7 +638,8 @@ long long emptyData(int dbnum, int flags, void(callback)(dict *)) {
 
     if (with_functions) {
         serverAssert(dbnum == -1);
-        functionsLibCtxClearCurrent(async, callback);
+        /* TODO: fix this callback incompatibility. The arg is not used. */
+        functionsLibCtxClearCurrent(async, (void (*)(dict *))callback);
     }
 
     /* Also fire the end event. Note that this event will fire almost
@@ -587,16 +652,16 @@ long long emptyData(int dbnum, int flags, void(callback)(dict *)) {
 /* Initialize temporary db on replica for use during diskless replication. */
 serverDb *initTempDb(void) {
     int slot_count_bits = 0;
-    int flags = KVSTORE_ALLOCATE_DICTS_ON_DEMAND;
+    int flags = KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND;
     if (server.cluster_enabled) {
         slot_count_bits = CLUSTER_SLOT_MASK_BITS;
-        flags |= KVSTORE_FREE_EMPTY_DICTS;
+        flags |= KVSTORE_FREE_EMPTY_HASHTABLES;
     }
     serverDb *tempDb = zcalloc(sizeof(serverDb) * server.dbnum);
     for (int i = 0; i < server.dbnum; i++) {
         tempDb[i].id = i;
-        tempDb[i].keys = kvstoreCreate(&kvstoreKeysDictType, slot_count_bits, flags);
-        tempDb[i].expires = kvstoreCreate(&kvstoreExpiresDictType, slot_count_bits, flags);
+        tempDb[i].keys = kvstoreCreate(&kvstoreKeysHashtableType, slot_count_bits, flags);
+        tempDb[i].expires = kvstoreCreate(&kvstoreExpiresHashtableType, slot_count_bits, flags);
     }
 
     return tempDb;
@@ -757,7 +822,7 @@ void delGenericCommand(client *c, int lazy) {
     int numdel = 0, j;
 
     for (j = 1; j < c->argc; j++) {
-        if (expireIfNeeded(c->db, c->argv[j], 0) == KEY_DELETED) continue;
+        if (expireIfNeeded(c->db, c->argv[j], NULL, 0) == KEY_DELETED) continue;
         int deleted = lazy ? dbAsyncDelete(c->db, c->argv[j]) : dbSyncDelete(c->db, c->argv[j]);
         if (deleted) {
             signalModifiedKey(c, c->db, c->argv[j]);
@@ -818,7 +883,6 @@ void randomkeyCommand(client *c) {
 }
 
 void keysCommand(client *c) {
-    dictEntry *de;
     sds pattern = c->argv[1]->ptr;
     int plen = sdslen(pattern), allkeys, pslot = -1;
     unsigned long numkeys = 0;
@@ -827,37 +891,26 @@ void keysCommand(client *c) {
     if (server.cluster_enabled && !allkeys) {
         pslot = patternHashSlot(pattern, plen);
     }
-    kvstoreDictIterator *kvs_di = NULL;
+    kvstoreHashtableIterator *kvs_di = NULL;
     kvstoreIterator *kvs_it = NULL;
     if (pslot != -1) {
-        kvs_di = kvstoreGetDictSafeIterator(c->db->keys, pslot);
+        kvs_di = kvstoreGetHashtableSafeIterator(c->db->keys, pslot);
     } else {
         kvs_it = kvstoreIteratorInit(c->db->keys);
     }
-    while (1) {
-        robj keyobj;
-        int dict_index;
-        if (kvs_di) {
-            de = kvstoreDictIteratorNext(kvs_di);
-            dict_index = pslot;
-        } else {
-            de = kvstoreIteratorNext(kvs_it);
-            dict_index = kvstoreIteratorGetCurrentDictIndex(kvs_it);
-        }
-        if (de == NULL) break;
-
-        sds key = dictGetKey(de);
-
+    void *next;
+    while (kvs_di ? kvstoreHashtableIteratorNext(kvs_di, &next) : kvstoreIteratorNext(kvs_it, &next)) {
+        robj *val = next;
+        sds key = objectGetKey(val);
         if (allkeys || stringmatchlen(pattern, plen, key, sdslen(key), 0)) {
-            initStaticStringObject(keyobj, key);
-            if (!keyIsExpiredWithDictIndex(c->db, &keyobj, dict_index)) {
+            if (!objectIsExpired(val)) {
                 addReplyBulkCBuffer(c, key, sdslen(key));
                 numkeys++;
             }
         }
         if (c->flag.close_asap) break;
     }
-    if (kvs_di) kvstoreReleaseDictIterator(kvs_di);
+    if (kvs_di) kvstoreReleaseHashtableIterator(kvs_di);
     if (kvs_it) kvstoreIteratorRelease(kvs_it);
     setDeferredArrayLen(c, replylen, numkeys);
 }
@@ -866,6 +919,7 @@ void keysCommand(client *c) {
 typedef struct {
     list *keys;     /* elements that collect from dict */
     robj *o;        /* o must be a hash/set/zset object, NULL means current db */
+    serverDb *db;   /* database currently being scanned */
     long long type; /* the particular type when scan the db */
     sds pattern;    /* pattern string, NULL means no pattern */
     long sampled;   /* cumulative number of keys sampled */
@@ -887,6 +941,41 @@ int objectTypeCompare(robj *o, long long target) {
     else
         return 1;
 }
+
+/* Hashtable scan callback used by scanCallback when scanning the keyspace. */
+void keysScanCallback(void *privdata, void *entry) {
+    scanData *data = (scanData *)privdata;
+    robj *obj = entry;
+    data->sampled++;
+
+    /* Filter an object if it isn't the type we want. */
+    if (data->type != LLONG_MAX) {
+        if (!objectTypeCompare(obj, data->type)) return;
+    }
+
+    sds key = objectGetKey(obj);
+
+    /* Filter object if its key does not match the pattern. */
+    if (data->pattern) {
+        if (!stringmatchlen(data->pattern, sdslen(data->pattern), key, sdslen(key), 0)) {
+            return;
+        }
+    }
+
+    /* Handle and skip expired key. */
+    if (objectIsExpired(obj)) {
+        robj kobj;
+        initStaticStringObject(kobj, key);
+        if (expireIfNeeded(data->db, &kobj, obj, 0) != KEY_VALID) {
+            return;
+        }
+    }
+
+    /* Keep this key. */
+    list *keys = data->keys;
+    listAddNodeTail(keys, key);
+}
+
 /* This callback is used by scanGenericCommand in order to collect elements
  * returned by the dictionary iterator into a list. */
 void scanCallback(void *privdata, const dictEntry *de) {
@@ -897,14 +986,9 @@ void scanCallback(void *privdata, const dictEntry *de) {
     sds key = NULL;
     data->sampled++;
 
-    /* o and typename can not have values at the same time. */
-    serverAssert(!((data->type != LLONG_MAX) && o));
-
-    /* Filter an element if it isn't the type we want. */
-    if (!o && data->type != LLONG_MAX) {
-        robj *rval = dictGetVal(de);
-        if (!objectTypeCompare(rval, data->type)) return;
-    }
+    /* This callback is only used for scanning elements within a key (hash
+     * fields, set elements, etc.) so o must be set here. */
+    serverAssert(o != NULL);
 
     /* Filter element if it does not match the pattern. */
     sds keysds = dictGetKey(de);
@@ -914,9 +998,7 @@ void scanCallback(void *privdata, const dictEntry *de) {
         }
     }
 
-    if (o == NULL) {
-        key = keysds;
-    } else if (o->type == OBJ_SET) {
+    if (o->type == OBJ_SET) {
         key = keysds;
     } else if (o->type == OBJ_HASH) {
         key = keysds;
@@ -1119,6 +1201,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) {
          * only keys are returned. */
         scanData data = {
             .keys = keys,
+            .db = c->db,
             .o = o,
             .type = type,
             .pattern = use_pattern ? pat : NULL,
@@ -1135,7 +1218,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) {
             /* In cluster mode there is a separate dictionary for each slot.
              * If cursor is empty, we should try exploring next non-empty slot. */
             if (o == NULL) {
-                cursor = kvstoreScan(c->db->keys, cursor, onlydidx, scanCallback, NULL, &data);
+                cursor = kvstoreScan(c->db->keys, cursor, onlydidx, keysScanCallback, NULL, &data);
             } else {
                 cursor = dictScan(ht, cursor, scanCallback, &data);
             }
@@ -1187,22 +1270,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) {
         serverPanic("Not handled encoding in SCAN.");
     }
 
-    /* Step 3: Filter the expired keys */
-    if (o == NULL && listLength(keys)) {
-        robj kobj;
-        listIter li;
-        listNode *ln;
-        listRewind(keys, &li);
-        while ((ln = listNext(&li))) {
-            sds key = listNodeValue(ln);
-            initStaticStringObject(kobj, key);
-            if (expireIfNeeded(c->db, &kobj, 0) != KEY_VALID) {
-                listDelNode(keys, ln);
-            }
-        }
-    }
-
-    /* Step 4: Reply to the client. */
+    /* Step 3: Reply to the client. */
     addReplyArrayLen(c, 2);
     addReplyBulkLongLong(c, cursor);
 
@@ -1325,9 +1393,9 @@ void renameGenericCommand(client *c, int nx) {
          * with the same name. */
         dbDelete(c->db, c->argv[2]);
     }
-    dbAdd(c->db, c->argv[2], o);
-    if (expire != -1) setExpire(c, c->db, c->argv[2], expire);
     dbDelete(c->db, c->argv[1]);
+    dbAdd(c->db, c->argv[2], &o);
+    if (expire != -1) o = setExpire(c, c->db, c->argv[2], expire);
     signalModifiedKey(c, c->db, c->argv[1]);
     signalModifiedKey(c, c->db, c->argv[2]);
     notifyKeyspaceEvent(NOTIFY_GENERIC, "rename_from", c->argv[1], c->db->id);
@@ -1388,12 +1456,14 @@ void moveCommand(client *c) {
         addReply(c, shared.czero);
         return;
     }
-    dbAdd(dst, c->argv[1], o);
-    if (expire != -1) setExpire(c, dst, c->argv[1], expire);
-    incrRefCount(o);
 
-    /* OK! key moved, free the entry in the source DB */
-    dbDelete(src, c->argv[1]);
+    incrRefCount(o);           /* ref counter = 2 */
+    dbDelete(src, c->argv[1]); /* ref counter = 1 */
+
+    dbAdd(dst, c->argv[1], &o);
+    if (expire != -1) o = setExpire(c, dst, c->argv[1], expire);
+
+    /* OK! key moved */
     signalModifiedKey(c, src, c->argv[1]);
     signalModifiedKey(c, dst, c->argv[1]);
     notifyKeyspaceEvent(NOTIFY_GENERIC, "move_from", c->argv[1], src->id);
@@ -1491,8 +1561,8 @@ void copyCommand(client *c) {
         dbDelete(dst, newkey);
     }
 
-    dbAdd(dst, newkey, newobj);
-    if (expire != -1) setExpire(c, dst, newkey, expire);
+    dbAdd(dst, newkey, &newobj);
+    if (expire != -1) newobj = setExpire(c, dst, newkey, expire);
 
     /* OK! key copied */
     signalModifiedKey(c, dst, c->argv[2]);
@@ -1511,9 +1581,8 @@ void scanDatabaseForReadyKeys(serverDb *db) {
     dictIterator *di = dictGetSafeIterator(db->blocking_keys);
     while ((de = dictNext(di)) != NULL) {
         robj *key = dictGetKey(de);
-        dictEntry *kde = dbFind(db, key->ptr);
-        if (kde) {
-            robj *value = dictGetVal(kde);
+        robj *value = dbFind(db, key->ptr);
+        if (value) {
             signalKeyAsReady(db, key, value->type);
         }
     }
@@ -1531,17 +1600,15 @@ void scanDatabaseForDeletedKeys(serverDb *emptied, serverDb *replaced_with) {
         int existed = 0, exists = 0;
         int original_type = -1, curr_type = -1;
 
-        dictEntry *kde = dbFind(emptied, key->ptr);
-        if (kde) {
-            robj *value = dictGetVal(kde);
+        robj *value = dbFind(emptied, key->ptr);
+        if (value) {
             original_type = value->type;
             existed = 1;
         }
 
         if (replaced_with) {
-            kde = dbFind(replaced_with, key->ptr);
-            if (kde) {
-                robj *value = dictGetVal(kde);
+            value = dbFind(replaced_with, key->ptr);
+            if (value) {
                 curr_type = value->type;
                 exists = 1;
             }
@@ -1678,39 +1745,63 @@ void swapdbCommand(client *c) {
  *----------------------------------------------------------------------------*/
 
 int removeExpire(serverDb *db, robj *key) {
-    return kvstoreDictDelete(db->expires, getKVStoreIndexForKey(key->ptr), key->ptr) == DICT_OK;
+    int dict_index = getKVStoreIndexForKey(key->ptr);
+    void *popped;
+    if (kvstoreHashtablePop(db->expires, dict_index, key->ptr, &popped)) {
+        robj *val = popped;
+        robj *newval = objectSetExpire(val, -1);
+        serverAssert(newval == val);
+        debugServerAssert(getExpire(db, key) == -1);
+        return 1;
+    }
+    return 0;
 }
 
 /* Set an expire to the specified key. If the expire is set in the context
  * of an user calling a command 'c' is the client, otherwise 'c' is set
  * to NULL. The 'when' parameter is the absolute unix time in milliseconds
  * after which the key will no longer be considered valid. */
-void setExpire(client *c, serverDb *db, robj *key, long long when) {
-    dictEntry *kde, *de, *existing;
+robj *setExpire(client *c, serverDb *db, robj *key, long long when) {
+    /* TODO: Add val as a parameter to this function, to avoid looking it up. */
+    robj *val;
 
-    /* Reuse the sds from the main dict in the expire dict */
+    /* Reuse the object from the main dict in the expire dict. When setting
+     * expire in an robj, it's potentially reallocated. We need to updates the
+     * pointer(s) to it. */
     int dict_index = getKVStoreIndexForKey(key->ptr);
-    kde = kvstoreDictFind(db->keys, dict_index, key->ptr);
-    serverAssertWithInfo(NULL, key, kde != NULL);
-    de = kvstoreDictAddRaw(db->expires, dict_index, dictGetKey(kde), &existing);
-    if (existing) {
-        dictSetSignedIntegerVal(existing, when);
+    void **valref = kvstoreHashtableFindRef(db->keys, dict_index, key->ptr);
+    serverAssertWithInfo(NULL, key, valref != NULL);
+    val = *valref;
+    long long old_when = objectGetExpire(val);
+    robj *newval = objectSetExpire(val, when);
+    if (old_when != -1) {
+        /* Val already had an expire field, so it was not reallocated. */
+        serverAssert(newval == val);
+        /* It already exists in set of keys with expire. */
+        debugServerAssert(!kvstoreHashtableAdd(db->expires, dict_index, newval));
     } else {
-        dictSetSignedIntegerVal(de, when);
+        /* No old expire. Update the pointer in the keys hashtable, if needed,
+         * and add it to the expires hashtable. */
+        if (newval != val) {
+            val = *valref = newval;
+        }
+        int added = kvstoreHashtableAdd(db->expires, dict_index, newval);
+        serverAssert(added);
     }
 
     int writable_replica = server.primary_host && server.repl_replica_ro == 0;
     if (c && writable_replica && !c->flag.primary) rememberReplicaKeyWithExpire(db, key);
+    return val;
 }
 
 /* Return the expire time of the specified key, or -1 if no expire
  * is associated with this key (i.e. the key is non volatile) */
 long long getExpireWithDictIndex(serverDb *db, robj *key, int dict_index) {
-    dictEntry *de;
+    robj *val;
 
-    if ((de = dbFindExpiresWithDictIndex(db, key->ptr, dict_index)) == NULL) return -1;
+    if ((val = dbFindExpiresWithDictIndex(db, key->ptr, dict_index)) == NULL) return -1;
 
-    return dictGetSignedIntegerVal(de);
+    return objectGetExpire(val);
 }
 
 /* Return the expire time of the specified key, or -1 if no expire
@@ -1789,24 +1880,36 @@ void propagateDeletion(serverDb *db, robj *key, int lazy) {
     decrRefCount(argv[1]);
 }
 
-static int keyIsExpiredWithDictIndexImpl(serverDb *db, robj *key, int dict_index) {
-    /* Don't expire anything while loading. It will be done later. */
-    if (server.loading) return 0;
-
-    mstime_t when = getExpireWithDictIndex(db, key, dict_index);
-    mstime_t now;
-
-    if (when < 0) return 0; /* No expire for this key */
-
-    now = commandTimeSnapshot();
+/* Returns 1 if the expire value is expired, 0 otherwise. */
+static int timestampIsExpired(mstime_t when) {
+    if (when < 0) return 0; /* no expire */
+    mstime_t now = commandTimeSnapshot();
 
     /* The key expired if the current (virtual or real) time is greater
      * than the expire time of the key. */
     return now > when;
 }
 
+/* Use this instead of keyIsExpired if you already have the value object. */
+static int objectIsExpired(robj *val) {
+    /* Don't expire anything while loading. It will be done later. */
+    if (server.loading) return 0;
+    if (!timestampIsExpired(objectGetExpire(val))) return 0;
+    if (server.primary_host == NULL && server.import_mode) {
+        if (server.current_client && server.current_client->flag.import_source) return 0;
+    }
+    return 1;
+}
+
+static int keyIsExpiredWithDictIndexImpl(serverDb *db, robj *key, int dict_index) {
+    /* Don't expire anything while loading. It will be done later. */
+    if (server.loading) return 0;
+    mstime_t when = getExpireWithDictIndex(db, key, dict_index);
+    return timestampIsExpired(when);
+}
+
 /* Check if the key is expired. */
-int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index) {
+static int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index) {
     if (!keyIsExpiredWithDictIndexImpl(db, key, dict_index)) return 0;
 
     /* See expireIfNeededWithDictIndex for more details. */
@@ -1822,9 +1925,14 @@ int keyIsExpired(serverDb *db, robj *key) {
     return keyIsExpiredWithDictIndex(db, key, dict_index);
 }
 
-keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, int flags, int dict_index) {
+/* val is optional. Pass NULL if val is not yet fetched from the database. */
+static keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, robj *val, int flags, int dict_index) {
     if (server.lazy_expire_disabled) return KEY_VALID;
-    if (!keyIsExpiredWithDictIndexImpl(db, key, dict_index)) return KEY_VALID;
+    if (val != NULL) {
+        if (!objectIsExpired(val)) return KEY_VALID;
+    } else {
+        if (!keyIsExpiredWithDictIndexImpl(db, key, dict_index)) return KEY_VALID;
+    }
 
     /* If we are running in the context of a replica, instead of
      * evicting the expired key from the database, we return ASAP:
@@ -1912,12 +2020,17 @@ keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, int flags, int di
  * the actual key deletion and propagation of the deletion, use the
  * EXPIRE_AVOID_DELETE_EXPIRED flag.
  *
+ * Passing the value 'val' to this function is optional, as an optimization to
+ * avoid looking up the key. Pass NULL if it's not already fetched from the
+ * database.
+ *
  * The return value of the function is KEY_VALID if the key is still valid.
  * The function returns KEY_EXPIRED if the key is expired BUT not deleted,
  * or returns KEY_DELETED if the key is expired and deleted. */
-keyStatus expireIfNeeded(serverDb *db, robj *key, int flags) {
+static keyStatus expireIfNeeded(serverDb *db, robj *key, robj *val, int flags) {
+    if (val != NULL && !objectIsExpired(val)) return KEY_VALID; /* shortcut */
     int dict_index = getKVStoreIndexForKey(key->ptr);
-    return expireIfNeededWithDictIndex(db, key, flags, dict_index);
+    return expireIfNeededWithDictIndex(db, key, val, flags, dict_index);
 }
 
 /* CB passed to kvstoreExpand.
@@ -1932,10 +2045,11 @@ static int dbExpandSkipSlot(int slot) {
  * In cluster mode resizes all individual dictionaries for slots that this node owns.
  *
  * Based on the parameter `try_expand`, appropriate dict expand API is invoked.
- * if try_expand is set to 1, `dictTryExpand` is used else `dictExpand`.
- * The return code is either `DICT_OK`/`DICT_ERR` for both the API(s).
- * `DICT_OK` response is for successful expansion. However ,`DICT_ERR` response signifies failure in allocation in
- * `dictTryExpand` call and in case of `dictExpand` call it signifies no expansion was performed.
+ * if try_expand is non-zero, `hashtableTryExpand` is used else `hashtableExpand`.
+ *
+ * Returns C_OK or C_ERR. C_OK response is for successful expansion. C_ERR
+ * signifies failure in allocation if try_expand is non-zero. Otherwise it
+ * signifies that no expansion was performed.
  */
 static int dbExpandGeneric(kvstore *kvs, uint64_t db_size, int try_expand) {
     int ret;
@@ -1961,20 +2075,24 @@ int dbExpandExpires(serverDb *db, uint64_t db_size, int try_expand) {
     return dbExpandGeneric(db->expires, db_size, try_expand);
 }
 
-dictEntry *dbFindWithDictIndex(serverDb *db, void *key, int dict_index) {
-    return kvstoreDictFind(db->keys, dict_index, key);
+static robj *dbFindWithDictIndex(serverDb *db, sds key, int dict_index) {
+    void *existing = NULL;
+    kvstoreHashtableFind(db->keys, dict_index, key, &existing);
+    return existing;
 }
 
-dictEntry *dbFind(serverDb *db, void *key) {
+robj *dbFind(serverDb *db, sds key) {
     int dict_index = getKVStoreIndexForKey(key);
     return dbFindWithDictIndex(db, key, dict_index);
 }
 
-dictEntry *dbFindExpiresWithDictIndex(serverDb *db, void *key, int dict_index) {
-    return kvstoreDictFind(db->expires, dict_index, key);
+static robj *dbFindExpiresWithDictIndex(serverDb *db, sds key, int dict_index) {
+    void *existing = NULL;
+    kvstoreHashtableFind(db->expires, dict_index, key, &existing);
+    return existing;
 }
 
-dictEntry *dbFindExpires(serverDb *db, void *key) {
+robj *dbFindExpires(serverDb *db, sds key) {
     int dict_index = getKVStoreIndexForKey(key);
     return dbFindExpiresWithDictIndex(db, key, dict_index);
 }
@@ -1983,7 +2101,7 @@ unsigned long long dbSize(serverDb *db) {
     return kvstoreSize(db->keys);
 }
 
-unsigned long long dbScan(serverDb *db, unsigned long long cursor, dictScanFunction *scan_cb, void *privdata) {
+unsigned long long dbScan(serverDb *db, unsigned long long cursor, hashtableScanFunction scan_cb, void *privdata) {
     return kvstoreScan(db->keys, cursor, -1, scan_cb, NULL, privdata);
 }
 
diff --git a/src/debug.c b/src/debug.c
index 7407af3514..d63d12f762 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -283,7 +283,7 @@ void xorObjectDigest(serverDb *db, robj *keyobj, unsigned char *digest, robj *o)
  * a different digest. */
 void computeDatasetDigest(unsigned char *final) {
     unsigned char digest[20];
-    dictEntry *de;
+    robj *o;
     int j;
     uint32_t aux;
 
@@ -299,17 +299,16 @@ void computeDatasetDigest(unsigned char *final) {
         mixDigest(final, &aux, sizeof(aux));
 
         /* Iterate this DB writing every entry */
-        while ((de = kvstoreIteratorNext(kvs_it)) != NULL) {
+        while (kvstoreIteratorNext(kvs_it, (void **)&o)) {
             sds key;
-            robj *keyobj, *o;
+            robj *keyobj;
 
             memset(digest, 0, 20); /* This key-val digest */
-            key = dictGetKey(de);
+            key = objectGetKey(o);
             keyobj = createStringObject(key, sdslen(key));
 
             mixDigest(digest, key, sdslen(key));
 
-            o = dictGetVal(de);
             xorObjectDigest(db, keyobj, digest, o);
 
             /* We can finally xor the key-val digest to the final digest */
@@ -615,18 +614,16 @@ void debugCommand(client *c) {
         server.debug_cluster_disable_random_ping = atoi(c->argv[2]->ptr);
         addReply(c, shared.ok);
     } else if (!strcasecmp(c->argv[1]->ptr, "object") && (c->argc == 3 || c->argc == 4)) {
-        dictEntry *de;
         robj *val;
         char *strenc;
 
         int fast = 0;
         if (c->argc == 4 && !strcasecmp(c->argv[3]->ptr, "fast")) fast = 1;
 
-        if ((de = dbFind(c->db, c->argv[2]->ptr)) == NULL) {
+        if ((val = dbFind(c->db, c->argv[2]->ptr)) == NULL) {
             addReplyErrorObject(c, shared.nokeyerr);
             return;
         }
-        val = dictGetVal(de);
         strenc = strEncoding(val->encoding);
 
         char extra[138] = {0};
@@ -674,16 +671,14 @@ void debugCommand(client *c) {
         addReplyStatusLength(c, s, sdslen(s));
         sdsfree(s);
     } else if (!strcasecmp(c->argv[1]->ptr, "sdslen") && c->argc == 3) {
-        dictEntry *de;
         robj *val;
         sds key;
 
-        if ((de = dbFind(c->db, c->argv[2]->ptr)) == NULL) {
+        if ((val = dbFind(c->db, c->argv[2]->ptr)) == NULL) {
             addReplyErrorObject(c, shared.nokeyerr);
             return;
         }
-        val = dictGetVal(de);
-        key = dictGetKey(de);
+        key = objectGetKey(val);
 
         if (val->type != OBJ_STRING || !sdsEncodedObject(val)) {
             addReplyError(c, "Not an sds encoded string.");
@@ -753,7 +748,7 @@ void debugCommand(client *c) {
                 val = createStringObject(NULL, valsize);
                 memcpy(val->ptr, buf, valsize <= buflen ? valsize : buflen);
             }
-            dbAdd(c->db, key, val);
+            dbAdd(c->db, key, &val);
             signalModifiedKey(c, c->db, key);
             decrRefCount(key);
         }
@@ -776,8 +771,7 @@ void debugCommand(client *c) {
 
             /* We don't use lookupKey because a debug command should
              * work on logically expired keys */
-            dictEntry *de;
-            robj *o = ((de = dbFind(c->db, c->argv[j]->ptr)) == NULL) ? NULL : dictGetVal(de);
+            robj *o = dbFind(c->db, c->argv[j]->ptr);
             if (o) xorObjectDigest(c->db, c->argv[j], digest, o);
 
             sds d = sdsempty();
@@ -1911,12 +1905,10 @@ void logCurrentClient(client *cc, const char *title) {
      * selected DB, and if so print info about the associated object. */
     if (cc->argc > 1) {
         robj *val, *key;
-        dictEntry *de;
 
         key = getDecodedObject(cc->argv[1]);
-        de = dbFind(cc->db, key->ptr);
-        if (de) {
-            val = dictGetVal(de);
+        val = dbFind(cc->db, key->ptr);
+        if (val) {
             serverLog(LL_WARNING, "key '%s' found in DB containing the following object:", (char *)key->ptr);
             serverLogObjectDebugInfo(val);
         }
diff --git a/src/defrag.c b/src/defrag.c
index 9c195e8959..057fdd50de 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -103,7 +103,7 @@ typedef struct {
     unsigned long cursor;
 } kvstoreIterState;
 /* The kvstore helper uses this function to perform tasks before continuing the iteration.  For the
- * main dictionary, large items are set aside and processed by this function before continuing with
+ * main hash table, large items are set aside and processed by this function before continuing with
  * iteration over the kvstore.
  *  endtime     - This is the monotonic time that the function should end and return.
  *  privdata    - Private data for functions invoked by the helper.  If provided in the call to
@@ -121,7 +121,6 @@ typedef doneStatus (*kvstoreHelperPreContinueFn)(monotime endtime, void *privdat
 typedef struct {
     kvstoreIterState kvstate;
     serverDb *db;
-    dictEntry *saved_expire_de;
 } defragKeysCtx;
 static_assert(offsetof(defragKeysCtx, kvstate) == 0, "defragStageKvstoreHelper requires this");
 
@@ -188,38 +187,6 @@ void *activeDefragAlloc(void *ptr) {
     return newptr;
 }
 
-/* This method captures the expiry db dict entry which refers to data stored in keys db dict entry. */
-static void defragEntryStartCbForKeys(void *ctx, void *oldptr) {
-    defragKeysCtx *defragctx = (defragKeysCtx *)ctx;
-    serverDb *db = defragctx->db;
-    sds oldsds = (sds)dictGetKey((dictEntry *)oldptr);
-    int slot = defragctx->kvstate.slot;
-    if (kvstoreDictSize(db->expires, slot)) {
-        dictEntry *expire_de = kvstoreDictFind(db->expires, slot, oldsds);
-        defragctx->saved_expire_de = expire_de;
-    } else {
-        defragctx->saved_expire_de = NULL;
-    }
-}
-
-/* This method updates the key of expiry db dict entry. The key might be no longer valid
- * as it could have been cleaned up during the defrag-realloc of the main dictionary. */
-static void defragEntryFinishCbForKeys(void *ctx, void *newptr) {
-    defragKeysCtx *defragctx = (defragKeysCtx *)ctx;
-    dictEntry *expire_de = defragctx->saved_expire_de;
-    /* Item doesn't have TTL associated to it. */
-    if (!expire_de) return;
-    /* No reallocation happened. */
-    if (!newptr) {
-        expire_de = NULL;
-        return;
-    }
-    serverDb *db = defragctx->db;
-    sds newsds = (sds)dictGetKey((dictEntry *)newptr);
-    int slot = defragctx->kvstate.slot;
-    kvstoreDictSetKey(db->expires, slot, expire_de, newsds);
-}
-
 /* Defrag helper for sds strings
  *
  * Returns NULL in case the allocation wasn't moved.
@@ -440,13 +407,13 @@ static void activeDefragQuickListNodes(quicklist *ql) {
 /* when the value has lots of elements, we want to handle it later and not as
  * part of the main dictionary scan. this is needed in order to prevent latency
  * spikes when handling large items */
-static void defragLater(dictEntry *kde) {
+static void defragLater(robj *obj) {
     if (!defrag_later) {
         defrag_later = listCreate();
         listSetFreeMethod(defrag_later, (void (*)(void *))sdsfree);
         defrag_later_cursor = 0;
     }
-    sds key = sdsdup(dictGetKey(kde));
+    sds key = sdsdup(objectGetKey(obj));
     listAddNodeTail(defrag_later, key);
 }
 
@@ -513,7 +480,15 @@ static void scanLaterZset(robj *ob, unsigned long *cursor) {
     *cursor = dictScanDefrag(d, *cursor, scanLaterZsetCallback, &defragfns, &data);
 }
 
-/* Used as scan callback when all the work is done in the dictDefragFunctions. */
+/* Used as hashtable scan callback when all we need is to defrag the hashtable
+ * internals (the allocated buckets) and not the elements. */
+static void scanHashtableCallbackCountScanned(void *privdata, void *elemref) {
+    UNUSED(privdata);
+    UNUSED(elemref);
+    server.stat_active_defrag_scanned++;
+}
+
+/* Used as dict scan callback when all the work is done in the dictDefragFunctions. */
 static void scanCallbackCountScanned(void *privdata, const dictEntry *de) {
     UNUSED(privdata);
     UNUSED(de);
@@ -537,19 +512,17 @@ static void scanLaterHash(robj *ob, unsigned long *cursor) {
     *cursor = dictScanDefrag(d, *cursor, scanCallbackCountScanned, &defragfns, NULL);
 }
 
-static void defragQuicklist(dictEntry *kde) {
-    robj *ob = dictGetVal(kde);
+static void defragQuicklist(robj *ob) {
     quicklist *ql = ob->ptr, *newql;
     serverAssert(ob->type == OBJ_LIST && ob->encoding == OBJ_ENCODING_QUICKLIST);
     if ((newql = activeDefragAlloc(ql))) ob->ptr = ql = newql;
     if (ql->len > server.active_defrag_max_scan_fields)
-        defragLater(kde);
+        defragLater(ob);
     else
         activeDefragQuickListNodes(ql);
 }
 
-static void defragZsetSkiplist(dictEntry *kde) {
-    robj *ob = dictGetVal(kde);
+static void defragZsetSkiplist(robj *ob) {
     zset *zs = (zset *)ob->ptr;
     zset *newzs;
     zskiplist *newzsl;
@@ -561,7 +534,7 @@ static void defragZsetSkiplist(dictEntry *kde) {
     if ((newzsl = activeDefragAlloc(zs->zsl))) zs->zsl = newzsl;
     if ((newheader = activeDefragAlloc(zs->zsl->header))) zs->zsl->header = newheader;
     if (dictSize(zs->dict) > server.active_defrag_max_scan_fields)
-        defragLater(kde);
+        defragLater(ob);
     else {
         dictIterator *di = dictGetIterator(zs->dict);
         while ((de = dictNext(di)) != NULL) {
@@ -573,26 +546,24 @@ static void defragZsetSkiplist(dictEntry *kde) {
     if ((newdict = dictDefragTables(zs->dict))) zs->dict = newdict;
 }
 
-static void defragHash(dictEntry *kde) {
-    robj *ob = dictGetVal(kde);
+static void defragHash(robj *ob) {
     dict *d, *newd;
     serverAssert(ob->type == OBJ_HASH && ob->encoding == OBJ_ENCODING_HT);
     d = ob->ptr;
     if (dictSize(d) > server.active_defrag_max_scan_fields)
-        defragLater(kde);
+        defragLater(ob);
     else
         activeDefragSdsDict(d, DEFRAG_SDS_DICT_VAL_IS_SDS);
     /* defrag the dict struct and tables */
     if ((newd = dictDefragTables(ob->ptr))) ob->ptr = newd;
 }
 
-static void defragSet(dictEntry *kde) {
-    robj *ob = dictGetVal(kde);
+static void defragSet(robj *ob) {
     dict *d, *newd;
     serverAssert(ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HT);
     d = ob->ptr;
     if (dictSize(d) > server.active_defrag_max_scan_fields)
-        defragLater(kde);
+        defragLater(ob);
     else
         activeDefragSdsDict(d, DEFRAG_SDS_DICT_NO_VAL);
     /* defrag the dict struct and tables */
@@ -730,8 +701,7 @@ static void *defragStreamConsumerGroup(raxIterator *ri, void *privdata) {
     return NULL;
 }
 
-static void defragStream(dictEntry *kde) {
-    robj *ob = dictGetVal(kde);
+static void defragStream(robj *ob) {
     serverAssert(ob->type == OBJ_STREAM && ob->encoding == OBJ_ENCODING_STREAM);
     stream *s = ob->ptr, *news;
 
@@ -741,7 +711,7 @@ static void defragStream(dictEntry *kde) {
     if (raxSize(s->rax) > server.active_defrag_max_scan_fields) {
         rax *newrax = activeDefragAlloc(s->rax);
         if (newrax) s->rax = newrax;
-        defragLater(kde);
+        defragLater(ob);
     } else
         defragRadixTree(&s->rax, 1, NULL, NULL);
 
@@ -751,25 +721,36 @@ static void defragStream(dictEntry *kde) {
 /* Defrag a module key. This is either done immediately or scheduled
  * for later. Returns then number of pointers defragged.
  */
-static void defragModule(serverDb *db, dictEntry *kde) {
-    robj *obj = dictGetVal(kde);
+static void defragModule(serverDb *db, robj *obj) {
     serverAssert(obj->type == OBJ_MODULE);
-
-    if (!moduleDefragValue(dictGetKey(kde), obj, db->id)) defragLater(kde);
+    /* Fun fact (and a bug since forever): The key is passed to
+     * moduleDefragValue as an sds string, but the parameter is declared to be
+     * an robj and it's passed as such to the module type defrag callbacks.
+     * Nobody can ever have used this, i.e. accessed the key name in the defrag
+     * or free_effort module type callbacks. */
+    void *sds_key_passed_as_robj = objectGetKey(obj);
+    if (!moduleDefragValue(sds_key_passed_as_robj, obj, db->id)) defragLater(obj);
 }
 
 /* for each key we scan in the main dict, this function will attempt to defrag
  * all the various pointers it has. */
-static void defragKey(defragKeysCtx *ctx, dictEntry *de) {
+static void defragKey(defragKeysCtx *ctx, robj **elemref) {
     serverDb *db = ctx->db;
     int slot = ctx->kvstate.slot;
     robj *newob, *ob;
     unsigned char *newzl;
+    ob = *elemref;
 
-    /* Try to defrag robj and / or string value. */
-    ob = dictGetVal(de);
+    /* Try to defrag robj and/or string value. */
     if ((newob = activeDefragStringOb(ob))) {
-        kvstoreDictSetVal(ctx->kvstate.kvs, slot, de, newob);
+        *elemref = newob;
+        if (objectGetExpire(newob) >= 0) {
+            /* Replace the pointer in the expire table without accessing the old
+             * pointer. */
+            hashtable *expires_ht = kvstoreGetHashtable(db->expires, slot);
+            int replaced = hashtableReplaceReallocatedEntry(expires_ht, ob, newob);
+            serverAssert(replaced);
+        }
         ob = newob;
     }
 
@@ -777,7 +758,7 @@ static void defragKey(defragKeysCtx *ctx, dictEntry *de) {
         /* Already handled in activeDefragStringOb. */
     } else if (ob->type == OBJ_LIST) {
         if (ob->encoding == OBJ_ENCODING_QUICKLIST) {
-            defragQuicklist(de);
+            defragQuicklist(ob);
         } else if (ob->encoding == OBJ_ENCODING_LISTPACK) {
             if ((newzl = activeDefragAlloc(ob->ptr))) ob->ptr = newzl;
         } else {
@@ -785,7 +766,7 @@ static void defragKey(defragKeysCtx *ctx, dictEntry *de) {
         }
     } else if (ob->type == OBJ_SET) {
         if (ob->encoding == OBJ_ENCODING_HT) {
-            defragSet(de);
+            defragSet(ob);
         } else if (ob->encoding == OBJ_ENCODING_INTSET || ob->encoding == OBJ_ENCODING_LISTPACK) {
             void *newptr, *ptr = ob->ptr;
             if ((newptr = activeDefragAlloc(ptr))) ob->ptr = newptr;
@@ -796,7 +777,7 @@ static void defragKey(defragKeysCtx *ctx, dictEntry *de) {
         if (ob->encoding == OBJ_ENCODING_LISTPACK) {
             if ((newzl = activeDefragAlloc(ob->ptr))) ob->ptr = newzl;
         } else if (ob->encoding == OBJ_ENCODING_SKIPLIST) {
-            defragZsetSkiplist(de);
+            defragZsetSkiplist(ob);
         } else {
             serverPanic("Unknown sorted set encoding");
         }
@@ -804,23 +785,23 @@ static void defragKey(defragKeysCtx *ctx, dictEntry *de) {
         if (ob->encoding == OBJ_ENCODING_LISTPACK) {
             if ((newzl = activeDefragAlloc(ob->ptr))) ob->ptr = newzl;
         } else if (ob->encoding == OBJ_ENCODING_HT) {
-            defragHash(de);
+            defragHash(ob);
         } else {
             serverPanic("Unknown hash encoding");
         }
     } else if (ob->type == OBJ_STREAM) {
-        defragStream(de);
+        defragStream(ob);
     } else if (ob->type == OBJ_MODULE) {
-        defragModule(db, de);
+        defragModule(db, ob);
     } else {
         serverPanic("Unknown object type");
     }
 }
 
 /* Defrag scan callback for the main db dictionary. */
-static void dbKeysScanCallback(void *privdata, const dictEntry *de) {
+static void dbKeysScanCallback(void *privdata, void *elemref) {
     long long hits_before = server.stat_active_defrag_hits;
-    defragKey((defragKeysCtx *)privdata, (dictEntry *)de);
+    defragKey((defragKeysCtx *)privdata, (robj **)elemref);
     if (server.stat_active_defrag_hits != hits_before)
         server.stat_active_defrag_key_hits++;
     else
@@ -851,19 +832,19 @@ static float getAllocatorFragmentation(size_t *out_frag_bytes) {
     return frag_pct;
 }
 
-/* Defrag scan callback for the pubsub dictionary. */
-static void defragPubsubScanCallback(void *privdata, const dictEntry *de) {
+/* Defrag scan callback for a pubsub channels hashtable. */
+static void defragPubsubScanCallback(void *privdata, void *elemref) {
     defragPubSubCtx *ctx = privdata;
-    kvstore *pubsub_channels = ctx->kvstate.kvs;
-    robj *newchannel, *channel = dictGetKey(de);
-    dict *newclients, *clients = dictGetVal(de);
+    void **channel_dict_ref = (void **)elemref;
+    dict *newclients, *clients = *channel_dict_ref;
+    robj *newchannel, *channel = *(robj **)dictMetadata(clients);
     size_t allocation_size;
 
     /* Try to defrag the channel name. */
     serverAssert(channel->refcount == (int)dictSize(clients) + 1);
     newchannel = activeDefragStringObWithoutFree(channel, &allocation_size);
     if (newchannel) {
-        kvstoreDictSetKey(pubsub_channels, ctx->kvstate.slot, (dictEntry *)de, newchannel);
+        *(robj **)dictMetadata(clients) = newchannel;
 
         /* The channel name is shared by the client's pubsub(shard) and server's
          * pubsub(shard), after defraging the channel name, we need to update
@@ -884,16 +865,15 @@ static void defragPubsubScanCallback(void *privdata, const dictEntry *de) {
 
     /* Try to defrag the dictionary of clients that is stored as the value part. */
     if ((newclients = dictDefragTables(clients)))
-        kvstoreDictSetVal(pubsub_channels, ctx->kvstate.slot, (dictEntry *)de, newclients);
+        *channel_dict_ref = newclients;
 
     server.stat_active_defrag_scanned++;
 }
 
 /* returns 0 more work may or may not be needed (see non-zero cursor),
  * and 1 if time is up and more work is needed. */
-static int defragLaterItem(dictEntry *de, unsigned long *cursor, monotime endtime, int dbid) {
-    if (de) {
-        robj *ob = dictGetVal(de);
+static int defragLaterItem(robj *ob, unsigned long *cursor, monotime endtime, int dbid) {
+    if (ob) {
         if (ob->type == OBJ_LIST) {
             return scanLaterList(ob, cursor, endtime);
         } else if (ob->type == OBJ_SET) {
@@ -905,7 +885,14 @@ static int defragLaterItem(dictEntry *de, unsigned long *cursor, monotime endtim
         } else if (ob->type == OBJ_STREAM) {
             return scanLaterStreamListpacks(ob, cursor, endtime);
         } else if (ob->type == OBJ_MODULE) {
-            return moduleLateDefrag(dictGetKey(de), ob, cursor, endtime, dbid);
+            /* Fun fact (and a bug since forever): The key is passed to
+             * moduleLateDefrag as an sds string, but the parameter is declared
+             * to be an robj and it's passed as such to the module type defrag
+             * callbacks. Nobody can ever have used this, i.e. accessed the key
+             * name in the defrag module type callback. */
+            void *sds_key_passed_as_robj = objectGetKey(ob);
+            long long endtimeWallClock = ustime() + (endtime - getMonotonicUs());
+            return moduleLateDefrag(sds_key_passed_as_robj, ob, cursor, endtimeWallClock, dbid);
         } else {
             *cursor = 0; /* object type may have changed since we schedule it for later */
         }
@@ -927,10 +914,12 @@ static doneStatus defragLaterStep(monotime endtime, void *privdata) {
     while (defrag_later && listLength(defrag_later) > 0) {
         listNode *head = listFirst(defrag_later);
         sds key = head->value;
-        dictEntry *de = kvstoreDictFind(ctx->kvstate.kvs, ctx->kvstate.slot, key);
+        void *found = NULL;
+        kvstoreHashtableFind(ctx->kvstate.kvs, ctx->kvstate.slot, key, &found);
+        robj *ob = found;
 
         long long key_defragged = server.stat_active_defrag_hits;
-        bool timeout = (defragLaterItem(de, &defrag_later_cursor, endtime, ctx->db->id) == 1);
+        bool timeout = (defragLaterItem(ob, &defrag_later_cursor, endtime, ctx->db->id) == 1);
         if (key_defragged != server.stat_active_defrag_hits) {
             server.stat_active_defrag_key_hits++;
         } else {
@@ -962,9 +951,8 @@ static doneStatus defragLaterStep(monotime endtime, void *privdata) {
  * function during the iteration. */
 static doneStatus defragStageKvstoreHelper(monotime endtime,
                                            kvstore *kvs,
-                                           dictScanFunction scan_fn,
+                                           hashtableScanFunction scan_fn,
                                            kvstoreHelperPreContinueFn precontinue_fn,
-                                           const dictDefragFunctions *defragfns,
                                            void *privdata) {
     static kvstoreIterState state; // STATIC - this persists
     if (endtime == 0) {
@@ -983,7 +971,7 @@ static doneStatus defragStageKvstoreHelper(monotime endtime,
     if (state.slot == KVS_SLOT_DEFRAG_LUT) {
         // Before we start scanning the kvstore, handle the main structures
         do {
-            state.cursor = kvstoreDictLUTDefrag(kvs, state.cursor, dictDefragTables);
+            state.cursor = kvstoreHashtableDefragTables(kvs, state.cursor, activeDefragAlloc);
             if (getMonotonicUs() >= endtime) return DEFRAG_NOT_DONE;
         } while (state.cursor != 0);
         state.slot = KVS_SLOT_UNASSIGNED;
@@ -1005,9 +993,9 @@ static doneStatus defragStageKvstoreHelper(monotime endtime,
         if (!state.cursor) {
             // If there's no cursor, we're ready to begin a new kvstore slot.
             if (state.slot == KVS_SLOT_UNASSIGNED) {
-                state.slot = kvstoreGetFirstNonEmptyDictIndex(kvs);
+                state.slot = kvstoreGetFirstNonEmptyHashtableIndex(kvs);
             } else {
-                state.slot = kvstoreGetNextNonEmptyDictIndex(kvs, state.slot);
+                state.slot = kvstoreGetNextNonEmptyHashtableIndex(kvs, state.slot);
             }
 
             if (state.slot == KVS_SLOT_UNASSIGNED) return DEFRAG_DONE;
@@ -1015,8 +1003,9 @@ static doneStatus defragStageKvstoreHelper(monotime endtime,
 
         // Whatever privdata's actual type, this function requires that it begins with kvstoreIterState.
         if (privdata) *(kvstoreIterState *)privdata = state;
-        state.cursor = kvstoreDictScanDefrag(kvs, state.slot, state.cursor,
-                                             scan_fn, defragfns, privdata);
+        state.cursor = kvstoreHashtableScanDefrag(kvs, state.slot, state.cursor,
+                                                  scan_fn, privdata, activeDefragAlloc,
+                                                  HASHTABLE_SCAN_EMIT_REF);
     }
 
     return DEFRAG_NOT_DONE;
@@ -1035,46 +1024,25 @@ static doneStatus defragStageDbKeys(monotime endtime, void *target, void *privda
     }
     serverAssert(ctx.db == db);
 
-    /* Note: for DB keys, we use the start/finish callback to fix an expires table entry if
-     *       the main DB entry has been moved. */
-    static const dictDefragFunctions defragfns = {
-        .defragAlloc = activeDefragAlloc,
-        .defragKey = NULL, // Handled by dbKeysScanCallback
-        .defragVal = NULL, // Handled by dbKeysScanCallback
-        .defragEntryStartCb = defragEntryStartCbForKeys,
-        .defragEntryFinishCb = defragEntryFinishCbForKeys};
-
     return defragStageKvstoreHelper(endtime, db->keys,
-                                    dbKeysScanCallback, defragLaterStep, &defragfns, &ctx);
+                                    dbKeysScanCallback, defragLaterStep, &ctx);
 }
 
 
 static doneStatus defragStageExpiresKvstore(monotime endtime, void *target, void *privdata) {
     UNUSED(privdata);
-    static const dictDefragFunctions defragfns = {
-        .defragAlloc = activeDefragAlloc,
-        .defragKey = NULL, // Not needed for expires (just a ref)
-        .defragVal = NULL, // Not needed for expires (no value)
-    };
     return defragStageKvstoreHelper(endtime, (kvstore *)target,
-                                    scanCallbackCountScanned, NULL, &defragfns, NULL);
+                                    scanHashtableCallbackCountScanned, NULL, NULL);
 }
 
 
 static doneStatus defragStagePubsubKvstore(monotime endtime, void *target, void *privdata) {
     // target is server.pubsub_channels or server.pubsubshard_channels
     getClientChannelsFnWrapper *fnWrapper = privdata;
-
-    static const dictDefragFunctions defragfns = {
-        .defragAlloc = activeDefragAlloc,
-        .defragKey = NULL, // Handled by defragPubsubScanCallback
-        .defragVal = NULL, // Not needed for expires (no value)
-    };
     defragPubSubCtx ctx;
-
     ctx.getPubSubChannels = fnWrapper->fn;
     return defragStageKvstoreHelper(endtime, (kvstore *)target,
-                                    defragPubsubScanCallback, NULL, &defragfns, &ctx);
+                                    defragPubsubScanCallback, NULL, &ctx);
 }
 
 
diff --git a/src/evict.c b/src/evict.c
index 5208328b32..eecd000a4b 100644
--- a/src/evict.c
+++ b/src/evict.c
@@ -143,26 +143,14 @@ void evictionPoolAlloc(void) {
  * right. */
 int evictionPoolPopulate(serverDb *db, kvstore *samplekvs, struct evictionPoolEntry *pool) {
     int j, k, count;
-    dictEntry *samples[server.maxmemory_samples];
+    void *samples[server.maxmemory_samples];
 
-    int slot = kvstoreGetFairRandomDictIndex(samplekvs);
-    count = kvstoreDictGetSomeKeys(samplekvs, slot, samples, server.maxmemory_samples);
+    int slot = kvstoreGetFairRandomHashtableIndex(samplekvs);
+    count = kvstoreHashtableSampleEntries(samplekvs, slot, &samples[0], server.maxmemory_samples);
     for (j = 0; j < count; j++) {
         unsigned long long idle;
-        sds key;
-        robj *o;
-        dictEntry *de;
-
-        de = samples[j];
-        key = dictGetKey(de);
-
-        /* If the dictionary we are sampling from is not the main
-         * dictionary (but the expires one) we need to lookup the key
-         * again in the key dictionary to obtain the value object. */
-        if (server.maxmemory_policy != MAXMEMORY_VOLATILE_TTL) {
-            if (samplekvs != db->keys) de = kvstoreDictFind(db->keys, slot, key);
-            o = dictGetVal(de);
-        }
+        robj *o = samples[j];
+        sds key = objectGetKey(o);
 
         /* Calculate the idle time according to the policy. This is called
          * idle just because the code initially handled LRU, but is in fact
@@ -180,7 +168,7 @@ int evictionPoolPopulate(serverDb *db, kvstore *samplekvs, struct evictionPoolEn
             idle = 255 - LFUDecrAndReturn(o);
         } else if (server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL) {
             /* In this case the sooner the expire the better. */
-            idle = ULLONG_MAX - (long)dictGetVal(de);
+            idle = ULLONG_MAX - objectGetExpire(o);
         } else {
             serverPanic("Unknown eviction policy in evictionPoolPopulate()");
         }
@@ -568,7 +556,7 @@ int performEvictions(void) {
         sds bestkey = NULL;
         int bestdbid;
         serverDb *db;
-        dictEntry *de;
+        robj *valkey;
 
         if (server.maxmemory_policy & (MAXMEMORY_FLAG_LRU | MAXMEMORY_FLAG_LFU) ||
             server.maxmemory_policy == MAXMEMORY_VOLATILE_TTL) {
@@ -592,7 +580,7 @@ int performEvictions(void) {
                     if (current_db_keys == 0) continue;
 
                     total_keys += current_db_keys;
-                    int l = kvstoreNumNonEmptyDicts(kvs);
+                    int l = kvstoreNumNonEmptyHashtables(kvs);
                     /* Do not exceed the number of non-empty slots when looping. */
                     while (l--) {
                         sampled_keys += evictionPoolPopulate(db, kvs, pool);
@@ -617,7 +605,8 @@ int performEvictions(void) {
                     } else {
                         kvs = server.db[bestdbid].expires;
                     }
-                    de = kvstoreDictFind(kvs, pool[k].slot, pool[k].key);
+                    void *entry = NULL;
+                    int found = kvstoreHashtableFind(kvs, pool[k].slot, pool[k].key, &entry);
 
                     /* Remove the entry from the pool. */
                     if (pool[k].key != pool[k].cached) sdsfree(pool[k].key);
@@ -626,8 +615,9 @@ int performEvictions(void) {
 
                     /* If the key exists, is our pick. Otherwise it is
                      * a ghost and we need to try the next element. */
-                    if (de) {
-                        bestkey = dictGetKey(de);
+                    if (found) {
+                        valkey = entry;
+                        bestkey = objectGetKey(valkey);
                         break;
                     } else {
                         /* Ghost... Iterate again. */
@@ -651,10 +641,10 @@ int performEvictions(void) {
                 } else {
                     kvs = db->expires;
                 }
-                int slot = kvstoreGetFairRandomDictIndex(kvs);
-                de = kvstoreDictGetRandomKey(kvs, slot);
-                if (de) {
-                    bestkey = dictGetKey(de);
+                int slot = kvstoreGetFairRandomHashtableIndex(kvs);
+                int found = kvstoreHashtableRandomEntry(kvs, slot, (void **)&valkey);
+                if (found) {
+                    bestkey = objectGetKey(valkey);
                     bestdbid = j;
                     break;
                 }
diff --git a/src/expire.c b/src/expire.c
index c22df1ef86..e4c3b0ec96 100644
--- a/src/expire.c
+++ b/src/expire.c
@@ -46,8 +46,7 @@ static double avg_ttl_factor[16] = {0.98, 0.9604, 0.941192, 0.922368, 0.903921,
                                     0.833748, 0.817073, 0.800731, 0.784717, 0.769022, 0.753642, 0.738569, 0.723798};
 
 /* Helper function for the activeExpireCycle() function.
- * This function will try to expire the key that is stored in the hash table
- * entry 'de' of the 'expires' hash table of a database.
+ * This function will try to expire the key-value entry 'val'.
  *
  * If the key is found to be expired, it is removed from the database and
  * 1 is returned. Otherwise no operation is performed and 0 is returned.
@@ -56,11 +55,12 @@ static double avg_ttl_factor[16] = {0.98, 0.9604, 0.941192, 0.922368, 0.903921,
  *
  * The parameter 'now' is the current time in milliseconds as is passed
  * to the function to avoid too many gettimeofday() syscalls. */
-int activeExpireCycleTryExpire(serverDb *db, dictEntry *de, long long now) {
-    long long t = dictGetSignedIntegerVal(de);
+int activeExpireCycleTryExpire(serverDb *db, robj *val, long long now) {
+    long long t = objectGetExpire(val);
+    serverAssert(t >= 0);
     if (now > t) {
         enterExecutionUnit(1, 0);
-        sds key = dictGetKey(de);
+        sds key = objectGetKey(val);
         robj *keyobj = createStringObject(key, sdslen(key));
         deleteExpiredKeyAndPropagate(db, keyobj);
         decrRefCount(keyobj);
@@ -127,11 +127,11 @@ typedef struct {
     int ttl_samples;       /* num keys with ttl not yet expired */
 } expireScanData;
 
-void expireScanCallback(void *privdata, const dictEntry *const_de) {
-    dictEntry *de = (dictEntry *)const_de;
+void expireScanCallback(void *privdata, void *entry) {
+    robj *val = entry;
     expireScanData *data = privdata;
-    long long ttl = dictGetSignedIntegerVal(de) - data->now;
-    if (activeExpireCycleTryExpire(data->db, de, data->now)) {
+    long long ttl = objectGetExpire(val) - data->now;
+    if (activeExpireCycleTryExpire(data->db, val, data->now)) {
         data->expired++;
         /* Propagate the DEL command */
         postExecutionUnitOperations();
@@ -144,13 +144,13 @@ void expireScanCallback(void *privdata, const dictEntry *const_de) {
     data->sampled++;
 }
 
-static inline int isExpiryDictValidForSamplingCb(dict *d) {
-    long long numkeys = dictSize(d);
-    unsigned long buckets = dictBuckets(d);
+static inline int isExpiryTableValidForSamplingCb(hashtable *ht) {
+    long long numkeys = hashtableSize(ht);
+    unsigned long buckets = hashtableBuckets(ht);
     /* When there are less than 1% filled buckets, sampling the key
      * space is expensive, so stop here waiting for better times...
      * The dictionary will be resized asap. */
-    if (buckets > DICT_HT_INITIAL_SIZE && (numkeys * 100 / buckets < 1)) {
+    if (buckets > 0 && (numkeys * 100 / buckets < 1)) {
         return C_ERR;
     }
     return C_OK;
@@ -279,14 +279,14 @@ void activeExpireCycle(int type) {
              * is very fast: we are in the cache line scanning a sequential
              * array of NULL pointers, so we can scan a lot more buckets
              * than keys in the same time. */
-            long max_buckets = num * 20;
+            long max_buckets = num * 10;
             long checked_buckets = 0;
 
             int origin_ttl_samples = data.ttl_samples;
 
             while (data.sampled < num && checked_buckets < max_buckets) {
                 db->expires_cursor = kvstoreScan(db->expires, db->expires_cursor, -1, expireScanCallback,
-                                                 isExpiryDictValidForSamplingCb, &data);
+                                                 isExpiryTableValidForSamplingCb, &data);
                 if (db->expires_cursor == 0) {
                     db_done = 1;
                     break;
@@ -422,7 +422,7 @@ void expireReplicaKeys(void) {
         while (dbids && dbid < server.dbnum) {
             if ((dbids & 1) != 0) {
                 serverDb *db = server.db + dbid;
-                dictEntry *expire = dbFindExpires(db, keyname);
+                robj *expire = dbFindExpires(db, keyname);
                 int expired = 0;
 
                 if (expire && activeExpireCycleTryExpire(server.db + dbid, expire, start)) {
@@ -619,14 +619,16 @@ void expireGenericCommand(client *c, long long basetime, int unit) {
     }
     when += basetime;
 
+    robj *obj = lookupKeyWrite(c->db, key);
+
     /* No key, return zero. */
-    if (lookupKeyWrite(c->db, key) == NULL) {
+    if (obj == NULL) {
         addReply(c, shared.czero);
         return;
     }
 
     if (flag) {
-        current_expire = getExpire(c->db, key);
+        current_expire = objectGetExpire(obj);
 
         /* NX option is set, check current expiry */
         if (flag & EXPIRE_NX) {
@@ -674,7 +676,7 @@ void expireGenericCommand(client *c, long long basetime, int unit) {
         addReply(c, shared.cone);
         return;
     } else {
-        setExpire(c, c->db, key, when);
+        obj = setExpire(c, c->db, key, when);
         addReply(c, shared.cone);
         /* Propagate as PEXPIREAT millisecond-timestamp
          * Only rewrite the command arg if not already PEXPIREAT */
diff --git a/src/geo.c b/src/geo.c
index 9e43a6e93b..75654f85a5 100644
--- a/src/geo.c
+++ b/src/geo.c
@@ -780,8 +780,7 @@ void georadiusGeneric(client *c, int srcKeyIndex, int flags) {
 
         if (returned_items) {
             zsetConvertToListpackIfNeeded(zobj, maxelelen, totelelen);
-            setKey(c, c->db, storekey, zobj, 0);
-            decrRefCount(zobj);
+            setKey(c, c->db, storekey, &zobj, 0);
             notifyKeyspaceEvent(NOTIFY_ZSET, flags & GEOSEARCH ? "geosearchstore" : "georadiusstore", storekey,
                                 c->db->id);
             server.dirty += returned_items;
diff --git a/src/hyperloglog.c b/src/hyperloglog.c
index 9a48c821ab..f0390b3e1e 100644
--- a/src/hyperloglog.c
+++ b/src/hyperloglog.c
@@ -1440,7 +1440,7 @@ void pfaddCommand(client *c) {
          * hold our HLL data structure. sdsnewlen() when NULL is passed
          * is guaranteed to return bytes initialized to zero. */
         o = createHLLObject();
-        dbAdd(c->db, c->argv[1], o);
+        dbAdd(c->db, c->argv[1], &o);
         updated++;
     } else {
         if (isHLLObjectOrReply(c, o) != C_OK) return;
@@ -1597,7 +1597,7 @@ void pfmergeCommand(client *c) {
          * hold our HLL data structure. sdsnewlen() when NULL is passed
          * is guaranteed to return bytes initialized to zero. */
         o = createHLLObject();
-        dbAdd(c->db, c->argv[1], o);
+        dbAdd(c->db, c->argv[1], &o);
     } else {
         /* If key exists we are sure it's of the right type/size
          * since we checked when merging the different HLLs, so we
diff --git a/src/io_threads.c b/src/io_threads.c
index 1ebd748bc2..3865eb77c3 100644
--- a/src/io_threads.c
+++ b/src/io_threads.c
@@ -493,6 +493,8 @@ int tryOffloadFreeObjToIOThreads(robj *obj) {
 
     if (obj->refcount > 1) return C_ERR;
 
+    if (obj->encoding != OBJ_ENCODING_RAW || obj->type != OBJ_STRING) return C_ERR;
+
     /* We select the thread ID in a round-robin fashion. */
     size_t tid = (server.stat_io_freed_objects % (server.active_io_threads_num - 1)) + 1;
 
@@ -501,7 +503,12 @@ int tryOffloadFreeObjToIOThreads(robj *obj) {
         return C_ERR;
     }
 
-    IOJobQueue_push(jq, decrRefCountVoid, obj);
+    /* We offload only the free of the ptr that may be allocated by the I/O thread.
+     * The object itself was allocated by the main thread and will be freed by the main thread. */
+    IOJobQueue_push(jq, sdsfreeVoid, obj->ptr);
+    obj->ptr = NULL;
+    decrRefCount(obj);
+
     server.stat_io_freed_objects++;
     return C_OK;
 }
diff --git a/src/kvstore.c b/src/kvstore.c
index 344a8af5cf..d6db4d3fe1 100644
--- a/src/kvstore.c
+++ b/src/kvstore.c
@@ -1,11 +1,11 @@
 /*
  * Index-based KV store implementation
- * This file implements a KV store comprised of an array of dicts (see dict.c)
+ * This file implements a KV store comprised of an array of hash tables (see hashtable.c)
  * The purpose of this KV store is to have easy access to all keys that belong
- * in the same dict (i.e. are in the same dict-index)
+ * in the same hash table (i.e. are in the same hashtable-index)
  *
  * For example, when the server is running in cluster mode, we use kvstore to save
- * all keys that map to the same hash-slot in a separate dict within the kvstore
+ * all keys that map to the same hash-slot in a separate hash table within the kvstore
  * struct.
  * This enables us to easily access all keys that map to a specific hash-slot.
  *
@@ -40,6 +40,7 @@
 
 #include <string.h>
 #include <stddef.h>
+#include <stdlib.h>
 
 #include "zmalloc.h"
 #include "kvstore.h"
@@ -48,236 +49,248 @@
 
 #define UNUSED(V) ((void)V)
 
-static dict *kvstoreIteratorNextDict(kvstoreIterator *kvs_it);
+static hashtable *kvstoreIteratorNextHashtable(kvstoreIterator *kvs_it);
 
 struct _kvstore {
     int flags;
-    dictType *dtype;
-    dict **dicts;
-    int num_dicts;
-    int num_dicts_bits;
-    list *rehashing;                     /* List of dictionaries in this kvstore that are currently rehashing. */
-    int resize_cursor;                   /* Cron job uses this cursor to gradually resize dictionaries (only used if num_dicts > 1). */
-    int allocated_dicts;                 /* The number of allocated dicts. */
-    int non_empty_dicts;                 /* The number of non-empty dicts. */
-    unsigned long long key_count;        /* Total number of keys in this kvstore. */
-    unsigned long long bucket_count;     /* Total number of buckets in this kvstore across dictionaries. */
-    unsigned long long *dict_size_index; /* Binary indexed tree (BIT) that describes cumulative key frequencies up until
-                                            given dict-index. */
-    size_t overhead_hashtable_lut;       /* The overhead of all dictionaries. */
-    size_t overhead_hashtable_rehashing; /* The overhead of dictionaries rehashing. */
+    hashtableType *dtype;
+    hashtable **hashtables;
+    int num_hashtables;
+    int num_hashtables_bits;
+    list *rehashing;                          /* List of hash tables in this kvstore that are currently rehashing. */
+    int resize_cursor;                        /* Cron job uses this cursor to gradually resize hash tables (only used if num_hashtables > 1). */
+    int allocated_hashtables;                 /* The number of allocated hashtables. */
+    int non_empty_hashtables;                 /* The number of non-empty hashtables. */
+    unsigned long long key_count;             /* Total number of keys in this kvstore. */
+    unsigned long long bucket_count;          /* Total number of buckets in this kvstore across hash tables. */
+    unsigned long long *hashtable_size_index; /* Binary indexed tree (BIT) that describes cumulative key frequencies up until
+                                               * given hashtable-index. */
+    size_t overhead_hashtable_lut;            /* Overhead of all hashtables in bytes. */
+    size_t overhead_hashtable_rehashing;      /* Overhead of hash tables rehashing in bytes. */
 };
 
-/* Structure for kvstore iterator that allows iterating across multiple dicts. */
+/* Structure for kvstore iterator that allows iterating across multiple hashtables. */
 struct _kvstoreIterator {
     kvstore *kvs;
     long long didx;
     long long next_didx;
-    dictIterator di;
+    hashtableIterator di;
 };
 
-/* Structure for kvstore dict iterator that allows iterating the corresponding dict. */
-struct _kvstoreDictIterator {
+/* Structure for kvstore hashtable iterator that allows iterating the corresponding hashtable. */
+struct _kvstoreHashtableIterator {
     kvstore *kvs;
     long long didx;
-    dictIterator di;
+    hashtableIterator di;
 };
 
-/* Dict metadata for database, used for record the position in rehashing list. */
+/* Hashtable metadata for database, used for record the position in rehashing list. */
 typedef struct {
     listNode *rehashing_node; /* list node in rehashing list */
     kvstore *kvs;
-} kvstoreDictMetadata;
+} kvstoreHashtableMetadata;
 
 /**********************************/
 /*** Helpers **********************/
 /**********************************/
 
-/* Get the dictionary pointer based on dict-index. */
-dict *kvstoreGetDict(kvstore *kvs, int didx) {
-    return kvs->dicts[didx];
+/* Get the hash table pointer based on hashtable-index. */
+hashtable *kvstoreGetHashtable(kvstore *kvs, int didx) {
+    return kvs->hashtables[didx];
 }
 
-static dict **kvstoreGetDictRef(kvstore *kvs, int didx) {
-    return &kvs->dicts[didx];
+static hashtable **kvstoreGetHashtableRef(kvstore *kvs, int didx) {
+    return &kvs->hashtables[didx];
 }
 
-static int kvstoreDictIsRehashingPaused(kvstore *kvs, int didx) {
-    dict *d = kvstoreGetDict(kvs, didx);
-    return d ? dictIsRehashingPaused(d) : 0;
+static int kvstoreHashtableIsRehashingPaused(kvstore *kvs, int didx) {
+    hashtable *ht = kvstoreGetHashtable(kvs, didx);
+    return ht ? hashtableIsRehashingPaused(ht) : 0;
 }
 
-/* Returns total (cumulative) number of keys up until given dict-index (inclusive).
- * Time complexity is O(log(kvs->num_dicts)). */
+/* Returns total (cumulative) number of keys up until given hashtable-index (inclusive).
+ * Time complexity is O(log(kvs->num_hashtables)). */
 static unsigned long long cumulativeKeyCountRead(kvstore *kvs, int didx) {
-    if (kvs->num_dicts == 1) {
+    if (kvs->num_hashtables == 1) {
         assert(didx == 0);
         return kvstoreSize(kvs);
     }
     int idx = didx + 1;
     unsigned long long sum = 0;
     while (idx > 0) {
-        sum += kvs->dict_size_index[idx];
+        sum += kvs->hashtable_size_index[idx];
         idx -= (idx & -idx);
     }
     return sum;
 }
 
-static void addDictIndexToCursor(kvstore *kvs, int didx, unsigned long long *cursor) {
-    if (kvs->num_dicts == 1) return;
-    /* didx can be -1 when iteration is over and there are no more dicts to visit. */
+static void addHashtableIndexToCursor(kvstore *kvs, int didx, unsigned long long *cursor) {
+    if (kvs->num_hashtables == 1) return;
+    /* didx can be -1 when iteration is over and there are no more hashtables to visit. */
     if (didx < 0) return;
-    *cursor = (*cursor << kvs->num_dicts_bits) | didx;
+    *cursor = (*cursor << kvs->num_hashtables_bits) | didx;
 }
 
-static int getAndClearDictIndexFromCursor(kvstore *kvs, unsigned long long *cursor) {
-    if (kvs->num_dicts == 1) return 0;
-    int didx = (int)(*cursor & (kvs->num_dicts - 1));
-    *cursor = *cursor >> kvs->num_dicts_bits;
+static int getAndClearHashtableIndexFromCursor(kvstore *kvs, unsigned long long *cursor) {
+    if (kvs->num_hashtables == 1) return 0;
+    int didx = (int)(*cursor & (kvs->num_hashtables - 1));
+    *cursor = *cursor >> kvs->num_hashtables_bits;
     return didx;
 }
 
-/* Updates binary index tree (also known as Fenwick tree), increasing key count for a given dict.
+/* Updates binary index tree (also known as Fenwick tree), increasing key count for a given hashtable.
  * You can read more about this data structure here https://en.wikipedia.org/wiki/Fenwick_tree
- * Time complexity is O(log(kvs->num_dicts)). */
+ * Time complexity is O(log(kvs->num_hashtables)). */
 static void cumulativeKeyCountAdd(kvstore *kvs, int didx, long delta) {
     kvs->key_count += delta;
 
-    dict *d = kvstoreGetDict(kvs, didx);
-    size_t dsize = dictSize(d);
-    int non_empty_dicts_delta = dsize == 1 ? 1 : dsize == 0 ? -1
-                                                            : 0;
-    kvs->non_empty_dicts += non_empty_dicts_delta;
+    hashtable *ht = kvstoreGetHashtable(kvs, didx);
+    size_t size = hashtableSize(ht);
+    if (delta < 0 && size == 0) {
+        kvs->non_empty_hashtables--; /* It became empty. */
+    } else if (delta > 0 && size == (size_t)delta) {
+        kvs->non_empty_hashtables++; /* It was empty before. */
+    }
 
-    /* BIT does not need to be calculated when there's only one dict. */
-    if (kvs->num_dicts == 1) return;
+    /* BIT does not need to be calculated when there's only one hashtable. */
+    if (kvs->num_hashtables == 1) return;
 
     /* Update the BIT */
-    int idx = didx + 1; /* Unlike dict indices, BIT is 1-based, so we need to add 1. */
-    while (idx <= kvs->num_dicts) {
+    int idx = didx + 1; /* Unlike hashtable indices, BIT is 1-based, so we need to add 1. */
+    while (idx <= kvs->num_hashtables) {
         if (delta < 0) {
-            assert(kvs->dict_size_index[idx] >= (unsigned long long)labs(delta));
+            assert(kvs->hashtable_size_index[idx] >= (unsigned long long)labs(delta));
         }
-        kvs->dict_size_index[idx] += delta;
+        kvs->hashtable_size_index[idx] += delta;
         idx += (idx & -idx);
     }
 }
 
-/* Create the dict if it does not exist and return it. */
-static dict *createDictIfNeeded(kvstore *kvs, int didx) {
-    dict *d = kvstoreGetDict(kvs, didx);
-    if (d) return d;
+/* Create the hashtable if it does not exist and return it. */
+static hashtable *createHashtableIfNeeded(kvstore *kvs, int didx) {
+    hashtable *ht = kvstoreGetHashtable(kvs, didx);
+    if (ht) return ht;
 
-    kvs->dicts[didx] = dictCreate(kvs->dtype);
-    kvstoreDictMetadata *metadata = (kvstoreDictMetadata *)dictMetadata(kvs->dicts[didx]);
+    kvs->hashtables[didx] = hashtableCreate(kvs->dtype);
+    kvstoreHashtableMetadata *metadata = (kvstoreHashtableMetadata *)hashtableMetadata(kvs->hashtables[didx]);
     metadata->kvs = kvs;
-    kvs->allocated_dicts++;
-    return kvs->dicts[didx];
+    /* Memory is counted by kvstoreHashtableTrackMemUsage, but when it's invoked
+     * by hashtableCreate above, we don't know which hashtable it is for, because
+     * the metadata has yet been initialized. Account for the newly created
+     * hashtable here instead. */
+    kvs->overhead_hashtable_lut += hashtableMemUsage(kvs->hashtables[didx]);
+    kvs->allocated_hashtables++;
+    return kvs->hashtables[didx];
 }
 
-/* Called when the dict will delete entries, the function will check
- * KVSTORE_FREE_EMPTY_DICTS to determine whether the empty dict needs
+/* Called when the hashtable will delete entries, the function will check
+ * KVSTORE_FREE_EMPTY_HASHTABLES to determine whether the empty hashtable needs
  * to be freed.
  *
- * Note that for rehashing dicts, that is, in the case of safe iterators
- * and Scan, we won't delete the dict. We will check whether it needs
+ * Note that for rehashing hashtables, that is, in the case of safe iterators
+ * and Scan, we won't delete the hashtable. We will check whether it needs
  * to be deleted when we're releasing the iterator. */
-static void freeDictIfNeeded(kvstore *kvs, int didx) {
-    if (!(kvs->flags & KVSTORE_FREE_EMPTY_DICTS) || !kvstoreGetDict(kvs, didx) || kvstoreDictSize(kvs, didx) != 0 ||
-        kvstoreDictIsRehashingPaused(kvs, didx))
+static void freeHashtableIfNeeded(kvstore *kvs, int didx) {
+    if (!(kvs->flags & KVSTORE_FREE_EMPTY_HASHTABLES) || !kvstoreGetHashtable(kvs, didx) || kvstoreHashtableSize(kvs, didx) != 0 ||
+        kvstoreHashtableIsRehashingPaused(kvs, didx))
         return;
-    dictRelease(kvs->dicts[didx]);
-    kvs->dicts[didx] = NULL;
-    kvs->allocated_dicts--;
+    hashtableRelease(kvs->hashtables[didx]);
+    kvs->hashtables[didx] = NULL;
+    kvs->allocated_hashtables--;
 }
 
-/**********************************/
-/*** dict callbacks ***************/
-/**********************************/
+/*************************************/
+/*** hashtable callbacks ***************/
+/*************************************/
 
-/* Adds dictionary to the rehashing list, which allows us
+/* Adds hash table to the rehashing list, which allows us
  * to quickly find rehash targets during incremental rehashing.
  *
- * If there are multiple dicts, updates the bucket count for the given dictionary
+ * If there are multiple hashtables, updates the bucket count for the given hash table
  * in a DB, bucket count incremented with the new ht size during the rehashing phase.
- * If there's one dict, bucket count can be retrieved directly from single dict bucket. */
-void kvstoreDictRehashingStarted(dict *d) {
-    kvstoreDictMetadata *metadata = (kvstoreDictMetadata *)dictMetadata(d);
+ * If there's one hashtable, bucket count can be retrieved directly from single hashtable bucket. */
+void kvstoreHashtableRehashingStarted(hashtable *ht) {
+    kvstoreHashtableMetadata *metadata = (kvstoreHashtableMetadata *)hashtableMetadata(ht);
     kvstore *kvs = metadata->kvs;
-    listAddNodeTail(kvs->rehashing, d);
+    listAddNodeTail(kvs->rehashing, ht);
     metadata->rehashing_node = listLast(kvs->rehashing);
 
-    unsigned long long from, to;
-    dictRehashingInfo(d, &from, &to);
+    size_t from, to;
+    hashtableRehashingInfo(ht, &from, &to);
     kvs->bucket_count += to; /* Started rehashing (Add the new ht size) */
-    kvs->overhead_hashtable_lut += to;
-    kvs->overhead_hashtable_rehashing += from;
+    kvs->overhead_hashtable_rehashing += from * HASHTABLE_BUCKET_SIZE;
 }
 
-/* Remove dictionary from the rehashing list.
+/* Remove hash table from the rehashing list.
  *
- * Updates the bucket count for the given dictionary in a DB. It removes
- * the old ht size of the dictionary from the total sum of buckets for a DB.  */
-void kvstoreDictRehashingCompleted(dict *d) {
-    kvstoreDictMetadata *metadata = (kvstoreDictMetadata *)dictMetadata(d);
+ * Updates the bucket count for the given hash table in a DB. It removes
+ * the old ht size of the hash table from the total sum of buckets for a DB.  */
+void kvstoreHashtableRehashingCompleted(hashtable *ht) {
+    kvstoreHashtableMetadata *metadata = (kvstoreHashtableMetadata *)hashtableMetadata(ht);
     kvstore *kvs = metadata->kvs;
     if (metadata->rehashing_node) {
         listDelNode(kvs->rehashing, metadata->rehashing_node);
         metadata->rehashing_node = NULL;
     }
 
-    unsigned long long from, to;
-    dictRehashingInfo(d, &from, &to);
+    size_t from, to;
+    hashtableRehashingInfo(ht, &from, &to);
     kvs->bucket_count -= from; /* Finished rehashing (Remove the old ht size) */
-    kvs->overhead_hashtable_lut -= from;
-    kvs->overhead_hashtable_rehashing -= from;
+    kvs->overhead_hashtable_rehashing -= from * HASHTABLE_BUCKET_SIZE;
+}
+
+/* Hashtable callback to keep track of memory usage. */
+void kvstoreHashtableTrackMemUsage(hashtable *ht, ssize_t delta) {
+    kvstoreHashtableMetadata *metadata = (kvstoreHashtableMetadata *)hashtableMetadata(ht);
+    if (metadata->kvs == NULL) {
+        /* This is the initial allocation by hashtableCreate, when the metadata
+         * hasn't been initialized yet. */
+        return;
+    }
+    metadata->kvs->overhead_hashtable_lut += delta;
 }
 
-/* Returns the size of the DB dict metadata in bytes. */
-size_t kvstoreDictMetadataSize(dict *d) {
-    UNUSED(d);
-    return sizeof(kvstoreDictMetadata);
+/* Returns the size of the DB hashtable metadata in bytes. */
+size_t kvstoreHashtableMetadataSize(void) {
+    return sizeof(kvstoreHashtableMetadata);
 }
 
 /**********************************/
 /*** API **************************/
 /**********************************/
 
-/* Create an array of dictionaries
- * num_dicts_bits is the log2 of the amount of dictionaries needed (e.g. 0 for 1 dict,
- * 3 for 8 dicts, etc.)
- *
- * The kvstore handles `key` based on `dictType` during initialization:
- * - If `dictType.embedded-entry` is 1, it clones the `key`.
- * - Otherwise, it assumes ownership of the `key`.
+/* Create an array of hash tables
+ * num_hashtables_bits is the log2 of the amount of hash tables needed (e.g. 0 for 1 hashtable,
+ * 3 for 8 hashtables, etc.)
  */
-kvstore *kvstoreCreate(dictType *type, int num_dicts_bits, int flags) {
-    /* We can't support more than 2^16 dicts because we want to save 48 bits
-     * for the dict cursor, see kvstoreScan */
-    assert(num_dicts_bits <= 16);
+kvstore *kvstoreCreate(hashtableType *type, int num_hashtables_bits, int flags) {
+    /* We can't support more than 2^16 hashtables because we want to save 48 bits
+     * for the hashtable cursor, see kvstoreScan */
+    assert(num_hashtables_bits <= 16);
 
-    /* The dictType of kvstore needs to use the specific callbacks.
+    /* The hashtableType of kvstore needs to use the specific callbacks.
      * If there are any changes in the future, it will need to be modified. */
-    assert(type->rehashingStarted == kvstoreDictRehashingStarted);
-    assert(type->rehashingCompleted == kvstoreDictRehashingCompleted);
-    assert(type->dictMetadataBytes == kvstoreDictMetadataSize);
+    assert(type->rehashingStarted == kvstoreHashtableRehashingStarted);
+    assert(type->rehashingCompleted == kvstoreHashtableRehashingCompleted);
+    assert(type->trackMemUsage == kvstoreHashtableTrackMemUsage);
+    assert(type->getMetadataSize == kvstoreHashtableMetadataSize);
 
     kvstore *kvs = zcalloc(sizeof(*kvs));
     kvs->dtype = type;
     kvs->flags = flags;
 
-    kvs->num_dicts_bits = num_dicts_bits;
-    kvs->num_dicts = 1 << kvs->num_dicts_bits;
-    kvs->dicts = zcalloc(sizeof(dict *) * kvs->num_dicts);
-    if (!(kvs->flags & KVSTORE_ALLOCATE_DICTS_ON_DEMAND)) {
-        for (int i = 0; i < kvs->num_dicts; i++) createDictIfNeeded(kvs, i);
+    kvs->num_hashtables_bits = num_hashtables_bits;
+    kvs->num_hashtables = 1 << kvs->num_hashtables_bits;
+    kvs->hashtables = zcalloc(sizeof(hashtable *) * kvs->num_hashtables);
+    if (!(kvs->flags & KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND)) {
+        for (int i = 0; i < kvs->num_hashtables; i++) createHashtableIfNeeded(kvs, i);
     }
 
     kvs->rehashing = listCreate();
     kvs->key_count = 0;
-    kvs->non_empty_dicts = 0;
+    kvs->non_empty_hashtables = 0;
     kvs->resize_cursor = 0;
-    kvs->dict_size_index = kvs->num_dicts > 1 ? zcalloc(sizeof(unsigned long long) * (kvs->num_dicts + 1)) : NULL;
+    kvs->hashtable_size_index = kvs->num_hashtables > 1 ? zcalloc(sizeof(unsigned long long) * (kvs->num_hashtables + 1)) : NULL;
     kvs->bucket_count = 0;
     kvs->overhead_hashtable_lut = 0;
     kvs->overhead_hashtable_rehashing = 0;
@@ -285,105 +298,102 @@ kvstore *kvstoreCreate(dictType *type, int num_dicts_bits, int flags) {
     return kvs;
 }
 
-void kvstoreEmpty(kvstore *kvs, void(callback)(dict *)) {
-    for (int didx = 0; didx < kvs->num_dicts; didx++) {
-        dict *d = kvstoreGetDict(kvs, didx);
-        if (!d) continue;
-        kvstoreDictMetadata *metadata = (kvstoreDictMetadata *)dictMetadata(d);
+void kvstoreEmpty(kvstore *kvs, void(callback)(hashtable *)) {
+    for (int didx = 0; didx < kvs->num_hashtables; didx++) {
+        hashtable *ht = kvstoreGetHashtable(kvs, didx);
+        if (!ht) continue;
+        kvstoreHashtableMetadata *metadata = (kvstoreHashtableMetadata *)hashtableMetadata(ht);
         if (metadata->rehashing_node) metadata->rehashing_node = NULL;
-        dictEmpty(d, callback);
-        freeDictIfNeeded(kvs, didx);
+        hashtableEmpty(ht, callback);
+        freeHashtableIfNeeded(kvs, didx);
     }
 
     listEmpty(kvs->rehashing);
 
     kvs->key_count = 0;
-    kvs->non_empty_dicts = 0;
+    kvs->non_empty_hashtables = 0;
     kvs->resize_cursor = 0;
     kvs->bucket_count = 0;
-    if (kvs->dict_size_index) memset(kvs->dict_size_index, 0, sizeof(unsigned long long) * (kvs->num_dicts + 1));
-    kvs->overhead_hashtable_lut = 0;
+    if (kvs->hashtable_size_index) memset(kvs->hashtable_size_index, 0, sizeof(unsigned long long) * (kvs->num_hashtables + 1));
     kvs->overhead_hashtable_rehashing = 0;
 }
 
 void kvstoreRelease(kvstore *kvs) {
-    for (int didx = 0; didx < kvs->num_dicts; didx++) {
-        dict *d = kvstoreGetDict(kvs, didx);
-        if (!d) continue;
-        kvstoreDictMetadata *metadata = (kvstoreDictMetadata *)dictMetadata(d);
+    for (int didx = 0; didx < kvs->num_hashtables; didx++) {
+        hashtable *ht = kvstoreGetHashtable(kvs, didx);
+        if (!ht) continue;
+        kvstoreHashtableMetadata *metadata = (kvstoreHashtableMetadata *)hashtableMetadata(ht);
         if (metadata->rehashing_node) metadata->rehashing_node = NULL;
-        dictRelease(d);
+        hashtableRelease(ht);
     }
-    zfree(kvs->dicts);
+    assert(kvs->overhead_hashtable_lut == 0);
+    zfree(kvs->hashtables);
 
     listRelease(kvs->rehashing);
-    if (kvs->dict_size_index) zfree(kvs->dict_size_index);
+    if (kvs->hashtable_size_index) zfree(kvs->hashtable_size_index);
 
     zfree(kvs);
 }
 
 unsigned long long int kvstoreSize(kvstore *kvs) {
-    if (kvs->num_dicts != 1) {
+    if (kvs->num_hashtables != 1) {
         return kvs->key_count;
     } else {
-        return kvs->dicts[0] ? dictSize(kvs->dicts[0]) : 0;
+        return kvs->hashtables[0] ? hashtableSize(kvs->hashtables[0]) : 0;
     }
 }
 
-/* This method provides the cumulative sum of all the dictionary buckets
- * across dictionaries in a database. */
+/* This method provides the cumulative sum of all the hash table buckets
+ * across hash tables in a database. */
 unsigned long kvstoreBuckets(kvstore *kvs) {
-    if (kvs->num_dicts != 1) {
+    if (kvs->num_hashtables != 1) {
         return kvs->bucket_count;
     } else {
-        return kvs->dicts[0] ? dictBuckets(kvs->dicts[0]) : 0;
+        return kvs->hashtables[0] ? hashtableBuckets(kvs->hashtables[0]) : 0;
     }
 }
 
 size_t kvstoreMemUsage(kvstore *kvs) {
     size_t mem = sizeof(*kvs);
+    mem += kvs->overhead_hashtable_lut;
 
-    unsigned long long keys_count = kvstoreSize(kvs);
-    mem += keys_count * dictEntryMemUsage(NULL) + kvstoreBuckets(kvs) * sizeof(dictEntry *) +
-           kvs->allocated_dicts * (sizeof(dict) + kvstoreDictMetadataSize(NULL));
-
-    /* Values are dict* shared with kvs->dicts */
+    /* Values are hashtable* shared with kvs->hashtables */
     mem += listLength(kvs->rehashing) * sizeof(listNode);
 
-    if (kvs->dict_size_index) mem += sizeof(unsigned long long) * (kvs->num_dicts + 1);
+    if (kvs->hashtable_size_index) mem += sizeof(unsigned long long) * (kvs->num_hashtables + 1);
 
     return mem;
 }
 
 /*
- * This method is used to iterate over the elements of the entire kvstore specifically across dicts.
+ * This method is used to iterate over the elements of the entire kvstore specifically across hashtables.
  * It's a three pronged approach.
  *
- * 1. It uses the provided cursor `cursor` to retrieve the dict index from it.
- * 2. If the dictionary is in a valid state checked through the provided callback `dictScanValidFunction`,
- *    it performs a dictScan over the appropriate `keyType` dictionary of `db`.
- * 3. If the dict is entirely scanned i.e. the cursor has reached 0, the next non empty dict is discovered.
- *    The dict information is embedded into the cursor and returned.
+ * 1. It uses the provided cursor `cursor` to retrieve the hashtable index from it.
+ * 2. If the hash table is in a valid state checked through the provided callback `hashtableScanValidFunction`,
+ *    it performs a hashtableScan over the appropriate `keyType` hash table of `db`.
+ * 3. If the hashtable is entirely scanned i.e. the cursor has reached 0, the next non empty hashtable is discovered.
+ *    The hashtable information is embedded into the cursor and returned.
  *
- * To restrict the scan to a single dict, pass a valid dict index as
+ * To restrict the scan to a single hashtable, pass a valid hashtable index as
  * 'onlydidx', otherwise pass -1.
  */
 unsigned long long kvstoreScan(kvstore *kvs,
                                unsigned long long cursor,
                                int onlydidx,
-                               dictScanFunction *scan_cb,
-                               kvstoreScanShouldSkipDict *skip_cb,
+                               hashtableScanFunction scan_cb,
+                               kvstoreScanShouldSkipHashtable *skip_cb,
                                void *privdata) {
-    unsigned long long _cursor = 0;
-    /* During dictionary traversal, 48 upper bits in the cursor are used for positioning in the HT.
-     * Following lower bits are used for the dict index number, ranging from 0 to 2^num_dicts_bits-1.
-     * Dict index is always 0 at the start of iteration and can be incremented only if there are
-     * multiple dicts. */
-    int didx = getAndClearDictIndexFromCursor(kvs, &cursor);
+    unsigned long long next_cursor = 0;
+    /* During hash table traversal, 48 upper bits in the cursor are used for positioning in the HT.
+     * Following lower bits are used for the hashtable index number, ranging from 0 to 2^num_hashtables_bits-1.
+     * Hashtable index is always 0 at the start of iteration and can be incremented only if there are
+     * multiple hashtables. */
+    int didx = getAndClearHashtableIndexFromCursor(kvs, &cursor);
     if (onlydidx >= 0) {
         if (didx < onlydidx) {
             /* Fast-forward to onlydidx. */
-            assert(onlydidx < kvs->num_dicts);
+            assert(onlydidx < kvs->num_hashtables);
             didx = onlydidx;
             cursor = 0;
         } else if (didx > onlydidx) {
@@ -392,55 +402,60 @@ unsigned long long kvstoreScan(kvstore *kvs,
         }
     }
 
-    dict *d = kvstoreGetDict(kvs, didx);
+    hashtable *ht = kvstoreGetHashtable(kvs, didx);
 
-    int skip = !d || (skip_cb && skip_cb(d));
+    int skip = !ht || (skip_cb && skip_cb(ht));
     if (!skip) {
-        _cursor = dictScan(d, cursor, scan_cb, privdata);
-        /* In dictScan, scan_cb may delete entries (e.g., in active expire case). */
-        freeDictIfNeeded(kvs, didx);
+        next_cursor = hashtableScan(ht, cursor, scan_cb, privdata);
+        /* In hashtableScan, scan_cb may delete entries (e.g., in active expire case). */
+        freeHashtableIfNeeded(kvs, didx);
     }
-    /* scanning done for the current dictionary or if the scanning wasn't possible, move to the next dict index. */
-    if (_cursor == 0 || skip) {
+    /* scanning done for the current hash table or if the scanning wasn't possible, move to the next hashtable index. */
+    if (next_cursor == 0 || skip) {
         if (onlydidx >= 0) return 0;
-        didx = kvstoreGetNextNonEmptyDictIndex(kvs, didx);
+        didx = kvstoreGetNextNonEmptyHashtableIndex(kvs, didx);
     }
     if (didx == -1) {
         return 0;
     }
-    addDictIndexToCursor(kvs, didx, &_cursor);
-    return _cursor;
+    addHashtableIndexToCursor(kvs, didx, &next_cursor);
+    return next_cursor;
 }
 
 /*
  * This functions increases size of kvstore to match desired number.
- * It resizes all individual dictionaries, unless skip_cb indicates otherwise.
+ * It resizes all individual hash tables, unless skip_cb indicates otherwise.
  *
- * Based on the parameter `try_expand`, appropriate dict expand API is invoked.
- * if try_expand is set to 1, `dictTryExpand` is used else `dictExpand`.
- * The return code is either `DICT_OK`/`DICT_ERR` for both the API(s).
- * `DICT_OK` response is for successful expansion. However, `DICT_ERR` response signifies failure in allocation in
- * `dictTryExpand` call and in case of `dictExpand` call it signifies no expansion was performed.
+ * Based on the parameter `try_expand`, appropriate hashtable expand API is invoked.
+ * if try_expand is set to 1, `hashtableTryExpand` is used else `hashtableExpand`.
+ * The return code is either 1 or 0 for both the API(s).
+ * 1 response is for successful expansion. However, 0 response signifies failure in allocation in
+ * `hashtableTryExpand` call and in case of `hashtableExpand` call it signifies no expansion was performed.
  */
-int kvstoreExpand(kvstore *kvs, uint64_t newsize, int try_expand, kvstoreExpandShouldSkipDictIndex *skip_cb) {
+int kvstoreExpand(kvstore *kvs, uint64_t newsize, int try_expand, kvstoreExpandShouldSkipHashtableIndex *skip_cb) {
     if (newsize == 0) return 1;
-    for (int i = 0; i < kvs->num_dicts; i++) {
+    for (int i = 0; i < kvs->num_hashtables; i++) {
         if (skip_cb && skip_cb(i)) continue;
-        /* If the dictionary doesn't exist, create it */
-        dict *d = createDictIfNeeded(kvs, i);
-        int result = try_expand ? dictTryExpand(d, newsize) : dictExpand(d, newsize);
-        if (try_expand && result == DICT_ERR) return 0;
+        /* If the hash table doesn't exist, create it. */
+        hashtable *ht = createHashtableIfNeeded(kvs, i);
+        if (try_expand) {
+            if (!hashtableTryExpand(ht, newsize)) return 0;
+        } else {
+            hashtableExpand(ht, newsize);
+        }
     }
 
     return 1;
 }
 
-/* Returns fair random dict index, probability of each dict being returned is proportional to the number of elements
- * that dictionary holds. This function guarantees that it returns a dict-index of a non-empty dict, unless the entire
- * kvstore is empty. Time complexity of this function is O(log(kvs->num_dicts)). */
-int kvstoreGetFairRandomDictIndex(kvstore *kvs) {
-    unsigned long target = kvstoreSize(kvs) ? (randomULong() % kvstoreSize(kvs)) + 1 : 0;
-    return kvstoreFindDictIndexByKeyIndex(kvs, target);
+/* Returns fair random hashtable index, probability of each hashtable being
+ * returned is proportional to the number of elements that hash table holds.
+ * This function guarantees that it returns a hashtable-index of a non-empty
+ * hashtable, unless the entire kvstore is empty. Time complexity of this
+ * function is O(log(kvs->num_hashtables)). */
+int kvstoreGetFairRandomHashtableIndex(kvstore *kvs) {
+    unsigned long target = kvstoreSize(kvs) ? (random() % kvstoreSize(kvs)) + 1 : 0;
+    return kvstoreFindHashtableIndexByKeyIndex(kvs, target);
 }
 
 void kvstoreGetStats(kvstore *kvs, char *buf, size_t bufsize, int full) {
@@ -449,40 +464,40 @@ void kvstoreGetStats(kvstore *kvs, char *buf, size_t bufsize, int full) {
     size_t l;
     char *orig_buf = buf;
     size_t orig_bufsize = bufsize;
-    dictStats *mainHtStats = NULL;
-    dictStats *rehashHtStats = NULL;
-    dict *d;
+    hashtableStats *mainHtStats = NULL;
+    hashtableStats *rehashHtStats = NULL;
+    hashtable *ht;
     kvstoreIterator *kvs_it = kvstoreIteratorInit(kvs);
-    while ((d = kvstoreIteratorNextDict(kvs_it))) {
-        dictStats *stats = dictGetStatsHt(d, 0, full);
+    while ((ht = kvstoreIteratorNextHashtable(kvs_it))) {
+        hashtableStats *stats = hashtableGetStatsHt(ht, 0, full);
         if (!mainHtStats) {
             mainHtStats = stats;
         } else {
-            dictCombineStats(stats, mainHtStats);
-            dictFreeStats(stats);
+            hashtableCombineStats(stats, mainHtStats);
+            hashtableFreeStats(stats);
         }
-        if (dictIsRehashing(d)) {
-            stats = dictGetStatsHt(d, 1, full);
+        if (hashtableIsRehashing(ht)) {
+            stats = hashtableGetStatsHt(ht, 1, full);
             if (!rehashHtStats) {
                 rehashHtStats = stats;
             } else {
-                dictCombineStats(stats, rehashHtStats);
-                dictFreeStats(stats);
+                hashtableCombineStats(stats, rehashHtStats);
+                hashtableFreeStats(stats);
             }
         }
     }
     kvstoreIteratorRelease(kvs_it);
 
     if (mainHtStats && bufsize > 0) {
-        l = dictGetStatsMsg(buf, bufsize, mainHtStats, full);
-        dictFreeStats(mainHtStats);
+        l = hashtableGetStatsMsg(buf, bufsize, mainHtStats, full);
+        hashtableFreeStats(mainHtStats);
         buf += l;
         bufsize -= l;
     }
 
     if (rehashHtStats && bufsize > 0) {
-        l = dictGetStatsMsg(buf, bufsize, rehashHtStats, full);
-        dictFreeStats(rehashHtStats);
+        l = hashtableGetStatsMsg(buf, bufsize, rehashHtStats, full);
+        hashtableFreeStats(rehashHtStats);
         buf += l;
         bufsize -= l;
     }
@@ -490,142 +505,143 @@ void kvstoreGetStats(kvstore *kvs, char *buf, size_t bufsize, int full) {
     if (orig_bufsize) orig_buf[orig_bufsize - 1] = '\0';
 }
 
-/* Finds a dict containing target element in a key space ordered by dict index.
- * Consider this example. Dictionaries are represented by brackets and keys by dots:
+/* Finds a hashtable containing target element in a key space ordered by hashtable index.
+ * Consider this example. Hash Tables are represented by brackets and keys by dots:
  *  #0   #1   #2     #3    #4
  * [..][....][...][.......][.]
  *                    ^
  *                 target
  *
- * In this case dict #3 contains key that we are trying to find.
+ * In this case hashtable #3 contains key that we are trying to find.
  *
- * The return value is 0 based dict-index, and the range of the target is [1..kvstoreSize], kvstoreSize inclusive.
+ * The return value is 0 based hashtable-index, and the range of the target is [1..kvstoreSize], kvstoreSize inclusive.
  *
- * To find the dict, we start with the root node of the binary index tree and search through its children
- * from the highest index (2^num_dicts_bits in our case) to the lowest index. At each node, we check if the target
+ * To find the hashtable, we start with the root node of the binary index tree and search through its children
+ * from the highest index (2^num_hashtables_bits in our case) to the lowest index. At each node, we check if the target
  * value is greater than the node's value. If it is, we remove the node's value from the target and recursively
  * search for the new target using the current node as the parent.
- * Time complexity of this function is O(log(kvs->num_dicts))
+ * Time complexity of this function is O(log(kvs->num_hashtables))
  */
-int kvstoreFindDictIndexByKeyIndex(kvstore *kvs, unsigned long target) {
-    if (kvs->num_dicts == 1 || kvstoreSize(kvs) == 0) return 0;
+int kvstoreFindHashtableIndexByKeyIndex(kvstore *kvs, unsigned long target) {
+    if (kvs->num_hashtables == 1 || kvstoreSize(kvs) == 0) return 0;
     assert(target <= kvstoreSize(kvs));
 
-    int result = 0, bit_mask = 1 << kvs->num_dicts_bits;
+    int result = 0, bit_mask = 1 << kvs->num_hashtables_bits;
     for (int i = bit_mask; i != 0; i >>= 1) {
         int current = result + i;
         /* When the target index is greater than 'current' node value the we will update
          * the target and search in the 'current' node tree. */
-        if (target > kvs->dict_size_index[current]) {
-            target -= kvs->dict_size_index[current];
+        if (target > kvs->hashtable_size_index[current]) {
+            target -= kvs->hashtable_size_index[current];
             result = current;
         }
     }
-    /* Adjust the result to get the correct dict:
+    /* Adjust the result to get the correct hashtable:
      * 1. result += 1;
-     *    After the calculations, the index of target in dict_size_index should be the next one,
+     *    After the calculations, the index of target in hashtable_size_index should be the next one,
      *    so we should add 1.
      * 2. result -= 1;
-     *    Unlike BIT(dict_size_index is 1-based), dict indices are 0-based, so we need to subtract 1.
+     *    Unlike BIT(hashtable_size_index is 1-based), hashtable indices are 0-based, so we need to subtract 1.
      * As the addition and subtraction cancel each other out, we can simply return the result. */
     return result;
 }
 
-/* Wrapper for kvstoreFindDictIndexByKeyIndex to get the first non-empty dict index in the kvstore. */
-int kvstoreGetFirstNonEmptyDictIndex(kvstore *kvs) {
-    return kvstoreFindDictIndexByKeyIndex(kvs, 1);
+/* Wrapper for kvstoreFindHashtableIndexByKeyIndex to get the first non-empty hashtable index in the kvstore. */
+int kvstoreGetFirstNonEmptyHashtableIndex(kvstore *kvs) {
+    return kvstoreFindHashtableIndexByKeyIndex(kvs, 1);
 }
 
-/* Returns next non-empty dict index strictly after given one, or -1 if provided didx is the last one. */
-int kvstoreGetNextNonEmptyDictIndex(kvstore *kvs, int didx) {
-    if (kvs->num_dicts == 1) {
+/* Returns next non-empty hashtable index strictly after given one, or -1 if provided didx is the last one. */
+int kvstoreGetNextNonEmptyHashtableIndex(kvstore *kvs, int didx) {
+    if (kvs->num_hashtables == 1) {
         assert(didx == 0);
         return -1;
     }
     unsigned long long next_key = cumulativeKeyCountRead(kvs, didx) + 1;
-    return next_key <= kvstoreSize(kvs) ? kvstoreFindDictIndexByKeyIndex(kvs, next_key) : -1;
+    return next_key <= kvstoreSize(kvs) ? kvstoreFindHashtableIndexByKeyIndex(kvs, next_key) : -1;
 }
 
-int kvstoreNumNonEmptyDicts(kvstore *kvs) {
-    return kvs->non_empty_dicts;
+int kvstoreNumNonEmptyHashtables(kvstore *kvs) {
+    return kvs->non_empty_hashtables;
 }
 
-int kvstoreNumAllocatedDicts(kvstore *kvs) {
-    return kvs->allocated_dicts;
+int kvstoreNumAllocatedHashtables(kvstore *kvs) {
+    return kvs->allocated_hashtables;
 }
 
-int kvstoreNumDicts(kvstore *kvs) {
-    return kvs->num_dicts;
+int kvstoreNumHashtables(kvstore *kvs) {
+    return kvs->num_hashtables;
 }
 
-/* Returns kvstore iterator that can be used to iterate through sub-dictionaries.
+/* Returns kvstore iterator that can be used to iterate through sub-hash tables.
  *
  * The caller should free the resulting kvs_it with kvstoreIteratorRelease. */
 kvstoreIterator *kvstoreIteratorInit(kvstore *kvs) {
     kvstoreIterator *kvs_it = zmalloc(sizeof(*kvs_it));
     kvs_it->kvs = kvs;
     kvs_it->didx = -1;
-    kvs_it->next_didx = kvstoreGetFirstNonEmptyDictIndex(kvs_it->kvs); /* Finds first non-empty dict index. */
-    dictInitSafeIterator(&kvs_it->di, NULL);
+    kvs_it->next_didx = kvstoreGetFirstNonEmptyHashtableIndex(kvs_it->kvs); /* Finds first non-empty hashtable index. */
+    hashtableInitSafeIterator(&kvs_it->di, NULL);
     return kvs_it;
 }
 
 /* Free the kvs_it returned by kvstoreIteratorInit. */
 void kvstoreIteratorRelease(kvstoreIterator *kvs_it) {
-    dictIterator *iter = &kvs_it->di;
-    dictResetIterator(iter);
+    hashtableIterator *iter = &kvs_it->di;
+    hashtableResetIterator(iter);
     /* In the safe iterator context, we may delete entries. */
-    freeDictIfNeeded(kvs_it->kvs, kvs_it->didx);
+    freeHashtableIfNeeded(kvs_it->kvs, kvs_it->didx);
     zfree(kvs_it);
 }
 
-/* Returns next dictionary from the iterator, or NULL if iteration is complete. */
-static dict *kvstoreIteratorNextDict(kvstoreIterator *kvs_it) {
+/* Returns next hash table from the iterator, or NULL if iteration is complete. */
+static hashtable *kvstoreIteratorNextHashtable(kvstoreIterator *kvs_it) {
     if (kvs_it->next_didx == -1) return NULL;
 
-    /* The dict may be deleted during the iteration process, so here need to check for NULL. */
-    if (kvs_it->didx != -1 && kvstoreGetDict(kvs_it->kvs, kvs_it->didx)) {
-        /* Before we move to the next dict, reset the iter of the previous dict. */
-        dictIterator *iter = &kvs_it->di;
-        dictResetIterator(iter);
+    /* The hashtable may be deleted during the iteration process, so here need to check for NULL. */
+    if (kvs_it->didx != -1 && kvstoreGetHashtable(kvs_it->kvs, kvs_it->didx)) {
+        /* Before we move to the next hashtable, reset the iter of the previous hashtable. */
+        hashtableIterator *iter = &kvs_it->di;
+        hashtableResetIterator(iter);
         /* In the safe iterator context, we may delete entries. */
-        freeDictIfNeeded(kvs_it->kvs, kvs_it->didx);
+        freeHashtableIfNeeded(kvs_it->kvs, kvs_it->didx);
     }
 
     kvs_it->didx = kvs_it->next_didx;
-    kvs_it->next_didx = kvstoreGetNextNonEmptyDictIndex(kvs_it->kvs, kvs_it->didx);
-    return kvs_it->kvs->dicts[kvs_it->didx];
+    kvs_it->next_didx = kvstoreGetNextNonEmptyHashtableIndex(kvs_it->kvs, kvs_it->didx);
+    return kvs_it->kvs->hashtables[kvs_it->didx];
 }
 
-int kvstoreIteratorGetCurrentDictIndex(kvstoreIterator *kvs_it) {
-    assert(kvs_it->didx >= 0 && kvs_it->didx < kvs_it->kvs->num_dicts);
+int kvstoreIteratorGetCurrentHashtableIndex(kvstoreIterator *kvs_it) {
+    assert(kvs_it->didx >= 0 && kvs_it->didx < kvs_it->kvs->num_hashtables);
     return kvs_it->didx;
 }
 
-/* Returns next entry. */
-dictEntry *kvstoreIteratorNext(kvstoreIterator *kvs_it) {
-    dictEntry *de = kvs_it->di.d ? dictNext(&kvs_it->di) : NULL;
-    if (!de) { /* No current dict or reached the end of the dictionary. */
-        dict *d = kvstoreIteratorNextDict(kvs_it);
-        if (!d) return NULL;
-        dictInitSafeIterator(&kvs_it->di, d);
-        de = dictNext(&kvs_it->di);
+/* Fetches the next element and returns 1. Returns 0 if there are no more elements. */
+int kvstoreIteratorNext(kvstoreIterator *kvs_it, void **next) {
+    if (kvs_it->didx != -1 && hashtableNext(&kvs_it->di, next)) {
+        return 1;
+    } else {
+        /* No current hashtable or reached the end of the hash table. */
+        hashtable *ht = kvstoreIteratorNextHashtable(kvs_it);
+        if (!ht) return 0;
+        hashtableInitSafeIterator(&kvs_it->di, ht);
+        return hashtableNext(&kvs_it->di, next);
     }
-    return de;
 }
 
-/* This method traverses through kvstore dictionaries and triggers a resize.
+/* This method traverses through kvstore hash tables and triggers a resize.
  * It first tries to shrink if needed, and if it isn't, it tries to expand. */
-void kvstoreTryResizeDicts(kvstore *kvs, int limit) {
-    if (limit > kvs->num_dicts) limit = kvs->num_dicts;
+void kvstoreTryResizeHashtables(kvstore *kvs, int limit) {
+    if (limit > kvs->num_hashtables) limit = kvs->num_hashtables;
 
     for (int i = 0; i < limit; i++) {
         int didx = kvs->resize_cursor;
-        dict *d = kvstoreGetDict(kvs, didx);
-        if (d && dictShrinkIfNeeded(d) == DICT_ERR) {
-            dictExpandIfNeeded(d);
+        hashtable *ht = kvstoreGetHashtable(kvs, didx);
+        if (ht && !hashtableShrinkIfNeeded(ht)) {
+            hashtableExpandIfNeeded(ht);
         }
-        kvs->resize_cursor = (didx + 1) % kvs->num_dicts;
+        kvs->resize_cursor = (didx + 1) % kvs->num_hashtables;
     }
 }
 
@@ -639,14 +655,14 @@ void kvstoreTryResizeDicts(kvstore *kvs, int limit) {
 uint64_t kvstoreIncrementallyRehash(kvstore *kvs, uint64_t threshold_us) {
     if (listLength(kvs->rehashing) == 0) return 0;
 
-    /* Our goal is to rehash as many dictionaries as we can before reaching threshold_us,
-     * after each dictionary completes rehashing, it removes itself from the list. */
+    /* Our goal is to rehash as many hash tables as we can before reaching threshold_us,
+     * after each hash table completes rehashing, it removes itself from the list. */
     listNode *node;
     monotime timer;
     uint64_t elapsed_us = 0;
     elapsedStart(&timer);
     while ((node = listFirst(kvs->rehashing))) {
-        dictRehashMicroseconds(listNodeValue(node), threshold_us - elapsed_us);
+        hashtableRehashMicroseconds(listNodeValue(node), threshold_us - elapsed_us);
 
         elapsed_us = elapsedUs(timer);
         if (elapsed_us >= threshold_us) {
@@ -656,118 +672,118 @@ uint64_t kvstoreIncrementallyRehash(kvstore *kvs, uint64_t threshold_us) {
     return elapsed_us;
 }
 
+/* Size in bytes of hash tables used by the hashtables. */
 size_t kvstoreOverheadHashtableLut(kvstore *kvs) {
-    return kvs->overhead_hashtable_lut * sizeof(dictEntry *);
+    return kvs->overhead_hashtable_lut;
 }
 
 size_t kvstoreOverheadHashtableRehashing(kvstore *kvs) {
-    return kvs->overhead_hashtable_rehashing * sizeof(dictEntry *);
+    return kvs->overhead_hashtable_rehashing;
 }
 
-unsigned long kvstoreDictRehashingCount(kvstore *kvs) {
+unsigned long kvstoreHashtableRehashingCount(kvstore *kvs) {
     return listLength(kvs->rehashing);
 }
 
-unsigned long kvstoreDictSize(kvstore *kvs, int didx) {
-    dict *d = kvstoreGetDict(kvs, didx);
-    if (!d) return 0;
-    return dictSize(d);
+unsigned long kvstoreHashtableSize(kvstore *kvs, int didx) {
+    hashtable *ht = kvstoreGetHashtable(kvs, didx);
+    if (!ht) return 0;
+    return hashtableSize(ht);
 }
 
-kvstoreDictIterator *kvstoreGetDictIterator(kvstore *kvs, int didx) {
-    kvstoreDictIterator *kvs_di = zmalloc(sizeof(*kvs_di));
+kvstoreHashtableIterator *kvstoreGetHashtableIterator(kvstore *kvs, int didx) {
+    kvstoreHashtableIterator *kvs_di = zmalloc(sizeof(*kvs_di));
     kvs_di->kvs = kvs;
     kvs_di->didx = didx;
-    dictInitIterator(&kvs_di->di, kvstoreGetDict(kvs, didx));
+    hashtableInitIterator(&kvs_di->di, kvstoreGetHashtable(kvs, didx));
     return kvs_di;
 }
 
-kvstoreDictIterator *kvstoreGetDictSafeIterator(kvstore *kvs, int didx) {
-    kvstoreDictIterator *kvs_di = zmalloc(sizeof(*kvs_di));
+kvstoreHashtableIterator *kvstoreGetHashtableSafeIterator(kvstore *kvs, int didx) {
+    kvstoreHashtableIterator *kvs_di = zmalloc(sizeof(*kvs_di));
     kvs_di->kvs = kvs;
     kvs_di->didx = didx;
-    dictInitSafeIterator(&kvs_di->di, kvstoreGetDict(kvs, didx));
+    hashtableInitSafeIterator(&kvs_di->di, kvstoreGetHashtable(kvs, didx));
     return kvs_di;
 }
 
-/* Free the kvs_di returned by kvstoreGetDictIterator and kvstoreGetDictSafeIterator. */
-void kvstoreReleaseDictIterator(kvstoreDictIterator *kvs_di) {
-    /* The dict may be deleted during the iteration process, so here need to check for NULL. */
-    if (kvstoreGetDict(kvs_di->kvs, kvs_di->didx)) {
-        dictResetIterator(&kvs_di->di);
+/* Free the kvs_di returned by kvstoreGetHashtableIterator and kvstoreGetHashtableSafeIterator. */
+void kvstoreReleaseHashtableIterator(kvstoreHashtableIterator *kvs_di) {
+    /* The hashtable may be deleted during the iteration process, so here need to check for NULL. */
+    if (kvstoreGetHashtable(kvs_di->kvs, kvs_di->didx)) {
+        hashtableResetIterator(&kvs_di->di);
         /* In the safe iterator context, we may delete entries. */
-        freeDictIfNeeded(kvs_di->kvs, kvs_di->didx);
+        freeHashtableIfNeeded(kvs_di->kvs, kvs_di->didx);
     }
 
     zfree(kvs_di);
 }
 
-/* Get the next element of the dict through kvstoreDictIterator and dictNext. */
-dictEntry *kvstoreDictIteratorNext(kvstoreDictIterator *kvs_di) {
-    /* The dict may be deleted during the iteration process, so here need to check for NULL. */
-    dict *d = kvstoreGetDict(kvs_di->kvs, kvs_di->didx);
-    if (!d) return NULL;
-
-    return dictNext(&kvs_di->di);
+/* Get the next element of the hashtable through kvstoreHashtableIterator and hashtableNext. */
+int kvstoreHashtableIteratorNext(kvstoreHashtableIterator *kvs_di, void **next) {
+    /* The hashtable may be deleted during the iteration process, so here need to check for NULL. */
+    hashtable *ht = kvstoreGetHashtable(kvs_di->kvs, kvs_di->didx);
+    if (!ht) return 0;
+    return hashtableNext(&kvs_di->di, next);
 }
 
-dictEntry *kvstoreDictGetRandomKey(kvstore *kvs, int didx) {
-    dict *d = kvstoreGetDict(kvs, didx);
-    if (!d) return NULL;
-    return dictGetRandomKey(d);
+int kvstoreHashtableRandomEntry(kvstore *kvs, int didx, void **entry) {
+    hashtable *ht = kvstoreGetHashtable(kvs, didx);
+    if (!ht) return 0;
+    return hashtableRandomEntry(ht, entry);
 }
 
-dictEntry *kvstoreDictGetFairRandomKey(kvstore *kvs, int didx) {
-    dict *d = kvstoreGetDict(kvs, didx);
-    if (!d) return NULL;
-    return dictGetFairRandomKey(d);
+int kvstoreHashtableFairRandomEntry(kvstore *kvs, int didx, void **entry) {
+    hashtable *ht = kvstoreGetHashtable(kvs, didx);
+    if (!ht) return 0;
+    return hashtableFairRandomEntry(ht, entry);
 }
 
-unsigned int kvstoreDictGetSomeKeys(kvstore *kvs, int didx, dictEntry **des, unsigned int count) {
-    dict *d = kvstoreGetDict(kvs, didx);
-    if (!d) return 0;
-    return dictGetSomeKeys(d, des, count);
+unsigned int kvstoreHashtableSampleEntries(kvstore *kvs, int didx, void **dst, unsigned int count) {
+    hashtable *ht = kvstoreGetHashtable(kvs, didx);
+    if (!ht) return 0;
+    return hashtableSampleEntries(ht, dst, count);
 }
 
-int kvstoreDictExpand(kvstore *kvs, int didx, unsigned long size) {
-    dict *d = kvstoreGetDict(kvs, didx);
-    if (!d) return DICT_ERR;
-    return dictExpand(d, size);
+int kvstoreHashtableExpand(kvstore *kvs, int didx, unsigned long size) {
+    hashtable *ht = kvstoreGetHashtable(kvs, didx);
+    if (!ht) return 0;
+    return hashtableExpand(ht, size);
 }
 
-unsigned long kvstoreDictScanDefrag(kvstore *kvs,
-                                    int didx,
-                                    unsigned long v,
-                                    dictScanFunction *fn,
-                                    const dictDefragFunctions *defragfns,
-                                    void *privdata) {
-    dict *d = kvstoreGetDict(kvs, didx);
-    if (!d) return 0;
-    return dictScanDefrag(d, v, fn, defragfns, privdata);
+unsigned long kvstoreHashtableScanDefrag(kvstore *kvs,
+                                         int didx,
+                                         unsigned long v,
+                                         hashtableScanFunction fn,
+                                         void *privdata,
+                                         void *(*defragfn)(void *),
+                                         int flags) {
+    hashtable *ht = kvstoreGetHashtable(kvs, didx);
+    if (!ht) return 0;
+    return hashtableScanDefrag(ht, v, fn, privdata, defragfn, flags);
 }
 
-/* Unlike kvstoreDictScanDefrag(), this method doesn't defrag the data(keys and values)
- * within dict, it only reallocates the memory used by the dict structure itself using
- * the provided allocation function. This feature was added for the active defrag feature.
+/* This function doesn't defrag the data (keys and values) within hashtable. It
+ * only reallocates the memory used by the hashtable structure itself using the
+ * provided allocation function. This feature was added for the active defrag
+ * feature.
  *
- * With 16k dictionaries for cluster mode with 1 shard, this operation may require substantial time
- * to execute.  A "cursor" is used to perform the operation iteratively.  When first called, a
+ * A "cursor" is used to perform the operation iteratively.  When first called, a
  * cursor value of 0 should be provided.  The return value is an updated cursor which should be
  * provided on the next iteration.  The operation is complete when 0 is returned.
  *
- * The 'defragfn' callback is called with a reference to the dict that callback can reallocate. */
-unsigned long kvstoreDictLUTDefrag(kvstore *kvs, unsigned long cursor, kvstoreDictLUTDefragFunction *defragfn) {
-    for (int didx = cursor; didx < kvs->num_dicts; didx++) {
-        dict **d = kvstoreGetDictRef(kvs, didx), *newd;
-        if (!*d) continue;
-
-        listNode *rehashing_node = NULL;
-        if (listLength(kvs->rehashing) > 0) {
-            rehashing_node = ((kvstoreDictMetadata *)dictMetadata(*d))->rehashing_node;
+ * The provided defragfn callback should return either NULL (if reallocation
+ * isn't necessary) or return a pointer to reallocated memory like realloc(). */
+unsigned long kvstoreHashtableDefragTables(kvstore *kvs, unsigned long cursor, void *(*defragfn)(void *)) {
+    for (int didx = cursor; didx < kvs->num_hashtables; didx++) {
+        hashtable **ref = kvstoreGetHashtableRef(kvs, didx), *new;
+        if (!*ref) continue;
+        new = hashtableDefragTables(*ref, defragfn);
+        if (new) {
+            *ref = new;
+            kvstoreHashtableMetadata *metadata = hashtableMetadata(new);
+            if (metadata->rehashing_node) metadata->rehashing_node->value = new;
         }
-
-        if ((newd = defragfn(*d))) *d = newd;
-        if (rehashing_node) listNodeValue(rehashing_node) = *d;
         return (didx + 1);
     }
     return 0;
@@ -777,68 +793,76 @@ uint64_t kvstoreGetHash(kvstore *kvs, const void *key) {
     return kvs->dtype->hashFunction(key);
 }
 
-void *kvstoreDictFetchValue(kvstore *kvs, int didx, const void *key) {
-    dict *d = kvstoreGetDict(kvs, didx);
-    if (!d) return NULL;
-    return dictFetchValue(d, key);
+int kvstoreHashtableFind(kvstore *kvs, int didx, void *key, void **found) {
+    hashtable *ht = kvstoreGetHashtable(kvs, didx);
+    if (!ht) return 0;
+    return hashtableFind(ht, key, found);
 }
 
-dictEntry *kvstoreDictFind(kvstore *kvs, int didx, void *key) {
-    dict *d = kvstoreGetDict(kvs, didx);
-    if (!d) return NULL;
-    return dictFind(d, key);
+void **kvstoreHashtableFindRef(kvstore *kvs, int didx, const void *key) {
+    hashtable *ht = kvstoreGetHashtable(kvs, didx);
+    if (!ht) return NULL;
+    return hashtableFindRef(ht, key);
 }
 
-/*
- * The kvstore handles `key` based on `dictType` during initialization:
- * - If `dictType.embedded-entry` is 1, it clones the `key`.
- * - Otherwise, it assumes ownership of the `key`.
- * The caller must ensure the `key` is properly freed.
- *
- * kvstore current usage:
- *
- * 1. keyspace (db.keys) kvstore - creates a copy of the key.
- * 2. expiry (db.expires), pubsub_channels and pubsubshard_channels kvstore - takes ownership of the key.
- */
-dictEntry *kvstoreDictAddRaw(kvstore *kvs, int didx, void *key, dictEntry **existing) {
-    dict *d = createDictIfNeeded(kvs, didx);
-    dictEntry *ret = dictAddRaw(d, key, existing);
+int kvstoreHashtableAddOrFind(kvstore *kvs, int didx, void *key, void **existing) {
+    hashtable *ht = createHashtableIfNeeded(kvs, didx);
+    int ret = hashtableAddOrFind(ht, key, existing);
+    if (ret) cumulativeKeyCountAdd(kvs, didx, 1);
+    return ret;
+}
+
+int kvstoreHashtableAdd(kvstore *kvs, int didx, void *entry) {
+    hashtable *ht = createHashtableIfNeeded(kvs, didx);
+    int ret = hashtableAdd(ht, entry);
     if (ret) cumulativeKeyCountAdd(kvs, didx, 1);
     return ret;
 }
 
-void kvstoreDictSetKey(kvstore *kvs, int didx, dictEntry *de, void *key) {
-    dict *d = kvstoreGetDict(kvs, didx);
-    dictSetKey(d, de, key);
+int kvstoreHashtableFindPositionForInsert(kvstore *kvs, int didx, void *key, hashtablePosition *position, void **existing) {
+    hashtable *ht = createHashtableIfNeeded(kvs, didx);
+    return hashtableFindPositionForInsert(ht, key, position, existing);
 }
 
-void kvstoreDictSetVal(kvstore *kvs, int didx, dictEntry *de, void *val) {
-    UNUSED(kvs);
-    UNUSED(didx);
-    dictSetVal(NULL, de, val);
+/* Must be used together with kvstoreHashtableFindPositionForInsert, with returned
+ * position and with the same didx. */
+void kvstoreHashtableInsertAtPosition(kvstore *kvs, int didx, void *entry, void *position) {
+    hashtable *ht = kvstoreGetHashtable(kvs, didx);
+    hashtableInsertAtPosition(ht, entry, position);
+    cumulativeKeyCountAdd(kvs, didx, 1);
 }
 
-dictEntry *
-kvstoreDictTwoPhaseUnlinkFind(kvstore *kvs, int didx, const void *key, dictEntry ***plink, int *table_index) {
-    dict *d = kvstoreGetDict(kvs, didx);
-    if (!d) return NULL;
-    return dictTwoPhaseUnlinkFind(kvstoreGetDict(kvs, didx), key, plink, table_index);
+void **kvstoreHashtableTwoPhasePopFindRef(kvstore *kvs, int didx, const void *key, void *position) {
+    hashtable *ht = kvstoreGetHashtable(kvs, didx);
+    if (!ht) return NULL;
+    return hashtableTwoPhasePopFindRef(ht, key, position);
 }
 
-void kvstoreDictTwoPhaseUnlinkFree(kvstore *kvs, int didx, dictEntry *he, dictEntry **plink, int table_index) {
-    dict *d = kvstoreGetDict(kvs, didx);
-    dictTwoPhaseUnlinkFree(d, he, plink, table_index);
+void kvstoreHashtableTwoPhasePopDelete(kvstore *kvs, int didx, void *position) {
+    hashtable *ht = kvstoreGetHashtable(kvs, didx);
+    hashtableTwoPhasePopDelete(ht, position);
     cumulativeKeyCountAdd(kvs, didx, -1);
-    freeDictIfNeeded(kvs, didx);
+    freeHashtableIfNeeded(kvs, didx);
+}
+
+int kvstoreHashtablePop(kvstore *kvs, int didx, const void *key, void **popped) {
+    hashtable *ht = kvstoreGetHashtable(kvs, didx);
+    if (!ht) return 0;
+    int ret = hashtablePop(ht, key, popped);
+    if (ret) {
+        cumulativeKeyCountAdd(kvs, didx, -1);
+        freeHashtableIfNeeded(kvs, didx);
+    }
+    return ret;
 }
 
-int kvstoreDictDelete(kvstore *kvs, int didx, const void *key) {
-    dict *d = kvstoreGetDict(kvs, didx);
-    if (!d) return DICT_ERR;
-    int ret = dictDelete(d, key);
-    if (ret == DICT_OK) {
+int kvstoreHashtableDelete(kvstore *kvs, int didx, const void *key) {
+    hashtable *ht = kvstoreGetHashtable(kvs, didx);
+    if (!ht) return 0;
+    int ret = hashtableDelete(ht, key);
+    if (ret) {
         cumulativeKeyCountAdd(kvs, didx, -1);
-        freeDictIfNeeded(kvs, didx);
+        freeHashtableIfNeeded(kvs, didx);
     }
     return ret;
 }
diff --git a/src/kvstore.h b/src/kvstore.h
index 00ec472e73..1a8c74a6b9 100644
--- a/src/kvstore.h
+++ b/src/kvstore.h
@@ -1,20 +1,20 @@
-#ifndef DICTARRAY_H_
-#define DICTARRAY_H_
+#ifndef KVSTORE_H
+#define KVSTORE_H
 
-#include "dict.h"
+#include "hashtable.h"
 #include "adlist.h"
 
 typedef struct _kvstore kvstore;
 typedef struct _kvstoreIterator kvstoreIterator;
-typedef struct _kvstoreDictIterator kvstoreDictIterator;
+typedef struct _kvstoreHashtableIterator kvstoreHashtableIterator;
 
-typedef int(kvstoreScanShouldSkipDict)(dict *d);
-typedef int(kvstoreExpandShouldSkipDictIndex)(int didx);
+typedef int(kvstoreScanShouldSkipHashtable)(hashtable *d);
+typedef int(kvstoreExpandShouldSkipHashtableIndex)(int didx);
 
-#define KVSTORE_ALLOCATE_DICTS_ON_DEMAND (1 << 0)
-#define KVSTORE_FREE_EMPTY_DICTS (1 << 1)
-kvstore *kvstoreCreate(dictType *type, int num_dicts_bits, int flags);
-void kvstoreEmpty(kvstore *kvs, void(callback)(dict *));
+#define KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND (1 << 0)
+#define KVSTORE_FREE_EMPTY_HASHTABLES (1 << 1)
+kvstore *kvstoreCreate(hashtableType *type, int num_hashtables_bits, int flags);
+void kvstoreEmpty(kvstore *kvs, void(callback)(hashtable *));
 void kvstoreRelease(kvstore *kvs);
 unsigned long long kvstoreSize(kvstore *kvs);
 unsigned long kvstoreBuckets(kvstore *kvs);
@@ -22,64 +22,69 @@ size_t kvstoreMemUsage(kvstore *kvs);
 unsigned long long kvstoreScan(kvstore *kvs,
                                unsigned long long cursor,
                                int onlydidx,
-                               dictScanFunction *scan_cb,
-                               kvstoreScanShouldSkipDict *skip_cb,
+                               hashtableScanFunction scan_cb,
+                               kvstoreScanShouldSkipHashtable *skip_cb,
                                void *privdata);
-int kvstoreExpand(kvstore *kvs, uint64_t newsize, int try_expand, kvstoreExpandShouldSkipDictIndex *skip_cb);
-int kvstoreGetFairRandomDictIndex(kvstore *kvs);
+int kvstoreExpand(kvstore *kvs, uint64_t newsize, int try_expand, kvstoreExpandShouldSkipHashtableIndex *skip_cb);
+int kvstoreGetFairRandomHashtableIndex(kvstore *kvs);
 void kvstoreGetStats(kvstore *kvs, char *buf, size_t bufsize, int full);
 
-int kvstoreFindDictIndexByKeyIndex(kvstore *kvs, unsigned long target);
-int kvstoreGetFirstNonEmptyDictIndex(kvstore *kvs);
-int kvstoreGetNextNonEmptyDictIndex(kvstore *kvs, int didx);
-int kvstoreNumNonEmptyDicts(kvstore *kvs);
-int kvstoreNumAllocatedDicts(kvstore *kvs);
-int kvstoreNumDicts(kvstore *kvs);
+int kvstoreFindHashtableIndexByKeyIndex(kvstore *kvs, unsigned long target);
+int kvstoreGetFirstNonEmptyHashtableIndex(kvstore *kvs);
+int kvstoreGetNextNonEmptyHashtableIndex(kvstore *kvs, int didx);
+int kvstoreNumNonEmptyHashtables(kvstore *kvs);
+int kvstoreNumAllocatedHashtables(kvstore *kvs);
+int kvstoreNumHashtables(kvstore *kvs);
 uint64_t kvstoreGetHash(kvstore *kvs, const void *key);
 
-void kvstoreDictRehashingStarted(dict *d);
-void kvstoreDictRehashingCompleted(dict *d);
-size_t kvstoreDictMetadataSize(dict *d);
+void kvstoreHashtableRehashingStarted(hashtable *d);
+void kvstoreHashtableRehashingCompleted(hashtable *d);
+void kvstoreHashtableTrackMemUsage(hashtable *s, ssize_t delta);
+size_t kvstoreHashtableMetadataSize(void);
 
 /* kvstore iterator specific functions */
 kvstoreIterator *kvstoreIteratorInit(kvstore *kvs);
 void kvstoreIteratorRelease(kvstoreIterator *kvs_it);
-int kvstoreIteratorGetCurrentDictIndex(kvstoreIterator *kvs_it);
-dictEntry *kvstoreIteratorNext(kvstoreIterator *kvs_it);
+int kvstoreIteratorGetCurrentHashtableIndex(kvstoreIterator *kvs_it);
+int kvstoreIteratorNext(kvstoreIterator *kvs_it, void **next);
 
 /* Rehashing */
-void kvstoreTryResizeDicts(kvstore *kvs, int limit);
+void kvstoreTryResizeHashtables(kvstore *kvs, int limit);
 uint64_t kvstoreIncrementallyRehash(kvstore *kvs, uint64_t threshold_us);
 size_t kvstoreOverheadHashtableLut(kvstore *kvs);
 size_t kvstoreOverheadHashtableRehashing(kvstore *kvs);
-unsigned long kvstoreDictRehashingCount(kvstore *kvs);
+unsigned long kvstoreHashtableRehashingCount(kvstore *kvs);
 
-/* Specific dict access by dict-index */
-unsigned long kvstoreDictSize(kvstore *kvs, int didx);
-kvstoreDictIterator *kvstoreGetDictIterator(kvstore *kvs, int didx);
-kvstoreDictIterator *kvstoreGetDictSafeIterator(kvstore *kvs, int didx);
-void kvstoreReleaseDictIterator(kvstoreDictIterator *kvs_id);
-dictEntry *kvstoreDictIteratorNext(kvstoreDictIterator *kvs_di);
-dictEntry *kvstoreDictGetRandomKey(kvstore *kvs, int didx);
-dictEntry *kvstoreDictGetFairRandomKey(kvstore *kvs, int didx);
-unsigned int kvstoreDictGetSomeKeys(kvstore *kvs, int didx, dictEntry **des, unsigned int count);
-int kvstoreDictExpand(kvstore *kvs, int didx, unsigned long size);
-unsigned long kvstoreDictScanDefrag(kvstore *kvs,
-                                    int didx,
-                                    unsigned long v,
-                                    dictScanFunction *fn,
-                                    const dictDefragFunctions *defragfns,
-                                    void *privdata);
-typedef dict *(kvstoreDictLUTDefragFunction)(dict *d);
-unsigned long kvstoreDictLUTDefrag(kvstore *kvs, unsigned long cursor, kvstoreDictLUTDefragFunction *defragfn);
-void *kvstoreDictFetchValue(kvstore *kvs, int didx, const void *key);
-dictEntry *kvstoreDictFind(kvstore *kvs, int didx, void *key);
-dictEntry *kvstoreDictAddRaw(kvstore *kvs, int didx, void *key, dictEntry **existing);
-void kvstoreDictSetKey(kvstore *kvs, int didx, dictEntry *de, void *key);
-void kvstoreDictSetVal(kvstore *kvs, int didx, dictEntry *de, void *val);
-dictEntry *kvstoreDictTwoPhaseUnlinkFind(kvstore *kvs, int didx, const void *key, dictEntry ***plink, int *table_index);
-void kvstoreDictTwoPhaseUnlinkFree(kvstore *kvs, int didx, dictEntry *he, dictEntry **plink, int table_index);
-int kvstoreDictDelete(kvstore *kvs, int didx, const void *key);
-dict *kvstoreGetDict(kvstore *kvs, int didx);
+/* Specific hashtable access by hashtable-index */
+unsigned long kvstoreHashtableSize(kvstore *kvs, int didx);
+kvstoreHashtableIterator *kvstoreGetHashtableIterator(kvstore *kvs, int didx);
+kvstoreHashtableIterator *kvstoreGetHashtableSafeIterator(kvstore *kvs, int didx);
+void kvstoreReleaseHashtableIterator(kvstoreHashtableIterator *kvs_id);
+int kvstoreHashtableIteratorNext(kvstoreHashtableIterator *kvs_di, void **next);
+int kvstoreHashtableRandomEntry(kvstore *kvs, int didx, void **found);
+int kvstoreHashtableFairRandomEntry(kvstore *kvs, int didx, void **found);
+unsigned int kvstoreHashtableSampleEntries(kvstore *kvs, int didx, void **dst, unsigned int count);
+int kvstoreHashtableExpand(kvstore *kvs, int didx, unsigned long size);
+unsigned long kvstoreHashtableScanDefrag(kvstore *kvs,
+                                         int didx,
+                                         unsigned long v,
+                                         hashtableScanFunction fn,
+                                         void *privdata,
+                                         void *(*defragfn)(void *),
+                                         int flags);
+unsigned long kvstoreHashtableDefragTables(kvstore *kvs, unsigned long cursor, void *(*defragfn)(void *));
+int kvstoreHashtableFind(kvstore *kvs, int didx, void *key, void **found);
+void **kvstoreHashtableFindRef(kvstore *kvs, int didx, const void *key);
+int kvstoreHashtableAddOrFind(kvstore *kvs, int didx, void *key, void **existing);
+int kvstoreHashtableAdd(kvstore *kvs, int didx, void *entry);
 
-#endif /* DICTARRAY_H_ */
+int kvstoreHashtableFindPositionForInsert(kvstore *kvs, int didx, void *key, hashtablePosition *position, void **existing);
+void kvstoreHashtableInsertAtPosition(kvstore *kvs, int didx, void *entry, void *position);
+
+void **kvstoreHashtableTwoPhasePopFindRef(kvstore *kvs, int didx, const void *key, void *position);
+void kvstoreHashtableTwoPhasePopDelete(kvstore *kvs, int didx, void *position);
+int kvstoreHashtablePop(kvstore *kvs, int didx, const void *key, void **popped);
+int kvstoreHashtableDelete(kvstore *kvs, int didx, const void *key);
+hashtable *kvstoreGetHashtable(kvstore *kvs, int didx);
+
+#endif /* KVSTORE_H */
diff --git a/src/lazyfree.c b/src/lazyfree.c
index 6176b43440..14a4454d7a 100644
--- a/src/lazyfree.c
+++ b/src/lazyfree.c
@@ -186,14 +186,14 @@ void freeObjAsync(robj *key, robj *obj, int dbid) {
  * lazy freeing. */
 void emptyDbAsync(serverDb *db) {
     int slot_count_bits = 0;
-    int flags = KVSTORE_ALLOCATE_DICTS_ON_DEMAND;
+    int flags = KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND;
     if (server.cluster_enabled) {
         slot_count_bits = CLUSTER_SLOT_MASK_BITS;
-        flags |= KVSTORE_FREE_EMPTY_DICTS;
+        flags |= KVSTORE_FREE_EMPTY_HASHTABLES;
     }
     kvstore *oldkeys = db->keys, *oldexpires = db->expires;
-    db->keys = kvstoreCreate(&kvstoreKeysDictType, slot_count_bits, flags);
-    db->expires = kvstoreCreate(&kvstoreExpiresDictType, slot_count_bits, flags);
+    db->keys = kvstoreCreate(&kvstoreKeysHashtableType, slot_count_bits, flags);
+    db->expires = kvstoreCreate(&kvstoreExpiresHashtableType, slot_count_bits, flags);
     atomic_fetch_add_explicit(&lazyfree_objects, kvstoreSize(oldkeys), memory_order_relaxed);
     bioCreateLazyFreeJob(lazyfreeFreeDatabase, 2, oldkeys, oldexpires);
 }
diff --git a/src/memory_prefetch.c b/src/memory_prefetch.c
index d888170176..ef6a6c6d02 100644
--- a/src/memory_prefetch.c
+++ b/src/memory_prefetch.c
@@ -9,65 +9,16 @@
 
 #include "memory_prefetch.h"
 #include "server.h"
-#include "dict.h"
-
-/* Forward declarations of dict.c functions */
-dictEntry *dictGetNext(const dictEntry *de);
-
-/* Forward declarations of kvstore.c functions */
-dict *kvstoreGetDict(kvstore *kvs, int didx);
-
-typedef enum {
-    HT_IDX_FIRST = 0,
-    HT_IDX_SECOND = 1,
-    HT_IDX_INVALID = -1
-} HashTableIndex;
 
 typedef enum {
-    PREFETCH_BUCKET,     /* Initial state, determines which hash table to use and prefetch the table's bucket */
-    PREFETCH_ENTRY,      /* prefetch entries associated with the given key's hash */
-    PREFETCH_VALUE,      /* prefetch the value object of the entry found in the previous step */
-    PREFETCH_VALUE_DATA, /* prefetch the value object's data (if applicable) */
-    PREFETCH_DONE        /* Indicates that prefetching for this key is complete */
+    PREFETCH_ENTRY, /* Initial state, prefetch entries associated with the given key's hash */
+    PREFETCH_VALUE, /* prefetch the value object of the entry found in the previous step */
+    PREFETCH_DONE   /* Indicates that prefetching for this key is complete */
 } PrefetchState;
 
-
-/************************************ State machine diagram for the prefetch operation. ********************************
-                                                           │
-                                                         start
-                                                           │
-                                                  ┌────────▼─────────┐
-                                       ┌─────────►│  PREFETCH_BUCKET ├────►────────┐
-                                       │          └────────┬─────────┘            no more tables -> done
-                                       |             bucket|found                  |
-                                       │                   |                       │
-        entry not found - goto next table         ┌────────▼────────┐              │
-                                       └────◄─────┤ PREFETCH_ENTRY  |              ▼
-                                    ┌────────────►└────────┬────────┘              │
-                                    |                 Entry│found                  │
-                                    │                      |                       │
-       value not found - goto next entry           ┌───────▼────────┐              |
-                                    └───────◄──────┤ PREFETCH_VALUE |              ▼
-                                                   └───────┬────────┘              │
-                                                      Value│found                  │
-                                                           |                       |
-                                               ┌───────────▼──────────────┐        │
-                                               │    PREFETCH_VALUE_DATA   │        ▼
-                                               └───────────┬──────────────┘        │
-                                                           |                       │
-                                                 ┌───────-─▼─────────────┐         │
-                                                 │     PREFETCH_DONE     │◄────────┘
-                                                 └───────────────────────┘
-**********************************************************************************************************************/
-
-typedef void *(*GetValueDataFunc)(const void *val);
-
 typedef struct KeyPrefetchInfo {
-    PrefetchState state;      /* Current state of the prefetch operation */
-    HashTableIndex ht_idx;    /* Index of the current hash table (0 or 1 for rehashing) */
-    uint64_t bucket_idx;      /* Index of the bucket in the current hash table */
-    uint64_t key_hash;        /* Hash value of the key being prefetched */
-    dictEntry *current_entry; /* Pointer to the current entry being processed */
+    PrefetchState state; /* Current state of the prefetch operation */
+    hashtableIncrementalFindState hashtab_state;
 } KeyPrefetchInfo;
 
 /* PrefetchCommandsBatch structure holds the state of the current batch of client commands being processed. */
@@ -81,9 +32,7 @@ typedef struct PrefetchCommandsBatch {
     int *slots;                     /* Array of slots for each key */
     void **keys;                    /* Array of keys to prefetch in the current batch */
     client **clients;               /* Array of clients in the current batch */
-    dict **keys_dicts;              /* Main dict for each key */
-    dict **expire_dicts;            /* Expire dict for each key */
-    dict **current_dicts;           /* Points to either keys_dicts or expire_dicts */
+    hashtable **keys_tables;        /* Main table for each key */
     KeyPrefetchInfo *prefetch_info; /* Prefetch info for each key */
 } PrefetchCommandsBatch;
 
@@ -96,8 +45,7 @@ void freePrefetchCommandsBatch(void) {
 
     zfree(batch->clients);
     zfree(batch->keys);
-    zfree(batch->keys_dicts);
-    zfree(batch->expire_dicts);
+    zfree(batch->keys_tables);
     zfree(batch->slots);
     zfree(batch->prefetch_info);
     zfree(batch);
@@ -116,8 +64,7 @@ void prefetchCommandsBatchInit(void) {
     batch->max_prefetch_size = max_prefetch_size;
     batch->clients = zcalloc(max_prefetch_size * sizeof(client *));
     batch->keys = zcalloc(max_prefetch_size * sizeof(void *));
-    batch->keys_dicts = zcalloc(max_prefetch_size * sizeof(dict *));
-    batch->expire_dicts = zcalloc(max_prefetch_size * sizeof(dict *));
+    batch->keys_tables = zcalloc(max_prefetch_size * sizeof(hashtable *));
     batch->slots = zcalloc(max_prefetch_size * sizeof(int));
     batch->prefetch_info = zcalloc(max_prefetch_size * sizeof(KeyPrefetchInfo));
 }
@@ -132,10 +79,8 @@ void onMaxBatchSizeChange(void) {
     prefetchCommandsBatchInit();
 }
 
-/* Prefetch the given pointer and move to the next key in the batch. */
-static void prefetchAndMoveToNextKey(void *addr) {
-    valkey_prefetch(addr);
-    /* While the prefetch is in progress, we can continue to the next key */
+/* Move to the next key in the batch. */
+static void moveToNextKey(void) {
     batch->cur_idx = (batch->cur_idx + 1) % batch->key_count;
 }
 
@@ -156,144 +101,64 @@ static KeyPrefetchInfo *getNextPrefetchInfo(void) {
     return NULL;
 }
 
-static void initBatchInfo(dict **dicts) {
-    batch->current_dicts = dicts;
-
+static void initBatchInfo(hashtable **tables) {
     /* Initialize the prefetch info */
     for (size_t i = 0; i < batch->key_count; i++) {
         KeyPrefetchInfo *info = &batch->prefetch_info[i];
-        if (!batch->current_dicts[i] || dictSize(batch->current_dicts[i]) == 0) {
+        if (!tables[i] || hashtableSize(tables[i]) == 0) {
             info->state = PREFETCH_DONE;
             batch->keys_done++;
             continue;
         }
-        info->ht_idx = HT_IDX_INVALID;
-        info->current_entry = NULL;
-        info->state = PREFETCH_BUCKET;
-        info->key_hash = dictHashKey(batch->current_dicts[i], batch->keys[i]);
-    }
-}
-
-/* Prefetch the bucket of the next hash table index.
- * If no tables are left, move to the PREFETCH_DONE state. */
-static void prefetchBucket(KeyPrefetchInfo *info) {
-    size_t i = batch->cur_idx;
-
-    /* Determine which hash table to use */
-    if (info->ht_idx == HT_IDX_INVALID) {
-        info->ht_idx = HT_IDX_FIRST;
-    } else if (info->ht_idx == HT_IDX_FIRST && dictIsRehashing(batch->current_dicts[i])) {
-        info->ht_idx = HT_IDX_SECOND;
-    } else {
-        /* No more tables left - mark as done. */
-        markKeyAsdone(info);
-        return;
+        info->state = PREFETCH_ENTRY;
+        hashtableIncrementalFindInit(&info->hashtab_state, tables[i], batch->keys[i]);
     }
-
-    /* Prefetch the bucket */
-    info->bucket_idx = info->key_hash & DICTHT_SIZE_MASK(batch->current_dicts[i]->ht_size_exp[info->ht_idx]);
-    prefetchAndMoveToNextKey(&batch->current_dicts[i]->ht_table[info->ht_idx][info->bucket_idx]);
-    info->current_entry = NULL;
-    info->state = PREFETCH_ENTRY;
 }
 
-/* Prefetch the next entry in the bucket and move to the PREFETCH_VALUE state.
- * If no more entries in the bucket, move to the PREFETCH_BUCKET state to look at the next table. */
 static void prefetchEntry(KeyPrefetchInfo *info) {
-    size_t i = batch->cur_idx;
-
-    if (info->current_entry) {
-        /* We already found an entry in the bucket - move to the next entry */
-        info->current_entry = dictGetNext(info->current_entry);
+    if (hashtableIncrementalFindStep(&info->hashtab_state) == 1) {
+        /* Not done yet */
+        moveToNextKey();
     } else {
-        /* Go to the first entry in the bucket */
-        info->current_entry = batch->current_dicts[i]->ht_table[info->ht_idx][info->bucket_idx];
-    }
-
-    if (info->current_entry) {
-        prefetchAndMoveToNextKey(info->current_entry);
         info->state = PREFETCH_VALUE;
-    } else {
-        /* No entry found in the bucket - try the bucket in the next table */
-        info->state = PREFETCH_BUCKET;
     }
 }
 
-/* Prefetch the entry's value. If the value is found, move to the PREFETCH_VALUE_DATA state.
- * If the value is not found, move to the PREFETCH_ENTRY state to look at the next entry in the bucket. */
+/* Prefetch the entry's value. If the value is found.*/
 static void prefetchValue(KeyPrefetchInfo *info) {
-    size_t i = batch->cur_idx;
-    void *value = dictGetVal(info->current_entry);
-
-    if (dictGetNext(info->current_entry) == NULL && !dictIsRehashing(batch->current_dicts[i])) {
-        /* If this is the last element, we assume a hit and don't compare the keys */
-        prefetchAndMoveToNextKey(value);
-        info->state = PREFETCH_VALUE_DATA;
-        return;
-    }
-
-    void *current_entry_key = dictGetKey(info->current_entry);
-    if (batch->keys[i] == current_entry_key ||
-        dictCompareKeys(batch->current_dicts[i], batch->keys[i], current_entry_key)) {
-        /* If the key is found, prefetch the value */
-        prefetchAndMoveToNextKey(value);
-        info->state = PREFETCH_VALUE_DATA;
-    } else {
-        /* Move to the next entry */
-        info->state = PREFETCH_ENTRY;
+    void *entry;
+    if (hashtableIncrementalFindGetResult(&info->hashtab_state, &entry)) {
+        robj *val = entry;
+        if (val->encoding == OBJ_ENCODING_RAW && val->type == OBJ_STRING) {
+            valkey_prefetch(val->ptr);
+        }
     }
-}
 
-/* Prefetch the value data if available. */
-static void prefetchValueData(KeyPrefetchInfo *info, GetValueDataFunc get_val_data_func) {
-    if (get_val_data_func) {
-        void *value_data = get_val_data_func(dictGetVal(info->current_entry));
-        if (value_data) prefetchAndMoveToNextKey(value_data);
-    }
     markKeyAsdone(info);
 }
 
-/* Prefetch dictionary data for an array of keys.
+/* Prefetch hashtable data for an array of keys.
  *
- * This function takes an array of dictionaries and keys, attempting to bring
- * data closer to the L1 cache that might be needed for dictionary operations
+ * This function takes an array of tables and keys, attempting to bring
+ * data closer to the L1 cache that might be needed for hashtable operations
  * on those keys.
  *
- * The dictFind algorithm:
- * 1. Evaluate the hash of the key
- * 2. Access the index in the first table
- * 3. Walk the entries linked list until the key is found
- *    If the key hasn't been found and the dictionary is in the middle of rehashing,
- *    access the index on the second table and repeat step 3
- *
- * dictPrefetch executes the same algorithm as dictFind, but one step at a time
- * for each key. Instead of waiting for data to be read from memory, it prefetches
- * the data and then moves on to execute the next prefetch for another key.
- *
- * dicts - An array of dictionaries to prefetch data from.
- * get_val_data_func - A callback function that dictPrefetch can invoke
+ * tables - An array of hashtables to prefetch data from.
+ * prefetch_value - If true, we prefetch the value data for each key.
  * to bring the key's value data closer to the L1 cache as well.
  */
-static void dictPrefetch(dict **dicts, GetValueDataFunc get_val_data_func) {
-    initBatchInfo(dicts);
+static void hashtablePrefetch(hashtable **tables) {
+    initBatchInfo(tables);
     KeyPrefetchInfo *info;
     while ((info = getNextPrefetchInfo())) {
         switch (info->state) {
-        case PREFETCH_BUCKET: prefetchBucket(info); break;
         case PREFETCH_ENTRY: prefetchEntry(info); break;
         case PREFETCH_VALUE: prefetchValue(info); break;
-        case PREFETCH_VALUE_DATA: prefetchValueData(info, get_val_data_func); break;
         default: serverPanic("Unknown prefetch state %d", info->state);
         }
     }
 }
 
-/* Helper function to get the value pointer of an object. */
-static void *getObjectValuePtr(const void *val) {
-    robj *o = (robj *)val;
-    return (o->type == OBJ_STRING && o->encoding == OBJ_ENCODING_RAW) ? o->ptr : NULL;
-}
-
 static void resetCommandsBatch(void) {
     batch->cur_idx = 0;
     batch->keys_done = 0;
@@ -304,7 +169,7 @@ static void resetCommandsBatch(void) {
 
 /* Prefetch command-related data:
  * 1. Prefetch the command arguments allocated by the I/O thread to bring them closer to the L1 cache.
- * 2. Prefetch the keys and values for all commands in the current batch from the main and expires dictionaries. */
+ * 2. Prefetch the keys and values for all commands in the current batch from the main hashtable. */
 static void prefetchCommands(void) {
     /* Prefetch argv's for all clients */
     for (size_t i = 0; i < batch->client_count; i++) {
@@ -332,13 +197,11 @@ static void prefetchCommands(void) {
         batch->keys[i] = ((robj *)batch->keys[i])->ptr;
     }
 
-    /* Prefetch dict keys for all commands. Prefetching is beneficial only if there are more than one key. */
+    /* Prefetch hashtable keys for all commands. Prefetching is beneficial only if there are more than one key. */
     if (batch->key_count > 1) {
         server.stat_total_prefetch_batches++;
-        /* Prefetch keys from the main dict */
-        dictPrefetch(batch->keys_dicts, getObjectValuePtr);
-        /* Prefetch keys from the expires dict - no value data to prefetch */
-        dictPrefetch(batch->expire_dicts, NULL);
+        /* Prefetch keys from the main hashtable */
+        hashtablePrefetch(batch->keys_tables);
     }
 }
 
@@ -388,8 +251,7 @@ int addCommandToBatchAndProcessIfFull(client *c) {
         for (int i = 0; i < num_keys && batch->key_count < batch->max_prefetch_size; i++) {
             batch->keys[batch->key_count] = c->argv[result.keys[i].pos];
             batch->slots[batch->key_count] = c->slot > 0 ? c->slot : 0;
-            batch->keys_dicts[batch->key_count] = kvstoreGetDict(c->db->keys, batch->slots[batch->key_count]);
-            batch->expire_dicts[batch->key_count] = kvstoreGetDict(c->db->expires, batch->slots[batch->key_count]);
+            batch->keys_tables[batch->key_count] = kvstoreGetHashtable(c->db->keys, batch->slots[batch->key_count]);
             batch->key_count++;
         }
         getKeysFreeResult(&result);
diff --git a/src/module.c b/src/module.c
index 05ab032800..9bcf68646e 100644
--- a/src/module.c
+++ b/src/module.c
@@ -718,7 +718,7 @@ int moduleCreateEmptyKey(ValkeyModuleKey *key, int type) {
     case VALKEYMODULE_KEYTYPE_STREAM: obj = createStreamObject(); break;
     default: return VALKEYMODULE_ERR;
     }
-    dbAdd(key->db, key->key, obj);
+    dbAdd(key->db, key->key, &obj);
     key->value = obj;
     moduleInitKeyTypeSpecific(key);
     return VALKEYMODULE_OK;
@@ -4196,7 +4196,7 @@ int VM_SetExpire(ValkeyModuleKey *key, mstime_t expire) {
         return VALKEYMODULE_ERR;
     if (expire != VALKEYMODULE_NO_EXPIRE) {
         expire += commandTimeSnapshot();
-        setExpire(key->ctx->client, key->db, key->key, expire);
+        key->value = setExpire(key->ctx->client, key->db, key->key, expire);
     } else {
         removeExpire(key->db, key->key);
     }
@@ -4225,7 +4225,7 @@ int VM_SetAbsExpire(ValkeyModuleKey *key, mstime_t expire) {
     if (!(key->mode & VALKEYMODULE_WRITE) || key->value == NULL || (expire < 0 && expire != VALKEYMODULE_NO_EXPIRE))
         return VALKEYMODULE_ERR;
     if (expire != VALKEYMODULE_NO_EXPIRE) {
-        setExpire(key->ctx->client, key->db, key->key, expire);
+        key->value = setExpire(key->ctx->client, key->db, key->key, expire);
     } else {
         removeExpire(key->db, key->key);
     }
@@ -4286,7 +4286,9 @@ int VM_GetToDbIdFromOptCtx(ValkeyModuleKeyOptCtx *ctx) {
 int VM_StringSet(ValkeyModuleKey *key, ValkeyModuleString *str) {
     if (!(key->mode & VALKEYMODULE_WRITE) || key->iter) return VALKEYMODULE_ERR;
     VM_DeleteKey(key);
-    setKey(key->ctx->client, key->db, key->key, str, SETKEY_NO_SIGNAL);
+    /* Retain str so setKey copies it to db rather than reallocating it. */
+    incrRefCount(str);
+    setKey(key->ctx->client, key->db, key->key, &str, SETKEY_NO_SIGNAL);
     key->value = str;
     return VALKEYMODULE_OK;
 }
@@ -4366,9 +4368,8 @@ int VM_StringTruncate(ValkeyModuleKey *key, size_t newlen) {
     if (key->value == NULL) {
         /* Empty key: create it with the new size. */
         robj *o = createObject(OBJ_STRING, sdsnewlen(NULL, newlen));
-        setKey(key->ctx->client, key->db, key->key, o, SETKEY_NO_SIGNAL);
+        setKey(key->ctx->client, key->db, key->key, &o, SETKEY_NO_SIGNAL);
         key->value = o;
-        decrRefCount(o);
     } else {
         /* Unshare and resize. */
         key->value = dbUnshareStringValue(key->db, key->key, key->value);
@@ -6933,8 +6934,7 @@ int VM_ModuleTypeSetValue(ValkeyModuleKey *key, moduleType *mt, void *value) {
     if (!(key->mode & VALKEYMODULE_WRITE) || key->iter) return VALKEYMODULE_ERR;
     VM_DeleteKey(key);
     robj *o = createModuleObject(mt, value);
-    setKey(key->ctx->client, key->db, key->key, o, SETKEY_NO_SIGNAL);
-    decrRefCount(o);
+    setKey(key->ctx->client, key->db, key->key, &o, SETKEY_NO_SIGNAL);
     key->value = o;
     return VALKEYMODULE_OK;
 }
@@ -10900,10 +10900,10 @@ typedef struct ValkeyModuleScanCursor {
     int done;
 } ValkeyModuleScanCursor;
 
-static void moduleScanCallback(void *privdata, const dictEntry *de) {
+static void moduleScanCallback(void *privdata, void *element) {
     ScanCBData *data = privdata;
-    sds key = dictGetKey(de);
-    robj *val = dictGetVal(de);
+    robj *val = element;
+    sds key = objectGetKey(val);
     ValkeyModuleString *keyname = createObject(OBJ_STRING, sdsdup(key));
 
     /* Setup the key handle. */
diff --git a/src/object.c b/src/object.c
index 035198ad89..ac1c26adf9 100644
--- a/src/object.c
+++ b/src/object.c
@@ -41,18 +41,68 @@
 #define strtold(a, b) ((long double)strtod((a), (b)))
 #endif
 
+/* For objects with large embedded keys, we reserve space for an expire field,
+ * so if expire is set later, we don't need to reallocate the object. */
+#define KEY_SIZE_TO_INCLUDE_EXPIRE_THRESHOLD 128
+
 /* ===================== Creation and parsing of objects ==================== */
 
-robj *createObject(int type, void *ptr) {
-    robj *o = zmalloc(sizeof(*o));
+/* Creates an object, optionally with embedded key and expire fields. The key
+ * and expire fields can be omitted by passing NULL and -1, respectively. */
+robj *createObjectWithKeyAndExpire(int type, void *ptr, const sds key, long long expire) {
+    /* Calculate sizes */
+    int has_expire = (expire != -1 ||
+                      (key != NULL && sdslen(key) >= KEY_SIZE_TO_INCLUDE_EXPIRE_THRESHOLD));
+    size_t key_sds_size = 0;
+    size_t min_size = sizeof(robj);
+    if (has_expire) {
+        min_size += sizeof(long long);
+    }
+    if (key != NULL) {
+        /* Size of embedded key, incl. 1 byte for prefixed sds hdr size. */
+        key_sds_size = sdscopytobuffer(NULL, 0, key, NULL);
+        min_size += 1 + key_sds_size;
+    }
+    /* Allocate and set the declared fields. */
+    size_t bufsize = 0;
+    robj *o = zmalloc_usable(min_size, &bufsize);
     o->type = type;
     o->encoding = OBJ_ENCODING_RAW;
     o->ptr = ptr;
     o->refcount = 1;
     o->lru = 0;
+    o->hasembkey = (key != NULL);
+
+    /* If the allocation has enough space for an expire field, add it even if we
+     * don't need it now. Then we don't need to realloc if it's needed later. */
+    if (key != NULL && !has_expire && bufsize >= min_size + sizeof(long long)) {
+        has_expire = 1;
+        min_size += sizeof(long long);
+    }
+    o->hasexpire = has_expire;
+
+    /* The memory after the struct where we embedded data. */
+    unsigned char *data = (void *)(o + 1);
+
+    /* Set the expire field. */
+    if (o->hasexpire) {
+        *(long long *)data = expire;
+        data += sizeof(long long);
+    }
+
+    /* Copy embedded key. */
+    if (o->hasembkey) {
+        sdscopytobuffer(data + 1, key_sds_size, key, data);
+        data += 1 + key_sds_size;
+    }
+
     return o;
 }
 
+robj *createObject(int type, void *ptr) {
+    return createObjectWithKeyAndExpire(type, ptr, NULL, -1);
+}
+
 void initObjectLRUOrLFU(robj *o) {
     if (o->refcount == OBJ_SHARED_REFCOUNT) return;
     /* Set the LRU to the current lruclock (minutes resolution), or
@@ -88,39 +138,85 @@ robj *createRawStringObject(const char *ptr, size_t len) {
     return createObject(OBJ_STRING, sdsnewlen(ptr, len));
 }
 
-/* Create a string object with encoding OBJ_ENCODING_EMBSTR, that is
- * an object where the sds string is actually an unmodifiable string
- * allocated in the same chunk as the object itself. */
-robj *createEmbeddedStringObject(const char *ptr, size_t len) {
-    size_t bufsize = 0;
-    size_t sds_hdrlen = sizeof(struct sdshdr8);
-    robj *o = zmalloc_usable(sizeof(robj) + sds_hdrlen + len + 1, &bufsize);
-    struct sdshdr8 *sh = (void *)(o + 1);
+/* Creates a new embedded string object and copies the content of key, val and
+ * expire to the new object. LRU is set to 0. */
+static robj *createEmbeddedStringObjectWithKeyAndExpire(const char *val_ptr,
+                                                        size_t val_len,
+                                                        const sds key,
+                                                        long long expire) {
+    /* Calculate sizes */
+    size_t key_sds_size = 0;
+    size_t min_size = sizeof(robj);
+    if (expire != -1) {
+        min_size += sizeof(long long);
+    }
+    if (key != NULL) {
+        /* Size of embedded key, incl. 1 byte for prefixed sds hdr size. */
+        key_sds_size = sdscopytobuffer(NULL, 0, key, NULL);
+        min_size += 1 + key_sds_size;
+    }
+    /* Size of embedded value (EMBSTR) including \0 term. */
+    min_size += sizeof(struct sdshdr8) + val_len + 1;
 
+    /* Allocate and set the declared fields. */
+    size_t bufsize = 0;
+    robj *o = zmalloc_usable(min_size, &bufsize);
     o->type = OBJ_STRING;
     o->encoding = OBJ_ENCODING_EMBSTR;
-    o->ptr = sh + 1;
     o->refcount = 1;
     o->lru = 0;
+    o->hasexpire = (expire != -1);
+    o->hasembkey = (key != NULL);
+
+    /* If the allocation has enough space for an expire field, add it even if we
+     * don't need it now. Then we don't need to realloc if it's needed later. */
+    if (!o->hasexpire && bufsize >= min_size + sizeof(long long)) {
+        o->hasexpire = 1;
+        min_size += sizeof(long long);
+    }
+
+    /* The memory after the struct where we embedded data. */
+    unsigned char *data = (void *)(o + 1);
 
-    sh->len = len;
-    size_t usable = bufsize - (sizeof(robj) + sds_hdrlen + 1);
-    sh->alloc = usable;
-    /* Overflow check. This must not happen as we use embedded strings only
-     * for sds strings that fit into SDS_TYPE_8. */
-    serverAssert(usable == sh->alloc);
+    /* Set the expire field. */
+    if (o->hasexpire) {
+        *(long long *)data = expire;
+        data += sizeof(long long);
+    }
+
+    /* Copy embedded key. */
+    if (o->hasembkey) {
+        sdscopytobuffer(data + 1, key_sds_size, key, data);
+        data += 1 + key_sds_size;
+    }
+
+    /* Copy embedded value (EMBSTR). */
+    struct sdshdr8 *sh = (void *)data;
     sh->flags = SDS_TYPE_8;
-    if (ptr == SDS_NOINIT)
-        sh->buf[len] = '\0';
-    else if (ptr) {
-        memcpy(sh->buf, ptr, len);
-        sh->buf[len] = '\0';
+    sh->len = val_len;
+    size_t capacity = bufsize - (min_size - val_len);
+    sh->alloc = capacity;
+    serverAssert(capacity == sh->alloc); /* Overflow check. */
+    if (val_ptr == SDS_NOINIT) {
+        sh->buf[val_len] = '\0';
+    } else if (val_ptr != NULL) {
+        memcpy(sh->buf, val_ptr, val_len);
+        sh->buf[val_len] = '\0';
     } else {
-        memset(sh->buf, 0, len + 1);
+        memset(sh->buf, 0, val_len + 1);
     }
+    o->ptr = sh->buf;
+
     return o;
 }
 
+/* Create a string object with encoding OBJ_ENCODING_EMBSTR, that is
+ * an object where the sds string is actually an unmodifiable string
+ * allocated in the same chunk as the object itself. */
+robj *createEmbeddedStringObject(const char *ptr, size_t len) {
+    return createEmbeddedStringObjectWithKeyAndExpire(ptr, len, NULL, -1);
+}
+
 /* Create a string object with EMBSTR encoding if it is smaller than
  * OBJ_ENCODING_EMBSTR_SIZE_LIMIT, otherwise the RAW encoding is
  * used.
@@ -135,6 +231,96 @@ robj *createStringObject(const char *ptr, size_t len) {
         return createRawStringObject(ptr, len);
 }
 
+robj *createStringObjectWithKeyAndExpire(const char *ptr, size_t len, const sds key, long long expire) {
+    /* When to embed? Embed when the sum is up to 64 bytes. There may be better
+     * heuristics, e.g. we can look at the jemalloc sizes (16-byte intervals up
+     * to 128 bytes). */
+    size_t size = sizeof(robj);
+    size += (key != NULL) * (sdslen(key) + 3); /* hdr size (1) + hdr (1) + nullterm (1) */
+    size += (expire != -1) * sizeof(long long);
+    size += 4 + len; /* embstr header (3) + nullterm (1) */
+    if (size <= 64) {
+        return createEmbeddedStringObjectWithKeyAndExpire(ptr, len, key, expire);
+    } else {
+        return createObjectWithKeyAndExpire(OBJ_STRING, sdsnewlen(ptr, len), key, expire);
+    }
+}
+
+sds objectGetKey(const robj *val) {
+    unsigned char *data = (void *)(val + 1);
+    if (val->hasexpire) {
+        /* Skip expire field */
+        data += sizeof(long long);
+    }
+    if (val->hasembkey) {
+        uint8_t hdr_size = *(uint8_t *)data;
+        data += 1 + hdr_size;
+        return (sds)data;
+    }
+    return NULL;
+}
+
+long long objectGetExpire(const robj *val) {
+    unsigned char *data = (void *)(val + 1);
+    if (val->hasexpire) {
+        return *(long long *)data;
+    } else {
+        return -1;
+    }
+}
+
+/* This functions may reallocate the value. The new allocation is returned and
+ * the old object's reference counter is decremented and possibly freed. Use the
+ * returned object instead of 'val' after calling this function. */
+robj *objectSetExpire(robj *val, long long expire) {
+    if (val->hasexpire) {
+        /* Update existing expire field. */
+        unsigned char *data = (void *)(val + 1);
+        *(long long *)data = expire;
+        return val;
+    } else if (expire == -1) {
+        return val;
+    } else {
+        return objectSetKeyAndExpire(val, objectGetKey(val), expire);
+    }
+}
+
+/* This functions may reallocate the value. The new allocation is returned and
+ * the old object's reference counter is decremented and possibly freed. Use the
+ * returned object instead of 'val' after calling this function. */
+robj *objectSetKeyAndExpire(robj *val, sds key, long long expire) {
+    if (val->type == OBJ_STRING && val->encoding == OBJ_ENCODING_EMBSTR) {
+        robj *new = createStringObjectWithKeyAndExpire(val->ptr, sdslen(val->ptr), key, expire);
+        new->lru = val->lru;
+        decrRefCount(val);
+        return new;
+    }
+
+    /* Create a new object with embedded key. Reuse ptr if possible. */
+    void *ptr;
+    if (val->refcount == 1) {
+        /* Reuse the ptr. There are no other references to val. */
+        ptr = val->ptr;
+        val->ptr = NULL;
+    } else if (val->type == OBJ_STRING && val->encoding == OBJ_ENCODING_INT) {
+        /* The pointer is not allocated memory. We can just copy the pointer. */
+        ptr = val->ptr;
+    } else if (val->type == OBJ_STRING && val->encoding == OBJ_ENCODING_RAW) {
+        /* Dup the string. */
+        ptr = sdsdup(val->ptr);
+    } else {
+        serverAssert(val->type != OBJ_STRING);
+        /* There are multiple references to this non-string object. Most types
+         * can be duplicated, but for a module type is not always possible. */
+        serverPanic("Not implemented");
+    }
+    robj *new = createObjectWithKeyAndExpire(val->type, ptr, key, expire);
+    new->encoding = val->encoding;
+    new->lru = val->lru;
+    decrRefCount(val);
+    return new;
+}
+
 /* Same as CreateRawStringObject, can return NULL if allocation fails */
 robj *tryCreateRawStringObject(const char *ptr, size_t len) {
     sds str = sdstrynewlen(ptr, len);
@@ -179,18 +365,10 @@ robj *createStringObjectFromLongLong(long long value) {
     return createStringObjectFromLongLongWithOptions(value, LL2STROBJ_AUTO);
 }
 
-/* The function avoids returning a shared integer when LFU/LRU info
- * are needed, that is, when the object is used as a value in the key
- * space(for instance when the INCR command is used), and the server is
- * configured to evict based on LFU/LRU, so we want LFU/LRU values
- * specific for each key. */
+/* The function doesn't return a shared integer when the object is used as a
+ * value in the key space (for instance when the INCR command is used). */
 robj *createStringObjectFromLongLongForValue(long long value) {
-    if (server.maxmemory == 0 || !(server.maxmemory_policy & MAXMEMORY_FLAG_NO_SHARED_INTEGERS)) {
-        /* If the maxmemory policy permits, we can still return shared integers */
-        return createStringObjectFromLongLongWithOptions(value, LL2STROBJ_AUTO);
-    } else {
-        return createStringObjectFromLongLongWithOptions(value, LL2STROBJ_NO_SHARED);
-    }
+    return createStringObjectFromLongLongWithOptions(value, LL2STROBJ_NO_SHARED);
 }
 
 /* Create a string object that contains an sds inside it. That means it can't be
@@ -381,15 +559,17 @@ void incrRefCount(robj *o) {
 
 void decrRefCount(robj *o) {
     if (o->refcount == 1) {
-        switch (o->type) {
-        case OBJ_STRING: freeStringObject(o); break;
-        case OBJ_LIST: freeListObject(o); break;
-        case OBJ_SET: freeSetObject(o); break;
-        case OBJ_ZSET: freeZsetObject(o); break;
-        case OBJ_HASH: freeHashObject(o); break;
-        case OBJ_MODULE: freeModuleObject(o); break;
-        case OBJ_STREAM: freeStreamObject(o); break;
-        default: serverPanic("Unknown object type"); break;
+        if (o->ptr != NULL) {
+            switch (o->type) {
+            case OBJ_STRING: freeStringObject(o); break;
+            case OBJ_LIST: freeListObject(o); break;
+            case OBJ_SET: freeSetObject(o); break;
+            case OBJ_ZSET: freeZsetObject(o); break;
+            case OBJ_HASH: freeHashObject(o); break;
+            case OBJ_MODULE: freeModuleObject(o); break;
+            case OBJ_STREAM: freeStreamObject(o); break;
+            default: serverPanic("Unknown object type"); break;
+            }
         }
         zfree(o);
     } else {
@@ -579,13 +759,6 @@ void dismissObject(robj *o, size_t size_hint) {
 #endif
 }
 
-/* This variant of decrRefCount() gets its argument as void, and is useful
- * as free method in data structures that expect a 'void free_object(void*)'
- * prototype for the free method. */
-void decrRefCountVoid(void *o) {
-    decrRefCount(o);
-}
-
 int checkType(client *c, robj *o, int type) {
     /* A NULL is considered an empty key */
     if (o && o->type != type) {
@@ -653,23 +826,15 @@ robj *tryObjectEncodingEx(robj *o, int try_trim) {
      * representable as a 32 nor 64 bit integer. */
     len = sdslen(s);
     if (len <= 20 && string2l(s, len, &value)) {
-        /* This object is encodable as a long. Try to use a shared object.
-         * Note that we avoid using shared integers when maxmemory is used
-         * because every object needs to have a private LRU field for the LRU
-         * algorithm to work well. */
-        if (canUseSharedObject() && value >= 0 && value < OBJ_SHARED_INTEGERS) {
+        /* This object is encodable as a long. */
+        if (o->encoding == OBJ_ENCODING_RAW) {
+            sdsfree(o->ptr);
+            o->encoding = OBJ_ENCODING_INT;
+            o->ptr = (void *)value;
+            return o;
+        } else if (o->encoding == OBJ_ENCODING_EMBSTR) {
             decrRefCount(o);
-            return shared.integers[value];
-        } else {
-            if (o->encoding == OBJ_ENCODING_RAW) {
-                sdsfree(o->ptr);
-                o->encoding = OBJ_ENCODING_INT;
-                o->ptr = (void *)value;
-                return o;
-            } else if (o->encoding == OBJ_ENCODING_EMBSTR) {
-                decrRefCount(o);
-                return createStringObjectFromLongLongForValue(value);
-            }
+            return createStringObjectFromLongLongForValue(value);
         }
     }
 
@@ -1199,7 +1364,7 @@ struct serverMemOverhead *getMemoryOverheadData(void) {
 
     for (j = 0; j < server.dbnum; j++) {
         serverDb *db = server.db + j;
-        if (!kvstoreNumAllocatedDicts(db->keys)) continue;
+        if (!kvstoreNumAllocatedHashtables(db->keys)) continue;
 
         unsigned long long keyscount = kvstoreSize(db->keys);
 
@@ -1221,8 +1386,8 @@ struct serverMemOverhead *getMemoryOverheadData(void) {
         mh->overhead_db_hashtable_lut += kvstoreOverheadHashtableLut(db->expires);
         mh->overhead_db_hashtable_rehashing += kvstoreOverheadHashtableRehashing(db->keys);
         mh->overhead_db_hashtable_rehashing += kvstoreOverheadHashtableRehashing(db->expires);
-        mh->db_dict_rehashing_count += kvstoreDictRehashingCount(db->keys);
-        mh->db_dict_rehashing_count += kvstoreDictRehashingCount(db->expires);
+        mh->db_dict_rehashing_count += kvstoreHashtableRehashingCount(db->keys);
+        mh->db_dict_rehashing_count += kvstoreHashtableRehashingCount(db->expires);
     }
 
     mh->overhead_total = mem_total;
@@ -1520,7 +1685,6 @@ void memoryCommand(client *c) {
         };
         addReplyHelp(c, help);
     } else if (!strcasecmp(c->argv[1]->ptr, "usage") && c->argc >= 3) {
-        dictEntry *de;
         long long samples = OBJ_COMPUTE_SIZE_DEF_SAMPLES;
         for (int j = 3; j < c->argc; j++) {
             if (!strcasecmp(c->argv[j]->ptr, "samples") && j + 1 < c->argc) {
@@ -1536,12 +1700,12 @@ void memoryCommand(client *c) {
                 return;
             }
         }
-        if ((de = dbFind(c->db, c->argv[2]->ptr)) == NULL) {
+        robj *obj = dbFind(c->db, c->argv[2]->ptr);
+        if (obj == NULL) {
             addReplyNull(c);
             return;
         }
-        size_t usage = objectComputeSize(c->argv[2], dictGetVal(de), samples, c->db->id);
-        usage += dictEntryMemUsage(de);
+        size_t usage = objectComputeSize(c->argv[2], obj, samples, c->db->id);
         addReplyLongLong(c, usage);
     } else if (!strcasecmp(c->argv[1]->ptr, "stats") && c->argc == 2) {
         struct serverMemOverhead *mh = getMemoryOverheadData();
diff --git a/src/pubsub.c b/src/pubsub.c
index 5b037b5721..3781fa39aa 100644
--- a/src/pubsub.c
+++ b/src/pubsub.c
@@ -258,7 +258,6 @@ void unmarkClientAsPubSub(client *c) {
 /* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
  * 0 if the client was already subscribed to that channel. */
 int pubsubSubscribeChannel(client *c, robj *channel, pubsubtype type) {
-    dictEntry *de, *existing;
     dict *clients = NULL;
     int retval = 0;
     unsigned int slot = 0;
@@ -272,15 +271,18 @@ int pubsubSubscribeChannel(client *c, robj *channel, pubsubtype type) {
             slot = getKeySlot(channel->ptr);
         }
 
-        de = kvstoreDictAddRaw(*type.serverPubSubChannels, slot, channel, &existing);
-
-        if (existing) {
-            clients = dictGetVal(existing);
-            channel = dictGetKey(existing);
+        hashtablePosition pos;
+        void *existing;
+        if (!kvstoreHashtableFindPositionForInsert(*type.serverPubSubChannels, slot, channel, &pos, &existing)) {
+            clients = existing;
+            channel = *(robj **)dictMetadata(clients);
         } else {
+            /* Store pointer to channel name in the dict's metadata. */
             clients = dictCreate(&clientDictType);
-            kvstoreDictSetVal(*type.serverPubSubChannels, slot, de, clients);
+            *(robj **)dictMetadata(clients) = channel;
             incrRefCount(channel);
+            /* Insert this dict in the kvstore at the position returned above. */
+            kvstoreHashtableInsertAtPosition(*type.serverPubSubChannels, slot, clients, &pos);
         }
 
         serverAssert(dictAdd(clients, c, NULL) != DICT_ERR);
@@ -295,7 +297,6 @@ int pubsubSubscribeChannel(client *c, robj *channel, pubsubtype type) {
 /* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
  * 0 if the client was not subscribed to the specified channel. */
 int pubsubUnsubscribeChannel(client *c, robj *channel, int notify, pubsubtype type) {
-    dictEntry *de;
     dict *clients;
     int retval = 0;
     int slot = 0;
@@ -309,15 +310,16 @@ int pubsubUnsubscribeChannel(client *c, robj *channel, int notify, pubsubtype ty
         if (server.cluster_enabled && type.shard) {
             slot = getKeySlot(channel->ptr);
         }
-        de = kvstoreDictFind(*type.serverPubSubChannels, slot, channel);
-        serverAssertWithInfo(c, NULL, de != NULL);
-        clients = dictGetVal(de);
+        void *found;
+        kvstoreHashtableFind(*type.serverPubSubChannels, slot, channel, &found);
+        serverAssertWithInfo(c, NULL, found);
+        clients = found;
         serverAssertWithInfo(c, NULL, dictDelete(clients, c) == DICT_OK);
         if (dictSize(clients) == 0) {
             /* Free the dict and associated hash entry at all if this was
              * the latest client, so that it will be possible to abuse
              * PUBSUB creating millions of channels. */
-            kvstoreDictDelete(*type.serverPubSubChannels, slot, channel);
+            kvstoreHashtableDelete(*type.serverPubSubChannels, slot, channel);
         }
     }
     /* Notify the client */
@@ -330,13 +332,13 @@ int pubsubUnsubscribeChannel(client *c, robj *channel, int notify, pubsubtype ty
 
 /* Unsubscribe all shard channels in a slot. */
 void pubsubShardUnsubscribeAllChannelsInSlot(unsigned int slot) {
-    if (!kvstoreDictSize(server.pubsubshard_channels, slot)) return;
+    if (!kvstoreHashtableSize(server.pubsubshard_channels, slot)) return;
 
-    kvstoreDictIterator *kvs_di = kvstoreGetDictSafeIterator(server.pubsubshard_channels, slot);
-    dictEntry *de;
-    while ((de = kvstoreDictIteratorNext(kvs_di)) != NULL) {
-        robj *channel = dictGetKey(de);
-        dict *clients = dictGetVal(de);
+    kvstoreHashtableIterator *kvs_di = kvstoreGetHashtableSafeIterator(server.pubsubshard_channels, slot);
+    void *element;
+    while (kvstoreHashtableIteratorNext(kvs_di, &element)) {
+        dict *clients = element;
+        robj *channel = *(robj **)dictMetadata(clients);
         /* For each client subscribed to the channel, unsubscribe it. */
         dictIterator *iter = dictGetIterator(clients);
         dictEntry *entry;
@@ -352,9 +354,9 @@ void pubsubShardUnsubscribeAllChannelsInSlot(unsigned int slot) {
             }
         }
         dictReleaseIterator(iter);
-        kvstoreDictDelete(server.pubsubshard_channels, slot, channel);
+        kvstoreHashtableDelete(server.pubsubshard_channels, slot, channel);
     }
-    kvstoreReleaseDictIterator(kvs_di);
+    kvstoreReleaseHashtableIterator(kvs_di);
 }
 
 /* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the client was already subscribed to
@@ -474,6 +476,7 @@ int pubsubUnsubscribeAllPatterns(client *c, int notify) {
  */
 int pubsubPublishMessageInternal(robj *channel, robj *message, pubsubtype type) {
     int receivers = 0;
+    void *element;
     dictEntry *de;
     dictIterator *di;
     int slot = -1;
@@ -482,9 +485,8 @@ int pubsubPublishMessageInternal(robj *channel, robj *message, pubsubtype type)
     if (server.cluster_enabled && type.shard) {
         slot = keyHashSlot(channel->ptr, sdslen(channel->ptr));
     }
-    de = kvstoreDictFind(*type.serverPubSubChannels, (slot == -1) ? 0 : slot, channel);
-    if (de) {
-        dict *clients = dictGetVal(de);
+    if (kvstoreHashtableFind(*type.serverPubSubChannels, (slot == -1) ? 0 : slot, channel, &element)) {
+        dict *clients = element;
         dictEntry *entry;
         dictIterator *iter = dictGetIterator(clients);
         while ((entry = dictNext(iter)) != NULL) {
@@ -650,8 +652,9 @@ void pubsubCommand(client *c) {
 
         addReplyArrayLen(c, (c->argc - 2) * 2);
         for (j = 2; j < c->argc; j++) {
-            dict *d = kvstoreDictFetchValue(server.pubsub_channels, 0, c->argv[j]);
-
+            void *found = NULL;
+            kvstoreHashtableFind(server.pubsub_channels, 0, c->argv[j], &found);
+            dict *d = found;
             addReplyBulk(c, c->argv[j]);
             addReplyLongLong(c, d ? dictSize(d) : 0);
         }
@@ -669,8 +672,9 @@ void pubsubCommand(client *c) {
         for (j = 2; j < c->argc; j++) {
             sds key = c->argv[j]->ptr;
             unsigned int slot = server.cluster_enabled ? keyHashSlot(key, (int)sdslen(key)) : 0;
-            dict *clients = kvstoreDictFetchValue(server.pubsubshard_channels, slot, c->argv[j]);
-
+            void *found = NULL;
+            kvstoreHashtableFind(server.pubsubshard_channels, slot, c->argv[j], &found);
+            dict *clients = found;
             addReplyBulk(c, c->argv[j]);
             addReplyLongLong(c, clients ? dictSize(clients) : 0);
         }
@@ -682,15 +686,16 @@ void pubsubCommand(client *c) {
 void channelList(client *c, sds pat, kvstore *pubsub_channels) {
     long mblen = 0;
     void *replylen;
-    unsigned int slot_cnt = kvstoreNumDicts(pubsub_channels);
+    unsigned int slot_cnt = kvstoreNumHashtables(pubsub_channels);
 
     replylen = addReplyDeferredLen(c);
     for (unsigned int i = 0; i < slot_cnt; i++) {
-        if (!kvstoreDictSize(pubsub_channels, i)) continue;
-        kvstoreDictIterator *kvs_di = kvstoreGetDictIterator(pubsub_channels, i);
-        dictEntry *de;
-        while ((de = kvstoreDictIteratorNext(kvs_di)) != NULL) {
-            robj *cobj = dictGetKey(de);
+        if (!kvstoreHashtableSize(pubsub_channels, i)) continue;
+        kvstoreHashtableIterator *kvs_di = kvstoreGetHashtableIterator(pubsub_channels, i);
+        void *next;
+        while (kvstoreHashtableIteratorNext(kvs_di, &next)) {
+            dict *clients = next;
+            robj *cobj = *(robj **)dictMetadata(clients);
             sds channel = cobj->ptr;
 
             if (!pat || stringmatchlen(pat, sdslen(pat), channel, sdslen(channel), 0)) {
@@ -698,7 +703,7 @@ void channelList(client *c, sds pat, kvstore *pubsub_channels) {
                 mblen++;
             }
         }
-        kvstoreReleaseDictIterator(kvs_di);
+        kvstoreReleaseHashtableIterator(kvs_di);
     }
     setDeferredArrayLen(c, replylen, mblen);
 }
diff --git a/src/rdb.c b/src/rdb.c
index ca904f7f98..6e990736bc 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -1321,7 +1321,6 @@ ssize_t rdbSaveFunctions(rio *rdb) {
 }
 
 ssize_t rdbSaveDb(rio *rdb, int dbid, int rdbflags, long *key_counter) {
-    dictEntry *de;
     ssize_t written = 0;
     ssize_t res;
     kvstoreIterator *kvs_it = NULL;
@@ -1350,12 +1349,14 @@ ssize_t rdbSaveDb(rio *rdb, int dbid, int rdbflags, long *key_counter) {
     kvs_it = kvstoreIteratorInit(db->keys);
     int last_slot = -1;
     /* Iterate this DB writing every entry */
-    while ((de = kvstoreIteratorNext(kvs_it)) != NULL) {
-        int curr_slot = kvstoreIteratorGetCurrentDictIndex(kvs_it);
+    void *next;
+    while (kvstoreIteratorNext(kvs_it, &next)) {
+        robj *o = next;
+        int curr_slot = kvstoreIteratorGetCurrentHashtableIndex(kvs_it);
         /* Save slot info. */
         if (server.cluster_enabled && curr_slot != last_slot) {
-            sds slot_info = sdscatprintf(sdsempty(), "%i,%lu,%lu", curr_slot, kvstoreDictSize(db->keys, curr_slot),
-                                         kvstoreDictSize(db->expires, curr_slot));
+            sds slot_info = sdscatprintf(sdsempty(), "%i,%lu,%lu", curr_slot, kvstoreHashtableSize(db->keys, curr_slot),
+                                         kvstoreHashtableSize(db->expires, curr_slot));
             if ((res = rdbSaveAuxFieldStrStr(rdb, "slot-info", slot_info)) < 0) {
                 sdsfree(slot_info);
                 goto werr;
@@ -1364,8 +1365,8 @@ ssize_t rdbSaveDb(rio *rdb, int dbid, int rdbflags, long *key_counter) {
             last_slot = curr_slot;
             sdsfree(slot_info);
         }
-        sds keystr = dictGetKey(de);
-        robj key, *o = dictGetVal(de);
+        sds keystr = objectGetKey(o);
+        robj key;
         long long expire;
         size_t rdb_bytes_before_key = rdb->processed_bytes;
 
@@ -3146,8 +3147,8 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin
                 if (server.cluster_enabled) {
                     /* In cluster mode we resize individual slot specific dictionaries based on the number of keys that
                      * slot holds. */
-                    kvstoreDictExpand(db->keys, slot_id, slot_size);
-                    kvstoreDictExpand(db->expires, slot_id, expires_slot_size);
+                    kvstoreHashtableExpand(db->keys, slot_id, slot_size);
+                    kvstoreHashtableExpand(db->expires, slot_id, expires_slot_size);
                     should_expand_db = 0;
                 }
             } else {
@@ -3305,7 +3306,7 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin
             initStaticStringObject(keyobj, key);
 
             /* Add the new object in the hash table */
-            int added = dbAddRDBLoad(db, key, val);
+            int added = dbAddRDBLoad(db, key, &val);
             server.rdb_last_load_keys_loaded++;
             if (!added) {
                 if (rdbflags & RDBFLAGS_ALLOW_DUP) {
@@ -3313,7 +3314,8 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin
                      * When it's set we allow new keys to replace the current
                      * keys with the same name. */
                     dbSyncDelete(db, &keyobj);
-                    dbAddRDBLoad(db, key, val);
+                    added = dbAddRDBLoad(db, key, &val);
+                    serverAssert(added);
                 } else {
                     serverLog(LL_WARNING, "RDB has duplicated key '%s' in DB %d", key, db->id);
                     serverPanic("Duplicated key found in RDB file");
@@ -3322,7 +3324,7 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin
 
             /* Set the expire time if needed */
             if (expiretime != -1) {
-                setExpire(NULL, db, &keyobj, expiretime);
+                val = setExpire(NULL, db, &keyobj, expiretime);
             }
 
             /* Set usage information (for eviction). */
diff --git a/src/replication.c b/src/replication.c
index d17199bfc3..b5ce77f5e0 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -1886,7 +1886,7 @@ void replicationSendNewlineToPrimary(void) {
 /* Callback used by emptyData() while flushing away old data to load
  * the new dataset received by the primary and by discardTempDb()
  * after loading succeeded or failed. */
-void replicationEmptyDbCallback(dict *d) {
+void replicationEmptyDbCallback(hashtable *d) {
     UNUSED(d);
     if (server.repl_state == REPL_STATE_TRANSFER) replicationSendNewlineToPrimary();
 }
diff --git a/src/sds.c b/src/sds.c
index ee7a2c0f97..97be74ea47 100644
--- a/src/sds.c
+++ b/src/sds.c
@@ -194,12 +194,12 @@ sds sdsdup(const sds s) {
 /*
  * This method returns the minimum amount of bytes required to store the sds (header + data + NULL terminator).
  */
-static inline size_t sdsminlen(sds s) {
+static inline size_t sdsminlen(const sds s) {
     return sdslen(s) + sdsHdrSize(s[-1]) + 1;
 }
 
 /* This method copies the sds `s` into `buf` which is the target character buffer. */
-size_t sdscopytobuffer(unsigned char *buf, size_t buf_len, sds s, uint8_t *hdr_size) {
+size_t sdscopytobuffer(unsigned char *buf, size_t buf_len, const sds s, uint8_t *hdr_size) {
     size_t required_keylen = sdsminlen(s);
     if (buf == NULL) {
         return required_keylen;
@@ -216,6 +216,13 @@ void sdsfree(sds s) {
     s_free_with_size(sdsAllocPtr(s), sdsAllocSize(s));
 }
 
+/* This variant of sdsfree() gets its argument as void, and is useful
+ * as free method in data structures that expect a 'void free_object(void*)'
+ * prototype for the free method. */
+void sdsfreeVoid(void *s) {
+    sdsfree(s);
+}
+
 /* Set the sds string length to the length as obtained with strlen(), so
  * considering as content only up to the first null term character.
  *
diff --git a/src/sds.h b/src/sds.h
index e9c4a95f9a..e1b8531955 100644
--- a/src/sds.h
+++ b/src/sds.h
@@ -183,6 +183,7 @@ sds sdsempty(void);
 sds sdsdup(const sds s);
 size_t sdscopytobuffer(unsigned char *buf, size_t buf_len, sds s, uint8_t *hdr_size);
 void sdsfree(sds s);
+void sdsfreeVoid(void *s);
 sds sdsgrowzero(sds s, size_t len);
 sds sdscatlen(sds s, const void *t, size_t len);
 sds sdscat(sds s, const char *t);
diff --git a/src/server.c b/src/server.c
index 6a1309e68f..1e38b5ac69 100644
--- a/src/server.c
+++ b/src/server.c
@@ -494,21 +494,20 @@ uint64_t dictEncObjHash(const void *key) {
     }
 }
 
-/* Return 1 if currently we allow dict to expand. Dict may allocate huge
- * memory to contain hash buckets when dict expands, that may lead the server to
- * reject user's requests or evict some keys, we can stop dict to expand
- * provisionally if used memory will be over maxmemory after dict expands,
- * but to guarantee the performance of the server, we still allow dict to expand
- * if dict load factor exceeds HASHTABLE_MAX_LOAD_FACTOR. */
-int dictResizeAllowed(size_t moreMem, double usedRatio) {
-    /* for debug purposes: dict is not allowed to be resized. */
+/* Return 1 if we allow a hash table to expand. It may allocate a huge amount of
+ * memory to contain hash buckets when it expands, that may lead the server to
+ * reject user's requests or evict some keys. We can prevent expansion
+ * provisionally if used memory will be over maxmemory after it expands,
+ * but to guarantee the performance of the server, we still allow it to expand
+ * if the load factor exceeds the hard limit defined in hashtable.c. */
+int hashtableResizeAllowed(size_t moreMem, double usedRatio) {
+    UNUSED(usedRatio);
+
+    /* For debug purposes, not allowed to be resized. */
     if (!server.dict_resizing) return 0;
 
-    if (usedRatio <= HASHTABLE_MAX_LOAD_FACTOR) {
-        return !overMaxmemoryAfterAlloc(moreMem);
-    } else {
-        return 1;
-    }
+    /* Avoid resizing over max memory. */
+    return !overMaxmemoryAfterAlloc(moreMem);
 }
 
 const void *hashtableCommandGetKey(const void *element) {
@@ -565,32 +564,53 @@ dictType zsetDictType = {
     NULL,              /* allow to expand */
 };
 
+uint64_t hashtableSdsHash(const void *key) {
+    return hashtableGenHashFunction((const char *)key, sdslen((char *)key));
+}
+
+const void *hashtableObjectGetKey(const void *entry) {
+    return objectGetKey(entry);
+}
+
+int hashtableSdsKeyCompare(const void *key1, const void *key2) {
+    const sds sds1 = (const sds)key1, sds2 = (const sds)key2;
+    return sdslen(sds1) != sdslen(sds2) || sdscmp(sds1, sds2);
+}
+
+int hashtableObjKeyCompare(const void *key1, const void *key2) {
+    const robj *o1 = key1, *o2 = key2;
+    return hashtableSdsKeyCompare(o1->ptr, o2->ptr);
+}
+
+void hashtableObjectDestructor(void *val) {
+    if (val == NULL) return; /* Lazy freeing will set value to NULL. */
+    decrRefCount(val);
+}
+
 /* Kvstore->keys, keys are sds strings, vals are Objects. */
-dictType kvstoreKeysDictType = {
-    dictSdsHash,          /* hash function */
-    NULL,                 /* key dup */
-    dictSdsKeyCompare,    /* key compare */
-    NULL,                 /* key is embedded in the dictEntry and freed internally */
-    dictObjectDestructor, /* val destructor */
-    dictResizeAllowed,    /* allow to resize */
-    kvstoreDictRehashingStarted,
-    kvstoreDictRehashingCompleted,
-    kvstoreDictMetadataSize,
-    .embedKey = dictSdsEmbedKey,
-    .embedded_entry = 1,
+hashtableType kvstoreKeysHashtableType = {
+    .entryGetKey = hashtableObjectGetKey,
+    .hashFunction = hashtableSdsHash,
+    .keyCompare = hashtableSdsKeyCompare,
+    .entryDestructor = hashtableObjectDestructor,
+    .resizeAllowed = hashtableResizeAllowed,
+    .rehashingStarted = kvstoreHashtableRehashingStarted,
+    .rehashingCompleted = kvstoreHashtableRehashingCompleted,
+    .trackMemUsage = kvstoreHashtableTrackMemUsage,
+    .getMetadataSize = kvstoreHashtableMetadataSize,
 };
 
 /* Kvstore->expires */
-dictType kvstoreExpiresDictType = {
-    dictSdsHash,       /* hash function */
-    NULL,              /* key dup */
-    dictSdsKeyCompare, /* key compare */
-    NULL,              /* key destructor */
-    NULL,              /* val destructor */
-    dictResizeAllowed, /* allow to resize */
-    kvstoreDictRehashingStarted,
-    kvstoreDictRehashingCompleted,
-    kvstoreDictMetadataSize,
+hashtableType kvstoreExpiresHashtableType = {
+    .entryGetKey = hashtableObjectGetKey,
+    .hashFunction = hashtableSdsHash,
+    .keyCompare = hashtableSdsKeyCompare,
+    .entryDestructor = NULL, /* shared with keyspace table */
+    .resizeAllowed = hashtableResizeAllowed,
+    .rehashingStarted = kvstoreHashtableRehashingStarted,
+    .rehashingCompleted = kvstoreHashtableRehashingCompleted,
+    .trackMemUsage = kvstoreHashtableTrackMemUsage,
+    .getMetadataSize = kvstoreHashtableMetadataSize,
 };
 
 /* Command set, hashed by sds string, stores serverCommand structs. */
@@ -648,18 +668,33 @@ dictType objToDictDictType = {
     NULL                  /* allow to expand */
 };
 
-/* Same as objToDictDictType, added some kvstore callbacks, it's used
- * for PUBSUB command to track clients subscribing the channels. */
-dictType kvstoreChannelDictType = {
-    dictObjHash,          /* hash function */
-    NULL,                 /* key dup */
-    dictObjKeyCompare,    /* key compare */
-    dictObjectDestructor, /* key destructor */
-    dictDictDestructor,   /* val destructor */
-    NULL,                 /* allow to expand */
-    kvstoreDictRehashingStarted,
-    kvstoreDictRehashingCompleted,
-    kvstoreDictMetadataSize,
+/* Callback used for hash tables where the entries are dicts and the key
+ * (channel name) is stored in each dict's metadata. */
+const void *hashtableChannelsDictGetKey(const void *entry) {
+    const dict *d = entry;
+    return *((const void **)dictMetadata(d));
+}
+
+void hashtableChannelsDictDestructor(void *entry) {
+    dict *d = entry;
+    robj *channel = *((void **)dictMetadata(d));
+    decrRefCount(channel);
+    dictRelease(d);
+}
+
+/* Similar to objToDictDictType, but changed to hashtable and added some kvstore
+ * callbacks, it's used for PUBSUB command to track clients subscribing the
+ * channels. The elements are dicts where the keys are clients. The metadata in
+ * each dict stores a pointer to the channel name. */
+hashtableType kvstoreChannelHashtableType = {
+    .entryGetKey = hashtableChannelsDictGetKey,
+    .hashFunction = dictObjHash,
+    .keyCompare = hashtableObjKeyCompare,
+    .entryDestructor = hashtableChannelsDictDestructor,
+    .rehashingStarted = kvstoreHashtableRehashingStarted,
+    .rehashingCompleted = kvstoreHashtableRehashingCompleted,
+    .trackMemUsage = kvstoreHashtableTrackMemUsage,
+    .getMetadataSize = kvstoreHashtableMetadataSize,
 };
 
 /* Modules system dictionary type. Keys are module name,
@@ -716,12 +751,18 @@ dictType sdsHashDictType = {
     NULL                   /* allow to expand */
 };
 
+size_t clientSetDictTypeMetadataBytes(dict *d) {
+    UNUSED(d);
+    return sizeof(void *);
+}
+
 /* Client Set dictionary type. Keys are client, values are not used. */
 dictType clientDictType = {
     dictClientHash,       /* hash function */
     NULL,                 /* key dup */
     dictClientKeyCompare, /* key compare */
-    .no_value = 1         /* no values in this dict */
+    .dictMetadataBytes = clientSetDictTypeMetadataBytes,
+    .no_value = 1 /* no values in this dict */
 };
 
 /* This function is called once a background process of some kind terminates,
@@ -731,12 +772,16 @@ dictType clientDictType = {
  * for dict.c to resize or rehash the tables accordingly to the fact we have an
  * active fork child running. */
 void updateDictResizePolicy(void) {
-    if (server.in_fork_child != CHILD_TYPE_NONE)
+    if (server.in_fork_child != CHILD_TYPE_NONE) {
         dictSetResizeEnabled(DICT_RESIZE_FORBID);
-    else if (hasActiveChildProcess())
+        hashtableSetResizePolicy(HASHTABLE_RESIZE_FORBID);
+    } else if (hasActiveChildProcess()) {
         dictSetResizeEnabled(DICT_RESIZE_AVOID);
-    else
+        hashtableSetResizePolicy(HASHTABLE_RESIZE_AVOID);
+    } else {
         dictSetResizeEnabled(DICT_RESIZE_ENABLE);
+        hashtableSetResizePolicy(HASHTABLE_RESIZE_ALLOW);
+    }
 }
 
 const char *strChildType(int type) {
@@ -1176,8 +1221,8 @@ void databasesCron(void) {
 
         for (j = 0; j < dbs_per_call; j++) {
             serverDb *db = &server.db[resize_db % server.dbnum];
-            kvstoreTryResizeDicts(db->keys, CRON_DICTS_PER_DB);
-            kvstoreTryResizeDicts(db->expires, CRON_DICTS_PER_DB);
+            kvstoreTryResizeHashtables(db->keys, CRON_DICTS_PER_DB);
+            kvstoreTryResizeHashtables(db->expires, CRON_DICTS_PER_DB);
             resize_db++;
         }
 
@@ -2706,14 +2751,14 @@ void initServer(void) {
 
     /* Create the databases, and initialize other internal state. */
     int slot_count_bits = 0;
-    int flags = KVSTORE_ALLOCATE_DICTS_ON_DEMAND;
+    int flags = KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND;
     if (server.cluster_enabled) {
         slot_count_bits = CLUSTER_SLOT_MASK_BITS;
-        flags |= KVSTORE_FREE_EMPTY_DICTS;
+        flags |= KVSTORE_FREE_EMPTY_HASHTABLES;
     }
     for (j = 0; j < server.dbnum; j++) {
-        server.db[j].keys = kvstoreCreate(&kvstoreKeysDictType, slot_count_bits, flags);
-        server.db[j].expires = kvstoreCreate(&kvstoreExpiresDictType, slot_count_bits, flags);
+        server.db[j].keys = kvstoreCreate(&kvstoreKeysHashtableType, slot_count_bits, flags);
+        server.db[j].expires = kvstoreCreate(&kvstoreExpiresHashtableType, slot_count_bits, flags);
         server.db[j].expires_cursor = 0;
         server.db[j].blocking_keys = dictCreate(&keylistDictType);
         server.db[j].blocking_keys_unblock_on_nokey = dictCreate(&objectKeyPointerValueDictType);
@@ -2726,10 +2771,10 @@ void initServer(void) {
     /* Note that server.pubsub_channels was chosen to be a kvstore (with only one dict, which
      * seems odd) just to make the code cleaner by making it be the same type as server.pubsubshard_channels
      * (which has to be kvstore), see pubsubtype.serverPubSubChannels */
-    server.pubsub_channels = kvstoreCreate(&kvstoreChannelDictType, 0, KVSTORE_ALLOCATE_DICTS_ON_DEMAND);
+    server.pubsub_channels = kvstoreCreate(&kvstoreChannelHashtableType, 0, KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND);
     server.pubsub_patterns = dictCreate(&objToDictDictType);
-    server.pubsubshard_channels = kvstoreCreate(&kvstoreChannelDictType, slot_count_bits,
-                                                KVSTORE_ALLOCATE_DICTS_ON_DEMAND | KVSTORE_FREE_EMPTY_DICTS);
+    server.pubsubshard_channels = kvstoreCreate(&kvstoreChannelHashtableType, slot_count_bits,
+                                                KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND | KVSTORE_FREE_EMPTY_HASHTABLES);
     server.pubsub_clients = 0;
     server.watching_clients = 0;
     server.cronloops = 0;
@@ -6815,6 +6860,7 @@ __attribute__((weak)) int main(int argc, char **argv) {
     uint8_t hashseed[16];
     getRandomBytes(hashseed, sizeof(hashseed));
     dictSetHashFunctionSeed(hashseed);
+    hashtableSetHashFunctionSeed(hashseed);
 
     char *exec_name = strrchr(argv[0], '/');
     if (exec_name == NULL) exec_name = argv[0];
diff --git a/src/server.h b/src/server.h
index 4a7d4777c8..14a16593b0 100644
--- a/src/server.h
+++ b/src/server.h
@@ -210,9 +210,6 @@ struct hdr_histogram;
 
 extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT];
 
-/* Hash table parameters */
-#define HASHTABLE_MAX_LOAD_FACTOR 1.618 /* Maximum hash table load factor. */
-
 /* Command flags. Please check the definition of struct serverCommand in this file
  * for more information about the meaning of every flag. */
 #define CMD_WRITE (1ULL << 0)
@@ -881,8 +878,9 @@ struct ValkeyModuleDigest {
 #define LRU_CLOCK_MAX ((1 << LRU_BITS) - 1) /* Max value of obj->lru */
 #define LRU_CLOCK_RESOLUTION 1000           /* LRU clock resolution in ms */
 
-#define OBJ_SHARED_REFCOUNT INT_MAX       /* Global object never destroyed. */
-#define OBJ_STATIC_REFCOUNT (INT_MAX - 1) /* Object allocated in the stack. */
+#define OBJ_REFCOUNT_BITS 30
+#define OBJ_SHARED_REFCOUNT ((1 << OBJ_REFCOUNT_BITS) - 1) /* Global object never destroyed. */
+#define OBJ_STATIC_REFCOUNT ((1 << OBJ_REFCOUNT_BITS) - 2) /* Object allocated in the stack. */
 #define OBJ_FIRST_SPECIAL_REFCOUNT OBJ_STATIC_REFCOUNT
 struct serverObject {
     unsigned type : 4;
@@ -890,7 +888,9 @@ struct serverObject {
     unsigned lru : LRU_BITS; /* LRU time (relative to global lru_clock) or
                               * LFU data (least significant 8 bits frequency
                               * and most significant 16 bits access time). */
-    int refcount;
+    unsigned hasexpire : 1;
+    unsigned hasembkey : 1;
+    unsigned refcount : OBJ_REFCOUNT_BITS;
     void *ptr;
 };
 
@@ -908,6 +908,8 @@ char *getObjectTypeName(robj *);
         _var.refcount = OBJ_STATIC_REFCOUNT; \
         _var.type = OBJ_STRING;              \
         _var.encoding = OBJ_ENCODING_RAW;    \
+        _var.hasexpire = 0;                  \
+        _var.hasembkey = 0;                  \
         _var.ptr = _ptr;                     \
     } while (0)
 
@@ -2667,8 +2669,8 @@ extern dictType objectKeyHeapPointerValueDictType;
 extern dictType setDictType;
 extern dictType BenchmarkDictType;
 extern dictType zsetDictType;
-extern dictType kvstoreKeysDictType;
-extern dictType kvstoreExpiresDictType;
+extern hashtableType kvstoreKeysHashtableType;
+extern hashtableType kvstoreExpiresHashtableType;
 extern double R_Zero, R_PosInf, R_NegInf, R_Nan;
 extern dictType hashDictType;
 extern dictType stringSetDictType;
@@ -2676,7 +2678,7 @@ extern dictType externalStringType;
 extern dictType sdsHashDictType;
 extern dictType clientDictType;
 extern dictType objToDictDictType;
-extern dictType kvstoreChannelDictType;
+extern hashtableType kvstoreChannelHashtableType;
 extern dictType modulesDictType;
 extern dictType sdsReplyDictType;
 extern dictType keylistDictType;
@@ -3000,7 +3002,6 @@ void execCommandAbort(client *c, sds error);
 
 /* Object implementation */
 void decrRefCount(robj *o);
-void decrRefCountVoid(void *o);
 void incrRefCount(robj *o);
 robj *makeObjectShared(robj *o);
 void freeStringObject(robj *o);
@@ -3013,7 +3014,6 @@ robj *createObject(int type, void *ptr);
 void initObjectLRUOrLFU(robj *o);
 robj *createStringObject(const char *ptr, size_t len);
 robj *createRawStringObject(const char *ptr, size_t len);
-robj *createEmbeddedStringObject(const char *ptr, size_t len);
 robj *tryCreateRawStringObject(const char *ptr, size_t len);
 robj *tryCreateStringObject(const char *ptr, size_t len);
 robj *dupStringObject(const robj *o);
@@ -3054,11 +3054,15 @@ int collateStringObjects(const robj *a, const robj *b);
 int equalStringObjects(robj *a, robj *b);
 unsigned long long estimateObjectIdleTime(robj *o);
 void trimStringObjectIfNeeded(robj *o, int trim_small_values);
-static inline int canUseSharedObject(void) {
-    return server.maxmemory == 0 || !(server.maxmemory_policy & MAXMEMORY_FLAG_NO_SHARED_INTEGERS);
-}
 #define sdsEncodedObject(objptr) (objptr->encoding == OBJ_ENCODING_RAW || objptr->encoding == OBJ_ENCODING_EMBSTR)
 
+/* Objects with key attached, AKA valkey (val+key) objects */
+robj *createObjectWithKeyAndExpire(int type, void *ptr, const sds key, long long expire);
+robj *objectSetKeyAndExpire(robj *val, sds key, long long expire);
+robj *objectSetExpire(robj *val, long long expire);
+sds objectGetKey(const robj *val);
+long long objectGetExpire(const robj *val);
+
 /* Synchronous I/O with timeout */
 ssize_t syncWrite(int fd, char *ptr, ssize_t size, long long timeout);
 ssize_t syncRead(int fd, char *ptr, ssize_t size, long long timeout);
@@ -3384,10 +3388,10 @@ int calculateKeySlot(sds key);
 /* kvstore wrappers */
 int dbExpand(serverDb *db, uint64_t db_size, int try_expand);
 int dbExpandExpires(serverDb *db, uint64_t db_size, int try_expand);
-dictEntry *dbFind(serverDb *db, void *key);
-dictEntry *dbFindExpires(serverDb *db, void *key);
+robj *dbFind(serverDb *db, sds key);
+robj *dbFindExpires(serverDb *db, sds key);
 unsigned long long dbSize(serverDb *db);
-unsigned long long dbScan(serverDb *db, unsigned long long cursor, dictScanFunction *scan_cb, void *privdata);
+unsigned long long dbScan(serverDb *db, unsigned long long cursor, hashtableScanFunction scan_cb, void *privdata);
 
 /* Set data type */
 robj *setTypeCreate(sds value, size_t size_hint);
@@ -3546,7 +3550,7 @@ void deleteExpiredKeyFromOverwriteAndPropagate(client *c, robj *keyobj);
 void propagateDeletion(serverDb *db, robj *key, int lazy);
 int keyIsExpired(serverDb *db, robj *key);
 long long getExpire(serverDb *db, robj *key);
-void setExpire(client *c, serverDb *db, robj *key, long long when);
+robj *setExpire(client *c, serverDb *db, robj *key, long long when);
 int checkAlreadyExpired(long long when);
 robj *lookupKeyRead(serverDb *db, robj *key);
 robj *lookupKeyWrite(serverDb *db, robj *key);
@@ -3566,16 +3570,16 @@ int objectSetLRUOrLFU(robj *val, long long lfu_freq, long long lru_idle, long lo
 #define LOOKUP_NOEFFECTS \
     (LOOKUP_NONOTIFY | LOOKUP_NOSTATS | LOOKUP_NOTOUCH | LOOKUP_NOEXPIRE) /* Avoid any effects from fetching the key */
 
-void dbAdd(serverDb *db, robj *key, robj *val);
-int dbAddRDBLoad(serverDb *db, sds key, robj *val);
-void dbReplaceValue(serverDb *db, robj *key, robj *val);
+void dbAdd(serverDb *db, robj *key, robj **valref);
+int dbAddRDBLoad(serverDb *db, sds key, robj **valref);
+void dbReplaceValue(serverDb *db, robj *key, robj **valref);
 
 #define SETKEY_KEEPTTL 1
 #define SETKEY_NO_SIGNAL 2
 #define SETKEY_ALREADY_EXIST 4
 #define SETKEY_DOESNT_EXIST 8
 #define SETKEY_ADD_OR_UPDATE 16 /* Key most likely doesn't exists */
-void setKey(client *c, serverDb *db, robj *key, robj *val, int flags);
+void setKey(client *c, serverDb *db, robj *key, robj **valref, int flags);
 robj *dbRandomKey(serverDb *db);
 int dbGenericDelete(serverDb *db, robj *key, int async, int flags);
 int dbSyncDelete(serverDb *db, robj *key);
@@ -3585,14 +3589,12 @@ robj *dbUnshareStringValue(serverDb *db, robj *key, robj *o);
 #define EMPTYDB_NO_FLAGS 0           /* No flags. */
 #define EMPTYDB_ASYNC (1 << 0)       /* Reclaim memory in another thread. */
 #define EMPTYDB_NOFUNCTIONS (1 << 1) /* Indicate not to flush the functions. */
-long long emptyData(int dbnum, int flags, void(callback)(dict *));
-long long emptyDbStructure(serverDb *dbarray, int dbnum, int async, void(callback)(dict *));
+long long emptyData(int dbnum, int flags, void(callback)(hashtable *));
+long long emptyDbStructure(serverDb *dbarray, int dbnum, int async, void(callback)(hashtable *));
 void flushAllDataAndResetRDB(int flags);
 long long dbTotalServerKeyCount(void);
 serverDb *initTempDb(void);
 void discardTempDb(serverDb *tempDb);
-
-
 int selectDb(client *c, int id);
 void signalModifiedKey(client *c, serverDb *db, robj *key);
 void signalFlushedDb(int dbid, int async);
diff --git a/src/sort.c b/src/sort.c
index ad0496da79..b1723daff0 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -579,7 +579,10 @@ void sortCommandGeneric(client *c, int readonly) {
         }
         if (outputlen) {
             listTypeTryConversion(sobj, LIST_CONV_AUTO, NULL, NULL);
-            setKey(c, c->db, storekey, sobj, 0);
+            setKey(c, c->db, storekey, &sobj, 0);
+            /* Ownership of sobj transferred to the db. Set to NULL to prevent
+             * freeing it below. */
+            sobj = NULL;
             notifyKeyspaceEvent(NOTIFY_LIST, "sortstore", storekey, c->db->id);
             server.dirty += outputlen;
         } else if (dbDelete(c->db, storekey)) {
@@ -587,7 +590,7 @@ void sortCommandGeneric(client *c, int readonly) {
             notifyKeyspaceEvent(NOTIFY_GENERIC, "del", storekey, c->db->id);
             server.dirty++;
         }
-        decrRefCount(sobj);
+        if (sobj != NULL) decrRefCount(sobj);
         addReplyLongLong(c, outputlen);
     }
 
diff --git a/src/t_hash.c b/src/t_hash.c
index dabe279808..1aa37968b7 100644
--- a/src/t_hash.c
+++ b/src/t_hash.c
@@ -432,7 +432,7 @@ robj *hashTypeLookupWriteOrCreate(client *c, robj *key) {
 
     if (o == NULL) {
         o = createHashObject();
-        dbAdd(c->db, key, o);
+        dbAdd(c->db, key, &o);
     }
     return o;
 }
diff --git a/src/t_list.c b/src/t_list.c
index ffe3e9b08a..57a47ee681 100644
--- a/src/t_list.c
+++ b/src/t_list.c
@@ -471,7 +471,7 @@ void pushGenericCommand(client *c, int where, int xx) {
         }
 
         lobj = createListListpackObject();
-        dbAdd(c->db, c->argv[1], lobj);
+        dbAdd(c->db, c->argv[1], &lobj);
     }
 
     listTypeTryConversionAppend(lobj, c->argv, 2, c->argc - 1, NULL, NULL);
@@ -1068,7 +1068,7 @@ void lmoveHandlePush(client *c, robj *dstkey, robj *dstobj, robj *value, int whe
     /* Create the list if the key does not exist */
     if (!dstobj) {
         dstobj = createListListpackObject();
-        dbAdd(c->db, dstkey, dstobj);
+        dbAdd(c->db, dstkey, &dstobj);
     }
     listTypeTryConversionAppend(dstobj, &value, 0, 0, NULL, NULL);
     listTypePush(dstobj, value, where);
diff --git a/src/t_set.c b/src/t_set.c
index a540c3c49b..997fa2f5c9 100644
--- a/src/t_set.c
+++ b/src/t_set.c
@@ -595,7 +595,7 @@ void saddCommand(client *c) {
 
     if (set == NULL) {
         set = setTypeCreate(c->argv[2]->ptr, c->argc - 2);
-        dbAdd(c->db, c->argv[1], set);
+        dbAdd(c->db, c->argv[1], &set);
     } else {
         setTypeMaybeConvert(set, c->argc - 2);
     }
@@ -674,7 +674,7 @@ void smoveCommand(client *c) {
     /* Create the destination set when it doesn't exist */
     if (!dstset) {
         dstset = setTypeCreate(ele->ptr, 1);
-        dbAdd(c->db, c->argv[2], dstset);
+        dbAdd(c->db, c->argv[2], &dstset);
     }
 
     signalModifiedKey(c, c->db, c->argv[1]);
@@ -919,7 +919,7 @@ void spopWithCountCommand(client *c) {
         setTypeReleaseIterator(si);
 
         /* Assign the new set as the key value. */
-        dbReplaceValue(c->db, c->argv[1], newset);
+        dbReplaceValue(c->db, c->argv[1], &newset);
     }
 
     /* Replicate/AOF the remaining elements as an SREM operation */
@@ -1383,7 +1383,7 @@ void sinterGenericCommand(client *c,
                  * frequent reallocs. Therefore, we shrink it now. */
                 dstset->ptr = lpShrinkToFit(dstset->ptr);
             }
-            setKey(c, c->db, dstkey, dstset, 0);
+            setKey(c, c->db, dstkey, &dstset, 0);
             addReplyLongLong(c, setTypeSize(dstset));
             notifyKeyspaceEvent(NOTIFY_SET, "sinterstore", dstkey, c->db->id);
             server.dirty++;
@@ -1394,8 +1394,8 @@ void sinterGenericCommand(client *c,
                 signalModifiedKey(c, c->db, dstkey);
                 notifyKeyspaceEvent(NOTIFY_GENERIC, "del", dstkey, c->db->id);
             }
+            decrRefCount(dstset);
         }
-        decrRefCount(dstset);
     } else {
         setDeferredSetLen(c, replylen, cardinality);
     }
@@ -1607,7 +1607,7 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke
         /* If we have a target key where to store the resulting set
          * create this key with the result set inside */
         if (setTypeSize(dstset) > 0) {
-            setKey(c, c->db, dstkey, dstset, 0);
+            setKey(c, c->db, dstkey, &dstset, 0);
             addReplyLongLong(c, setTypeSize(dstset));
             notifyKeyspaceEvent(NOTIFY_SET, op == SET_OP_UNION ? "sunionstore" : "sdiffstore", dstkey, c->db->id);
             server.dirty++;
@@ -1618,8 +1618,8 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke
                 signalModifiedKey(c, c->db, dstkey);
                 notifyKeyspaceEvent(NOTIFY_GENERIC, "del", dstkey, c->db->id);
             }
+            decrRefCount(dstset);
         }
-        decrRefCount(dstset);
     }
     zfree(sets);
 }
diff --git a/src/t_stream.c b/src/t_stream.c
index a42822dabc..79aa080703 100644
--- a/src/t_stream.c
+++ b/src/t_stream.c
@@ -1839,7 +1839,7 @@ robj *streamTypeLookupWriteOrCreate(client *c, robj *key, int no_create) {
             return NULL;
         }
         o = createStreamObject();
-        dbAdd(c->db, key, o);
+        dbAdd(c->db, key, &o);
     }
     return o;
 }
@@ -2645,7 +2645,7 @@ void xgroupCommand(client *c) {
         if (s == NULL) {
             serverAssert(mkstream);
             o = createStreamObject();
-            dbAdd(c->db, c->argv[2], o);
+            dbAdd(c->db, c->argv[2], &o);
             s = o->ptr;
             signalModifiedKey(c, c->db, c->argv[2]);
         }
diff --git a/src/t_string.c b/src/t_string.c
index 0dfebee038..da8953ee08 100644
--- a/src/t_string.c
+++ b/src/t_string.c
@@ -77,6 +77,8 @@ static int checkStringLength(client *c, long long size, long long append) {
 #define OBJ_PXAT (1 << 7)     /* Set if timestamp in ms is given */
 #define OBJ_PERSIST (1 << 8)  /* Set if we need to remove the ttl */
 #define OBJ_SET_IFEQ (1 << 9) /* Set if we need compare and set */
+#define OBJ_ARGV3 (1 << 10)   /* Set if the value is at argv[3]; otherwise it's \
+                               * at argv[2]. */
 
 /* Forward declaration */
 static int getExpireMillisecondsOrReply(client *c, robj *expire, int flags, int unit, long long *milliseconds);
@@ -145,12 +147,18 @@ void setGenericCommand(client *c,
     setkey_flags |= ((flags & OBJ_KEEPTTL) || expire) ? SETKEY_KEEPTTL : 0;
     setkey_flags |= found ? SETKEY_ALREADY_EXIST : SETKEY_DOESNT_EXIST;
 
-    setKey(c, c->db, key, val, setkey_flags);
+    setKey(c, c->db, key, &val, setkey_flags);
+    if (expire) val = setExpire(c, c->db, key, milliseconds);
+
+    /* By setting the reallocated value back into argv, we can avoid duplicating
+     * a large string value when adding it to the db. */
+    c->argv[(flags & OBJ_ARGV3) ? 3 : 2] = val;
+    incrRefCount(val);
+
     server.dirty++;
     notifyKeyspaceEvent(NOTIFY_STRING, "set", key, c->db->id);
 
     if (expire) {
-        setExpire(c, c->db, key, milliseconds);
         /* Propagate as SET Key Value PXAT millisecond-timestamp if there is
          * EX/PX/EXAT flag. */
         if (!(flags & OBJ_PXAT)) {
@@ -359,12 +367,12 @@ void setnxCommand(client *c) {
 
 void setexCommand(client *c) {
     c->argv[3] = tryObjectEncoding(c->argv[3]);
-    setGenericCommand(c, OBJ_EX, c->argv[1], c->argv[3], c->argv[2], UNIT_SECONDS, NULL, NULL, NULL);
+    setGenericCommand(c, OBJ_EX | OBJ_ARGV3, c->argv[1], c->argv[3], c->argv[2], UNIT_SECONDS, NULL, NULL, NULL);
 }
 
 void psetexCommand(client *c) {
     c->argv[3] = tryObjectEncoding(c->argv[3]);
-    setGenericCommand(c, OBJ_PX, c->argv[1], c->argv[3], c->argv[2], UNIT_MILLISECONDS, NULL, NULL, NULL);
+    setGenericCommand(c, OBJ_PX | OBJ_ARGV3, c->argv[1], c->argv[3], c->argv[2], UNIT_MILLISECONDS, NULL, NULL, NULL);
 }
 
 int getGenericCommand(client *c) {
@@ -439,7 +447,7 @@ void getexCommand(client *c) {
          * has already elapsed so delete the key in that case. */
         deleteExpiredKeyFromOverwriteAndPropagate(c, c->argv[1]);
     } else if (expire) {
-        setExpire(c, c->db, c->argv[1], milliseconds);
+        o = setExpire(c, c->db, c->argv[1], milliseconds);
         /* Propagate as PXEXPIREAT millisecond-timestamp if there is
          * EX/PX/EXAT/PXAT flag and the key has not expired. */
         robj *milliseconds_obj = createStringObjectFromLongLong(milliseconds);
@@ -472,7 +480,8 @@ void getdelCommand(client *c) {
 void getsetCommand(client *c) {
     if (getGenericCommand(c) == C_ERR) return;
     c->argv[2] = tryObjectEncoding(c->argv[2]);
-    setKey(c, c->db, c->argv[1], c->argv[2], 0);
+    setKey(c, c->db, c->argv[1], &c->argv[2], 0);
+    incrRefCount(c->argv[2]);
     notifyKeyspaceEvent(NOTIFY_STRING, "set", c->argv[1], c->db->id);
     server.dirty++;
 
@@ -506,7 +515,7 @@ void setrangeCommand(client *c) {
             return;
 
         o = createObject(OBJ_STRING, sdsnewlen(NULL, offset + sdslen(value)));
-        dbAdd(c->db, c->argv[1], o);
+        dbAdd(c->db, c->argv[1], &o);
     } else {
         size_t olen;
 
@@ -620,8 +629,10 @@ void msetGenericCommand(client *c, int nx) {
 
     int setkey_flags = nx ? SETKEY_DOESNT_EXIST : 0;
     for (j = 1; j < c->argc; j += 2) {
-        c->argv[j + 1] = tryObjectEncoding(c->argv[j + 1]);
-        setKey(c, c->db, c->argv[j], c->argv[j + 1], setkey_flags);
+        robj *val = tryObjectEncoding(c->argv[j + 1]);
+        setKey(c, c->db, c->argv[j], &val, setkey_flags);
+        incrRefCount(val);
+        c->argv[j + 1] = val;
         notifyKeyspaceEvent(NOTIFY_STRING, "set", c->argv[j], c->db->id);
         /* In MSETNX, It could be that we're overriding the same key, we can't be sure it doesn't exist. */
         if (nx)
@@ -656,16 +667,15 @@ void incrDecrCommand(client *c, long long incr) {
     value += incr;
 
     if (o && o->refcount == 1 && o->encoding == OBJ_ENCODING_INT &&
-        (value < 0 || value >= OBJ_SHARED_INTEGERS) &&
         value >= LONG_MIN && value <= LONG_MAX) {
         new = o;
         o->ptr = (void *)((long)value);
     } else {
         new = createStringObjectFromLongLongForValue(value);
         if (o) {
-            dbReplaceValue(c->db, c->argv[1], new);
+            dbReplaceValue(c->db, c->argv[1], &new);
         } else {
-            dbAdd(c->db, c->argv[1], new);
+            dbAdd(c->db, c->argv[1], &new);
         }
     }
     signalModifiedKey(c, c->db, c->argv[1]);
@@ -718,9 +728,9 @@ void incrbyfloatCommand(client *c) {
     }
     new = createStringObjectFromLongDouble(value, 1);
     if (o)
-        dbReplaceValue(c->db, c->argv[1], new);
+        dbReplaceValue(c->db, c->argv[1], &new);
     else
-        dbAdd(c->db, c->argv[1], new);
+        dbAdd(c->db, c->argv[1], &new);
     signalModifiedKey(c, c->db, c->argv[1]);
     notifyKeyspaceEvent(NOTIFY_STRING, "incrbyfloat", c->argv[1], c->db->id);
     server.dirty++;
@@ -742,7 +752,7 @@ void appendCommand(client *c) {
     if (o == NULL) {
         /* Create the key */
         c->argv[2] = tryObjectEncoding(c->argv[2]);
-        dbAdd(c->db, c->argv[1], c->argv[2]);
+        dbAdd(c->db, c->argv[1], &c->argv[2]);
         incrRefCount(c->argv[2]);
         totlen = stringObjectLen(c->argv[2]);
     } else {
diff --git a/src/t_zset.c b/src/t_zset.c
index 36a9bfffb1..105d57b7c3 100644
--- a/src/t_zset.c
+++ b/src/t_zset.c
@@ -1865,7 +1865,7 @@ void zaddGenericCommand(client *c, int flags) {
     if (zobj == NULL) {
         if (xx) goto reply_to_client; /* No key + XX option: nothing to do. */
         zobj = zsetTypeCreate(elements, maxelelen);
-        dbAdd(c->db, key, zobj);
+        dbAdd(c->db, key, &zobj);
     } else {
         zsetTypeMaybeConvert(zobj, elements, maxelelen);
     }
@@ -2844,7 +2844,7 @@ void zunionInterDiffGenericCommand(client *c, robj *dstkey, int numkeysIndex, in
     if (dstkey) {
         if (dstzset->zsl->length) {
             zsetConvertToListpackIfNeeded(dstobj, maxelelen, totelelen);
-            setKey(c, c->db, dstkey, dstobj, 0);
+            setKey(c, c->db, dstkey, &dstobj, 0);
             addReplyLongLong(c, zsetLength(dstobj));
             notifyKeyspaceEvent(
                 NOTIFY_ZSET, (op == SET_OP_UNION) ? "zunionstore" : (op == SET_OP_INTER ? "zinterstore" : "zdiffstore"),
@@ -2857,8 +2857,8 @@ void zunionInterDiffGenericCommand(client *c, robj *dstkey, int numkeysIndex, in
                 notifyKeyspaceEvent(NOTIFY_GENERIC, "del", dstkey, c->db->id);
                 server.dirty++;
             }
+            decrRefCount(dstobj);
         }
-        decrRefCount(dstobj);
     } else if (cardinality_only) {
         addReplyLongLong(c, cardinality);
     } else {
@@ -3047,7 +3047,7 @@ static void zrangeResultEmitLongLongForStore(zrange_result_handler *handler, lon
 
 static void zrangeResultFinalizeStore(zrange_result_handler *handler, size_t result_count) {
     if (result_count) {
-        setKey(handler->client, handler->client->db, handler->dstkey, handler->dstobj, 0);
+        setKey(handler->client, handler->client->db, handler->dstkey, &handler->dstobj, 0);
         addReplyLongLong(handler->client, result_count);
         notifyKeyspaceEvent(NOTIFY_ZSET, "zrangestore", handler->dstkey, handler->client->db->id);
         server.dirty++;
@@ -3058,8 +3058,8 @@ static void zrangeResultFinalizeStore(zrange_result_handler *handler, size_t res
             notifyKeyspaceEvent(NOTIFY_GENERIC, "del", handler->dstkey, handler->client->db->id);
             server.dirty++;
         }
+        decrRefCount(handler->dstobj);
     }
-    decrRefCount(handler->dstobj);
 }
 
 /* Initialize the consumer interface type with the requested type. */
diff --git a/src/unit/test_files.h b/src/unit/test_files.h
index 1de84b344f..f25e320452 100644
--- a/src/unit/test_files.h
+++ b/src/unit/test_files.h
@@ -44,10 +44,10 @@ int test_intsetUpgradeFromint32Toint64(int argc, char **argv, int flags);
 int test_intsetStressLookups(int argc, char **argv, int flags);
 int test_intsetStressAddDelete(int argc, char **argv, int flags);
 int test_kvstoreAdd16Keys(int argc, char **argv, int flags);
-int test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict(int argc, char **argv, int flags);
-int test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict(int argc, char **argv, int flags);
-int test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict(int argc, char **argv, int flags);
-int test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict(int argc, char **argv, int flags);
+int test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyHashtable(int argc, char **argv, int flags);
+int test_kvstoreIteratorRemoveAllKeysDeleteEmptyHashtable(int argc, char **argv, int flags);
+int test_kvstoreHashtableIteratorRemoveAllKeysNoDeleteEmptyHashtable(int argc, char **argv, int flags);
+int test_kvstoreHashtableIteratorRemoveAllKeysDeleteEmptyHashtable(int argc, char **argv, int flags);
 int test_listpackCreateIntList(int argc, char **argv, int flags);
 int test_listpackCreateList(int argc, char **argv, int flags);
 int test_listpackLpPrepend(int argc, char **argv, int flags);
@@ -102,6 +102,7 @@ int test_listpackBenchmarkLpCompareWithNumber(int argc, char **argv, int flags);
 int test_listpackBenchmarkFree(int argc, char **argv, int flags);
 int test_backupAndUpdateClientArgv(int argc, char **argv, int flags);
 int test_rewriteClientCommandArgument(int argc, char **argv, int flags);
+int test_object_with_key(int argc, char **argv, int flags);
 int test_quicklistCreateList(int argc, char **argv, int flags);
 int test_quicklistAddToTailOfEmptyList(int argc, char **argv, int flags);
 int test_quicklistAddToHeadOfEmptyList(int argc, char **argv, int flags);
@@ -233,9 +234,10 @@ unitTest __test_dict_c[] = {{"test_dictCreate", test_dictCreate}, {"test_dictAdd
 unitTest __test_endianconv_c[] = {{"test_endianconv", test_endianconv}, {NULL, NULL}};
 unitTest __test_hashtable_c[] = {{"test_cursor", test_cursor}, {"test_set_hash_function_seed", test_set_hash_function_seed}, {"test_add_find_delete", test_add_find_delete}, {"test_add_find_delete_avoid_resize", test_add_find_delete_avoid_resize}, {"test_instant_rehashing", test_instant_rehashing}, {"test_bucket_chain_length", test_bucket_chain_length}, {"test_two_phase_insert_and_pop", test_two_phase_insert_and_pop}, {"test_replace_reallocated_entry", test_replace_reallocated_entry}, {"test_incremental_find", test_incremental_find}, {"test_scan", test_scan}, {"test_iterator", test_iterator}, {"test_safe_iterator", test_safe_iterator}, {"test_compact_bucket_chain", test_compact_bucket_chain}, {"test_random_entry", test_random_entry}, {"test_random_entry_with_long_chain", test_random_entry_with_long_chain}, {"test_all_memory_freed", test_all_memory_freed}, {NULL, NULL}};
 unitTest __test_intset_c[] = {{"test_intsetValueEncodings", test_intsetValueEncodings}, {"test_intsetBasicAdding", test_intsetBasicAdding}, {"test_intsetLargeNumberRandomAdd", test_intsetLargeNumberRandomAdd}, {"test_intsetUpgradeFromint16Toint32", test_intsetUpgradeFromint16Toint32}, {"test_intsetUpgradeFromint16Toint64", test_intsetUpgradeFromint16Toint64}, {"test_intsetUpgradeFromint32Toint64", test_intsetUpgradeFromint32Toint64}, {"test_intsetStressLookups", test_intsetStressLookups}, {"test_intsetStressAddDelete", test_intsetStressAddDelete}, {NULL, NULL}};
-unitTest __test_kvstore_c[] = {{"test_kvstoreAdd16Keys", test_kvstoreAdd16Keys}, {"test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict}, {NULL, NULL}};
+unitTest __test_kvstore_c[] = {{"test_kvstoreAdd16Keys", test_kvstoreAdd16Keys}, {"test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyHashtable", test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyHashtable}, {"test_kvstoreIteratorRemoveAllKeysDeleteEmptyHashtable", test_kvstoreIteratorRemoveAllKeysDeleteEmptyHashtable}, {"test_kvstoreHashtableIteratorRemoveAllKeysNoDeleteEmptyHashtable", test_kvstoreHashtableIteratorRemoveAllKeysNoDeleteEmptyHashtable}, {"test_kvstoreHashtableIteratorRemoveAllKeysDeleteEmptyHashtable", test_kvstoreHashtableIteratorRemoveAllKeysDeleteEmptyHashtable}, {NULL, NULL}};
 unitTest __test_listpack_c[] = {{"test_listpackCreateIntList", test_listpackCreateIntList}, {"test_listpackCreateList", test_listpackCreateList}, {"test_listpackLpPrepend", test_listpackLpPrepend}, {"test_listpackLpPrependInteger", test_listpackLpPrependInteger}, {"test_listpackGetELementAtIndex", test_listpackGetELementAtIndex}, {"test_listpackPop", test_listpackPop}, {"test_listpackGetELementAtIndex2", test_listpackGetELementAtIndex2}, {"test_listpackIterate0toEnd", test_listpackIterate0toEnd}, {"test_listpackIterate1toEnd", test_listpackIterate1toEnd}, {"test_listpackIterate2toEnd", test_listpackIterate2toEnd}, {"test_listpackIterateBackToFront", test_listpackIterateBackToFront}, {"test_listpackIterateBackToFrontWithDelete", test_listpackIterateBackToFrontWithDelete}, {"test_listpackDeleteWhenNumIsMinusOne", test_listpackDeleteWhenNumIsMinusOne}, {"test_listpackDeleteWithNegativeIndex", test_listpackDeleteWithNegativeIndex}, {"test_listpackDeleteInclusiveRange0_0", test_listpackDeleteInclusiveRange0_0}, {"test_listpackDeleteInclusiveRange0_1", test_listpackDeleteInclusiveRange0_1}, {"test_listpackDeleteInclusiveRange1_2", test_listpackDeleteInclusiveRange1_2}, {"test_listpackDeleteWitStartIndexOutOfRange", test_listpackDeleteWitStartIndexOutOfRange}, {"test_listpackDeleteWitNumOverflow", test_listpackDeleteWitNumOverflow}, {"test_listpackBatchDelete", test_listpackBatchDelete}, {"test_listpackDeleteFooWhileIterating", test_listpackDeleteFooWhileIterating}, {"test_listpackReplaceWithSameSize", test_listpackReplaceWithSameSize}, {"test_listpackReplaceWithDifferentSize", test_listpackReplaceWithDifferentSize}, {"test_listpackRegressionGt255Bytes", test_listpackRegressionGt255Bytes}, {"test_listpackCreateLongListAndCheckIndices", test_listpackCreateLongListAndCheckIndices}, {"test_listpackCompareStrsWithLpEntries", test_listpackCompareStrsWithLpEntries}, {"test_listpackLpMergeEmptyLps", test_listpackLpMergeEmptyLps}, {"test_listpackLpMergeLp1Larger", test_listpackLpMergeLp1Larger}, {"test_listpackLpMergeLp2Larger", test_listpackLpMergeLp2Larger}, {"test_listpackLpNextRandom", test_listpackLpNextRandom}, {"test_listpackLpNextRandomCC", test_listpackLpNextRandomCC}, {"test_listpackRandomPairWithOneElement", test_listpackRandomPairWithOneElement}, {"test_listpackRandomPairWithManyElements", test_listpackRandomPairWithManyElements}, {"test_listpackRandomPairsWithOneElement", test_listpackRandomPairsWithOneElement}, {"test_listpackRandomPairsWithManyElements", test_listpackRandomPairsWithManyElements}, {"test_listpackRandomPairsUniqueWithOneElement", test_listpackRandomPairsUniqueWithOneElement}, {"test_listpackRandomPairsUniqueWithManyElements", test_listpackRandomPairsUniqueWithManyElements}, {"test_listpackPushVariousEncodings", test_listpackPushVariousEncodings}, {"test_listpackLpFind", test_listpackLpFind}, {"test_listpackLpValidateIntegrity", test_listpackLpValidateIntegrity}, {"test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN", test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN}, {"test_listpackStressWithRandom", test_listpackStressWithRandom}, {"test_listpackSTressWithVariableSize", test_listpackSTressWithVariableSize}, {"test_listpackBenchmarkInit", test_listpackBenchmarkInit}, {"test_listpackBenchmarkLpAppend", test_listpackBenchmarkLpAppend}, {"test_listpackBenchmarkLpFindString", test_listpackBenchmarkLpFindString}, {"test_listpackBenchmarkLpFindNumber", test_listpackBenchmarkLpFindNumber}, {"test_listpackBenchmarkLpSeek", test_listpackBenchmarkLpSeek}, {"test_listpackBenchmarkLpValidateIntegrity", test_listpackBenchmarkLpValidateIntegrity}, {"test_listpackBenchmarkLpCompareWithString", test_listpackBenchmarkLpCompareWithString}, {"test_listpackBenchmarkLpCompareWithNumber", test_listpackBenchmarkLpCompareWithNumber}, {"test_listpackBenchmarkFree", test_listpackBenchmarkFree}, {NULL, NULL}};
 unitTest __test_networking_c[] = {{"test_backupAndUpdateClientArgv", test_backupAndUpdateClientArgv}, {"test_rewriteClientCommandArgument", test_rewriteClientCommandArgument}, {NULL, NULL}};
+unitTest __test_object_c[] = {{"test_object_with_key", test_object_with_key}, {NULL, NULL}};
 unitTest __test_quicklist_c[] = {{"test_quicklistCreateList", test_quicklistCreateList}, {"test_quicklistAddToTailOfEmptyList", test_quicklistAddToTailOfEmptyList}, {"test_quicklistAddToHeadOfEmptyList", test_quicklistAddToHeadOfEmptyList}, {"test_quicklistAddToTail5xAtCompress", test_quicklistAddToTail5xAtCompress}, {"test_quicklistAddToHead5xAtCompress", test_quicklistAddToHead5xAtCompress}, {"test_quicklistAddToTail500xAtCompress", test_quicklistAddToTail500xAtCompress}, {"test_quicklistAddToHead500xAtCompress", test_quicklistAddToHead500xAtCompress}, {"test_quicklistRotateEmpty", test_quicklistRotateEmpty}, {"test_quicklistComprassionPlainNode", test_quicklistComprassionPlainNode}, {"test_quicklistNextPlainNode", test_quicklistNextPlainNode}, {"test_quicklistRotatePlainNode", test_quicklistRotatePlainNode}, {"test_quicklistRotateOneValOnce", test_quicklistRotateOneValOnce}, {"test_quicklistRotate500Val5000TimesAtCompress", test_quicklistRotate500Val5000TimesAtCompress}, {"test_quicklistPopEmpty", test_quicklistPopEmpty}, {"test_quicklistPop1StringFrom1", test_quicklistPop1StringFrom1}, {"test_quicklistPopHead1NumberFrom1", test_quicklistPopHead1NumberFrom1}, {"test_quicklistPopHead500From500", test_quicklistPopHead500From500}, {"test_quicklistPopHead5000From500", test_quicklistPopHead5000From500}, {"test_quicklistIterateForwardOver500List", test_quicklistIterateForwardOver500List}, {"test_quicklistIterateReverseOver500List", test_quicklistIterateReverseOver500List}, {"test_quicklistInsertAfter1Element", test_quicklistInsertAfter1Element}, {"test_quicklistInsertBefore1Element", test_quicklistInsertBefore1Element}, {"test_quicklistInsertHeadWhileHeadNodeIsFull", test_quicklistInsertHeadWhileHeadNodeIsFull}, {"test_quicklistInsertTailWhileTailNodeIsFull", test_quicklistInsertTailWhileTailNodeIsFull}, {"test_quicklistInsertOnceInElementsWhileIteratingAtCompress", test_quicklistInsertOnceInElementsWhileIteratingAtCompress}, {"test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress", test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress}, {"test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress", test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress}, {"test_quicklistDuplicateEmptyList", test_quicklistDuplicateEmptyList}, {"test_quicklistDuplicateListOf1Element", test_quicklistDuplicateListOf1Element}, {"test_quicklistDuplicateListOf500", test_quicklistDuplicateListOf500}, {"test_quicklistIndex1200From500ListAtFill", test_quicklistIndex1200From500ListAtFill}, {"test_quicklistIndex12From500ListAtFill", test_quicklistIndex12From500ListAtFill}, {"test_quicklistIndex100From500ListAtFill", test_quicklistIndex100From500ListAtFill}, {"test_quicklistIndexTooBig1From50ListAtFill", test_quicklistIndexTooBig1From50ListAtFill}, {"test_quicklistDeleteRangeEmptyList", test_quicklistDeleteRangeEmptyList}, {"test_quicklistDeleteRangeOfEntireNodeInListOfOneNode", test_quicklistDeleteRangeOfEntireNodeInListOfOneNode}, {"test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts", test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts}, {"test_quicklistDeleteMiddle100Of500List", test_quicklistDeleteMiddle100Of500List}, {"test_quicklistDeleteLessThanFillButAcrossNodes", test_quicklistDeleteLessThanFillButAcrossNodes}, {"test_quicklistDeleteNegative1From500List", test_quicklistDeleteNegative1From500List}, {"test_quicklistDeleteNegative1From500ListWithOverflowCounts", test_quicklistDeleteNegative1From500ListWithOverflowCounts}, {"test_quicklistDeleteNegative100From500List", test_quicklistDeleteNegative100From500List}, {"test_quicklistDelete10Count5From50List", test_quicklistDelete10Count5From50List}, {"test_quicklistNumbersOnlyListRead", test_quicklistNumbersOnlyListRead}, {"test_quicklistNumbersLargerListRead", test_quicklistNumbersLargerListRead}, {"test_quicklistNumbersLargerListReadB", test_quicklistNumbersLargerListReadB}, {"test_quicklistLremTestAtCompress", test_quicklistLremTestAtCompress}, {"test_quicklistIterateReverseDeleteAtCompress", test_quicklistIterateReverseDeleteAtCompress}, {"test_quicklistIteratorAtIndexTestAtCompress", test_quicklistIteratorAtIndexTestAtCompress}, {"test_quicklistLtrimTestAAtCompress", test_quicklistLtrimTestAAtCompress}, {"test_quicklistLtrimTestBAtCompress", test_quicklistLtrimTestBAtCompress}, {"test_quicklistLtrimTestCAtCompress", test_quicklistLtrimTestCAtCompress}, {"test_quicklistLtrimTestDAtCompress", test_quicklistLtrimTestDAtCompress}, {"test_quicklistVerifySpecificCompressionOfInteriorNodes", test_quicklistVerifySpecificCompressionOfInteriorNodes}, {"test_quicklistBookmarkGetUpdatedToNextItem", test_quicklistBookmarkGetUpdatedToNextItem}, {"test_quicklistBookmarkLimit", test_quicklistBookmarkLimit}, {"test_quicklistCompressAndDecompressQuicklistListpackNode", test_quicklistCompressAndDecompressQuicklistListpackNode}, {"test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX", test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX}, {NULL, NULL}};
 unitTest __test_rax_c[] = {{"test_raxRandomWalk", test_raxRandomWalk}, {"test_raxIteratorUnitTests", test_raxIteratorUnitTests}, {"test_raxTryInsertUnitTests", test_raxTryInsertUnitTests}, {"test_raxRegressionTest1", test_raxRegressionTest1}, {"test_raxRegressionTest2", test_raxRegressionTest2}, {"test_raxRegressionTest3", test_raxRegressionTest3}, {"test_raxRegressionTest4", test_raxRegressionTest4}, {"test_raxRegressionTest5", test_raxRegressionTest5}, {"test_raxRegressionTest6", test_raxRegressionTest6}, {"test_raxBenchmark", test_raxBenchmark}, {"test_raxHugeKey", test_raxHugeKey}, {"test_raxFuzz", test_raxFuzz}, {NULL, NULL}};
 unitTest __test_sds_c[] = {{"test_sds", test_sds}, {"test_typesAndAllocSize", test_typesAndAllocSize}, {"test_sdsHeaderSizes", test_sdsHeaderSizes}, {"test_sdssplitargs", test_sdssplitargs}, {NULL, NULL}};
@@ -259,6 +261,7 @@ struct unitTestSuite {
     {"test_kvstore.c", __test_kvstore_c},
     {"test_listpack.c", __test_listpack_c},
     {"test_networking.c", __test_networking_c},
+    {"test_object.c", __test_object_c},
     {"test_quicklist.c", __test_quicklist_c},
     {"test_rax.c", __test_rax_c},
     {"test_sds.c", __test_sds_c},
diff --git a/src/unit/test_kvstore.c b/src/unit/test_kvstore.c
index 062b9f32fc..d4cc91d6d8 100644
--- a/src/unit/test_kvstore.c
+++ b/src/unit/test_kvstore.c
@@ -2,22 +2,26 @@
 #include "test_help.h"
 
 uint64_t hashTestCallback(const void *key) {
-    return dictGenHashFunction((unsigned char *)key, strlen((char *)key));
+    return hashtableGenHashFunction((char *)key, strlen((char *)key));
+}
+
+int cmpTestCallback(const void *k1, const void *k2) {
+    return strcmp(k1, k2);
 }
 
 void freeTestCallback(void *val) {
     zfree(val);
 }
 
-dictType KvstoreDictTestType = {hashTestCallback,
-                                NULL,
-                                NULL,
-                                freeTestCallback,
-                                NULL,
-                                NULL,
-                                kvstoreDictRehashingStarted,
-                                kvstoreDictRehashingCompleted,
-                                kvstoreDictMetadataSize};
+hashtableType KvstoreHashtableTestType = {
+    .hashFunction = hashTestCallback,
+    .keyCompare = cmpTestCallback,
+    .entryDestructor = freeTestCallback,
+    .rehashingStarted = kvstoreHashtableRehashingStarted,
+    .rehashingCompleted = kvstoreHashtableRehashingCompleted,
+    .trackMemUsage = kvstoreHashtableTrackMemUsage,
+    .getMetadataSize = kvstoreHashtableMetadataSize,
+};
 
 char *stringFromInt(int value) {
     char buf[32];
@@ -37,21 +41,18 @@ int test_kvstoreAdd16Keys(int argc, char **argv, int flags) {
     UNUSED(flags);
 
     int i;
-    dictEntry *de;
 
     int didx = 0;
-    kvstore *kvs1 = kvstoreCreate(&KvstoreDictTestType, 0, KVSTORE_ALLOCATE_DICTS_ON_DEMAND);
-    kvstore *kvs2 = kvstoreCreate(&KvstoreDictTestType, 0, KVSTORE_ALLOCATE_DICTS_ON_DEMAND | KVSTORE_FREE_EMPTY_DICTS);
+    kvstore *kvs1 = kvstoreCreate(&KvstoreHashtableTestType, 0, KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND);
+    kvstore *kvs2 = kvstoreCreate(&KvstoreHashtableTestType, 0, KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND | KVSTORE_FREE_EMPTY_HASHTABLES);
 
     for (i = 0; i < 16; i++) {
-        de = kvstoreDictAddRaw(kvs1, didx, stringFromInt(i), NULL);
-        TEST_ASSERT(de != NULL);
-        de = kvstoreDictAddRaw(kvs2, didx, stringFromInt(i), NULL);
-        TEST_ASSERT(de != NULL);
+        TEST_ASSERT(kvstoreHashtableAdd(kvs1, didx, stringFromInt(i)));
+        TEST_ASSERT(kvstoreHashtableAdd(kvs2, didx, stringFromInt(i)));
     }
-    TEST_ASSERT(kvstoreDictSize(kvs1, didx) == 16);
+    TEST_ASSERT(kvstoreHashtableSize(kvs1, didx) == 16);
     TEST_ASSERT(kvstoreSize(kvs1) == 16);
-    TEST_ASSERT(kvstoreDictSize(kvs2, didx) == 16);
+    TEST_ASSERT(kvstoreHashtableSize(kvs2, didx) == 16);
     TEST_ASSERT(kvstoreSize(kvs2) == 16);
 
     kvstoreRelease(kvs1);
@@ -59,144 +60,132 @@ int test_kvstoreAdd16Keys(int argc, char **argv, int flags) {
     return 0;
 }
 
-int test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict(int argc, char **argv, int flags) {
+int test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyHashtable(int argc, char **argv, int flags) {
     UNUSED(argc);
     UNUSED(argv);
     UNUSED(flags);
 
     int i;
     void *key;
-    dictEntry *de;
     kvstoreIterator *kvs_it;
 
     int didx = 0;
     int curr_slot = 0;
-    kvstore *kvs1 = kvstoreCreate(&KvstoreDictTestType, 0, KVSTORE_ALLOCATE_DICTS_ON_DEMAND);
+    kvstore *kvs1 = kvstoreCreate(&KvstoreHashtableTestType, 0, KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND);
 
     for (i = 0; i < 16; i++) {
-        de = kvstoreDictAddRaw(kvs1, didx, stringFromInt(i), NULL);
-        TEST_ASSERT(de != NULL);
+        TEST_ASSERT(kvstoreHashtableAdd(kvs1, didx, stringFromInt(i)));
     }
 
     kvs_it = kvstoreIteratorInit(kvs1);
-    while ((de = kvstoreIteratorNext(kvs_it)) != NULL) {
-        curr_slot = kvstoreIteratorGetCurrentDictIndex(kvs_it);
-        key = dictGetKey(de);
-        TEST_ASSERT(kvstoreDictDelete(kvs1, curr_slot, key) == DICT_OK);
+    while (kvstoreIteratorNext(kvs_it, &key)) {
+        curr_slot = kvstoreIteratorGetCurrentHashtableIndex(kvs_it);
+        TEST_ASSERT(kvstoreHashtableDelete(kvs1, curr_slot, key));
     }
     kvstoreIteratorRelease(kvs_it);
 
-    dict *d = kvstoreGetDict(kvs1, didx);
-    TEST_ASSERT(d != NULL);
-    TEST_ASSERT(kvstoreDictSize(kvs1, didx) == 0);
+    hashtable *ht = kvstoreGetHashtable(kvs1, didx);
+    TEST_ASSERT(ht != NULL);
+    TEST_ASSERT(kvstoreHashtableSize(kvs1, didx) == 0);
     TEST_ASSERT(kvstoreSize(kvs1) == 0);
 
     kvstoreRelease(kvs1);
     return 0;
 }
 
-int test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict(int argc, char **argv, int flags) {
+int test_kvstoreIteratorRemoveAllKeysDeleteEmptyHashtable(int argc, char **argv, int flags) {
     UNUSED(argc);
     UNUSED(argv);
     UNUSED(flags);
 
     int i;
     void *key;
-    dictEntry *de;
     kvstoreIterator *kvs_it;
 
     int didx = 0;
     int curr_slot = 0;
-    kvstore *kvs2 = kvstoreCreate(&KvstoreDictTestType, 0, KVSTORE_ALLOCATE_DICTS_ON_DEMAND | KVSTORE_FREE_EMPTY_DICTS);
+    kvstore *kvs2 = kvstoreCreate(&KvstoreHashtableTestType, 0, KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND | KVSTORE_FREE_EMPTY_HASHTABLES);
 
     for (i = 0; i < 16; i++) {
-        de = kvstoreDictAddRaw(kvs2, didx, stringFromInt(i), NULL);
-        TEST_ASSERT(de != NULL);
+        TEST_ASSERT(kvstoreHashtableAdd(kvs2, didx, stringFromInt(i)));
     }
 
     kvs_it = kvstoreIteratorInit(kvs2);
-    while ((de = kvstoreIteratorNext(kvs_it)) != NULL) {
-        curr_slot = kvstoreIteratorGetCurrentDictIndex(kvs_it);
-        key = dictGetKey(de);
-        TEST_ASSERT(kvstoreDictDelete(kvs2, curr_slot, key) == DICT_OK);
+    while (kvstoreIteratorNext(kvs_it, &key)) {
+        curr_slot = kvstoreIteratorGetCurrentHashtableIndex(kvs_it);
+        TEST_ASSERT(kvstoreHashtableDelete(kvs2, curr_slot, key));
     }
     kvstoreIteratorRelease(kvs_it);
 
-    /* Make sure the dict was removed from the rehashing list. */
+    /* Make sure the hashtable was removed from the rehashing list. */
     while (kvstoreIncrementallyRehash(kvs2, 1000)) {
     }
 
-    dict *d = kvstoreGetDict(kvs2, didx);
-    TEST_ASSERT(d == NULL);
-    TEST_ASSERT(kvstoreDictSize(kvs2, didx) == 0);
+    hashtable *ht = kvstoreGetHashtable(kvs2, didx);
+    TEST_ASSERT(ht == NULL);
+    TEST_ASSERT(kvstoreHashtableSize(kvs2, didx) == 0);
     TEST_ASSERT(kvstoreSize(kvs2) == 0);
 
     kvstoreRelease(kvs2);
     return 0;
 }
 
-int test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict(int argc, char **argv, int flags) {
+int test_kvstoreHashtableIteratorRemoveAllKeysNoDeleteEmptyHashtable(int argc, char **argv, int flags) {
     UNUSED(argc);
     UNUSED(argv);
     UNUSED(flags);
 
     int i;
     void *key;
-    dictEntry *de;
-    kvstoreDictIterator *kvs_di;
+    kvstoreHashtableIterator *kvs_di;
 
     int didx = 0;
-    kvstore *kvs1 = kvstoreCreate(&KvstoreDictTestType, 0, KVSTORE_ALLOCATE_DICTS_ON_DEMAND);
+    kvstore *kvs1 = kvstoreCreate(&KvstoreHashtableTestType, 0, KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND);
 
     for (i = 0; i < 16; i++) {
-        de = kvstoreDictAddRaw(kvs1, didx, stringFromInt(i), NULL);
-        TEST_ASSERT(de != NULL);
+        TEST_ASSERT(kvstoreHashtableAdd(kvs1, didx, stringFromInt(i)));
     }
 
-    kvs_di = kvstoreGetDictSafeIterator(kvs1, didx);
-    while ((de = kvstoreDictIteratorNext(kvs_di)) != NULL) {
-        key = dictGetKey(de);
-        TEST_ASSERT(kvstoreDictDelete(kvs1, didx, key) == DICT_OK);
+    kvs_di = kvstoreGetHashtableSafeIterator(kvs1, didx);
+    while (kvstoreHashtableIteratorNext(kvs_di, &key)) {
+        TEST_ASSERT(kvstoreHashtableDelete(kvs1, didx, key));
     }
-    kvstoreReleaseDictIterator(kvs_di);
+    kvstoreReleaseHashtableIterator(kvs_di);
 
-    dict *d = kvstoreGetDict(kvs1, didx);
-    TEST_ASSERT(d != NULL);
-    TEST_ASSERT(kvstoreDictSize(kvs1, didx) == 0);
+    hashtable *ht = kvstoreGetHashtable(kvs1, didx);
+    TEST_ASSERT(ht != NULL);
+    TEST_ASSERT(kvstoreHashtableSize(kvs1, didx) == 0);
     TEST_ASSERT(kvstoreSize(kvs1) == 0);
 
     kvstoreRelease(kvs1);
     return 0;
 }
 
-int test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict(int argc, char **argv, int flags) {
+int test_kvstoreHashtableIteratorRemoveAllKeysDeleteEmptyHashtable(int argc, char **argv, int flags) {
     UNUSED(argc);
     UNUSED(argv);
     UNUSED(flags);
 
     int i;
     void *key;
-    dictEntry *de;
-    kvstoreDictIterator *kvs_di;
+    kvstoreHashtableIterator *kvs_di;
 
     int didx = 0;
-    kvstore *kvs2 = kvstoreCreate(&KvstoreDictTestType, 0, KVSTORE_ALLOCATE_DICTS_ON_DEMAND | KVSTORE_FREE_EMPTY_DICTS);
+    kvstore *kvs2 = kvstoreCreate(&KvstoreHashtableTestType, 0, KVSTORE_ALLOCATE_HASHTABLES_ON_DEMAND | KVSTORE_FREE_EMPTY_HASHTABLES);
 
     for (i = 0; i < 16; i++) {
-        de = kvstoreDictAddRaw(kvs2, didx, stringFromInt(i), NULL);
-        TEST_ASSERT(de != NULL);
+        TEST_ASSERT(kvstoreHashtableAdd(kvs2, didx, stringFromInt(i)));
     }
 
-    kvs_di = kvstoreGetDictSafeIterator(kvs2, didx);
-    while ((de = kvstoreDictIteratorNext(kvs_di)) != NULL) {
-        key = dictGetKey(de);
-        TEST_ASSERT(kvstoreDictDelete(kvs2, didx, key) == DICT_OK);
+    kvs_di = kvstoreGetHashtableSafeIterator(kvs2, didx);
+    while (kvstoreHashtableIteratorNext(kvs_di, &key)) {
+        TEST_ASSERT(kvstoreHashtableDelete(kvs2, didx, key));
     }
-    kvstoreReleaseDictIterator(kvs_di);
+    kvstoreReleaseHashtableIterator(kvs_di);
 
-    dict *d = kvstoreGetDict(kvs2, didx);
-    TEST_ASSERT(d == NULL);
-    TEST_ASSERT(kvstoreDictSize(kvs2, didx) == 0);
+    hashtable *ht = kvstoreGetHashtable(kvs2, didx);
+    TEST_ASSERT(ht == NULL);
+    TEST_ASSERT(kvstoreHashtableSize(kvs2, didx) == 0);
     TEST_ASSERT(kvstoreSize(kvs2) == 0);
 
     kvstoreRelease(kvs2);
diff --git a/src/unit/test_object.c b/src/unit/test_object.c
new file mode 100644
index 0000000000..995400c3d9
--- /dev/null
+++ b/src/unit/test_object.c
@@ -0,0 +1,50 @@
+#include "../object.c"
+#include "test_help.h"
+
+#include <stdio.h>
+#include <limits.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+
+
+int test_object_with_key(int argc, char **argv, int flags) {
+    UNUSED(argc);
+    UNUSED(argv);
+    UNUSED(flags);
+    sds key = sdsnew("foo");
+    robj *val = createStringObject("bar", strlen("bar"));
+    TEST_ASSERT(val->encoding == OBJ_ENCODING_EMBSTR);
+
+    /* Prevent objectSetKeyAndExpire from freeing the old val when reallocating it. */
+    incrRefCount(val);
+
+    /* Create valkey: val with key. */
+    robj *valkey = objectSetKeyAndExpire(val, key, -1);
+    TEST_ASSERT(valkey->encoding == OBJ_ENCODING_EMBSTR);
+    TEST_ASSERT(objectGetKey(valkey) != NULL);
+
+    /* Check embedded key "foo" */
+    TEST_ASSERT(sdslen(objectGetKey(valkey)) == 3);
+    TEST_ASSERT(sdslen(key) == 3);
+    TEST_ASSERT(sdscmp(objectGetKey(valkey), key) == 0);
+    TEST_ASSERT(strcmp(objectGetKey(valkey), "foo") == 0);
+
+    /* Check embedded value "bar" (EMBSTR content) */
+    TEST_ASSERT(sdscmp(valkey->ptr, val->ptr) == 0);
+    TEST_ASSERT(strcmp(valkey->ptr, "bar") == 0);
+
+    /* Either they're two separate objects, or one object with refcount == 2. */
+    if (valkey == val) {
+        TEST_ASSERT(valkey->refcount == 2);
+    } else {
+        TEST_ASSERT(valkey->refcount == 1);
+        TEST_ASSERT(val->refcount == 1);
+    }
+
+    /* Free them. */
+    sdsfree(key);
+    decrRefCount(val);
+    decrRefCount(valkey);
+    return 0;
+}
diff --git a/tests/integration/valkey-cli.tcl b/tests/integration/valkey-cli.tcl
index 6344215a25..0c15af74f9 100644
--- a/tests/integration/valkey-cli.tcl
+++ b/tests/integration/valkey-cli.tcl
@@ -499,10 +499,10 @@ if {!$::tls} { ;# fake_redis_node doesn't support TLS
         assert_equal 1000 [llength [split [run_cli --scan]]]
 
         # pattern
-        assert_equal {key:2} [run_cli --scan --pattern "*:2"]
+        assert_equal {key:2} [split [run_cli --scan --pattern "*:2"]]
 
         # pattern matching with a quoted string
-        assert_equal {key:2} [run_cli --scan --quoted-pattern {"*:\x32"}]
+        assert_equal {key:2} [split [run_cli --scan --quoted-pattern {"*:\x32"}]]
     }
 
     proc test_valkey_cli_repl {} {
diff --git a/tests/support/util.tcl b/tests/support/util.tcl
index e53cda3071..8f3cda3a4c 100644
--- a/tests/support/util.tcl
+++ b/tests/support/util.tcl
@@ -931,6 +931,30 @@ proc debug_digest {{level 0}} {
     r $level debug digest
 }
 
+proc main_hash_table_size {{level 0}} {
+    set dbnum [expr {$::singledb ? 0 : 9}]
+    append re \
+        {^\[Dictionary HT\]\n} \
+        {Hash table 0 stats \(main hash table\):\n} \
+        { table size: (\d+)}
+    regexp $re [r $level DEBUG HTSTATS $dbnum] -> table_size
+    return $table_size
+}
+
+# Returns the number of keys that can be added before rehashing starts. Insert
+# this number of keys and no rehashing happens. Insert one more key and
+# rehashing can be triggered by the cron function. Insert two more keys and
+# rehashing is triggered immediately.
+proc main_hash_table_keys_before_rehashing_starts {{level 0}} {
+    # This fill factor is defined internally in hashtable.c and duplicated here.
+    # If we change the fill factor, this needs to be updated accordingly.
+    set MAX_FILL_PERCENT_SOFT 100
+    set table_size [main_hash_table_size $level]
+    set dbsize [r $level dbsize]
+    set free_space [expr {$table_size * $MAX_FILL_PERCENT_SOFT / 100 - $dbsize - 1}]
+    return $free_space
+}
+
 proc wait_for_blocked_client {{idx 0}} {
     wait_for_condition 50 100 {
         [s $idx blocked_clients] ne 0
diff --git a/tests/unit/expire.tcl b/tests/unit/expire.tcl
index c5c11191c0..e1dcc9203b 100644
--- a/tests/unit/expire.tcl
+++ b/tests/unit/expire.tcl
@@ -939,7 +939,7 @@ start_cluster 1 0 {tags {"expire external:skip cluster"}} {
 
         # hashslot(foo) is 12182
         # fill data across different slots with expiration
-        for {set j 1} {$j <= 100} {incr j} {
+        for {set j 1} {$j <= 1000} {incr j} {
             r psetex "{foo}$j" 500 a
         }
         # hashslot(key) is 12539
@@ -950,7 +950,7 @@ start_cluster 1 0 {tags {"expire external:skip cluster"}} {
         r debug dict-resizing 0
 
         # delete data to have lot's (99%) of empty buckets (slot 12182 should be skipped)
-        for {set j 1} {$j <= 99} {incr j} {
+        for {set j 1} {$j <= 999} {incr j} {
             r del "{foo}$j"
         }
 
@@ -976,7 +976,9 @@ start_cluster 1 0 {tags {"expire external:skip cluster"}} {
         r debug dict-resizing 1
 
         # put some data into slot 12182 and trigger the resize
+        # by deleting it to trigger shrink
         r psetex "{foo}0" 500 a
+        r del "{foo}0"
 
         # Verify all keys have expired
         wait_for_condition 400 100 {
diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl
index 278a1d8e33..cf7f633a8c 100644
--- a/tests/unit/info.tcl
+++ b/tests/unit/info.tcl
@@ -515,23 +515,43 @@ start_server {tags {"info" "external:skip"}} {
         set info_mem [r info memory]
         set mem_stats [r memory stats]
         assert_equal [getInfoProperty $info_mem mem_overhead_db_hashtable_rehashing] {0}
-        assert_range [dict get $mem_stats overhead.db.hashtable.lut] 1 64
+        # overhead.db.hashtable.lut = memory overhead of hashset including hashset struct and tables
+        set hashset_overhead [dict get $mem_stats overhead.db.hashtable.lut]
+        if {$hashset_overhead < 140} {
+            # 32-bit version (hashset struct + 1 bucket of 64 bytes)
+            set bits 32
+        } else {
+            set bits 64
+        }
+        assert_range [dict get $mem_stats overhead.db.hashtable.lut] 1 256
         assert_equal [dict get $mem_stats overhead.db.hashtable.rehashing] {0}
         assert_equal [dict get $mem_stats db.dict.rehashing.count] {0}
-        # set 4 more keys to trigger rehashing
+        # set 7 more keys to trigger rehashing
         # get the info within a transaction to make sure the rehashing is not completed
-        r multi 
+        r multi
         r set b c
         r set c d
         r set d e
         r set e f
+        r set f g
+        r set g h
+        r set h i
+        if {$bits == 32} {
+            # In 32-bit mode, we have 12 elements per bucket. Insert five more
+            # to trigger rehashing.
+            r set aa aa
+            r set bb bb
+            r set cc cc
+            r set dd dd
+            r set ee ee
+        }
         r info memory
         r memory stats
         set res [r exec]
-        set info_mem [lindex $res 4]
-        set mem_stats [lindex $res 5]
+        set info_mem [lindex $res end-1]
+        set mem_stats [lindex $res end]
         assert_range [getInfoProperty $info_mem mem_overhead_db_hashtable_rehashing] 1 64
-        assert_range [dict get $mem_stats overhead.db.hashtable.lut] 1 192
+        assert_range [dict get $mem_stats overhead.db.hashtable.lut] 1 300
         assert_range [dict get $mem_stats overhead.db.hashtable.rehashing] 1 64
         assert_equal [dict get $mem_stats db.dict.rehashing.count] {1}
     }
diff --git a/tests/unit/maxmemory.tcl b/tests/unit/maxmemory.tcl
index 3b0a44a156..5b76f44645 100644
--- a/tests/unit/maxmemory.tcl
+++ b/tests/unit/maxmemory.tcl
@@ -145,45 +145,6 @@ start_server {tags {"maxmemory" "external:skip"}} {
 }
 
 start_server {tags {"maxmemory external:skip"}} {
-    test "Without maxmemory small integers are shared" {
-        r config set maxmemory 0
-        r set a 1
-        assert_refcount_morethan a 1
-    }
-
-    test "With maxmemory and non-LRU policy integers are still shared" {
-        r config set maxmemory 1073741824
-        r config set maxmemory-policy allkeys-random
-        r set a 1
-        assert_refcount_morethan a 1
-    }
-
-    test "With maxmemory and LRU policy integers are not shared" {
-        r config set maxmemory 1073741824
-        r config set maxmemory-policy allkeys-lru
-        r set a 1
-        r config set maxmemory-policy volatile-lru
-        r set b 1
-        assert_refcount 1 a
-        assert_refcount 1 b
-        r config set maxmemory 0
-    }
-
-    test "Shared integers are unshared with maxmemory and LRU policy" {
-        r set a 1
-        r set b 1
-        assert_refcount_morethan a 1
-        assert_refcount_morethan b 1
-        r config set maxmemory 1073741824
-        r config set maxmemory-policy allkeys-lru
-        r get a
-        assert_refcount 1 a
-        r config set maxmemory-policy volatile-lru
-        r get b
-        assert_refcount 1 b
-        r config set maxmemory 0
-    }
-
     foreach policy {
         allkeys-random allkeys-lru allkeys-lfu volatile-lru volatile-lfu volatile-random volatile-ttl
     } {
@@ -265,10 +226,10 @@ start_server {tags {"maxmemory external:skip"}} {
             # make sure to start with a blank instance
             r flushall
             # Get the current memory limit and calculate a new limit.
-            # We just add 100k to the current memory size so that it is
+            # We just add 400KiB to the current memory size so that it is
             # fast for us to reach that limit.
             set used [s used_memory]
-            set limit [expr {$used+100*1024}]
+            set limit [expr {$used+400*1024}]
             r config set maxmemory $limit
             r config set maxmemory-policy $policy
             # Now add keys until the limit is almost reached.
@@ -435,25 +396,37 @@ start_server {tags {"maxmemory external:skip"}} {
         r config set latency-tracking no
         r config set maxmemory 0
         r config set maxmemory-policy allkeys-random
+        set dbnum [expr {$::singledb ? 0 : 9}]
 
-        # Next rehash size is 8192, that will eat 64k memory
-        populate 4095 "" 1
+        # Populate some, then check table size and populate more up to one less
+        # than the soft maximum fill factor. Adding some more elements after
+        # this does not trigger rehashing, because rehashing would eat some
+        # kilobytes of memory.
+        populate 2000 a 1
+        set table_size [main_hash_table_size]
+        populate [main_hash_table_keys_before_rehashing_starts] b 1
 
+        # Now we are close to resizing. Check that rehashing didn't start.
+        assert_equal $table_size [main_hash_table_size]
+        assert_no_match "*Hash table 1 stats*" [r debug htstats $dbnum]
+
+        set dbsize_before [r dbsize]
         set used [s used_memory]
         set limit [expr {$used + 10*1024}]
         r config set maxmemory $limit
 
         # Adding a key to meet the 1:1 radio.
         r set k0 v0
-        # The dict has reached 4096, it can be resized in tryResizeHashTables in cron,
+        # The table has reached the soft max fill factor.
+        # It can be resized in tryResizeHashTables in cron,
         # or we add a key to let it check whether it can be resized.
         r set k1 v1
         # Next writing command will trigger evicting some keys if last
         # command trigger DB dict rehash
         r set k2 v2
-        # There must be 4098 keys because the server doesn't evict keys.
-        r dbsize
-    } {4098}
+        # There must be three more keys because the server doesn't evict keys.
+        assert_equal [r dbsize] [expr {$dbsize_before + 3}]
+    }
 }
 
 # Skip the following test when running with IO threads
diff --git a/tests/unit/other.tcl b/tests/unit/other.tcl
index 6e6230fc19..bb08c67471 100644
--- a/tests/unit/other.tcl
+++ b/tests/unit/other.tcl
@@ -396,7 +396,16 @@ start_server {tags {"other external:skip"}} {
         r config set save ""
         r config set rdb-key-save-delay 1000000
 
-        populate 4095 "" 1
+        # Populate some, then check table size and populate more up to one less
+        # than the soft maximum fill factor.
+        populate 2000 a 1
+        set table_size [main_hash_table_size]
+        populate [main_hash_table_keys_before_rehashing_starts] b 1
+
+        # Now we are close to resizing. Check that rehashing didn't start.
+        assert_equal $table_size [main_hash_table_size]
+        assert_no_match "*Hash table 1 stats*" [r debug htstats 9]
+
         r bgsave
         wait_for_condition 10 100 {
             [s rdb_bgsave_in_progress] eq 1
@@ -406,14 +415,15 @@ start_server {tags {"other external:skip"}} {
 
         r mset k1 v1 k2 v2
         # Hash table should not rehash
-        assert_no_match "*table size: 8192*" [r debug HTSTATS 9]
+        assert_equal $table_size [main_hash_table_size]
+        assert_no_match "*Hash table 1 stats*" [r debug htstats 9]
         exec kill -9 [get_child_pid 0]
         waitForBgsave r
 
         # Hash table should rehash since there is no child process,
-        # size is power of two and over 4096, so it is 8192
+        # so the resize limit is restored.
         wait_for_condition 50 100 {
-            [string match "*table size: 8192*" [r debug HTSTATS 9]]
+            [main_hash_table_size] > $table_size
         } else {
             fail "hash table did not rehash after child process killed"
         }
@@ -472,7 +482,7 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} {
         for {set j 1} {$j <= 128} {incr j} {
             r set "{foo}$j" a
         }
-        assert_match "*table size: 128*" [r debug HTSTATS 0]
+        set table_size [main_hash_table_size]
 
         # disable resizing, the reason for not using slow bgsave is because
         # it will hit the dict_force_resize_ratio.
@@ -482,14 +492,14 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} {
         for {set j 1} {$j <= 123} {incr j} {
             r del "{foo}$j"
         }
-        assert_match "*table size: 128*" [r debug HTSTATS 0]
+        assert_equal $table_size [main_hash_table_size]
 
         # enable resizing
         r debug dict-resizing 1
 
         # waiting for serverCron to resize the tables
         wait_for_condition 1000 10 {
-            [string match {*table size: 8*} [r debug HTSTATS 0]]
+            [main_hash_table_size] < $table_size
         } else {
             puts [r debug HTSTATS 0]
             fail "hash tables weren't resize."
@@ -503,6 +513,7 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} {
         for {set j 1} {$j <= 128} {incr j} {
             r set "{alice}$j" a
         }
+        set table_size [main_hash_table_size]
 
         # disable resizing, the reason for not using slow bgsave is because
         # it will hit the dict_force_resize_ratio.
@@ -517,7 +528,7 @@ start_cluster 1 0 {tags {"other external:skip cluster slow"}} {
 
         # waiting for serverCron to resize the tables
         wait_for_condition 1000 10 {
-            [string match {*table size: 16*} [r debug HTSTATS 0]]
+            [main_hash_table_size] < $table_size
         } else {
             puts [r debug HTSTATS 0]
             fail "hash tables weren't resize."
@@ -537,8 +548,9 @@ start_server {tags {"other external:skip"}} {
         }
         # The dict containing 128 keys must have expanded,
         # its hash table itself takes a lot more than 400 bytes
+        set dbnum [expr {$::singledb ? 0 : 9}]
         wait_for_condition 100 50 {
-            [dict get [r memory stats] db.9 overhead.hashtable.main] < 400
+            [dict get [r memory stats] db.$dbnum overhead.hashtable.main] < 400
         } else {
             fail "dict did not resize in time"
         }   
diff --git a/tests/unit/type/incr.tcl b/tests/unit/type/incr.tcl
index 4bc130bcb1..fd0a8d02d8 100644
--- a/tests/unit/type/incr.tcl
+++ b/tests/unit/type/incr.tcl
@@ -75,17 +75,6 @@ start_server {tags {"incr"}} {
         assert_equal {-1} [r decrby key_not_exist 1]
     }
 
-    test {INCR uses shared objects in the 0-9999 range} {
-        r set foo -1
-        r incr foo
-        assert_refcount_morethan foo 1
-        r set foo 9998
-        r incr foo
-        assert_refcount_morethan foo 1
-        r incr foo
-        assert_refcount 1 foo
-    }
-
     test {INCR can modify objects in-place} {
         r set foo 20000
         r incr foo

From 1acf7f71c0324747de6b0ed9118f065dde0a5a92 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Wed, 11 Dec 2024 13:40:18 +0800
Subject: [PATCH 36/73] Fix memory leak in the new hashtable unittest (#1421)

There is a leak in here, hashtableTwoPhasePopDelete won't call the entry
destructor and like hashtablePop we need to call it by myself.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/unit/test_hashtable.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/unit/test_hashtable.c b/src/unit/test_hashtable.c
index 782fa0ee6a..689440e43d 100644
--- a/src/unit/test_hashtable.c
+++ b/src/unit/test_hashtable.c
@@ -303,6 +303,7 @@ int test_two_phase_insert_and_pop(int argc, char **argv, int flags) {
         TEST_ASSERT(hashtableSize(ht) == size_before_find);
         hashtableTwoPhasePopDelete(ht, &position);
         TEST_ASSERT(hashtableSize(ht) == size_before_find - 1);
+        free(e);
     }
     TEST_ASSERT(hashtableSize(ht) == 0);
 

From 0c8ad5cd34129bacf5a3dda30b37b156e7cdfb98 Mon Sep 17 00:00:00 2001
From: Jim Brunner <brunnerj@amazon.com>
Date: Wed, 11 Dec 2024 09:47:06 -0800
Subject: [PATCH 37/73] defrag: allow defrag to start during AOF loading
 (#1420)

Addresses https://github.com/valkey-io/valkey/issues/1393

Changes:
* During AOF loading or long running script, this allows defrag to be
initiated.
* The AOF defrag test was corrected to eliminate the wait period and
rely on non-timer invocations.
* Logic for "overage" time in defrag was changed. It previously
accumulated underage leading to large latencies in extreme tests having
very high CPU percentage. After several simple stages were completed
during infrequent blocked processing, a large cycle time would be
experienced.

Signed-off-by: Jim Brunner <brunnerj@amazon.com>
---
 src/defrag.c                 | 14 ++++++++++----
 tests/unit/memefficiency.tcl |  6 +++++-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/defrag.c b/src/defrag.c
index 057fdd50de..2fa067f0dc 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -84,7 +84,7 @@ struct DefragContext {
 
     long long timeproc_id;      // Eventloop ID of the timerproc (or AE_DELETED_EVENT_ID)
     monotime timeproc_end_time; // Ending time of previous timerproc execution
-    long timeproc_overage_us;   // A correction value if over/under target CPU percent
+    long timeproc_overage_us;   // A correction value if over target CPU percent
 };
 static struct DefragContext defrag;
 
@@ -1157,7 +1157,7 @@ static int computeDefragCycleUs(void) {
          *  the starvation of the timer. */
         dutyCycleUs = targetCpuPercent * waitedUs / (100 - targetCpuPercent);
 
-        // Also adjust for any accumulated overage(underage).
+        // Also adjust for any accumulated overage.
         dutyCycleUs -= defrag.timeproc_overage_us;
         defrag.timeproc_overage_us = 0;
 
@@ -1176,8 +1176,11 @@ static int computeDefragCycleUs(void) {
  * computeDefragCycleUs computation. */
 static int computeDelayMs(monotime intendedEndtime) {
     defrag.timeproc_end_time = getMonotonicUs();
-    int overage = defrag.timeproc_end_time - intendedEndtime;
+    long overage = defrag.timeproc_end_time - intendedEndtime;
     defrag.timeproc_overage_us += overage; // track over/under desired CPU
+    /* Allow negative overage (underage) to count against existing overage, but don't allow
+     * underage (from short stages) to be accumulated.  */
+    if (defrag.timeproc_overage_us < 0) defrag.timeproc_overage_us = 0;
 
     int targetCpuPercent = server.active_defrag_cpu_percent;
     serverAssert(targetCpuPercent > 0 && targetCpuPercent < 100);
@@ -1189,7 +1192,7 @@ static int computeDelayMs(monotime intendedEndtime) {
     long totalCycleTimeUs = server.active_defrag_cycle_us * 100 / targetCpuPercent;
     long delayUs = totalCycleTimeUs - server.active_defrag_cycle_us;
     // Only increase delay by the fraction of the overage that would be non-duty-cycle
-    delayUs += defrag.timeproc_overage_us * (100 - targetCpuPercent) / 100; // "overage" might be negative
+    delayUs += defrag.timeproc_overage_us * (100 - targetCpuPercent) / 100;
     if (delayUs < 0) delayUs = 0;
     long delayMs = delayUs / 1000; // round down
     return delayMs;
@@ -1254,6 +1257,9 @@ static long long activeDefragTimeProc(struct aeEventLoop *eventLoop, long long i
  * actions.  This interface allows defrag to continue running, avoiding a single long defrag step
  * after the long operation completes. */
 void defragWhileBlocked(void) {
+    // This is called infrequently, while timers are not active.  We might need to start defrag.
+    if (!defragIsRunning()) monitorActiveDefrag();
+
     if (!defragIsRunning()) return;
 
     // Save off the timeproc_id.  If we have a normal termination, it will be cleared.
diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl
index abd23b1d83..ce74b7c618 100644
--- a/tests/unit/memefficiency.tcl
+++ b/tests/unit/memefficiency.tcl
@@ -138,8 +138,12 @@ run_solo {defrag} {
                 # reset stats and load the AOF file
                 r config resetstat
                 r config set key-load-delay -25 ;# sleep on average 1/25 usec
+                # Note: This test is checking if defrag is working DURING AOF loading (while
+                #       timers are not active).  So we don't give any extra time, and we deactivate
+                #       defrag immediately after the AOF loading is complete.  During loading,
+                #       defrag will get invoked less often, causing starvation prevention.  We
+                #       should expect longer latency measurements.
                 r debug loadaof
-                after 1000 ;# give defrag a chance to work before turning it off
                 r config set activedefrag no
 
                 # measure hits and misses right after aof loading

From 5f7fe9ef21f1feae42257ac93ab33d8f8c06e97f Mon Sep 17 00:00:00 2001
From: Pierre <105686771+pieturin@users.noreply.github.com>
Date: Wed, 11 Dec 2024 17:26:06 -0800
Subject: [PATCH 38/73] Send MEET packet to node if there is no inbound link to
 fix inconsistency when handshake timedout (#1307)

In some cases, when meeting a new node, if the handshake times out, we
can end up with an inconsistent view of the cluster where the new node
knows about all the nodes in the cluster, but the cluster does not know
about this new node (or vice versa).
To detect this inconsistency, we now check if a node has an outbound
link but no inbound link, in this case it probably means this node does
not know us. In this case we (re-)send a MEET packet to this node to do
a new handshake with it.
If we receive a MEET packet from a known node, we disconnect the
outbound link to force a reconnect and sending of a PING packet so that
the other node recognizes the link as belonging to us. This prevents
cases where a node could send MEET packets in a loop because it thinks
the other node does not have an inbound link.

This fixes the bug described in #1251.

---------

Signed-off-by: Pierre Turin <pieturin@amazon.com>
---
 src/cluster_legacy.c                          |  99 ++++++---
 src/cluster_legacy.h                          |   4 +
 tests/support/cluster_util.tcl                |   9 +
 tests/unit/cluster/cluster-multiple-meets.tcl |   4 +-
 tests/unit/cluster/cluster-reliable-meet.tcl  | 208 +++++++++++++++++-
 5 files changed, 291 insertions(+), 33 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index a273fe0d86..d1c6dd0094 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -1336,6 +1336,10 @@ clusterLink *createClusterLink(clusterNode *node) {
  * with this link will have the 'link' field set to NULL. */
 void freeClusterLink(clusterLink *link) {
     serverAssert(link != NULL);
+    serverLog(LL_DEBUG, "Freeing cluster link for node: %.40s:%s",
+              link->node ? link->node->name : "<unknown>",
+              link->inbound ? "inbound" : "outbound");
+
     if (link->conn) {
         connClose(link->conn);
         link->conn = NULL;
@@ -1351,6 +1355,7 @@ void freeClusterLink(clusterLink *link) {
         } else if (link->node->inbound_link == link) {
             serverAssert(link->inbound);
             link->node->inbound_link = NULL;
+            link->node->inbound_link_freed_time = mstime();
         }
     }
     zfree(link);
@@ -1490,6 +1495,7 @@ clusterNode *createClusterNode(char *nodename, int flags) {
     node->fail_time = 0;
     node->link = NULL;
     node->inbound_link = NULL;
+    node->inbound_link_freed_time = node->ctime;
     memset(node->ip, 0, sizeof(node->ip));
     node->announce_client_ipv4 = sdsempty();
     node->announce_client_ipv6 = sdsempty();
@@ -1696,6 +1702,9 @@ void clusterAddNode(clusterNode *node) {
  *    it is a replica node.
  */
 void clusterDelNode(clusterNode *delnode) {
+    serverAssert(delnode != NULL);
+    serverLog(LL_DEBUG, "Deleting node %.40s from cluster view", delnode->name);
+
     int j;
     dictIterator *di;
     dictEntry *de;
@@ -2078,7 +2087,7 @@ void clearNodeFailureIfNeeded(clusterNode *node) {
 /* Return 1 if we already have a node in HANDSHAKE state matching the
  * specified ip address and port number. This function is used in order to
  * avoid adding a new handshake node for the same address multiple times. */
-int clusterHandshakeInProgress(char *ip, int port, int cport) {
+static int clusterHandshakeInProgress(char *ip, int port, int cport) {
     dictIterator *di;
     dictEntry *de;
 
@@ -2100,7 +2109,7 @@ int clusterHandshakeInProgress(char *ip, int port, int cport) {
  *
  * EAGAIN - There is already a handshake in progress for this address.
  * EINVAL - IP or port are not valid. */
-int clusterStartHandshake(char *ip, int port, int cport) {
+static int clusterStartHandshake(char *ip, int port, int cport) {
     clusterNode *n;
     char norm_ip[NET_IP_STR_LEN];
     struct sockaddr_storage sa;
@@ -3207,33 +3216,48 @@ int clusterProcessPacket(clusterLink *link) {
             }
         }
 
-        /* Add this node if it is new for us and the msg type is MEET.
-         * In this stage we don't try to add the node with the right
-         * flags, replicaof pointer, and so forth, as this details will be
-         * resolved when we'll receive PONGs from the node. The exception
-         * to this is the flag that indicates extensions are supported, as
-         * we want to send extensions right away in the return PONG in order
-         * to reduce the amount of time needed to stabilize the shard ID. */
-        if (!sender && type == CLUSTERMSG_TYPE_MEET) {
-            clusterNode *node;
-
-            node = createClusterNode(NULL, CLUSTER_NODE_HANDSHAKE);
-            serverAssert(nodeIp2String(node->ip, link, hdr->myip) == C_OK);
-            getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port);
-            node->cport = ntohs(hdr->cport);
-            if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) {
-                node->flags |= CLUSTER_NODE_EXTENSIONS_SUPPORTED;
+        if (type == CLUSTERMSG_TYPE_MEET) {
+            if (!sender) {
+                /* Add this node if it is new for us and the msg type is MEET.
+                 * In this stage we don't try to add the node with the right
+                 * flags, replicaof pointer, and so forth, as this details will be
+                 * resolved when we'll receive PONGs from the node. The exception
+                 * to this is the flag that indicates extensions are supported, as
+                 * we want to send extensions right away in the return PONG in order
+                 * to reduce the amount of time needed to stabilize the shard ID. */
+                clusterNode *node;
+
+                node = createClusterNode(NULL, CLUSTER_NODE_HANDSHAKE);
+                serverAssert(nodeIp2String(node->ip, link, hdr->myip) == C_OK);
+                getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port);
+                node->cport = ntohs(hdr->cport);
+                if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) {
+                    node->flags |= CLUSTER_NODE_EXTENSIONS_SUPPORTED;
+                }
+                setClusterNodeToInboundClusterLink(node, link);
+                clusterAddNode(node);
+                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
+
+                /* If this is a MEET packet from an unknown node, we still process
+                 * the gossip section here since we have to trust the sender because
+                 * of the message type. */
+                clusterProcessGossipSection(hdr, link);
+            } else if (sender->link && now - sender->ctime > server.cluster_node_timeout) {
+                /* The MEET packet is from a known node, after the handshake timeout, so the sender thinks that I do not
+                 * know it.
+                 * Freeing my outbound link to that node, to force a reconnect and sending a PING.
+                 * Once that node receives our PING, it should recognize the new connection as an inbound link from me.
+                 * We should only free the outbound link if the node is known for more time than the handshake timeout,
+                 * since during this time, the other side might still be trying to complete the handshake. */
+
+                /* We should always receive a MEET packet on an inbound link. */
+                serverAssert(link != sender->link);
+                serverLog(LL_NOTICE, "Freeing outbound link to node %.40s after receiving a MEET packet from this known node",
+                          sender->name);
+                freeClusterLink(sender->link);
             }
-            setClusterNodeToInboundClusterLink(node, link);
-            clusterAddNode(node);
-            clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG);
         }
 
-        /* If this is a MEET packet from an unknown node, we still process
-         * the gossip section here since we have to trust the sender because
-         * of the message type. */
-        if (!sender && type == CLUSTERMSG_TYPE_MEET) clusterProcessGossipSection(hdr, link);
-
         /* Anyway reply with a PONG */
         clusterSendPing(link, CLUSTERMSG_TYPE_PONG);
     }
@@ -3243,7 +3267,7 @@ int clusterProcessPacket(clusterLink *link) {
         serverLog(LL_DEBUG, "%s packet received: %.40s", clusterGetMessageTypeString(type),
                   link->node ? link->node->name : "NULL");
 
-        if (sender && (sender->flags & CLUSTER_NODE_MEET)) {
+        if (sender && nodeInMeetState(sender)) {
             /* Once we get a response for MEET from the sender, we can stop sending more MEET. */
             sender->flags &= ~CLUSTER_NODE_MEET;
             serverLog(LL_NOTICE, "Successfully completed handshake with %.40s (%s)", sender->name,
@@ -3668,7 +3692,7 @@ void clusterLinkConnectHandler(connection *conn) {
      * of a PING one, to force the receiver to add us in its node
      * table. */
     mstime_t old_ping_sent = node->ping_sent;
-    clusterSendPing(link, node->flags & CLUSTER_NODE_MEET ? CLUSTERMSG_TYPE_MEET : CLUSTERMSG_TYPE_PING);
+    clusterSendPing(link, nodeInMeetState(node) ? CLUSTERMSG_TYPE_MEET : CLUSTERMSG_TYPE_PING);
     if (old_ping_sent) {
         /* If there was an active ping before the link was
          * disconnected, we want to restore the ping time, otherwise
@@ -3747,7 +3771,9 @@ void clusterReadHandler(connection *conn) {
 
         if (nread <= 0) {
             /* I/O error... */
-            serverLog(LL_DEBUG, "I/O error reading from node link: %s",
+            serverLog(LL_DEBUG, "I/O error reading from node link (%.40s:%s): %s",
+                      link->node ? link->node->name : "<unknown>",
+                      link->inbound ? "inbound" : "outbound",
                       (nread == 0) ? "connection closed" : connGetLastError(conn));
             handleLinkIOError(link);
             return;
@@ -3928,6 +3954,12 @@ void clusterSetGossipEntry(clusterMsg *hdr, int i, clusterNode *n) {
 /* Send a PING or PONG packet to the specified node, making sure to add enough
  * gossip information. */
 void clusterSendPing(clusterLink *link, int type) {
+    serverLog(LL_DEBUG, "Sending %s packet to node %.40s (%s) on %s link",
+              clusterGetMessageTypeString(type),
+              link->node ? link->node->name : "<unknown>",
+              link->node ? link->node->human_nodename : "<unknown>",
+              link->inbound ? "inbound" : "outbound");
+
     static unsigned long long cluster_pings_sent = 0;
     cluster_pings_sent++;
     int gossipcount = 0; /* Number of gossip sections added so far. */
@@ -4943,6 +4975,15 @@ static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t handshake_
         clusterDelNode(node);
         return 1;
     }
+    if (node->link != NULL && node->inbound_link == NULL && nodeInNormalState(node) &&
+        now - node->inbound_link_freed_time > handshake_timeout) {
+        /* Node has an outbound link, but no inbound link for more than the handshake timeout.
+         * This probably means this node does not know us yet, whereas we know it.
+         * So we send it a MEET packet to do a handshake with it and correct the inconsistent cluster view. */
+        node->flags |= CLUSTER_NODE_MEET;
+        serverLog(LL_NOTICE, "Sending MEET packet to node %.40s because there is no inbound link for it", node->name);
+        clusterSendPing(node->link, CLUSTERMSG_TYPE_MEET);
+    }
 
     if (node->link == NULL) {
         clusterLink *link = createClusterLink(node);
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index 5595402a4d..fb317038d6 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -61,12 +61,14 @@ typedef struct clusterLink {
 #define nodeIsPrimary(n) ((n)->flags & CLUSTER_NODE_PRIMARY)
 #define nodeIsReplica(n) ((n)->flags & CLUSTER_NODE_REPLICA)
 #define nodeInHandshake(n) ((n)->flags & CLUSTER_NODE_HANDSHAKE)
+#define nodeInMeetState(n) ((n)->flags & CLUSTER_NODE_MEET)
 #define nodeHasAddr(n) (!((n)->flags & CLUSTER_NODE_NOADDR))
 #define nodeTimedOut(n) ((n)->flags & CLUSTER_NODE_PFAIL)
 #define nodeFailed(n) ((n)->flags & CLUSTER_NODE_FAIL)
 #define nodeCantFailover(n) ((n)->flags & CLUSTER_NODE_NOFAILOVER)
 #define nodeSupportsExtensions(n) ((n)->flags & CLUSTER_NODE_EXTENSIONS_SUPPORTED)
 #define nodeSupportsLightMsgHdr(n) ((n)->flags & CLUSTER_NODE_LIGHT_HDR_SUPPORTED)
+#define nodeInNormalState(n) (!((n)->flags & (CLUSTER_NODE_HANDSHAKE | CLUSTER_NODE_MEET | CLUSTER_NODE_PFAIL | CLUSTER_NODE_FAIL)))
 
 /* This structure represent elements of node->fail_reports. */
 typedef struct clusterNodeFailReport {
@@ -343,6 +345,8 @@ struct _clusterNode {
                                              * failover scenarios. */
     mstime_t repl_offset_time;              /* Unix time we received offset for this node */
     mstime_t orphaned_time;                 /* Starting time of orphaned primary condition */
+    mstime_t inbound_link_freed_time;       /* Last time we freed the inbound link for this node.
+                                               If it was never freed, it is the same as ctime */
     long long repl_offset;                  /* Last known repl offset for this node. */
     char ip[NET_IP_STR_LEN];                /* Latest known IP address of this node */
     sds announce_client_ipv4;               /* IPv4 for clients only. */
diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl
index 686f00071b..4f641c5e96 100644
--- a/tests/support/cluster_util.tcl
+++ b/tests/support/cluster_util.tcl
@@ -323,6 +323,15 @@ proc get_cluster_nodes {id {status "*"}} {
     return $nodes
 }
 
+# Returns the parsed myself node entry as a dictionary.
+proc get_myself id {
+    set nodes [get_cluster_nodes $id]
+    foreach n $nodes {
+        if {[cluster_has_flag $n myself]} {return $n}
+    }
+    return {}
+}
+
 # Returns 1 if no node knows node_id, 0 if any node knows it.
 proc node_is_forgotten {node_id} {
     for {set j 0} {$j < [llength $::servers]} {incr j} {
diff --git a/tests/unit/cluster/cluster-multiple-meets.tcl b/tests/unit/cluster/cluster-multiple-meets.tcl
index 059f03fbe4..0b5f769930 100644
--- a/tests/unit/cluster/cluster-multiple-meets.tcl
+++ b/tests/unit/cluster/cluster-multiple-meets.tcl
@@ -58,7 +58,7 @@ tags {tls:skip external:skip cluster} {
             } else {
                 fail "Node 1 recognizes node 0 even though it drops PONGs from node 0"
             }
-            assert {[llength [get_cluster_nodes 0 connected]] == 2}
+            assert {[llength [get_cluster_nodes 0]] == 2}
 
             # Drop incoming and outgoing links from/to 1
             R 0 DEBUG CLUSTERLINK KILL ALL [R 1 CLUSTER MYID]
@@ -77,6 +77,8 @@ tags {tls:skip external:skip cluster} {
             # Both a and b will turn to cluster state ok
             wait_for_condition 1000 50 {
                 [CI 1 cluster_state] eq {ok} && [CI 0 cluster_state] eq {ok} &&
+                [llength [get_cluster_nodes 0 connected]] == 2 &&
+                [llength [get_cluster_nodes 1 connected]] == 2 &&
                 [CI 1 cluster_stats_messages_meet_sent] == [CI 0 cluster_stats_messages_meet_received]
             } else {
                 fail "1 cluster_state:[CI 1 cluster_state], 0 cluster_state: [CI 0 cluster_state]"
diff --git a/tests/unit/cluster/cluster-reliable-meet.tcl b/tests/unit/cluster/cluster-reliable-meet.tcl
index 45f5a6dc89..f189e96d5b 100644
--- a/tests/unit/cluster/cluster-reliable-meet.tcl
+++ b/tests/unit/cluster/cluster-reliable-meet.tcl
@@ -3,6 +3,12 @@ set old_singledb $::singledb
 set ::singledb 1
 
 tags {tls:skip external:skip cluster} {
+    set CLUSTER_PACKET_TYPE_PING 0
+    set CLUSTER_PACKET_TYPE_PONG 1
+    set CLUSTER_PACKET_TYPE_MEET 2
+    set CLUSTER_PACKET_TYPE_NONE -1
+    set CLUSTER_PACKET_TYPE_ALL -2
+
     set base_conf [list cluster-enabled yes]
     start_multiple_servers 2 [list overrides $base_conf] {
         test "Cluster nodes are reachable" {
@@ -22,9 +28,6 @@ tags {tls:skip external:skip cluster} {
             wait_for_cluster_state fail
         }
 
-        set CLUSTER_PACKET_TYPE_MEET 2
-        set CLUSTER_PACKET_TYPE_NONE -1
-
         test "Cluster nodes haven't met each other" {
             assert {[llength [get_cluster_nodes 1]] == 1}
             assert {[llength [get_cluster_nodes 0]] == 1}
@@ -75,3 +78,202 @@ tags {tls:skip external:skip cluster} {
 
 set ::singledb $old_singledb
 
+proc cluster_get_first_node_in_handshake id {
+    set nodes [get_cluster_nodes $id]
+    foreach n $nodes {
+        if {[cluster_has_flag $n handshake]} {
+            return [dict get $n id]
+        }
+    }
+    return {}
+}
+
+proc cluster_nodes_all_know_each_other {num_nodes} {
+    # Collect node IDs dynamically
+    set node_ids {}
+    for {set i 0} {$i < $num_nodes} {incr i} {
+        lappend node_ids [dict get [get_myself $i] id]
+    }
+
+    # Check if all nodes know each other
+    foreach node_id $node_ids {
+        foreach check_node_id $node_ids {
+            for {set node_index 0} {$node_index < $num_nodes} {incr node_index} {
+                if {[cluster_get_node_by_id $node_index $check_node_id] == {}} {
+                    return 0
+                }
+            }
+        }
+    }
+
+    # Verify cluster link counts for each node
+    set expected_links [expr {2 * ($num_nodes - 1)}]
+    for {set i 0} {$i < $num_nodes} {incr i} {
+        if {[llength [R $i CLUSTER LINKS]] != $expected_links} {
+            return 0
+        }
+    }
+
+    return 1
+}
+
+start_cluster 2 0 {tags {external:skip cluster} overrides {cluster-node-timeout 4000 cluster-replica-no-failover yes}} {
+    set CLUSTER_PACKET_TYPE_PING 0
+    set CLUSTER_PACKET_TYPE_PONG 1
+    set CLUSTER_PACKET_TYPE_MEET 2
+    set CLUSTER_PACKET_TYPE_NONE -1
+    set CLUSTER_PACKET_TYPE_ALL -2
+
+    test "Handshake eventually succeeds after node handshake timeout on both sides with inconsistent view of the cluster" {
+        set cluster_port [find_available_port $::baseport $::portcount]
+        start_server [list overrides [list cluster-enabled yes cluster-node-timeout 4000 cluster-port $cluster_port]] {
+            # In this test we will trigger a handshake timeout on both sides of the handshake.
+            # Node 1 and 2 already know each other, then we make node 1 meet node 0:
+            #
+            # Node 1 -- MEET -> Node 0 [Node 0 might learn about Node 2 from the gossip section of the msg]
+            # Node 1 <- PONG -- Node 0 [we drop this message, so Node 1 will eventually mark the handshake as timed out]
+            # Node 1 <- PING -- Node 0 [we drop this message, so Node 1 will never send a PONG and Node 0 will eventually mark the handshake as timed out]
+            #
+            # After the handshake is timed out, we allow all cluster bus messages to go through.
+            # Eventually Node 0 should send a MEET packet to the other nodes to complete the handshake.
+
+            set node0_id [dict get [get_myself 0] id]
+            set node1_id [dict get [get_myself 1] id]
+            set node2_id [dict get [get_myself 2] id]
+
+            # Drop all cluster bus messages
+            R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_ALL
+            # Drop MEET cluster bus messages, so that Node 0 cannot start a handshake with Node 2.
+            R 2 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_MEET
+
+            R 1 CLUSTER MEET [srv 0 host] [srv 0 port] $cluster_port
+
+            # Wait for Node 0 to be in handshake
+            wait_for_condition 10 400 {
+                [cluster_get_first_node_in_handshake 0] != {}
+            } else {
+                fail "Node 0 never entered handshake state"
+            }
+
+            # We want Node 0 to learn about Node 2 through the gossip section of the MEET message
+            set meet_retry 0
+            while {[cluster_get_node_by_id 0 $node2_id] eq {}} {
+                if {$meet_retry == 10} {
+                    error "assertion: Retried to meet Node 0 too many times"
+                }
+                # If Node 0 doesn't know about Node 1 & 2, it means Node 1 did not gossip about node 2 in its MEET message.
+                # So we kill the outbound link from Node 1 to Node 0, to force a reconnect and a re-send of the MEET message.
+                after 100
+                # Since we are in handshake, we use a randomly generated ID we have to find
+                R 1 DEBUG CLUSTERLINK KILL ALL [cluster_get_first_node_in_handshake 1]
+                incr meet_retry 1
+            }
+
+            # Wait for Node 1's handshake to timeout
+            wait_for_condition 50 100 {
+                [cluster_get_first_node_in_handshake 1] eq {}
+            } else {
+                fail "Node 1 never exited handshake state"
+            }
+
+            # Wait for Node 0's handshake to timeout
+            wait_for_condition 50 100 {
+                [cluster_get_first_node_in_handshake 1] eq {}
+            } else {
+                fail "Node 0 never exited handshake state"
+            }
+
+            # At this point Node 0 knows Node 1 & 2 through the gossip, but they don't know Node 0.
+            wait_for_condition 50 100 {
+                [cluster_get_node_by_id 0 $node1_id] != {} &&
+                [cluster_get_node_by_id 0 $node2_id] != {} &&
+                [cluster_get_node_by_id 1 $node0_id] eq {} &&
+                [cluster_get_node_by_id 2 $node0_id] eq {}
+            } else {
+                fail "Unexpected CLUSTER NODES output, nodes 1 & 2 should not know node 0."
+            }
+
+            # Allow all messages to go through again
+            R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE
+            R 2 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE
+
+            # Now Node 0 will send a MEET packet to Node 1 & 2 since it has an outbound link to these nodes but no inbound link.
+            # Handshake should now complete successfully.
+            wait_for_condition 50 200 {
+                [cluster_nodes_all_know_each_other 3]
+            } else {
+                fail "Unexpected CLUSTER NODES output, all nodes should know each other."
+            }
+        } ;# stop Node 0
+    } ;# test
+} ;# stop cluster
+
+start_cluster 2 0 {tags {external:skip cluster} overrides {cluster-node-timeout 4000 cluster-replica-no-failover yes}} {
+    set CLUSTER_PACKET_TYPE_PING 0
+    set CLUSTER_PACKET_TYPE_PONG 1
+    set CLUSTER_PACKET_TYPE_MEET 2
+    set CLUSTER_PACKET_TYPE_NONE -1
+    set CLUSTER_PACKET_TYPE_ALL -2
+
+    test "Handshake eventually succeeds after node handshake timeout on one side with inconsistent view of the cluster" {
+        set cluster_port [find_available_port $::baseport $::portcount]
+        start_server [list overrides [list cluster-enabled yes cluster-node-timeout 4000 cluster-port $cluster_port]] {
+            # In this test we will trigger a handshake timeout on one side of the handshake.
+            # Node 1 and 2 already know each other, then we make node 0 meet node 1:
+            #
+            # Node 0 -- MEET -> Node 1
+            # Node 0 <- PONG -- Node 1
+            # Node 0 <- PING -- Node 1 [Node 0 will mark the handshake as successful]
+            # Node 0 -- PONG -> Node 1 [we drop this message, so node 1 will eventually mark the handshake as timed out]
+            #
+            # After the handshake is timed out, we allow all cluster bus messages to go through.
+            # Eventually Node 0 should send a MEET packet to the other nodes to complete the handshake.
+
+            set node0_id [dict get [get_myself 0] id]
+            set node1_id [dict get [get_myself 1] id]
+            set node2_id [dict get [get_myself 2] id]
+
+            # Drop PONG messages
+            R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_PONG
+            # Drop MEET cluster bus messages, so that Node 0 cannot start a handshake with Node 2.
+            R 2 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_MEET
+
+            # Node 0 meets node 1
+            R 0 CLUSTER MEET [srv -1 host] [srv -1 port]
+
+            # Wait for node 0 to know about the other nodes in the cluster
+            wait_for_condition 50 100 {
+                [cluster_get_node_by_id 0 $node1_id] != {}
+            } else {
+                fail "Node 0 never learned about node 1"
+            }
+            # At this point, node 0 knows about node 1 and might know node 2 if node 1 gossiped about it.
+            wait_for_condition 50 100 {
+                [cluster_get_first_node_in_handshake 0] eq {}
+            } else {
+                fail "Node 1 never exited handshake state"
+            }
+            # At this point, from node 0 point of view, the handshake with node 1 succeeded.
+
+            wait_for_condition 50 100 {
+                [cluster_get_first_node_in_handshake 1] eq {}
+            } else {
+                fail "Node 1 never exited handshake state"
+            }
+            assert {[cluster_get_node_by_id 1 $node0_id] eq {}}
+            # At this point, from node 1 point of view, the handshake with node 0 timed out.
+
+            # Allow all messages
+            R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE
+            R 2 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE
+
+            # Now Node 0 will send a MEET packet to Node 1 & 2 since it has an outbound link to these nodes but no inblound link.
+            # Handshake should now complete successfully.
+            wait_for_condition 50 200 {
+                [cluster_nodes_all_know_each_other 3]
+            } else {
+                fail "Unexpected CLUSTER NODES output, all nodes should know each other."
+            }
+        } ;# stop Node 0
+    } ;# test
+} ;# stop cluster

From 2d924045223a9f2396b6a08a939b66e2fe5c5d65 Mon Sep 17 00:00:00 2001
From: ranshid <88133677+ranshid@users.noreply.github.com>
Date: Thu, 12 Dec 2024 23:52:58 +0200
Subject: [PATCH 39/73] Avoid defragging scripts during EVAL command execution
 (#1414)

This can happen when scripts are running for long period of time and the server attempts to defrag it in the whileBlockedCron.

Signed-off-by: Ran Shidlansik <ranshid@amazon.com>
---
 src/defrag.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/defrag.c b/src/defrag.c
index 2fa067f0dc..be7ff07510 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -34,6 +34,7 @@
  */
 
 #include "server.h"
+#include "script.h"
 #include <stddef.h>
 
 #ifdef HAVE_DEFRAG
@@ -1050,6 +1051,9 @@ static doneStatus defragLuaScripts(monotime endtime, void *target, void *privdat
     UNUSED(target);
     UNUSED(privdata);
     if (endtime == 0) return DEFRAG_NOT_DONE; // required initialization
+    /* In case we are in the process of eval some script we do not want to replace the script being run
+     * so we just bail out without really defragging here. */
+    if (scriptIsRunning()) return DEFRAG_DONE;
     activeDefragSdsDict(evalScriptsDict(), DEFRAG_SDS_DICT_VAL_LUA_SCRIPT);
     return DEFRAG_DONE;
 }

From ab69a8a55dee8738dc0bf005c4cb37a259b12053 Mon Sep 17 00:00:00 2001
From: Vu Diep <54611122+vudiep411@users.noreply.github.com>
Date: Thu, 12 Dec 2024 14:42:52 -0800
Subject: [PATCH 40/73] Use `configure-aws-credentials` workflow instead of
 passing `secret_access_key` (#1363)

## Summary
This PR fixes #1346 where we can get rid of the long term credentials by
using OpenID Connect. OpenID Connect (OIDC) allows your GitHub Actions
workflows to access resources in Amazon Web Services (AWS), without
needing to store the AWS credentials as long-lived GitHub secrets.

---------

Signed-off-by: vudiep411 <vdiep@amazon.com>
---
 .github/workflows/build-release-packages.yml  | 43 +++++++++++++------
 .../call-build-linux-arm-packages.yml         | 39 ++++++++---------
 .../call-build-linux-x86-packages.yml         | 39 ++++++++---------
 3 files changed, 65 insertions(+), 56 deletions(-)

diff --git a/.github/workflows/build-release-packages.yml b/.github/workflows/build-release-packages.yml
index 094d82de08..44e012d658 100644
--- a/.github/workflows/build-release-packages.yml
+++ b/.github/workflows/build-release-packages.yml
@@ -3,7 +3,12 @@ name: Build Release Packages
 on:
   release:
     types: [published]
-
+  push:
+    paths:
+      - '.github/workflows/build-release-packages.yml'
+      - '.github/workflows/call-build-linux-arm-packages.yml'
+      - '.github/workflows/call-build-linux-x86_64-packages.yml'
+      - 'utils/releasetools/build-config.json'
   workflow_dispatch:
     inputs:
       version:
@@ -11,6 +16,7 @@ on:
         required: true
 
 permissions:
+  id-token: write
   contents: read
 
 jobs:
@@ -20,8 +26,8 @@ jobs:
     runs-on: ubuntu-latest
     outputs:
       version: ${{ steps.get_version.outputs.VERSION }}
+      is_test: ${{ steps.check-if-testing.outputs.IS_TEST }}
     steps:
-
       - run: |
           echo "Version: ${{ inputs.version || github.ref_name }}"
         shell: bash
@@ -32,8 +38,13 @@ jobs:
       - name: Get the version
         id: get_version
         run: |
-          VERSION="${INPUT_VERSION}"
+          if [[ "${{ github.event_name }}" == "push" ]]; then
+            VERSION=${{ github.ref_name }}
+          else
+            VERSION="${INPUT_VERSION}"
+          fi
           if [ -z "${VERSION}" ]; then
+            echo "Error: No version specified"
             exit 1
           fi
           echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
@@ -43,6 +54,16 @@ jobs:
           # only ever be a tag
           INPUT_VERSION: ${{ inputs.version || github.ref_name }}
 
+      - name: Check if we are testing
+        id: check-if-testing
+        run: |
+          if [[ "${{ github.event_name }}" == "push" ]]; then
+            echo "IS_TEST=true" >> $GITHUB_OUTPUT
+          else
+            echo "IS_TEST=false" >> $GITHUB_OUTPUT
+          fi
+        shell: bash
+
   generate-build-matrix:
     name: Generating build matrix
     runs-on: ubuntu-latest
@@ -56,7 +77,7 @@ jobs:
       - uses: ./.github/actions/generate-package-build-matrix
         id: set-matrix
         with:
-          ref: ${{ inputs.version || github.ref_name }}
+          ref: ${{ needs.release-build-get-meta.outputs.version }}
 
   release-build-linux-x86-packages:
     needs:
@@ -67,11 +88,10 @@ jobs:
       version: ${{ needs.release-build-get-meta.outputs.version }}
       ref: ${{ inputs.version || github.ref_name }}
       build_matrix: ${{ needs.generate-build-matrix.outputs.x86_64-build-matrix }}
+      region: us-west-2
     secrets:
-      token: ${{ secrets.GITHUB_TOKEN }}
-      bucket: ${{ secrets.AWS_S3_BUCKET }}
-      access_key_id: ${{ secrets.AWS_S3_ACCESS_KEY_ID }}
-      secret_access_key: ${{ secrets.AWS_S3_ACCESS_KEY }}
+      bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_TEST_BUCKET || secrets.AWS_S3_BUCKET }}
+      role_to_assume: ${{ secrets.AWS_ROLE_TO_ASSUME }}
 
   release-build-linux-arm-packages:
     needs:
@@ -82,8 +102,7 @@ jobs:
       version: ${{ needs.release-build-get-meta.outputs.version }}
       ref: ${{ inputs.version || github.ref_name }}
       build_matrix: ${{ needs.generate-build-matrix.outputs.arm64-build-matrix }}
+      region: us-west-2
     secrets:
-      token: ${{ secrets.GITHUB_TOKEN }}
-      bucket: ${{ secrets.AWS_S3_BUCKET }}
-      access_key_id: ${{ secrets.AWS_S3_ACCESS_KEY_ID }}
-      secret_access_key: ${{ secrets.AWS_S3_ACCESS_KEY }}
+      bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_TEST_BUCKET || secrets.AWS_S3_BUCKET }}
+      role_to_assume: ${{ secrets.AWS_ROLE_TO_ASSUME }}
diff --git a/.github/workflows/call-build-linux-arm-packages.yml b/.github/workflows/call-build-linux-arm-packages.yml
index 2a7bcc533f..65445a83c8 100644
--- a/.github/workflows/call-build-linux-arm-packages.yml
+++ b/.github/workflows/call-build-linux-arm-packages.yml
@@ -15,21 +15,20 @@ on:
         description: The build targets to produce as a JSON matrix.
         type: string
         required: true
+      region:
+        description: The AWS region to push packages into.
+        type: string
+        required: true
     secrets:
-      token:
-        description: The Github token or similar to authenticate with.
+      bucket_name:
+        description: The S3 bucket to push packages into.
+        required: true
+      role_to_assume:
+        description: The role to assume for the S3 bucket.
         required: true
-      bucket:
-        description: The name of the S3 bucket to push packages into.
-        required: false
-      access_key_id:
-        description: The S3 access key id for the bucket.
-        required: false
-      secret_access_key:
-        description: The S3 secret access key for the bucket.
-        required: false
 
 permissions:
+  id-token: write
   contents: read
 
 jobs:
@@ -46,6 +45,12 @@ jobs:
         with:
           ref: ${{ inputs.version }}
 
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: ${{ inputs.region }}
+          role-to-assume: ${{ secrets.role_to_assume }}
+
       - name: Make Valkey
         uses: uraimo/run-on-arch-action@v2
         with:
@@ -65,15 +70,5 @@ jobs:
           mkdir -p packages-files
           cp -rfv $TAR_FILE_NAME.tar* packages-files/
 
-      - name: Install AWS cli.
-        run: |
-          sudo apt-get install -y awscli
-
-      - name: Configure AWS credentials
-        run: |
-          aws configure set region us-west-2
-          aws configure set aws_access_key_id ${{ secrets.access_key_id }}
-          aws configure set aws_secret_access_key ${{ secrets.secret_access_key }}
-
       - name: Sync to S3
-        run: aws s3 sync packages-files s3://${{secrets.bucket}}/releases/
+        run: aws s3 sync packages-files s3://${{ secrets.bucket_name }}/releases/
diff --git a/.github/workflows/call-build-linux-x86-packages.yml b/.github/workflows/call-build-linux-x86-packages.yml
index 9e438fa61a..a603c53c13 100644
--- a/.github/workflows/call-build-linux-x86-packages.yml
+++ b/.github/workflows/call-build-linux-x86-packages.yml
@@ -15,21 +15,20 @@ on:
         description: The build targets to produce as a JSON matrix.
         type: string
         required: true
+      region:
+        description: The AWS region to upload the packages to.
+        type: string
+        required: true
     secrets:
-      token:
-        description: The Github token or similar to authenticate with.
+      bucket_name:
+        description: The name of the S3 bucket to upload the packages to.
+        required: true
+      role_to_assume:
+        description: The role to assume for the S3 bucket.
         required: true
-      bucket:
-        description: The name of the S3 bucket to push packages into.
-        required: false
-      access_key_id:
-        description: The S3 access key id for the bucket.
-        required: false
-      secret_access_key:
-        description: The S3 secret access key for the bucket.
-        required: false
 
 permissions:
+  id-token: write
   contents: read
 
 jobs:
@@ -46,6 +45,12 @@ jobs:
         with:
           ref: ${{ inputs.version }}
 
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          aws-region: ${{ inputs.region }}
+          role-to-assume: ${{ secrets.role_to_assume }}
+
       - name: Install dependencies
         run: sudo apt-get update && sudo apt-get install -y build-essential libssl-dev libsystemd-dev
 
@@ -63,15 +68,5 @@ jobs:
           mkdir -p packages-files
           cp -rfv $TAR_FILE_NAME.tar* packages-files/
 
-      - name: Install AWS cli.
-        run: |
-          sudo apt-get install -y awscli
-
-      - name: Configure AWS credentials
-        run: |
-          aws configure set region us-west-2
-          aws configure set aws_access_key_id ${{ secrets.access_key_id }}
-          aws configure set aws_secret_access_key ${{ secrets.secret_access_key }}
-
       - name: Sync to S3
-        run: aws s3 sync packages-files s3://${{secrets.bucket}}/releases/
+        run: aws s3 sync packages-files s3://${{ secrets.bucket_name }}/releases/

From 3a1043a4f0fa97daa31f4bd2b3714a07736ac1b6 Mon Sep 17 00:00:00 2001
From: Roshan Khatri <117414976+roshkhatri@users.noreply.github.com>
Date: Thu, 12 Dec 2024 14:46:35 -0800
Subject: [PATCH 41/73] Fix Valkey binary build workflow, version support
 changes. (#1429)

This change makes the binary build on the target ubuntu version.

This PR also deprecated ubuntu18 and valkey will not support:

- X86:
  - Ubuntu 20
  - Ubuntu 22
  - Ubuntu 24
 - ARM:
   - Ubuntu 20
   - Ubuntu 22

Removed ARM ubuntu 24 as the action we are using for ARM builds does not
support Ubuntu 24.

---------

Signed-off-by: Roshan Khatri <rvkhatri@amazon.com>
---
 .../call-build-linux-x86-packages.yml         |  2 +-
 utils/releasetools/build-config.json          | 21 +++++++------------
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/call-build-linux-x86-packages.yml b/.github/workflows/call-build-linux-x86-packages.yml
index a603c53c13..4e68bf85f0 100644
--- a/.github/workflows/call-build-linux-x86-packages.yml
+++ b/.github/workflows/call-build-linux-x86-packages.yml
@@ -35,7 +35,7 @@ jobs:
   build-valkey:
     # Capture source tarball and generate checksum for it
     name: Build package ${{ matrix.distro.target }} ${{ matrix.distro.arch }}
-    runs-on: "ubuntu-latest"
+    runs-on: ${{matrix.distro.target}}
     strategy:
       fail-fast: false
       matrix: ${{ fromJSON(inputs.build_matrix) }}
diff --git a/utils/releasetools/build-config.json b/utils/releasetools/build-config.json
index 5e39fae70f..f64bf601ca 100644
--- a/utils/releasetools/build-config.json
+++ b/utils/releasetools/build-config.json
@@ -1,29 +1,24 @@
 {
     "linux_targets": [
+
         {
           "arch": "x86_64",
-          "target": "ubuntu18.04",
+          "target": "ubuntu-20.04",
           "type": "deb",
-          "platform": "bionic"
+          "platform": "focal"
         },
         {
           "arch": "x86_64",
-          "target": "ubuntu20.04",
+          "target": "ubuntu-22.04",
           "type": "deb",
-          "platform": "focal"
+          "platform": "jammy"
         },
         {
           "arch": "x86_64",
-          "target": "ubuntu24.04",
+          "target": "ubuntu-24.04",
           "type": "deb",
           "platform": "noble"
         },
-        {
-          "arch": "arm64",
-          "target": "ubuntu18.04",
-          "type": "deb",
-          "platform": "bionic"
-        },
         {
           "arch": "arm64",
           "target": "ubuntu20.04",
@@ -32,9 +27,9 @@
         },
         {
           "arch": "arm64",
-          "target": "ubuntu24.04",
+          "target": "ubuntu22.04",
           "type": "deb",
-          "platform": "noble"
+          "platform": "jammy"
         }
       ]
 }
\ No newline at end of file

From 32f2c73cb5c93516ac5abc5259023caf75326b6a Mon Sep 17 00:00:00 2001
From: Jim Brunner <brunnerj@amazon.com>
Date: Thu, 12 Dec 2024 14:55:57 -0800
Subject: [PATCH 42/73] defrag: eliminate persistent kvstore pointer and edge
 case fixes (#1430)

This update addresses several issues in defrag:
1. In the defrag redesign
(https://github.com/valkey-io/valkey/pull/1242), a bug was introduced
where `server.cronloops` was no longer being incremented in the
`whileBlockedCron()`. This resulted in some memory statistics not being
updated while blocked.
2. In the test case for AOF loading, we were seeing errors due to defrag
latencies. However, running the math, the latencies are justified given
the extremely high CPU target of the testcase. Adjusted the expected
latency check to allow longer latencies for this case where defrag is
undergoing starvation while AOF loading is in progress.
3. A "stage" is passed a "target". For the main dictionary and expires,
we were passing in a `kvstore*`. However, on flushall or swapdb, the
pointer may change. It's safer and more stable to use an index for the
DB (a DBID). Then if the pointer changes, we can detect the change, and
simply abort the stage. (If there's still fragmentation to deal with,
we'll pick it up again on the next cycle.)
4. We always start a new stage on a new defrag cycle. This gives the new
stage time to run, and prevents latency issues for certain stages which
don't operate incrementally. However, often several stages will require
almost no work, and this will leave a chunk of our CPU allotment unused.
This is mainly an issue in starvation situations (like AOF loading or
LUA script) - where defrag is running infrequently, with a large
duty-cycle. This change allows a new stage to be initiated if we still
have a standard duty-cycle remaining. (This can happen during starvation
situations where the planned duty cycle is larger than the standard
cycle. Most likely this isn't a concern for real scenarios, but it was
observed in testing.)
5. Minor comment correction in `server.h`

Signed-off-by: Jim Brunner <brunnerj@amazon.com>
---
 src/defrag.c                 | 67 ++++++++++++++++++++++--------------
 src/server.c                 |  6 ++++
 src/server.h                 |  3 +-
 tests/unit/memefficiency.tcl |  8 +++--
 4 files changed, 53 insertions(+), 31 deletions(-)

diff --git a/src/defrag.c b/src/defrag.c
index be7ff07510..8c1ad29de2 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -121,7 +121,7 @@ typedef doneStatus (*kvstoreHelperPreContinueFn)(monotime endtime, void *privdat
 // Private data for main dictionary keys
 typedef struct {
     kvstoreIterState kvstate;
-    serverDb *db;
+    int dbid;
 } defragKeysCtx;
 static_assert(offsetof(defragKeysCtx, kvstate) == 0, "defragStageKvstoreHelper requires this");
 
@@ -736,7 +736,7 @@ static void defragModule(serverDb *db, robj *obj) {
 /* for each key we scan in the main dict, this function will attempt to defrag
  * all the various pointers it has. */
 static void defragKey(defragKeysCtx *ctx, robj **elemref) {
-    serverDb *db = ctx->db;
+    serverDb *db = &server.db[ctx->dbid];
     int slot = ctx->kvstate.slot;
     robj *newob, *ob;
     unsigned char *newzl;
@@ -920,7 +920,7 @@ static doneStatus defragLaterStep(monotime endtime, void *privdata) {
         robj *ob = found;
 
         long long key_defragged = server.stat_active_defrag_hits;
-        bool timeout = (defragLaterItem(ob, &defrag_later_cursor, endtime, ctx->db->id) == 1);
+        bool timeout = (defragLaterItem(ob, &defrag_later_cursor, endtime, ctx->dbid) == 1);
         if (key_defragged != server.stat_active_defrag_hits) {
             server.stat_active_defrag_key_hits++;
         } else {
@@ -963,7 +963,10 @@ static doneStatus defragStageKvstoreHelper(monotime endtime,
         state.cursor = 0;
         return DEFRAG_NOT_DONE;
     }
-    serverAssert(kvs == state.kvs); // Shouldn't change during the stage
+    if (kvs != state.kvs) {
+        // There has been a change of the kvs (flushdb, swapdb, etc.).  Just complete the stage.
+        return DEFRAG_DONE;
+    }
 
     unsigned int iterations = 0;
     unsigned long long prev_defragged = server.stat_active_defrag_hits;
@@ -1013,26 +1016,30 @@ static doneStatus defragStageKvstoreHelper(monotime endtime,
 }
 
 
-// Note: target is a DB, (not a KVS like most stages)
+// Target is a DBID
 static doneStatus defragStageDbKeys(monotime endtime, void *target, void *privdata) {
     UNUSED(privdata);
-    serverDb *db = (serverDb *)target;
+    int dbid = (uintptr_t)target;
+    serverDb *db = &server.db[dbid];
 
     static defragKeysCtx ctx; // STATIC - this persists
     if (endtime == 0) {
-        ctx.db = db;
+        ctx.dbid = dbid;
         // Don't return yet.  Call the helper with endtime==0 below.
     }
-    serverAssert(ctx.db == db);
+    serverAssert(ctx.dbid == dbid);
 
     return defragStageKvstoreHelper(endtime, db->keys,
                                     dbKeysScanCallback, defragLaterStep, &ctx);
 }
 
 
+// Target is a DBID
 static doneStatus defragStageExpiresKvstore(monotime endtime, void *target, void *privdata) {
     UNUSED(privdata);
-    return defragStageKvstoreHelper(endtime, (kvstore *)target,
+    int dbid = (uintptr_t)target;
+    serverDb *db = &server.db[dbid];
+    return defragStageKvstoreHelper(endtime, db->expires,
                                     scanHashtableCallbackCountScanned, NULL, NULL);
 }
 
@@ -1226,29 +1233,38 @@ static long long activeDefragTimeProc(struct aeEventLoop *eventLoop, long long i
     }
 
     monotime starttime = getMonotonicUs();
-    monotime endtime = starttime + computeDefragCycleUs();
+    int dutyCycleUs = computeDefragCycleUs();
+    monotime endtime = starttime + dutyCycleUs;
+    bool haveMoreWork = true;
 
     mstime_t latency;
     latencyStartMonitor(latency);
 
-    if (!defrag.current_stage) {
-        defrag.current_stage = listNodeValue(listFirst(defrag.remaining_stages));
-        listDelNode(defrag.remaining_stages, listFirst(defrag.remaining_stages));
-        // Initialize the stage with endtime==0
-        doneStatus status = defrag.current_stage->stage_fn(0, defrag.current_stage->target, defrag.current_stage->privdata);
-        serverAssert(status == DEFRAG_NOT_DONE); // Initialization should always return DEFRAG_NOT_DONE
-    }
+    do {
+        if (!defrag.current_stage) {
+            defrag.current_stage = listNodeValue(listFirst(defrag.remaining_stages));
+            listDelNode(defrag.remaining_stages, listFirst(defrag.remaining_stages));
+            // Initialize the stage with endtime==0
+            doneStatus status = defrag.current_stage->stage_fn(0, defrag.current_stage->target, defrag.current_stage->privdata);
+            serverAssert(status == DEFRAG_NOT_DONE); // Initialization should always return DEFRAG_NOT_DONE
+        }
 
-    doneStatus status = defrag.current_stage->stage_fn(endtime, defrag.current_stage->target, defrag.current_stage->privdata);
-    if (status == DEFRAG_DONE) {
-        zfree(defrag.current_stage);
-        defrag.current_stage = NULL;
-    }
+        doneStatus status = defrag.current_stage->stage_fn(endtime, defrag.current_stage->target, defrag.current_stage->privdata);
+        if (status == DEFRAG_DONE) {
+            zfree(defrag.current_stage);
+            defrag.current_stage = NULL;
+        }
+
+        haveMoreWork = (defrag.current_stage || listLength(defrag.remaining_stages) > 0);
+        /* If we've completed a stage early, and still have a standard time allotment remaining,
+         * we'll start another stage.  This can happen when defrag is running infrequently, and
+         * starvation protection has increased the duty-cycle. */
+    } while (haveMoreWork && getMonotonicUs() <= endtime - server.active_defrag_cycle_us);
 
     latencyEndMonitor(latency);
     latencyAddSampleIfNeeded("active-defrag-cycle", latency);
 
-    if (defrag.current_stage || listLength(defrag.remaining_stages) > 0) {
+    if (haveMoreWork) {
         return computeDelayMs(endtime);
     } else {
         endDefragCycle(true);
@@ -1287,9 +1303,8 @@ static void beginDefragCycle(void) {
     defrag.remaining_stages = listCreate();
 
     for (int dbid = 0; dbid < server.dbnum; dbid++) {
-        serverDb *db = &server.db[dbid];
-        addDefragStage(defragStageDbKeys, db, NULL);
-        addDefragStage(defragStageExpiresKvstore, db->expires, NULL);
+        addDefragStage(defragStageDbKeys, (void *)(uintptr_t)dbid, NULL);
+        addDefragStage(defragStageExpiresKvstore, (void *)(uintptr_t)dbid, NULL);
     }
 
     static getClientChannelsFnWrapper getClientPubSubChannelsFn = {getClientPubSubChannels};
diff --git a/src/server.c b/src/server.c
index 1e38b5ac69..8e65b1f5cd 100644
--- a/src/server.c
+++ b/src/server.c
@@ -1669,6 +1669,12 @@ void whileBlockedCron(void) {
      * latency monitor if this function is called too often. */
     if (server.blocked_last_cron >= server.mstime) return;
 
+    /* Increment server.cronloops so that run_with_period works. */
+    long hz_ms = 1000 / server.hz;
+    int cronloops = (server.mstime - server.blocked_last_cron + (hz_ms - 1)) / hz_ms; // rounding up
+    server.blocked_last_cron += cronloops * hz_ms;
+    server.cronloops += cronloops;
+
     mstime_t latency;
     latencyStartMonitor(latency);
 
diff --git a/src/server.h b/src/server.h
index 14a16593b0..e9332233aa 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1900,8 +1900,7 @@ struct valkeyServer {
     int sanitize_dump_payload;                   /* Enables deep sanitization for ziplist and listpack in RDB and RESTORE. */
     int skip_checksum_validation;                /* Disable checksum validation for RDB and RESTORE payload. */
     int jemalloc_bg_thread;                      /* Enable jemalloc background thread */
-    int active_defrag_configuration_changed;     /* defrag configuration has been changed and need to reconsider
-                                                  * active_defrag_running in computeDefragCycles. */
+    int active_defrag_configuration_changed;     /* Config changed; need to recompute active_defrag_cpu_percent. */
     size_t active_defrag_ignore_bytes;           /* minimum amount of fragmentation waste to start active defrag */
     int active_defrag_threshold_lower;           /* minimum percentage of fragmentation to start active defrag */
     int active_defrag_threshold_upper;           /* maximum percentage of fragmentation at which we use maximum effort */
diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl
index ce74b7c618..78a68a682d 100644
--- a/tests/unit/memefficiency.tcl
+++ b/tests/unit/memefficiency.tcl
@@ -172,10 +172,12 @@ run_solo {defrag} {
                 # make sure the defragger did enough work to keep the fragmentation low during loading.
                 # we cannot check that it went all the way down, since we don't wait for full defrag cycle to complete.
                 assert {$frag < 1.4}
-                # since the AOF contains simple (fast) SET commands (and the cron during loading runs every 1024 commands),
-                # it'll still not block the loading for long periods of time.
+                # The AOF contains simple (fast) SET commands (and the cron during loading runs every 1024 commands).
+                # Even so, defrag can get starved for periods exceeding 100ms.  Using 200ms for test stability, and
+                # a 75% CPU requirement (as set above), we should allow up to 600ms latency
+                # (as total time = 200 non duty + 600 duty = 800ms, and 75% of 800ms is 600ms).
                 if {!$::no_latency} {
-                    assert {$max_latency <= 40}
+                    assert {$max_latency <= 600}
                 }
             }
             } ;# Active defrag - AOF loading

From b60097ba073139ca62fdb59d15cc09737d114add Mon Sep 17 00:00:00 2001
From: Thalia Archibald <thalia@archibald.dev>
Date: Fri, 13 Dec 2024 02:05:19 -0800
Subject: [PATCH 43/73] Check length before reading in `stringmatchlen` (#1431)

Fixes four cases where `stringmatchlen` could overrun the pattern if it
is not terminated with NUL.

These commits are cherry-picked from my
[fork](https://github.com/thaliaarchi/antirez-stringmatch) which
extracts `stringmatch` as a library and compares it to other projects by
antirez which use the same matcher.

Signed-off-by: Thalia Archibald <thalia@archibald.dev>
---
 src/util.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/util.c b/src/util.c
index 0b7af2d3fa..6d99d47e5a 100644
--- a/src/util.c
+++ b/src/util.c
@@ -104,23 +104,23 @@ static int stringmatchlen_impl(const char *pattern,
 
             pattern++;
             patternLen--;
-            not_op = pattern[0] == '^';
+            not_op = patternLen && pattern[0] == '^';
             if (not_op) {
                 pattern++;
                 patternLen--;
             }
             match = 0;
             while (1) {
-                if (pattern[0] == '\\' && patternLen >= 2) {
+                if (patternLen >= 2 && pattern[0] == '\\') {
                     pattern++;
                     patternLen--;
                     if (pattern[0] == string[0]) match = 1;
-                } else if (pattern[0] == ']') {
-                    break;
                 } else if (patternLen == 0) {
                     pattern--;
                     patternLen++;
                     break;
+                } else if (pattern[0] == ']') {
+                    break;
                 } else if (patternLen >= 3 && pattern[1] == '-') {
                     int start = pattern[0];
                     int end = pattern[2];
@@ -173,7 +173,7 @@ static int stringmatchlen_impl(const char *pattern,
         pattern++;
         patternLen--;
         if (stringLen == 0) {
-            while (*pattern == '*') {
+            while (patternLen && *pattern == '*') {
                 pattern++;
                 patternLen--;
             }

From d588bb440668056a5bc64251739b9349dbe719b3 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Sat, 14 Dec 2024 05:32:54 +0800
Subject: [PATCH 44/73] Skip build-release-packages CI job in forks (#1438)

The CI job was introduced in #1363, we should skip it in forks.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 .github/workflows/build-release-packages.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/build-release-packages.yml b/.github/workflows/build-release-packages.yml
index 44e012d658..c7d5c8fe54 100644
--- a/.github/workflows/build-release-packages.yml
+++ b/.github/workflows/build-release-packages.yml
@@ -23,6 +23,7 @@ jobs:
   # This job provides the version metadata from the tag for the other jobs to use.
   release-build-get-meta:
     name: Get metadata to build
+    if: github.repository == 'valkey-io/valkey'
     runs-on: ubuntu-latest
     outputs:
       version: ${{ steps.get_version.outputs.VERSION }}
@@ -66,6 +67,7 @@ jobs:
 
   generate-build-matrix:
     name: Generating build matrix
+    if: github.repository == 'valkey-io/valkey'
     runs-on: ubuntu-latest
     outputs:
       x86_64-build-matrix: ${{ steps.set-matrix.outputs.x86_64-build-matrix }}

From 7d72fada2c349eb5b91c7e33099adc167abb0e99 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Sat, 14 Dec 2024 06:26:20 +0800
Subject: [PATCH 45/73] Fix wrong file name in build-release-packages.yml
 (#1437)

Introduced in #1363, the file name does not match.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 .github/workflows/build-release-packages.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-release-packages.yml b/.github/workflows/build-release-packages.yml
index c7d5c8fe54..6c54971bcd 100644
--- a/.github/workflows/build-release-packages.yml
+++ b/.github/workflows/build-release-packages.yml
@@ -7,7 +7,7 @@ on:
     paths:
       - '.github/workflows/build-release-packages.yml'
       - '.github/workflows/call-build-linux-arm-packages.yml'
-      - '.github/workflows/call-build-linux-x86_64-packages.yml'
+      - '.github/workflows/call-build-linux-x86-packages.yml'
       - 'utils/releasetools/build-config.json'
   workflow_dispatch:
     inputs:

From 3cd176dc3908e89b3178d9d031f58339c242325e Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Sat, 14 Dec 2024 10:13:04 -0800
Subject: [PATCH 46/73] Avoid importing memory aligned malloc (#1442)

We deprecate the usage of classic malloc and free, but under certain
circumstances they might get imported from intrinsics. The original
thought is we should just override malloc and free to use zmalloc and
zfree, but I think we should continue to deprecate it to avoid
accidental imports of allocations.

Closes https://github.com/valkey-io/valkey/issues/1434.

---------

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 src/hyperloglog.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/hyperloglog.c b/src/hyperloglog.c
index f0390b3e1e..6056bc0098 100644
--- a/src/hyperloglog.c
+++ b/src/hyperloglog.c
@@ -36,6 +36,9 @@
 #include <math.h>
 
 #ifdef HAVE_AVX2
+/* Define __MM_MALLOC_H to prevent importing the memory aligned
+ * allocation functions, which we don't use. */
+#define __MM_MALLOC_H
 #include <immintrin.h>
 #endif
 

From 0e96bb311e35b07b1f85932752cad9b777037441 Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Sat, 14 Dec 2024 10:14:01 -0800
Subject: [PATCH 47/73] Synchronously delete data during defrag tests (#1443)

The creation of fragmentation is delayed when we use lazy-free. You can
induce some of the active-defrag tests to fail by artificially adding a
delay in the lazyfree process, similar to the issues seen in #1433 and
issues like
https://github.com/valkey-io/valkey/actions/runs/12267010712/job/34226304803#step:7:6538.
The solution is to always do sync free during tests.

Might close https://github.com/valkey-io/valkey/issues/1433.

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 tests/unit/memefficiency.tcl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl
index 78a68a682d..8f6e5e8dd3 100644
--- a/tests/unit/memefficiency.tcl
+++ b/tests/unit/memefficiency.tcl
@@ -47,6 +47,8 @@ run_solo {defrag} {
             r config set active-defrag-ignore-bytes 2mb
             r config set maxmemory 100mb
             r config set maxmemory-policy allkeys-lru
+            r config set lazyfree-lazy-user-del no
+            r config set lazyfree-lazy-user-flush no
 
             populate 700000 asdf1 150
             populate 100 asdf1 150 0 false 1000

From 88942c8e61b8180c6093010f9260c379e9762790 Mon Sep 17 00:00:00 2001
From: Rain Valentine <rsg000@gmail.com>
Date: Sat, 14 Dec 2024 11:53:48 -0800
Subject: [PATCH 48/73] Replace dict with new hashtable for sets datatype
 (#1176)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The new `hashtable` provides faster lookups and uses less memory than
`dict`.

A TCL test case "SRANDMEMBER with a dict containing long chain" is
deleted because it's covered by a hashtable unit test
"test_random_entry_with_long_chain", which is already present.

This change also moves some logic from dismissMemory (object.c) to
zmadvise_dontneed (zmalloc.c), so the hashtable implementation which
needs the dismiss functionality doesn't need to depend on object.c and
server.h.

This PR follows #1186.

---------

Signed-off-by: Rain Valentine <rsg000@gmail.com>
Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
---
 src/db.c                |  72 +++++++++++------
 src/debug.c             |  29 ++++---
 src/defrag.c            |  42 ++++++----
 src/hashtable.c         |  10 ++-
 src/hashtable.h         |   3 +-
 src/lazyfree.c          |   6 +-
 src/module.c            |  39 ++++++---
 src/object.c            |  56 +++++++------
 src/rdb.c               |  42 +++++-----
 src/server.c            |  57 +++++--------
 src/server.h            |   9 ++-
 src/t_set.c             | 172 ++++++++++++++++++++--------------------
 src/t_zset.c            |  24 +++---
 src/zmalloc.c           |  19 ++++-
 src/zmalloc.h           |   2 +-
 tests/unit/info.tcl     |   8 +-
 tests/unit/type/set.tcl | 107 +------------------------
 17 files changed, 326 insertions(+), 371 deletions(-)

diff --git a/src/db.c b/src/db.c
index 2bd40ba74b..1223d00c8d 100644
--- a/src/db.c
+++ b/src/db.c
@@ -978,7 +978,7 @@ void keysScanCallback(void *privdata, void *entry) {
 
 /* This callback is used by scanGenericCommand in order to collect elements
  * returned by the dictionary iterator into a list. */
-void scanCallback(void *privdata, const dictEntry *de) {
+void dictScanCallback(void *privdata, const dictEntry *de) {
     scanData *data = (scanData *)privdata;
     list *keys = data->keys;
     robj *o = data->o;
@@ -998,9 +998,7 @@ void scanCallback(void *privdata, const dictEntry *de) {
         }
     }
 
-    if (o->type == OBJ_SET) {
-        key = keysds;
-    } else if (o->type == OBJ_HASH) {
+    if (o->type == OBJ_HASH) {
         key = keysds;
         if (!data->only_keys) {
             val = dictGetVal(de);
@@ -1013,13 +1011,33 @@ void scanCallback(void *privdata, const dictEntry *de) {
             val = sdsnewlen(buf, len);
         }
     } else {
-        serverPanic("Type not handled in SCAN callback.");
+        serverPanic("Type not handled in dict SCAN callback.");
     }
 
     listAddNodeTail(keys, key);
     if (val) listAddNodeTail(keys, val);
 }
 
+void hashtableScanCallback(void *privdata, void *entry) {
+    scanData *data = (scanData *)privdata;
+    robj *o = data->o;
+    list *keys = data->keys;
+    data->sampled++;
+
+    /* currently only implemented for SET scan */
+    serverAssert(o && o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HASHTABLE);
+    sds key = (sds)entry; /* Specific for OBJ_SET */
+
+    /* Filter element if it does not match the pattern. */
+    if (data->pattern) {
+        if (!stringmatchlen(data->pattern, sdslen(data->pattern), key, sdslen(key), 0)) {
+            return;
+        }
+    }
+
+    listAddNodeTail(keys, key);
+}
+
 /* Try to parse a SCAN cursor stored at object 'o':
  * if the cursor is valid, store it as unsigned integer into *cursor and
  * returns C_OK. Otherwise return C_ERR and send an error to the
@@ -1083,7 +1101,6 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) {
     sds typename = NULL;
     long long type = LLONG_MAX;
     int patlen = 0, use_pattern = 0, only_keys = 0;
-    dict *ht;
 
     /* Object must be NULL (to iterate keys names), or the type of the object
      * must be Set, Sorted Set, or Hash. */
@@ -1152,34 +1169,35 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) {
      * just return everything inside the object in a single call, setting the
      * cursor to zero to signal the end of the iteration. */
 
-    /* Handle the case of a hash table. */
-    ht = NULL;
+    /* Handle the case of kvstore, dict or hashtable. */
+    dict *dict_table = NULL;
+    hashtable *hashtable_table = NULL;
+    int shallow_copied_list_items = 0;
     if (o == NULL) {
-        ht = NULL;
-    } else if (o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HT) {
-        ht = o->ptr;
+        shallow_copied_list_items = 1;
+    } else if (o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HASHTABLE) {
+        hashtable_table = o->ptr;
+        shallow_copied_list_items = 1;
     } else if (o->type == OBJ_HASH && o->encoding == OBJ_ENCODING_HT) {
-        ht = o->ptr;
+        dict_table = o->ptr;
+        shallow_copied_list_items = 1;
     } else if (o->type == OBJ_ZSET && o->encoding == OBJ_ENCODING_SKIPLIST) {
         zset *zs = o->ptr;
-        ht = zs->dict;
+        dict_table = zs->dict;
+        /* scanning ZSET allocates temporary strings even though it's a dict */
+        shallow_copied_list_items = 0;
     }
 
     list *keys = listCreate();
-    /* Set a free callback for the contents of the collected keys list.
-     * For the main keyspace dict, and when we scan a key that's dict encoded
-     * (we have 'ht'), we don't need to define free method because the strings
-     * in the list are just a shallow copy from the pointer in the dictEntry.
-     * When scanning a key with other encodings (e.g. listpack), we need to
-     * free the temporary strings we add to that list.
-     * The exception to the above is ZSET, where we do allocate temporary
-     * strings even when scanning a dict. */
-    if (o && (!ht || o->type == OBJ_ZSET)) {
+    /* Set a free callback for the contents of the collected keys list if they
+     * are deep copied temporary strings. We must not free them if they are just
+     * a shallow copy - a pointer to the actual data in the data structure */
+    if (!shallow_copied_list_items) {
         listSetFreeMethod(keys, (void (*)(void *))sdsfree);
     }
 
-    /* For main dictionary scan or data structure using hashtable. */
-    if (!o || ht) {
+    /* For main hash table scan or scannable data structure. */
+    if (!o || dict_table || hashtable_table) {
         /* We set the max number of iterations to ten times the specified
          * COUNT, so if the hash table is in a pathological state (very
          * sparsely populated) we avoid to block too much time at the cost
@@ -1188,7 +1206,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) {
 
         /* We pass scanData which have three pointers to the callback:
          * 1. data.keys: the list to which it will add new elements;
-         * 2. data.o: the object containing the dictionary so that
+         * 2. data.o: the object containing the hash table so that
          * it is possible to fetch more data in a type-dependent way;
          * 3. data.type: the specified type scan in the db, LLONG_MAX means
          * type matching is no needed;
@@ -1219,8 +1237,10 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) {
              * If cursor is empty, we should try exploring next non-empty slot. */
             if (o == NULL) {
                 cursor = kvstoreScan(c->db->keys, cursor, onlydidx, keysScanCallback, NULL, &data);
+            } else if (dict_table) {
+                cursor = dictScan(dict_table, cursor, dictScanCallback, &data);
             } else {
-                cursor = dictScan(ht, cursor, scanCallback, &data);
+                cursor = hashtableScan(hashtable_table, cursor, hashtableScanCallback, &data);
             }
         } while (cursor && maxiterations-- && data.sampled < count);
     } else if (o->type == OBJ_SET) {
diff --git a/src/debug.c b/src/debug.c
index d63d12f762..4efe12e237 100644
--- a/src/debug.c
+++ b/src/debug.c
@@ -916,30 +916,35 @@ void debugCommand(client *c) {
         addReplyVerbatim(c, stats, sdslen(stats), "txt");
         sdsfree(stats);
     } else if (!strcasecmp(c->argv[1]->ptr, "htstats-key") && c->argc >= 3) {
-        robj *o;
-        dict *ht = NULL;
         int full = 0;
-
         if (c->argc >= 4 && !strcasecmp(c->argv[3]->ptr, "full")) full = 1;
 
-        if ((o = objectCommandLookupOrReply(c, c->argv[2], shared.nokeyerr)) == NULL) return;
+        robj *o = objectCommandLookupOrReply(c, c->argv[2], shared.nokeyerr);
+        if (o == NULL) return;
 
-        /* Get the hash table reference from the object, if possible. */
+        /* Get the dict reference from the object, if possible. */
+        dict *d = NULL;
+        hashtable *ht = NULL;
         switch (o->encoding) {
         case OBJ_ENCODING_SKIPLIST: {
             zset *zs = o->ptr;
-            ht = zs->dict;
+            d = zs->dict;
         } break;
-        case OBJ_ENCODING_HT: ht = o->ptr; break;
+        case OBJ_ENCODING_HT: d = o->ptr; break;
+        case OBJ_ENCODING_HASHTABLE: ht = o->ptr; break;
         }
 
-        if (ht == NULL) {
-            addReplyError(c, "The value stored at the specified key is not "
-                             "represented using an hash table");
-        } else {
+        if (d != NULL) {
             char buf[4096];
-            dictGetStats(buf, sizeof(buf), ht, full);
+            dictGetStats(buf, sizeof(buf), d, full);
             addReplyVerbatim(c, buf, strlen(buf), "txt");
+        } else if (ht != NULL) {
+            char buf[4096];
+            hashtableGetStats(buf, sizeof(buf), ht, full);
+            addReplyVerbatim(c, buf, strlen(buf), "txt");
+        } else {
+            addReplyError(c, "The value stored at the specified key is not "
+                             "represented using an hash table");
         }
     } else if (!strcasecmp(c->argv[1]->ptr, "change-repl-id") && c->argc == 2) {
         serverLog(LL_NOTICE, "Changing replication IDs after receiving DEBUG change-repl-id");
diff --git a/src/defrag.c b/src/defrag.c
index 8c1ad29de2..8e7fc8449e 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -34,6 +34,7 @@
  */
 
 #include "server.h"
+#include "hashtable.h"
 #include "script.h"
 #include <stddef.h>
 
@@ -379,6 +380,20 @@ static void activeDefragSdsDict(dict *d, int val_type) {
     } while (cursor != 0);
 }
 
+void activeDefragSdsHashtableCallback(void *privdata, void *entry_ref) {
+    UNUSED(privdata);
+    sds *sds_ref = (sds *)entry_ref;
+    sds new_sds = activeDefragSds(*sds_ref);
+    if (new_sds != NULL) *sds_ref = new_sds;
+}
+
+void activeDefragSdsHashtable(hashtable *ht) {
+    unsigned long cursor = 0;
+    do {
+        cursor = hashtableScanDefrag(ht, cursor, activeDefragSdsHashtableCallback, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF);
+    } while (cursor != 0);
+}
+
 /* Defrag a list of ptr, sds or robj string values */
 static void activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) {
     quicklistNode *newnode, *node = *node_ref;
@@ -497,11 +512,9 @@ static void scanCallbackCountScanned(void *privdata, const dictEntry *de) {
 }
 
 static void scanLaterSet(robj *ob, unsigned long *cursor) {
-    if (ob->type != OBJ_SET || ob->encoding != OBJ_ENCODING_HT) return;
-    dict *d = ob->ptr;
-    dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc,
-                                     .defragKey = (dictDefragAllocFunction *)activeDefragSds};
-    *cursor = dictScanDefrag(d, *cursor, scanCallbackCountScanned, &defragfns, NULL);
+    if (ob->type != OBJ_SET || ob->encoding != OBJ_ENCODING_HASHTABLE) return;
+    hashtable *ht = ob->ptr;
+    *cursor = hashtableScanDefrag(ht, *cursor, activeDefragSdsHashtableCallback, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF);
 }
 
 static void scanLaterHash(robj *ob, unsigned long *cursor) {
@@ -560,15 +573,16 @@ static void defragHash(robj *ob) {
 }
 
 static void defragSet(robj *ob) {
-    dict *d, *newd;
-    serverAssert(ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HT);
-    d = ob->ptr;
-    if (dictSize(d) > server.active_defrag_max_scan_fields)
+    serverAssert(ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HASHTABLE);
+    hashtable *ht = ob->ptr;
+    if (hashtableSize(ht) > server.active_defrag_max_scan_fields) {
         defragLater(ob);
-    else
-        activeDefragSdsDict(d, DEFRAG_SDS_DICT_NO_VAL);
-    /* defrag the dict struct and tables */
-    if ((newd = dictDefragTables(ob->ptr))) ob->ptr = newd;
+    } else {
+        activeDefragSdsHashtable(ht);
+    }
+    /* defrag the hashtable struct and tables */
+    hashtable *newHashtable = hashtableDefragTables(ht, activeDefragAlloc);
+    if (newHashtable) ob->ptr = newHashtable;
 }
 
 /* Defrag callback for radix tree iterator, called for each node,
@@ -766,7 +780,7 @@ static void defragKey(defragKeysCtx *ctx, robj **elemref) {
             serverPanic("Unknown list encoding");
         }
     } else if (ob->type == OBJ_SET) {
-        if (ob->encoding == OBJ_ENCODING_HT) {
+        if (ob->encoding == OBJ_ENCODING_HASHTABLE) {
             defragSet(ob);
         } else if (ob->encoding == OBJ_ENCODING_INTSET || ob->encoding == OBJ_ENCODING_LISTPACK) {
             void *newptr, *ptr = ob->ptr;
diff --git a/src/hashtable.c b/src/hashtable.c
index 9d963b9ddc..11ba360800 100644
--- a/src/hashtable.c
+++ b/src/hashtable.c
@@ -1023,7 +1023,7 @@ void *hashtableMetadata(hashtable *ht) {
 }
 
 /* Returns the number of entries stored. */
-size_t hashtableSize(hashtable *ht) {
+size_t hashtableSize(const hashtable *ht) {
     return ht->used[0] + ht->used[1];
 }
 
@@ -1180,6 +1180,14 @@ hashtable *hashtableDefragTables(hashtable *ht, void *(*defragfn)(void *)) {
     return ht1;
 }
 
+/* Used for releasing memory to OS to avoid unnecessary CoW. Called when we've
+ * forked and memory won't be used again. See zmadvise_dontneed() */
+void dismissHashtable(hashtable *ht) {
+    for (int i = 0; i < 2; i++) {
+        zmadvise_dontneed(ht->tables[i], numBuckets(ht->bucket_exp[i]) * sizeof(bucket *));
+    }
+}
+
 /* Returns 1 if an entry was found matching the key. Also points *found to it,
  * if found is provided. Returns 0 if no matching entry was found. */
 int hashtableFind(hashtable *ht, const void *key, void **found) {
diff --git a/src/hashtable.h b/src/hashtable.h
index 242531df8f..4291cf5a5d 100644
--- a/src/hashtable.h
+++ b/src/hashtable.h
@@ -108,7 +108,7 @@ void hashtableRelease(hashtable *ht);
 void hashtableEmpty(hashtable *ht, void(callback)(hashtable *));
 hashtableType *hashtableGetType(hashtable *ht);
 void *hashtableMetadata(hashtable *ht);
-size_t hashtableSize(hashtable *ht);
+size_t hashtableSize(const hashtable *ht);
 size_t hashtableBuckets(hashtable *ht);
 size_t hashtableChainedBuckets(hashtable *ht, int table);
 size_t hashtableMemUsage(hashtable *ht);
@@ -123,6 +123,7 @@ int hashtableTryExpand(hashtable *ht, size_t size);
 int hashtableExpandIfNeeded(hashtable *ht);
 int hashtableShrinkIfNeeded(hashtable *ht);
 hashtable *hashtableDefragTables(hashtable *ht, void *(*defragfn)(void *));
+void dismissHashtable(hashtable *ht);
 
 /* Entries */
 int hashtableFind(hashtable *ht, const void *key, void **found);
diff --git a/src/lazyfree.c b/src/lazyfree.c
index 14a4454d7a..4b4c7f06ad 100644
--- a/src/lazyfree.c
+++ b/src/lazyfree.c
@@ -116,9 +116,9 @@ size_t lazyfreeGetFreeEffort(robj *key, robj *obj, int dbid) {
     if (obj->type == OBJ_LIST && obj->encoding == OBJ_ENCODING_QUICKLIST) {
         quicklist *ql = obj->ptr;
         return ql->len;
-    } else if (obj->type == OBJ_SET && obj->encoding == OBJ_ENCODING_HT) {
-        dict *ht = obj->ptr;
-        return dictSize(ht);
+    } else if (obj->type == OBJ_SET && obj->encoding == OBJ_ENCODING_HASHTABLE) {
+        hashtable *ht = obj->ptr;
+        return hashtableSize(ht);
     } else if (obj->type == OBJ_ZSET && obj->encoding == OBJ_ENCODING_SKIPLIST) {
         zset *zs = obj->ptr;
         return zs->zsl->length;
diff --git a/src/module.c b/src/module.c
index 9bcf68646e..36283e2c73 100644
--- a/src/module.c
+++ b/src/module.c
@@ -11017,20 +11017,20 @@ typedef struct {
     ValkeyModuleScanKeyCB fn;
 } ScanKeyCBData;
 
-static void moduleScanKeyCallback(void *privdata, const dictEntry *de) {
+static void moduleScanKeyDictCallback(void *privdata, const dictEntry *de) {
     ScanKeyCBData *data = privdata;
     sds key = dictGetKey(de);
     robj *o = data->key->value;
     robj *field = createStringObject(key, sdslen(key));
     robj *value = NULL;
-    if (o->type == OBJ_SET) {
-        value = NULL;
-    } else if (o->type == OBJ_HASH) {
+    if (o->type == OBJ_HASH) {
         sds val = dictGetVal(de);
         value = createStringObject(val, sdslen(val));
     } else if (o->type == OBJ_ZSET) {
         double *val = (double *)dictGetVal(de);
         value = createStringObjectFromLongDouble(*val, 0);
+    } else {
+        serverPanic("unexpected object type");
     }
 
     data->fn(data->key, field, value, data->user_data);
@@ -11038,6 +11038,17 @@ static void moduleScanKeyCallback(void *privdata, const dictEntry *de) {
     if (value) decrRefCount(value);
 }
 
+static void moduleScanKeyHashtableCallback(void *privdata, void *entry) {
+    ScanKeyCBData *data = privdata;
+    robj *o = data->key->value;
+    serverAssert(o->type == OBJ_SET);
+    sds key = entry;
+    robj *field = createStringObject(key, sdslen(key));
+
+    data->fn(data->key, field, NULL, data->user_data);
+    decrRefCount(field);
+}
+
 /* Scan api that allows a module to scan the elements in a hash, set or sorted set key
  *
  * Callback for scan implementation.
@@ -11091,14 +11102,15 @@ int VM_ScanKey(ValkeyModuleKey *key, ValkeyModuleScanCursor *cursor, ValkeyModul
         errno = EINVAL;
         return 0;
     }
-    dict *ht = NULL;
+    dict *d = NULL;
+    hashtable *ht = NULL;
     robj *o = key->value;
     if (o->type == OBJ_SET) {
-        if (o->encoding == OBJ_ENCODING_HT) ht = o->ptr;
+        if (o->encoding == OBJ_ENCODING_HASHTABLE) ht = o->ptr;
     } else if (o->type == OBJ_HASH) {
-        if (o->encoding == OBJ_ENCODING_HT) ht = o->ptr;
+        if (o->encoding == OBJ_ENCODING_HT) d = o->ptr;
     } else if (o->type == OBJ_ZSET) {
-        if (o->encoding == OBJ_ENCODING_SKIPLIST) ht = ((zset *)o->ptr)->dict;
+        if (o->encoding == OBJ_ENCODING_SKIPLIST) d = ((zset *)o->ptr)->dict;
     } else {
         errno = EINVAL;
         return 0;
@@ -11108,9 +11120,16 @@ int VM_ScanKey(ValkeyModuleKey *key, ValkeyModuleScanCursor *cursor, ValkeyModul
         return 0;
     }
     int ret = 1;
-    if (ht) {
+    if (d) {
+        ScanKeyCBData data = {key, privdata, fn};
+        cursor->cursor = dictScan(d, cursor->cursor, moduleScanKeyDictCallback, &data);
+        if (cursor->cursor == 0) {
+            cursor->done = 1;
+            ret = 0;
+        }
+    } else if (ht) {
         ScanKeyCBData data = {key, privdata, fn};
-        cursor->cursor = dictScan(ht, cursor->cursor, moduleScanKeyCallback, &data);
+        cursor->cursor = hashtableScan(ht, cursor->cursor, moduleScanKeyHashtableCallback, &data);
         if (cursor->cursor == 0) {
             cursor->done = 1;
             ret = 0;
diff --git a/src/object.c b/src/object.c
index ac1c26adf9..15363f31b8 100644
--- a/src/object.c
+++ b/src/object.c
@@ -429,9 +429,9 @@ robj *createListListpackObject(void) {
 }
 
 robj *createSetObject(void) {
-    dict *d = dictCreate(&setDictType);
-    robj *o = createObject(OBJ_SET, d);
-    o->encoding = OBJ_ENCODING_HT;
+    hashtable *ht = hashtableCreate(&setHashtableType);
+    robj *o = createObject(OBJ_SET, ht);
+    o->encoding = OBJ_ENCODING_HASHTABLE;
     return o;
 }
 
@@ -506,7 +506,7 @@ void freeListObject(robj *o) {
 
 void freeSetObject(robj *o) {
     switch (o->encoding) {
-    case OBJ_ENCODING_HT: dictRelease((dict *)o->ptr); break;
+    case OBJ_ENCODING_HASHTABLE: hashtableRelease((hashtable *)o->ptr); break;
     case OBJ_ENCODING_INTSET:
     case OBJ_ENCODING_LISTPACK: zfree(o->ptr); break;
     default: serverPanic("Unknown set encoding type");
@@ -622,23 +622,23 @@ void dismissListObject(robj *o, size_t size_hint) {
 
 /* See dismissObject() */
 void dismissSetObject(robj *o, size_t size_hint) {
-    if (o->encoding == OBJ_ENCODING_HT) {
-        dict *set = o->ptr;
-        serverAssert(dictSize(set) != 0);
+    if (o->encoding == OBJ_ENCODING_HASHTABLE) {
+        hashtable *ht = o->ptr;
+        serverAssert(hashtableSize(ht) != 0);
         /* We iterate all nodes only when average member size is bigger than a
          * page size, and there's a high chance we'll actually dismiss something. */
-        if (size_hint / dictSize(set) >= server.page_size) {
-            dictEntry *de;
-            dictIterator *di = dictGetIterator(set);
-            while ((de = dictNext(di)) != NULL) {
-                dismissSds(dictGetKey(de));
+        if (size_hint / hashtableSize(ht) >= server.page_size) {
+            hashtableIterator iter;
+            hashtableInitIterator(&iter, ht);
+            void *next;
+            while (hashtableNext(&iter, &next)) {
+                sds item = next;
+                dismissSds(item);
             }
-            dictReleaseIterator(di);
+            hashtableResetIterator(&iter);
         }
 
-        /* Dismiss hash table memory. */
-        dismissMemory(set->ht_table[0], DICTHT_SIZE(set->ht_size_exp[0]) * sizeof(dictEntry *));
-        dismissMemory(set->ht_table[1], DICTHT_SIZE(set->ht_size_exp[1]) * sizeof(dictEntry *));
+        dismissHashtable(ht);
     } else if (o->encoding == OBJ_ENCODING_INTSET) {
         dismissMemory(o->ptr, intsetBlobLen((intset *)o->ptr));
     } else if (o->encoding == OBJ_ENCODING_LISTPACK) {
@@ -728,7 +728,7 @@ void dismissStreamObject(robj *o, size_t size_hint) {
  * modifies any keys due to write traffic, it'll cause CoW which consume
  * physical memory. In the child process, after serializing the key and value,
  * the data is definitely not accessed again, so to avoid unnecessary CoW, we
- * try to release their memory back to OS. see dismissMemory().
+ * try to release their memory back to OS. see zmadvise_dontneed().
  *
  * Because of the cost of iterating all node/field/member/entry of complex data
  * types, we iterate and dismiss them only when approximate average we estimate
@@ -1109,6 +1109,7 @@ char *strEncoding(int encoding) {
     case OBJ_ENCODING_RAW: return "raw";
     case OBJ_ENCODING_INT: return "int";
     case OBJ_ENCODING_HT: return "hashtable";
+    case OBJ_ENCODING_HASHTABLE: return "hashtable";
     case OBJ_ENCODING_QUICKLIST: return "quicklist";
     case OBJ_ENCODING_LISTPACK: return "listpack";
     case OBJ_ENCODING_INTSET: return "intset";
@@ -1160,17 +1161,20 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) {
             serverPanic("Unknown list encoding");
         }
     } else if (o->type == OBJ_SET) {
-        if (o->encoding == OBJ_ENCODING_HT) {
-            d = o->ptr;
-            di = dictGetIterator(d);
-            asize = sizeof(*o) + sizeof(dict) + (sizeof(struct dictEntry *) * dictBuckets(d));
-            while ((de = dictNext(di)) != NULL && samples < sample_size) {
-                ele = dictGetKey(de);
-                elesize += dictEntryMemUsage(de) + sdsAllocSize(ele);
+        if (o->encoding == OBJ_ENCODING_HASHTABLE) {
+            hashtable *ht = o->ptr;
+            asize = sizeof(*o) + hashtableMemUsage(ht);
+
+            hashtableIterator iter;
+            hashtableInitIterator(&iter, ht);
+            void *next;
+            while (hashtableNext(&iter, &next) && samples < sample_size) {
+                sds element = next;
+                elesize += sdsAllocSize(element);
                 samples++;
             }
-            dictReleaseIterator(di);
-            if (samples) asize += (double)elesize / samples * dictSize(d);
+            hashtableResetIterator(&iter);
+            if (samples) asize += (double)elesize / samples * hashtableSize(ht);
         } else if (o->encoding == OBJ_ENCODING_INTSET) {
             asize = sizeof(*o) + zmalloc_size(o->ptr);
         } else if (o->encoding == OBJ_ENCODING_LISTPACK) {
diff --git a/src/rdb.c b/src/rdb.c
index 6e990736bc..5fb77a2897 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -692,7 +692,7 @@ int rdbSaveObjectType(rio *rdb, robj *o) {
     case OBJ_SET:
         if (o->encoding == OBJ_ENCODING_INTSET)
             return rdbSaveType(rdb, RDB_TYPE_SET_INTSET);
-        else if (o->encoding == OBJ_ENCODING_HT)
+        else if (o->encoding == OBJ_ENCODING_HASHTABLE)
             return rdbSaveType(rdb, RDB_TYPE_SET);
         else if (o->encoding == OBJ_ENCODING_LISTPACK)
             return rdbSaveType(rdb, RDB_TYPE_SET_LISTPACK);
@@ -876,26 +876,26 @@ ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid) {
         }
     } else if (o->type == OBJ_SET) {
         /* Save a set value */
-        if (o->encoding == OBJ_ENCODING_HT) {
-            dict *set = o->ptr;
-            dictIterator *di = dictGetIterator(set);
-            dictEntry *de;
+        if (o->encoding == OBJ_ENCODING_HASHTABLE) {
+            hashtable *set = o->ptr;
 
-            if ((n = rdbSaveLen(rdb, dictSize(set))) == -1) {
-                dictReleaseIterator(di);
+            if ((n = rdbSaveLen(rdb, hashtableSize(set))) == -1) {
                 return -1;
             }
             nwritten += n;
 
-            while ((de = dictNext(di)) != NULL) {
-                sds ele = dictGetKey(de);
+            hashtableIterator iterator;
+            hashtableInitIterator(&iterator, set);
+            void *next;
+            while (hashtableNext(&iterator, &next)) {
+                sds ele = next;
                 if ((n = rdbSaveRawString(rdb, (unsigned char *)ele, sdslen(ele))) == -1) {
-                    dictReleaseIterator(di);
+                    hashtableResetIterator(&iterator);
                     return -1;
                 }
                 nwritten += n;
             }
-            dictReleaseIterator(di);
+            hashtableResetIterator(&iterator);
         } else if (o->encoding == OBJ_ENCODING_INTSET) {
             size_t l = intsetBlobLen((intset *)o->ptr);
 
@@ -1909,8 +1909,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
             o = createSetObject();
             /* It's faster to expand the dict to the right size asap in order
              * to avoid rehashing */
-            if (len > DICT_HT_INITIAL_SIZE && dictTryExpand(o->ptr, len) != DICT_OK) {
-                rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len);
+            if (!hashtableTryExpand(o->ptr, len)) {
+                rdbReportCorruptRDB("OOM in hashtableTryExpand %llu", (unsigned long long)len);
                 decrRefCount(o);
                 return NULL;
             }
@@ -1949,8 +1949,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
                      * of many small ones. It's OK since lpSafeToAdd doesn't
                      * care about individual elements, only the total size. */
                     setTypeConvert(o, OBJ_ENCODING_LISTPACK);
-                } else if (setTypeConvertAndExpand(o, OBJ_ENCODING_HT, len, 0) != C_OK) {
-                    rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len);
+                } else if (setTypeConvertAndExpand(o, OBJ_ENCODING_HASHTABLE, len, 0) != C_OK) {
+                    rdbReportCorruptRDB("OOM in hashtableTryExpand %llu", (unsigned long long)len);
                     sdsfree(sdsele);
                     decrRefCount(o);
                     return NULL;
@@ -1970,8 +1970,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
                         return NULL;
                     }
                     o->ptr = lpAppend(o->ptr, (unsigned char *)sdsele, elelen);
-                } else if (setTypeConvertAndExpand(o, OBJ_ENCODING_HT, len, 0) != C_OK) {
-                    rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len);
+                } else if (setTypeConvertAndExpand(o, OBJ_ENCODING_HASHTABLE, len, 0) != C_OK) {
+                    rdbReportCorruptRDB("OOM in hashtableTryExpand %llu", (unsigned long long)len);
                     sdsfree(sdsele);
                     decrRefCount(o);
                     return NULL;
@@ -1980,8 +1980,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
 
             /* This will also be called when the set was just converted
              * to a regular hash table encoded set. */
-            if (o->encoding == OBJ_ENCODING_HT) {
-                if (dictAdd((dict *)o->ptr, sdsele, NULL) != DICT_OK) {
+            if (o->encoding == OBJ_ENCODING_HASHTABLE) {
+                if (!hashtableAdd((hashtable *)o->ptr, sdsele)) {
                     rdbReportCorruptRDB("Duplicate set members detected");
                     decrRefCount(o);
                     sdsfree(sdsele);
@@ -2356,7 +2356,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
             }
             o->type = OBJ_SET;
             o->encoding = OBJ_ENCODING_INTSET;
-            if (intsetLen(o->ptr) > server.set_max_intset_entries) setTypeConvert(o, OBJ_ENCODING_HT);
+            if (intsetLen(o->ptr) > server.set_max_intset_entries) setTypeConvert(o, OBJ_ENCODING_HASHTABLE);
             break;
         case RDB_TYPE_SET_LISTPACK:
             if (deep_integrity_validation) server.stat_dump_payload_sanitizations++;
@@ -2376,7 +2376,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) {
                 decrRefCount(o);
                 goto emptykey;
             }
-            if (setTypeSize(o) > server.set_max_listpack_entries) setTypeConvert(o, OBJ_ENCODING_HT);
+            if (setTypeSize(o) > server.set_max_listpack_entries) setTypeConvert(o, OBJ_ENCODING_HASHTABLE);
             break;
         case RDB_TYPE_ZSET_ZIPLIST: {
             unsigned char *lp = lpNew(encoded_len);
diff --git a/src/server.c b/src/server.c
index 8e65b1f5cd..9bd7bdd4a4 100644
--- a/src/server.c
+++ b/src/server.c
@@ -372,6 +372,7 @@ void dictDictDestructor(void *val) {
     dictRelease((dict *)val);
 }
 
+/* Returns 1 when keys match */
 int dictSdsKeyCompare(const void *key1, const void *key2) {
     int l1, l2;
     l1 = sdslen((sds)key1);
@@ -380,6 +381,12 @@ int dictSdsKeyCompare(const void *key1, const void *key2) {
     return memcmp(key1, key2, l1) == 0;
 }
 
+/* Returns 0 when keys match */
+int hashtableSdsKeyCompare(const void *key1, const void *key2) {
+    const sds sds1 = (const sds)key1, sds2 = (const sds)key2;
+    return sdslen(sds1) != sdslen(sds2) || sdscmp(sds1, sds2);
+}
+
 size_t dictSdsEmbedKey(unsigned char *buf, size_t buf_len, const void *key, uint8_t *key_offset) {
     return sdscopytobuffer(buf, buf_len, (sds)key, key_offset);
 }
@@ -542,17 +549,11 @@ dictType objectKeyHeapPointerValueDictType = {
     NULL                  /* allow to expand */
 };
 
-/* Set dictionary type. Keys are SDS strings, values are not used. */
-dictType setDictType = {
-    dictSdsHash,       /* hash function */
-    NULL,              /* key dup */
-    dictSdsKeyCompare, /* key compare */
-    dictSdsDestructor, /* key destructor */
-    NULL,              /* val destructor */
-    NULL,              /* allow to expand */
-    .no_value = 1,     /* no values in this dict */
-    .keys_are_odd = 1  /* an SDS string is always an odd pointer */
-};
+/* Set hashtable type. Items are SDS strings */
+hashtableType setHashtableType = {
+    .hashFunction = dictSdsHash,
+    .keyCompare = hashtableSdsKeyCompare,
+    .entryDestructor = dictSdsDestructor};
 
 /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
 dictType zsetDictType = {
@@ -572,11 +573,6 @@ const void *hashtableObjectGetKey(const void *entry) {
     return objectGetKey(entry);
 }
 
-int hashtableSdsKeyCompare(const void *key1, const void *key2) {
-    const sds sds1 = (const sds)key1, sds2 = (const sds)key2;
-    return sdslen(sds1) != sdslen(sds2) || sdscmp(sds1, sds2);
-}
-
 int hashtableObjKeyCompare(const void *key1, const void *key2) {
     const robj *o1 = key1, *o2 = key2;
     return hashtableSdsKeyCompare(o1->ptr, o2->ptr);
@@ -645,6 +641,11 @@ dictType sdsReplyDictType = {
     NULL               /* allow to expand */
 };
 
+/* Hashtable type without destructor */
+hashtableType sdsReplyHashtableType = {
+    .hashFunction = dictSdsCaseHash,
+    .keyCompare = hashtableSdsKeyCompare};
+
 /* Keylist hash table type has unencoded Objects as keys and
  * lists as values. It's used for blocking operations (BLPOP) and to
  * map swapped keys to a list of clients waiting for this keys to be loaded. */
@@ -6521,27 +6522,7 @@ void sendChildInfo(childInfoType info_type, size_t keys, char *pname) {
     sendChildInfoGeneric(info_type, keys, -1, pname);
 }
 
-/* Try to release pages back to the OS directly (bypassing the allocator),
- * in an effort to decrease CoW during fork. For small allocations, we can't
- * release any full page, so in an effort to avoid getting the size of the
- * allocation from the allocator (malloc_size) when we already know it's small,
- * we check the size_hint. If the size is not already known, passing a size_hint
- * of 0 will lead the checking the real size of the allocation.
- * Also please note that the size may be not accurate, so in order to make this
- * solution effective, the judgement for releasing memory pages should not be
- * too strict. */
-void dismissMemory(void *ptr, size_t size_hint) {
-    if (ptr == NULL) return;
-
-    /* madvise(MADV_DONTNEED) can not release pages if the size of memory
-     * is too small, we try to release only for the memory which the size
-     * is more than half of page size. */
-    if (size_hint && size_hint <= server.page_size / 2) return;
-
-    zmadvise_dontneed(ptr);
-}
-
-/* Dismiss big chunks of memory inside a client structure, see dismissMemory() */
+/* Dismiss big chunks of memory inside a client structure, see zmadvise_dontneed() */
 void dismissClientMemory(client *c) {
     /* Dismiss client query buffer and static reply buffer. */
     dismissMemory(c->buf, c->buf_usable_size);
@@ -6572,7 +6553,7 @@ void dismissClientMemory(client *c) {
 /* In the child process, we don't need some buffers anymore, and these are
  * likely to change in the parent when there's heavy write traffic.
  * We dismiss them right away, to avoid CoW.
- * see dismissMemory(). */
+ * see zmadvise_dontneed(). */
 void dismissMemoryInChild(void) {
     /* madvise(MADV_DONTNEED) may not work if Transparent Huge Pages is enabled. */
     if (server.thp_enabled) return;
diff --git a/src/server.h b/src/server.h
index e9332233aa..dc4d2e8808 100644
--- a/src/server.h
+++ b/src/server.h
@@ -83,6 +83,8 @@ typedef long long ustime_t; /* microsecond time type. */
 #include "connection.h" /* Connection abstraction */
 #include "memory_prefetch.h"
 
+#define dismissMemory zmadvise_dontneed
+
 #define VALKEYMODULE_CORE 1
 typedef struct serverObject robj;
 #include "valkeymodule.h" /* Modules API defines. */
@@ -873,6 +875,7 @@ struct ValkeyModuleDigest {
 #define OBJ_ENCODING_QUICKLIST 9  /* Encoded as linked list of listpacks */
 #define OBJ_ENCODING_STREAM 10    /* Encoded as a radix tree of listpacks */
 #define OBJ_ENCODING_LISTPACK 11  /* Encoded as a listpack */
+#define OBJ_ENCODING_HASHTABLE 12 /* Encoded as a hashtable */
 
 #define LRU_BITS 24
 #define LRU_CLOCK_MAX ((1 << LRU_BITS) - 1) /* Max value of obj->lru */
@@ -2634,7 +2637,7 @@ typedef struct {
     robj *subject;
     int encoding;
     int ii; /* intset iterator */
-    dictIterator *di;
+    hashtableIterator *hashtable_iterator;
     unsigned char *lpi; /* listpack iterator */
 } setTypeIterator;
 
@@ -2665,7 +2668,7 @@ extern struct valkeyServer server;
 extern struct sharedObjectsStruct shared;
 extern dictType objectKeyPointerValueDictType;
 extern dictType objectKeyHeapPointerValueDictType;
-extern dictType setDictType;
+extern hashtableType setHashtableType;
 extern dictType BenchmarkDictType;
 extern dictType zsetDictType;
 extern hashtableType kvstoreKeysHashtableType;
@@ -2680,6 +2683,7 @@ extern dictType objToDictDictType;
 extern hashtableType kvstoreChannelHashtableType;
 extern dictType modulesDictType;
 extern dictType sdsReplyDictType;
+extern hashtableType sdsReplyHashtableType;
 extern dictType keylistDictType;
 extern dict *modules;
 
@@ -3374,7 +3378,6 @@ void rejectCommandFormat(client *c, const char *fmt, ...);
 void *activeDefragAlloc(void *ptr);
 robj *activeDefragStringOb(robj *ob);
 void dismissSds(sds s);
-void dismissMemory(void *ptr, size_t size_hint);
 void dismissMemoryInChild(void);
 
 #define RESTART_SERVER_NONE 0
diff --git a/src/t_set.c b/src/t_set.c
index 997fa2f5c9..4279baf82f 100644
--- a/src/t_set.c
+++ b/src/t_set.c
@@ -28,6 +28,7 @@
  */
 
 #include "server.h"
+#include "hashtable.h"
 #include "intset.h" /* Compact integer set structure */
 
 /*-----------------------------------------------------------------------------
@@ -50,7 +51,7 @@ robj *setTypeCreate(sds value, size_t size_hint) {
     /* We may oversize the set by using the hint if the hint is not accurate,
      * but we will assume this is acceptable to maximize performance. */
     robj *o = createSetObject();
-    dictExpand(o->ptr, size_hint);
+    hashtableExpand(o->ptr, size_hint);
     return o;
 }
 
@@ -59,7 +60,7 @@ robj *setTypeCreate(sds value, size_t size_hint) {
 void setTypeMaybeConvert(robj *set, size_t size_hint) {
     if ((set->encoding == OBJ_ENCODING_LISTPACK && size_hint > server.set_max_listpack_entries) ||
         (set->encoding == OBJ_ENCODING_INTSET && size_hint > server.set_max_intset_entries)) {
-        setTypeConvertAndExpand(set, OBJ_ENCODING_HT, size_hint, 1);
+        setTypeConvertAndExpand(set, OBJ_ENCODING_HASHTABLE, size_hint, 1);
     }
 }
 
@@ -74,7 +75,7 @@ static size_t intsetMaxEntries(void) {
 /* Converts intset to HT if it contains too many entries. */
 static void maybeConvertIntset(robj *subject) {
     serverAssert(subject->encoding == OBJ_ENCODING_INTSET);
-    if (intsetLen(subject->ptr) > intsetMaxEntries()) setTypeConvert(subject, OBJ_ENCODING_HT);
+    if (intsetLen(subject->ptr) > intsetMaxEntries()) setTypeConvert(subject, OBJ_ENCODING_HASHTABLE);
 }
 
 /* When you know all set elements are integers, call this to convert the set to
@@ -91,7 +92,7 @@ static void maybeConvertToIntset(robj *set) {
     while (setTypeNext(si, &str, &len, &llval) != -1) {
         if (str) {
             /* If the element is returned as a string, we may be able to convert
-             * it to integer. This happens for OBJ_ENCODING_HT. */
+             * it to integer. This happens for OBJ_ENCODING_HASHTABLE. */
             serverAssert(string2ll(str, len, (long long *)&llval));
         }
         uint8_t success = 0;
@@ -134,20 +135,21 @@ int setTypeAddAux(robj *set, char *str, size_t len, int64_t llval, int str_is_sd
     }
 
     serverAssert(str);
-    if (set->encoding == OBJ_ENCODING_HT) {
+    if (set->encoding == OBJ_ENCODING_HASHTABLE) {
         /* Avoid duping the string if it is an sds string. */
         sds sdsval = str_is_sds ? (sds)str : sdsnewlen(str, len);
-        dict *ht = set->ptr;
-        void *position = dictFindPositionForInsert(ht, sdsval, NULL);
-        if (position) {
+        hashtable *ht = set->ptr;
+        hashtablePosition position;
+        if (hashtableFindPositionForInsert(ht, sdsval, &position, NULL)) {
             /* Key doesn't already exist in the set. Add it but dup the key. */
             if (sdsval == str) sdsval = sdsdup(sdsval);
-            dictInsertAtPosition(ht, sdsval, position);
+            hashtableInsertAtPosition(ht, sdsval, &position);
+            return 1;
         } else if (sdsval != str) {
             /* String is already a member. Free our temporary sds copy. */
             sdsfree(sdsval);
+            return 0;
         }
-        return (position != NULL);
     } else if (set->encoding == OBJ_ENCODING_LISTPACK) {
         unsigned char *lp = set->ptr;
         unsigned char *p = lpFirst(lp);
@@ -166,8 +168,8 @@ int setTypeAddAux(robj *set, char *str, size_t len, int64_t llval, int str_is_sd
                 set->ptr = lp;
             } else {
                 /* Size limit is reached. Convert to hashtable and add. */
-                setTypeConvertAndExpand(set, OBJ_ENCODING_HT, lpLength(lp) + 1, 1);
-                serverAssert(dictAdd(set->ptr, sdsnewlen(str, len), NULL) == DICT_OK);
+                setTypeConvertAndExpand(set, OBJ_ENCODING_HASHTABLE, lpLength(lp) + 1, 1);
+                serverAssert(hashtableAdd(set->ptr, sdsnewlen(str, len)));
             }
             return 1;
         }
@@ -204,10 +206,10 @@ int setTypeAddAux(robj *set, char *str, size_t len, int64_t llval, int str_is_sd
                 set->ptr = lp;
                 return 1;
             } else {
-                setTypeConvertAndExpand(set, OBJ_ENCODING_HT, intsetLen(set->ptr) + 1, 1);
+                setTypeConvertAndExpand(set, OBJ_ENCODING_HASHTABLE, intsetLen(set->ptr) + 1, 1);
                 /* The set *was* an intset and this value is not integer
-                 * encodable, so dictAdd should always work. */
-                serverAssert(dictAdd(set->ptr, sdsnewlen(str, len), NULL) == DICT_OK);
+                 * encodable, so hashtableAdd should always work. */
+                serverAssert(hashtableAdd(set->ptr, sdsnewlen(str, len)));
                 return 1;
             }
         }
@@ -242,9 +244,9 @@ int setTypeRemoveAux(robj *setobj, char *str, size_t len, int64_t llval, int str
         str_is_sds = 0;
     }
 
-    if (setobj->encoding == OBJ_ENCODING_HT) {
+    if (setobj->encoding == OBJ_ENCODING_HASHTABLE) {
         sds sdsval = str_is_sds ? (sds)str : sdsnewlen(str, len);
-        int deleted = (dictDelete(setobj->ptr, sdsval) == DICT_OK);
+        int deleted = hashtableDelete(setobj->ptr, sdsval);
         if (sdsval != str) sdsfree(sdsval); /* free temp copy */
         return deleted;
     } else if (setobj->encoding == OBJ_ENCODING_LISTPACK) {
@@ -298,11 +300,11 @@ int setTypeIsMemberAux(robj *set, char *str, size_t len, int64_t llval, int str_
     } else if (set->encoding == OBJ_ENCODING_INTSET) {
         long long llval;
         return string2ll(str, len, &llval) && intsetFind(set->ptr, llval);
-    } else if (set->encoding == OBJ_ENCODING_HT && str_is_sds) {
-        return dictFind(set->ptr, (sds)str) != NULL;
-    } else if (set->encoding == OBJ_ENCODING_HT) {
+    } else if (set->encoding == OBJ_ENCODING_HASHTABLE && str_is_sds) {
+        return hashtableFind(set->ptr, (sds)str, NULL);
+    } else if (set->encoding == OBJ_ENCODING_HASHTABLE) {
         sds sdsval = sdsnewlen(str, len);
-        int result = dictFind(set->ptr, sdsval) != NULL;
+        int result = hashtableFind(set->ptr, sdsval, NULL);
         sdsfree(sdsval);
         return result;
     } else {
@@ -314,8 +316,8 @@ setTypeIterator *setTypeInitIterator(robj *subject) {
     setTypeIterator *si = zmalloc(sizeof(setTypeIterator));
     si->subject = subject;
     si->encoding = subject->encoding;
-    if (si->encoding == OBJ_ENCODING_HT) {
-        si->di = dictGetIterator(subject->ptr);
+    if (si->encoding == OBJ_ENCODING_HASHTABLE) {
+        si->hashtable_iterator = hashtableCreateIterator(subject->ptr);
     } else if (si->encoding == OBJ_ENCODING_INTSET) {
         si->ii = 0;
     } else if (si->encoding == OBJ_ENCODING_LISTPACK) {
@@ -327,7 +329,7 @@ setTypeIterator *setTypeInitIterator(robj *subject) {
 }
 
 void setTypeReleaseIterator(setTypeIterator *si) {
-    if (si->encoding == OBJ_ENCODING_HT) dictReleaseIterator(si->di);
+    if (si->encoding == OBJ_ENCODING_HASHTABLE) hashtableReleaseIterator(si->hashtable_iterator);
     zfree(si);
 }
 
@@ -340,7 +342,7 @@ void setTypeReleaseIterator(setTypeIterator *si) {
  * (str and len) or (llele) depending on whether the value is stored as a string
  * or as an integer internally.
  *
- * If OBJ_ENCODING_HT is returned, then str points to an sds string and can be
+ * If OBJ_ENCODING_HASHTABLE is returned, then str points to an sds string and can be
  * used as such. If OBJ_ENCODING_INTSET, then llele is populated and str is
  * pointed to NULL. If OBJ_ENCODING_LISTPACK is returned, the value can be
  * either a string or an integer. If *str is not NULL, then str and len are
@@ -353,10 +355,10 @@ void setTypeReleaseIterator(setTypeIterator *si) {
  *
  * When there are no more elements -1 is returned. */
 int setTypeNext(setTypeIterator *si, char **str, size_t *len, int64_t *llele) {
-    if (si->encoding == OBJ_ENCODING_HT) {
-        dictEntry *de = dictNext(si->di);
-        if (de == NULL) return -1;
-        *str = dictGetKey(de);
+    if (si->encoding == OBJ_ENCODING_HASHTABLE) {
+        void *next;
+        if (!hashtableNext(si->hashtable_iterator, &next)) return -1;
+        *str = next;
         *len = sdslen(*str);
         *llele = -123456789; /* Not needed. Defensive. */
     } else if (si->encoding == OBJ_ENCODING_INTSET) {
@@ -406,15 +408,16 @@ sds setTypeNextObject(setTypeIterator *si) {
  * object. The return value of the function is the object->encoding
  * field of the object and can be used by the caller to check if the
  * int64_t pointer or the str and len pointers were populated, as for
- * setTypeNext. If OBJ_ENCODING_HT is returned, str is pointed to a
+ * setTypeNext. If OBJ_ENCODING_HASHTABLE is returned, str is pointed to a
  * string which is actually an sds string and it can be used as such.
  *
  * Note that both the str, len and llele pointers should be passed and cannot
  * be NULL. If str is set to NULL, the value is an integer stored in llele. */
 int setTypeRandomElement(robj *setobj, char **str, size_t *len, int64_t *llele) {
-    if (setobj->encoding == OBJ_ENCODING_HT) {
-        dictEntry *de = dictGetFairRandomKey(setobj->ptr);
-        *str = dictGetKey(de);
+    if (setobj->encoding == OBJ_ENCODING_HASHTABLE) {
+        void *entry = NULL;
+        hashtableFairRandomEntry(setobj->ptr, &entry);
+        *str = entry;
         *len = sdslen(*str);
         *llele = -123456789; /* Not needed. Defensive. */
     } else if (setobj->encoding == OBJ_ENCODING_INTSET) {
@@ -457,14 +460,14 @@ robj *setTypePopRandom(robj *set) {
             obj = createStringObject(str, len);
         else
             obj = createStringObjectFromLongLong(llele);
-        setTypeRemoveAux(set, str, len, llele, encoding == OBJ_ENCODING_HT);
+        setTypeRemoveAux(set, str, len, llele, encoding == OBJ_ENCODING_HASHTABLE);
     }
     return obj;
 }
 
 unsigned long setTypeSize(const robj *subject) {
-    if (subject->encoding == OBJ_ENCODING_HT) {
-        return dictSize((const dict *)subject->ptr);
+    if (subject->encoding == OBJ_ENCODING_HASHTABLE) {
+        return hashtableSize((const hashtable *)subject->ptr);
     } else if (subject->encoding == OBJ_ENCODING_INTSET) {
         return intsetLen((const intset *)subject->ptr);
     } else if (subject->encoding == OBJ_ENCODING_LISTPACK) {
@@ -474,7 +477,7 @@ unsigned long setTypeSize(const robj *subject) {
     }
 }
 
-/* Convert the set to specified encoding. The resulting dict (when converting
+/* Convert the set to specified encoding. The resulting hashtable (when converting
  * to a hash table) is presized to hold the number of elements in the original
  * set. */
 void setTypeConvert(robj *setobj, int enc) {
@@ -489,28 +492,28 @@ int setTypeConvertAndExpand(robj *setobj, int enc, unsigned long cap, int panic)
     setTypeIterator *si;
     serverAssertWithInfo(NULL, setobj, setobj->type == OBJ_SET && setobj->encoding != enc);
 
-    if (enc == OBJ_ENCODING_HT) {
-        dict *d = dictCreate(&setDictType);
+    if (enc == OBJ_ENCODING_HASHTABLE) {
+        hashtable *ht = hashtableCreate(&setHashtableType);
         sds element;
 
-        /* Presize the dict to avoid rehashing */
+        /* Presize the hashtable to avoid rehashing */
         if (panic) {
-            dictExpand(d, cap);
-        } else if (dictTryExpand(d, cap) != DICT_OK) {
-            dictRelease(d);
+            hashtableExpand(ht, cap);
+        } else if (!hashtableTryExpand(ht, cap)) {
+            hashtableRelease(ht);
             return C_ERR;
         }
 
         /* To add the elements we extract integers and create Objects */
         si = setTypeInitIterator(setobj);
         while ((element = setTypeNextObject(si)) != NULL) {
-            serverAssert(dictAdd(d, element, NULL) == DICT_OK);
+            serverAssert(hashtableAdd(ht, element));
         }
         setTypeReleaseIterator(si);
 
         freeSetObject(setobj); /* frees the internals but not setobj itself */
-        setobj->encoding = OBJ_ENCODING_HT;
-        setobj->ptr = d;
+        setobj->encoding = OBJ_ENCODING_HASHTABLE;
+        setobj->ptr = ht;
     } else if (enc == OBJ_ENCODING_LISTPACK) {
         /* Preallocate the minimum two bytes per element (enc/value + backlen) */
         size_t estcap = cap * 2;
@@ -568,10 +571,10 @@ robj *setTypeDup(robj *o) {
         memcpy(new_lp, lp, sz);
         set = createObject(OBJ_SET, new_lp);
         set->encoding = OBJ_ENCODING_LISTPACK;
-    } else if (o->encoding == OBJ_ENCODING_HT) {
+    } else if (o->encoding == OBJ_ENCODING_HASHTABLE) {
         set = createSetObject();
-        dict *d = o->ptr;
-        dictExpand(set->ptr, dictSize(d));
+        hashtable *ht = o->ptr;
+        hashtableExpand(set->ptr, hashtableSize(ht));
         si = setTypeInitIterator(o);
         char *str;
         size_t len;
@@ -891,8 +894,8 @@ void spopWithCountCommand(client *c) {
                 if (!newset) {
                     newset = str ? createSetListpackObject() : createIntsetObject();
                 }
-                setTypeAddAux(newset, str, len, llele, encoding == OBJ_ENCODING_HT);
-                setTypeRemoveAux(set, str, len, llele, encoding == OBJ_ENCODING_HT);
+                setTypeAddAux(newset, str, len, llele, encoding == OBJ_ENCODING_HASHTABLE);
+                setTypeRemoveAux(set, str, len, llele, encoding == OBJ_ENCODING_HASHTABLE);
             }
         }
 
@@ -1001,8 +1004,6 @@ void srandmemberWithCountCommand(client *c) {
     size_t len;
     int64_t llele;
 
-    dict *d;
-
     if (getRangeLongFromObjectOrReply(c, c->argv[2], -LONG_MAX, LONG_MAX, &l, NULL) != C_OK) return;
     if (l >= 0) {
         count = (unsigned long)l;
@@ -1111,8 +1112,8 @@ void srandmemberWithCountCommand(client *c) {
         return;
     }
 
-    /* For CASE 3 and CASE 4 we need an auxiliary dictionary. */
-    d = dictCreate(&sdsReplyDictType);
+    /* For CASE 3 and CASE 4 we need an auxiliary hashtable. */
+    hashtable *ht = hashtableCreate(&sdsReplyHashtableType);
 
     /* CASE 3:
      * The number of elements inside the set is not greater than
@@ -1126,29 +1127,25 @@ void srandmemberWithCountCommand(client *c) {
     if (count * SRANDMEMBER_SUB_STRATEGY_MUL > size) {
         setTypeIterator *si;
 
-        /* Add all the elements into the temporary dictionary. */
+        /* Add all the elements into the temporary hashtable. */
         si = setTypeInitIterator(set);
-        dictExpand(d, size);
+        hashtableExpand(ht, size);
         while (setTypeNext(si, &str, &len, &llele) != -1) {
-            int retval = DICT_ERR;
-
             if (str == NULL) {
-                retval = dictAdd(d, sdsfromlonglong(llele), NULL);
+                serverAssert(hashtableAdd(ht, (void *)sdsfromlonglong(llele)));
             } else {
-                retval = dictAdd(d, sdsnewlen(str, len), NULL);
+                serverAssert(hashtableAdd(ht, (void *)sdsnewlen(str, len)));
             }
-            serverAssert(retval == DICT_OK);
         }
         setTypeReleaseIterator(si);
-        serverAssert(dictSize(d) == size);
+        serverAssert(hashtableSize(ht) == size);
 
         /* Remove random elements to reach the right count. */
         while (size > count) {
-            dictEntry *de;
-            de = dictGetFairRandomKey(d);
-            dictUnlink(d, dictGetKey(de));
-            sdsfree(dictGetKey(de));
-            dictFreeUnlinkedEntry(d, de);
+            void *element;
+            hashtableFairRandomEntry(ht, &element);
+            hashtableDelete(ht, element);
+            sdsfree((sds)element);
             size--;
         }
     }
@@ -1161,7 +1158,7 @@ void srandmemberWithCountCommand(client *c) {
         unsigned long added = 0;
         sds sdsele;
 
-        dictExpand(d, count);
+        hashtableExpand(ht, count);
         while (added < count) {
             setTypeRandomElement(set, &str, &len, &llele);
             if (str == NULL) {
@@ -1172,7 +1169,7 @@ void srandmemberWithCountCommand(client *c) {
             /* Try to add the object to the dictionary. If it already exists
              * free it, otherwise increment the number of objects we have
              * in the result dictionary. */
-            if (dictAdd(d, sdsele, NULL) == DICT_OK)
+            if (hashtableAdd(ht, sdsele))
                 added++;
             else
                 sdsfree(sdsele);
@@ -1181,14 +1178,15 @@ void srandmemberWithCountCommand(client *c) {
 
     /* CASE 3 & 4: send the result to the user. */
     {
-        dictIterator *di;
-        dictEntry *de;
+        hashtableIterator iter;
+        hashtableInitIterator(&iter, ht);
 
         addReplyArrayLen(c, count);
-        di = dictGetIterator(d);
-        while ((de = dictNext(di)) != NULL) addReplyBulkSds(c, dictGetKey(de));
-        dictReleaseIterator(di);
-        dictRelease(d);
+        serverAssert(count == hashtableSize(ht));
+        void *element;
+        while (hashtableNext(&iter, &element)) addReplyBulkSds(c, (sds)element);
+        hashtableResetIterator(&iter);
+        hashtableRelease(ht);
     }
 }
 
@@ -1336,7 +1334,7 @@ void sinterGenericCommand(client *c,
     while ((encoding = setTypeNext(si, &str, &len, &intobj)) != -1) {
         for (j = 1; j < setnum; j++) {
             if (sets[j] == sets[0]) continue;
-            if (!setTypeIsMemberAux(sets[j], str, len, intobj, encoding == OBJ_ENCODING_HT)) break;
+            if (!setTypeIsMemberAux(sets[j], str, len, intobj, encoding == OBJ_ENCODING_HASHTABLE)) break;
         }
 
         /* Only take action when all sets contain the member */
@@ -1355,7 +1353,7 @@ void sinterGenericCommand(client *c,
             } else {
                 if (str && only_integers) {
                     /* It may be an integer although we got it as a string. */
-                    if (encoding == OBJ_ENCODING_HT && string2ll(str, len, (long long *)&intobj)) {
+                    if (encoding == OBJ_ENCODING_HASHTABLE && string2ll(str, len, (long long *)&intobj)) {
                         if (dstset->encoding == OBJ_ENCODING_LISTPACK || dstset->encoding == OBJ_ENCODING_INTSET) {
                             /* Adding it as an integer is more efficient. */
                             str = NULL;
@@ -1365,7 +1363,7 @@ void sinterGenericCommand(client *c,
                         only_integers = 0;
                     }
                 }
-                setTypeAddAux(dstset, str, len, intobj, encoding == OBJ_ENCODING_HT);
+                setTypeAddAux(dstset, str, len, intobj, encoding == OBJ_ENCODING_HASHTABLE);
             }
         }
     }
@@ -1467,7 +1465,7 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke
         /* For a SET's encoding, according to the factory method setTypeCreate(), currently have 3 types:
          * 1. OBJ_ENCODING_INTSET
          * 2. OBJ_ENCODING_LISTPACK
-         * 3. OBJ_ENCODING_HT
+         * 3. OBJ_ENCODING_HASHTABLE
          * 'dstset_encoding' is used to determine which kind of encoding to use when initialize 'dstset'.
          *
          * If all sets are all OBJ_ENCODING_INTSET encoding or 'dstkey' is not null, keep 'dstset'
@@ -1478,8 +1476,8 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke
          * the hashtable is more efficient when find and compare than the listpack. The corresponding
          * time complexity are O(1) vs O(n). */
         if (!dstkey && dstset_encoding == OBJ_ENCODING_INTSET &&
-            (setobj->encoding == OBJ_ENCODING_LISTPACK || setobj->encoding == OBJ_ENCODING_HT)) {
-            dstset_encoding = OBJ_ENCODING_HT;
+            (setobj->encoding == OBJ_ENCODING_LISTPACK || setobj->encoding == OBJ_ENCODING_HASHTABLE)) {
+            dstset_encoding = OBJ_ENCODING_HASHTABLE;
         }
         sets[j] = setobj;
         if (j > 0 && sets[0] == sets[j]) {
@@ -1536,7 +1534,7 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke
 
             si = setTypeInitIterator(sets[j]);
             while ((encoding = setTypeNext(si, &str, &len, &llval)) != -1) {
-                cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HT);
+                cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HASHTABLE);
             }
             setTypeReleaseIterator(si);
         }
@@ -1556,11 +1554,11 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke
             for (j = 1; j < setnum; j++) {
                 if (!sets[j]) continue;        /* no key is an empty set. */
                 if (sets[j] == sets[0]) break; /* same set! */
-                if (setTypeIsMemberAux(sets[j], str, len, llval, encoding == OBJ_ENCODING_HT)) break;
+                if (setTypeIsMemberAux(sets[j], str, len, llval, encoding == OBJ_ENCODING_HASHTABLE)) break;
             }
             if (j == setnum) {
                 /* There is no other set with this element. Add it. */
-                cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HT);
+                cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HASHTABLE);
             }
         }
         setTypeReleaseIterator(si);
@@ -1578,9 +1576,9 @@ void sunionDiffGenericCommand(client *c, robj **setkeys, int setnum, robj *dstke
             si = setTypeInitIterator(sets[j]);
             while ((encoding = setTypeNext(si, &str, &len, &llval)) != -1) {
                 if (j == 0) {
-                    cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HT);
+                    cardinality += setTypeAddAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HASHTABLE);
                 } else {
-                    cardinality -= setTypeRemoveAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HT);
+                    cardinality -= setTypeRemoveAux(dstset, str, len, llval, encoding == OBJ_ENCODING_HASHTABLE);
                 }
             }
             setTypeReleaseIterator(si);
diff --git a/src/t_zset.c b/src/t_zset.c
index 105d57b7c3..e8c5a369b7 100644
--- a/src/t_zset.c
+++ b/src/t_zset.c
@@ -2069,9 +2069,7 @@ typedef struct {
                 int ii;
             } is;
             struct {
-                dict *dict;
-                dictIterator *di;
-                dictEntry *de;
+                hashtableIterator *iter;
             } ht;
             struct {
                 unsigned char *lp;
@@ -2126,10 +2124,8 @@ void zuiInitIterator(zsetopsrc *op) {
         if (op->encoding == OBJ_ENCODING_INTSET) {
             it->is.is = op->subject->ptr;
             it->is.ii = 0;
-        } else if (op->encoding == OBJ_ENCODING_HT) {
-            it->ht.dict = op->subject->ptr;
-            it->ht.di = dictGetIterator(op->subject->ptr);
-            it->ht.de = dictNext(it->ht.di);
+        } else if (op->encoding == OBJ_ENCODING_HASHTABLE) {
+            it->ht.iter = hashtableCreateIterator(op->subject->ptr);
         } else if (op->encoding == OBJ_ENCODING_LISTPACK) {
             it->lp.lp = op->subject->ptr;
             it->lp.p = lpFirst(it->lp.lp);
@@ -2166,8 +2162,8 @@ void zuiClearIterator(zsetopsrc *op) {
         iterset *it = &op->iter.set;
         if (op->encoding == OBJ_ENCODING_INTSET) {
             UNUSED(it); /* skip */
-        } else if (op->encoding == OBJ_ENCODING_HT) {
-            dictReleaseIterator(it->ht.di);
+        } else if (op->encoding == OBJ_ENCODING_HASHTABLE) {
+            hashtableReleaseIterator(it->ht.iter);
         } else if (op->encoding == OBJ_ENCODING_LISTPACK) {
             UNUSED(it);
         } else {
@@ -2235,13 +2231,11 @@ int zuiNext(zsetopsrc *op, zsetopval *val) {
 
             /* Move to next element. */
             it->is.ii++;
-        } else if (op->encoding == OBJ_ENCODING_HT) {
-            if (it->ht.de == NULL) return 0;
-            val->ele = dictGetKey(it->ht.de);
+        } else if (op->encoding == OBJ_ENCODING_HASHTABLE) {
+            void *next;
+            if (!hashtableNext(it->ht.iter, &next)) return 0;
+            val->ele = next;
             val->score = 1.0;
-
-            /* Move to next element. */
-            it->ht.de = dictNext(it->ht.di);
         } else if (op->encoding == OBJ_ENCODING_LISTPACK) {
             if (it->lp.p == NULL) return 0;
             val->estr = lpGetValue(it->lp.p, &val->elen, &val->ell);
diff --git a/src/zmalloc.c b/src/zmalloc.c
index a696111e47..b1de4f2af1 100644
--- a/src/zmalloc.c
+++ b/src/zmalloc.c
@@ -451,15 +451,25 @@ void zmalloc_set_oom_handler(void (*oom_handler)(size_t)) {
     zmalloc_oom_handler = oom_handler;
 }
 
-/* Use 'MADV_DONTNEED' to release memory to operating system quickly.
- * We do that in a fork child process to avoid CoW when the parent modifies
- * these shared pages. */
-void zmadvise_dontneed(void *ptr) {
+/* Try to release pages back to the OS directly using 'MADV_DONTNEED' (bypassing
+ * the allocator) in a fork child process to avoid CoW when the parent modifies
+ * those shared pages. For small allocations, we can't release any full page,
+ * so in an effort to avoid getting the size of the allocation from the
+ * allocator (malloc_size) when we already know it's small, we check the
+ * size_hint. If the size is not already known, passing a size_hint of 0 will
+ * lead the checking the real size of the allocation.
+ * Also please note that the size may be not accurate, so in order to make this
+ * solution effective, the judgement for releasing memory pages should not be
+ * too strict. */
+void zmadvise_dontneed(void *ptr, size_t size_hint) {
 #if defined(USE_JEMALLOC) && defined(__linux__)
+    if (ptr == NULL) return;
+
     static size_t page_size = 0;
     if (page_size == 0) page_size = sysconf(_SC_PAGESIZE);
     size_t page_size_mask = page_size - 1;
 
+    if (size_hint && size_hint / 2 < page_size) return;
     size_t real_size = zmalloc_size(ptr);
     if (real_size < page_size) return;
 
@@ -473,6 +483,7 @@ void zmadvise_dontneed(void *ptr) {
     }
 #else
     (void)(ptr);
+    (void)(size_hint);
 #endif
 }
 
diff --git a/src/zmalloc.h b/src/zmalloc.h
index 38c2bae864..68b4df63aa 100644
--- a/src/zmalloc.h
+++ b/src/zmalloc.h
@@ -139,7 +139,7 @@ size_t zmalloc_get_smap_bytes_by_field(char *field, long pid);
 size_t zmalloc_get_memory_size(void);
 void zlibc_free(void *ptr);
 void zlibc_trim(void);
-void zmadvise_dontneed(void *ptr);
+void zmadvise_dontneed(void *ptr, size_t size_hint);
 
 #ifndef HAVE_MALLOC_SIZE
 size_t zmalloc_size(void *ptr);
diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl
index cf7f633a8c..e50faba62b 100644
--- a/tests/unit/info.tcl
+++ b/tests/unit/info.tcl
@@ -515,10 +515,10 @@ start_server {tags {"info" "external:skip"}} {
         set info_mem [r info memory]
         set mem_stats [r memory stats]
         assert_equal [getInfoProperty $info_mem mem_overhead_db_hashtable_rehashing] {0}
-        # overhead.db.hashtable.lut = memory overhead of hashset including hashset struct and tables
-        set hashset_overhead [dict get $mem_stats overhead.db.hashtable.lut]
-        if {$hashset_overhead < 140} {
-            # 32-bit version (hashset struct + 1 bucket of 64 bytes)
+        # overhead.db.hashtable.lut = memory overhead of hashtable including hashtable struct and tables
+        set hashtable_overhead [dict get $mem_stats overhead.db.hashtable.lut]
+        if {$hashtable_overhead < 140} {
+            # 32-bit version (hashtable struct + 1 bucket of 64 bytes)
             set bits 32
         } else {
             set bits 64
diff --git a/tests/unit/type/set.tcl b/tests/unit/type/set.tcl
index 944c3d3d98..1871ec9b4d 100644
--- a/tests/unit/type/set.tcl
+++ b/tests/unit/type/set.tcl
@@ -33,6 +33,7 @@ start_server {
         assert_equal {0 1} [r smismember myset bla foo]
         assert_equal {0} [r smismember myset bla]
         assert_equal "bar $initelems($type)" [lsort [r smembers myset]]
+        r memory usage myset
     }
     }
 
@@ -51,6 +52,7 @@ start_server {
         assert_equal {0 1} [r smismember myset 18 16]
         assert_equal {0} [r smismember myset 18]
         assert_equal {16 17} [lsort [r smembers myset]]
+        r memory usage myset
     }
 
     test {SMISMEMBER SMEMBERS SCARD against non set} {
@@ -1029,111 +1031,6 @@ foreach type {single multiple single_multiple} {
         r srem $myset {*}$members
     }
 
-    proc verify_rehashing_completed_key {myset table_size keys} {
-        set htstats [r debug HTSTATS-KEY $myset]
-        assert {![string match {*rehashing target*} $htstats]}
-        return {[string match {*table size: $table_size*number of elements: $keys*} $htstats]}
-    }
-
-    test "SRANDMEMBER with a dict containing long chain" {
-        set origin_save [config_get_set save ""]
-        set origin_max_lp [config_get_set set-max-listpack-entries 0]
-        set origin_save_delay [config_get_set rdb-key-save-delay 2147483647]
-
-        # 1) Create a hash set with 100000 members.
-        set members {}
-        for {set i 0} {$i < 100000} {incr i} {
-            lappend members [format "m:%d" $i]
-        }
-        create_set myset $members
-
-        # 2) Wait for the hash set rehashing to finish.
-        while {[is_rehashing myset]} {
-            r srandmember myset 100
-        }
-
-        # 3) Turn off the rehashing of this set, and remove the members to 500.
-        r bgsave
-        rem_hash_set_top_N myset [expr {[r scard myset] - 500}]
-        assert_equal [r scard myset] 500
-
-        # 4) Kill RDB child process to restart rehashing.
-        set pid1 [get_child_pid 0]
-        catch {exec kill -9 $pid1}
-        waitForBgsave r
-
-        # 5) Let the set hash to start rehashing
-        r spop myset 1
-        assert [is_rehashing myset]
-
-        # 6) Verify that when rdb saving is in progress, rehashing will still be performed (because
-        # the ratio is extreme) by waiting for it to finish during an active bgsave.
-        r bgsave
-
-        while {[is_rehashing myset]} {
-            r srandmember myset 1
-        }
-        if {$::verbose} {
-            puts [r debug HTSTATS-KEY myset full]
-        }
-
-        set pid1 [get_child_pid 0]
-        catch {exec kill -9 $pid1}
-        waitForBgsave r
-
-        # 7) Check that eventually, SRANDMEMBER returns all elements.
-        array set allmyset {}
-        foreach ele [r smembers myset] {
-            set allmyset($ele) 1
-        }
-        unset -nocomplain auxset
-        set iterations 1000
-        while {$iterations != 0} {
-            incr iterations -1
-            set res [r srandmember myset -10]
-            foreach ele $res {
-                set auxset($ele) 1
-            }
-            if {[lsort [array names allmyset]] eq
-                [lsort [array names auxset]]} {
-                break;
-            }
-        }
-        assert {$iterations != 0}
-
-        # 8) Remove the members to 30 in order to calculate the value of Chi-Square Distribution,
-        #    otherwise we would need more iterations.
-        rem_hash_set_top_N myset [expr {[r scard myset] - 30}]
-        assert_equal [r scard myset] 30
-        
-        # Hash set rehashing would be completed while removing members from the `myset`
-        # We also check the size and members in the hash table.
-        verify_rehashing_completed_key myset 64 30
-
-        # Now that we have a hash set with only one long chain bucket.
-        set htstats [r debug HTSTATS-KEY myset full]
-        assert {[regexp {different slots: ([0-9]+)} $htstats - different_slots]}
-        assert {[regexp {max chain length: ([0-9]+)} $htstats - max_chain_length]}
-        assert {$different_slots == 1 && $max_chain_length == 30}
-
-        # 9) Use positive count (PATH 4) to get 10 elements (out of 30) each time.
-        unset -nocomplain allkey
-        set iterations 1000
-        while {$iterations != 0} {
-            incr iterations -1
-            set res [r srandmember myset 10]
-            foreach ele $res {
-                lappend allkey $ele
-            }
-        }
-        # validate even distribution of random sampling (df = 29, 73 means 0.00001 probability)
-        assert_lessthan [chi_square_value $allkey] 73
-
-        r config set save $origin_save
-        r config set set-max-listpack-entries $origin_max_lp
-        r config set rdb-key-save-delay $origin_save_delay
-    } {OK} {needs:debug slow}
-
     proc setup_move {} {
         r del myset3{t} myset4{t}
         create_set myset1{t} {1 a b}

From ad242206819059937244eb519fa612936aee4143 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Sun, 15 Dec 2024 12:09:53 +0800
Subject: [PATCH 49/73] Automatic failover vote is not limited by two times the
 node timeout (#1356)

This is a follow of #1305, we now decided to apply the same change
to automatic failover as well, that is, move forward with removing
it for both automatic and manual failovers.

Quote from Ping during the review:
Note that we already debounce transient primary failures with node
timeout, ensuring failover is only triggered after sustained outages.
Election timing is naturally staggered by replica spacing, making the
likelihood of simultaneous elections from replicas of the same shard
very low. The one-vote-per-epoch rule further throttles retries and
ensures orderly elections. On top of that, quorum-based primary failure
confirmation, cluster-state convergence, and slot ownership validation
are all built into the process.

Quote from Madelyn during the review:
It against the specific primary. It's to prevent double failovers.
If a primary just took over we don't want someone else to try to
take over and give the new primary some amount of time to take over.
I have not seen this issue though, it might have been over optimizing?
The double failure mode, where a node fails and then another node fails
within the nodetimeout also doesn't seem that common either though.

So the conclusion is that we all agreed to remove it completely,
it will make the code a lot simpler. And if there is other specific
edge cases we are missing, we will fix it in other way.

See discussion #1305 for more information.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c                   | 19 -------------
 src/cluster_legacy.h                   |  2 --
 tests/unit/cluster/manual-failover.tcl | 39 ++++++++++++++++++++------
 3 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index d1c6dd0094..418070f69c 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -1505,7 +1505,6 @@ clusterNode *createClusterNode(char *nodename, int flags) {
     node->cport = 0;
     node->tls_port = 0;
     node->fail_reports = listCreate();
-    node->voted_time = 0;
     node->orphaned_time = 0;
     node->repl_offset_time = 0;
     node->repl_offset = 0;
@@ -4396,23 +4395,6 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
         return;
     }
 
-    /* We did not voted for a replica about this primary for two
-     * times the node timeout. This is not strictly needed for correctness
-     * of the algorithm but makes the base case more linear.
-     *
-     * This limitation does not restrict manual failover. If a user initiates
-     * a manual failover, we need to allow it to vote, otherwise the manual
-     * failover may time out. */
-    if (!force_ack && mstime() - node->replicaof->voted_time < server.cluster_node_timeout * 2) {
-        serverLog(LL_WARNING,
-                  "Failover auth denied to %.40s (%s): "
-                  "can't vote for any replica of %.40s (%s) within %lld milliseconds",
-                  node->name, node->human_nodename,
-                  node->replicaof->name, node->replicaof->human_nodename,
-                  (long long)((server.cluster_node_timeout * 2) - (mstime() - node->replicaof->voted_time)));
-        return;
-    }
-
     /* The replica requesting the vote must have a configEpoch for the claimed
      * slots that is >= the one of the primaries currently serving the same
      * slots in the current configuration. */
@@ -4434,7 +4416,6 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
 
     /* We can vote for this replica. */
     server.cluster->lastVoteEpoch = server.cluster->currentEpoch;
-    if (!force_ack) node->replicaof->voted_time = mstime();
     clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG);
     clusterSendFailoverAuth(node);
     serverLog(LL_NOTICE, "Failover auth granted to %.40s (%s) for epoch %llu", node->name, node->human_nodename,
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index fb317038d6..d3e1c3459e 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -341,8 +341,6 @@ struct _clusterNode {
     mstime_t pong_received;                 /* Unix time we received the pong */
     mstime_t data_received;                 /* Unix time we received any data */
     mstime_t fail_time;                     /* Unix time when FAIL flag was set */
-    mstime_t voted_time;                    /* Last time we voted for a replica of this primary in non manual
-                                             * failover scenarios. */
     mstime_t repl_offset_time;              /* Unix time we received offset for this node */
     mstime_t orphaned_time;                 /* Starting time of orphaned primary condition */
     mstime_t inbound_link_freed_time;       /* Last time we freed the inbound link for this node.
diff --git a/tests/unit/cluster/manual-failover.tcl b/tests/unit/cluster/manual-failover.tcl
index 220ffc3eaf..dbcbb26380 100644
--- a/tests/unit/cluster/manual-failover.tcl
+++ b/tests/unit/cluster/manual-failover.tcl
@@ -189,11 +189,6 @@ start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval
         set CLUSTER_PACKET_TYPE_FAILOVER_AUTH_ACK 6
         set CLUSTER_PACKET_TYPE_NONE -1
 
-        # Setting a large timeout to make sure we hit the voted_time limit.
-        R 0 config set cluster-node-timeout 150000
-        R 1 config set cluster-node-timeout 150000
-        R 2 config set cluster-node-timeout 150000
-
         # Let replica drop FAILOVER_AUTH_ACK so that the election won't
         # get the enough votes and the election will time out.
         R 3 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_FAILOVER_AUTH_ACK
@@ -229,10 +224,6 @@ start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval
         pause_process [srv 0 pid]
         wait_for_cluster_state fail
 
-        # Setting a large timeout to make sure we hit the voted_time limit.
-        R 1 config set cluster-node-timeout 150000
-        R 2 config set cluster-node-timeout 150000
-
         # R 3 performs an automatic failover and it will work.
         R 3 config set cluster-replica-no-failover no
         wait_for_condition 1000 50 {
@@ -272,6 +263,36 @@ start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval
     }
 } ;# start_cluster
 
+start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 2000}} {
+    test "Automatic failover vote is not limited by two times the node timeout - mixed failover" {
+        R 3 cluster failover
+        wait_for_condition 1000 50 {
+            [s 0 role] eq {slave} &&
+            [s -3 role] eq {master}
+        } else {
+            fail "The first failover does not happen"
+        }
+        wait_for_cluster_propagation
+
+        R 0 cluster failover
+        wait_for_condition 1000 50 {
+            [s 0 role] eq {master} &&
+            [s -3 role] eq {slave}
+        } else {
+            fail "The second failover does not happen"
+        }
+        wait_for_cluster_propagation
+
+        # Let R 3 trigger the automatic failover
+        pause_process [srv 0 pid]
+        wait_for_condition 1000 50 {
+            [s -3 role] eq {master}
+        } else {
+            fail "The third failover does not happen"
+        }
+    }
+} ;# start_cluster
+
 start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 15000}} {
     test "Manual failover will reset the on-going election" {
         set CLUSTER_PACKET_TYPE_FAILOVER_AUTH_REQUEST 5

From e024b4bd27645a33c5f317792051c5ae3a97fa56 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Mon, 16 Dec 2024 13:43:48 +0800
Subject: [PATCH 50/73] Drop the MEET packet if the link node is in handshake
 state (#1436)

After #1307 got merged, we notice there is a assert happen in setClusterNodeToInboundClusterLink:
```
=== ASSERTION FAILED ===
==> '!link->node' is not true
```

In #778, we will call setClusterNodeToInboundClusterLink to attach the node to the link
during the MEET processing, so if we receive a another MEET packet in a short time, the
node is still in handshake state, we will meet this assert and crash the server.

If the link is bound to a node and the node is in the handshake state, and we receive
a MEET packet, it may be that the sender sent multiple MEET packets so in here we are
dropping the MEET to avoid the assert in setClusterNodeToInboundClusterLink. The assert
will happen if the other sends a MEET packet because it detects that there is no inbound
link, this node creates a new node in HANDSHAKE state (with a random node name), and
respond with a PONG. The other node receives the PONG and removes the CLUSTER_NODE_MEET
flag. This node is supposed to open an outbound connection to the other node in the next
cron cycle, but before this happens, the other node re-sends a MEET on the same link
because it still detects no inbound connection.

Note that in getNodeFromLinkAndMsg, the node in the handshake state has a random name
and not truly "known", so we don't know the sender. Dropping the MEET packet can prevent
us from creating a random node, avoid incorrect link binding, and avoid duplicate MEET
packet eliminate the handshake state.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 418070f69c..9ddcf6678d 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3003,7 +3003,8 @@ int clusterIsValidPacket(clusterLink *link) {
     }
 
     if (type == server.cluster_drop_packet_filter || server.cluster_drop_packet_filter == -2) {
-        serverLog(LL_WARNING, "Dropping packet that matches debug drop filter");
+        serverLog(LL_WARNING, "Dropping packet of type %s that matches debug drop filter",
+                  clusterGetMessageTypeString(type));
         return 0;
     }
 
@@ -3094,7 +3095,7 @@ int clusterProcessPacket(clusterLink *link) {
         if (server.debug_cluster_close_link_on_packet_drop &&
             (type == server.cluster_drop_packet_filter || server.cluster_drop_packet_filter == -2)) {
             freeClusterLink(link);
-            serverLog(LL_WARNING, "Closing link for matching packet type %hu", type);
+            serverLog(LL_WARNING, "Closing link for matching packet type %s", clusterGetMessageTypeString(type));
             return 0;
         }
         return 1;
@@ -3110,8 +3111,8 @@ int clusterProcessPacket(clusterLink *link) {
             freeClusterLink(link);
             serverLog(
                 LL_NOTICE,
-                "Closing link for node that sent a lightweight message of type %hu as its first message on the link",
-                type);
+                "Closing link for node that sent a lightweight message of type %s as its first message on the link",
+                clusterGetMessageTypeString(type));
             return 0;
         }
         clusterNode *sender = link->node;
@@ -3120,6 +3121,27 @@ int clusterProcessPacket(clusterLink *link) {
         return 1;
     }
 
+    if (type == CLUSTERMSG_TYPE_MEET && link->node && nodeInHandshake(link->node)) {
+        /* If the link is bound to a node and the node is in the handshake state, and we receive
+         * a MEET packet, it may be that the sender sent multiple MEET packets so in here we are
+         * dropping the MEET to avoid the assert in setClusterNodeToInboundClusterLink. The assert
+         * will happen if the other sends a MEET packet because it detects that there is no inbound
+         * link, this node creates a new node in HANDSHAKE state (with a random node name), and
+         * respond with a PONG. The other node receives the PONG and removes the CLUSTER_NODE_MEET
+         * flag. This node is supposed to open an outbound connection to the other node in the next
+         * cron cycle, but before this happens, the other node re-sends a MEET on the same link
+         * because it still detects no inbound connection. We improved the re-send logic of MEET in
+         * #1441, now we will only re-send MEET packet once every handshake timeout period.
+         *
+         * Note that in getNodeFromLinkAndMsg, the node in the handshake state has a random name
+         * and not truly "known", so we don't know the sender. Dropping the MEET packet can prevent
+         * us from creating a random node, avoid incorrect link binding, and avoid duplicate MEET
+         * packet eliminate the handshake state. */
+        serverLog(LL_NOTICE, "Dropping MEET packet from node %.40s because the node is already in handshake state",
+                  link->node->name);
+        return 1;
+    }
+
     uint16_t flags = ntohs(hdr->flags);
     uint64_t sender_claimed_current_epoch = 0, sender_claimed_config_epoch = 0;
     clusterNode *sender = getNodeFromLinkAndMsg(link, hdr);

From 980a80115908d19c97b241bb9d62ec4f4fb18b2d Mon Sep 17 00:00:00 2001
From: Roshan Khatri <117414976+roshkhatri@users.noreply.github.com>
Date: Mon, 16 Dec 2024 13:01:34 -0800
Subject: [PATCH 51/73] Fix the secrete for test bucket. (#1447)

We have set the secret as `AWS_S3_TEST_BUCKET` for test bucket and I
missed it in the initial review.

Signed-off-by: Roshan Khatri <rvkhatri@amazon.com>
---
 .github/workflows/build-release-packages.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-release-packages.yml b/.github/workflows/build-release-packages.yml
index 6c54971bcd..3f1ca2627b 100644
--- a/.github/workflows/build-release-packages.yml
+++ b/.github/workflows/build-release-packages.yml
@@ -59,8 +59,10 @@ jobs:
         id: check-if-testing
         run: |
           if [[ "${{ github.event_name }}" == "push" ]]; then
+            echo "This is a test workflow -> We will upload to the Test S3 Bucket"
             echo "IS_TEST=true" >> $GITHUB_OUTPUT
           else
+            echo "This is a Release workflow -> We will upload to the Release S3 Bucket"
             echo "IS_TEST=false" >> $GITHUB_OUTPUT
           fi
         shell: bash
@@ -92,7 +94,7 @@ jobs:
       build_matrix: ${{ needs.generate-build-matrix.outputs.x86_64-build-matrix }}
       region: us-west-2
     secrets:
-      bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_TEST_BUCKET || secrets.AWS_S3_BUCKET }}
+      bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_S3_TEST_BUCKET || secrets.AWS_S3_BUCKET }}
       role_to_assume: ${{ secrets.AWS_ROLE_TO_ASSUME }}
 
   release-build-linux-arm-packages:
@@ -106,5 +108,5 @@ jobs:
       build_matrix: ${{ needs.generate-build-matrix.outputs.arm64-build-matrix }}
       region: us-west-2
     secrets:
-      bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_TEST_BUCKET || secrets.AWS_S3_BUCKET }}
+      bucket_name: ${{ needs.release-build-get-meta.outputs.is_test == 'true' && secrets.AWS_S3_TEST_BUCKET || secrets.AWS_S3_BUCKET }}
       role_to_assume: ${{ secrets.AWS_ROLE_TO_ASSUME }}

From 7892bf808b6f748684af7defa0c0a8611cc4be50 Mon Sep 17 00:00:00 2001
From: xbasel <103044017+xbasel@users.noreply.github.com>
Date: Tue, 17 Dec 2024 18:04:27 +0200
Subject: [PATCH 52/73] Fix test_reclaimFilePageCache to avoid tmpfs (#1379)

Avoid tmpfs as fadvise(FADV_DONTNEED) has no effect on memory-backed
filesystems.

Fixes https://github.com/valkey-io/valkey/issues/897

---------

Signed-off-by: Ran Shidlansik <ranshid@amazon.com>
Signed-off-by: ranshid <88133677+ranshid@users.noreply.github.com>
Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com>
Co-authored-by: Ran Shidlansik <ranshid@amazon.com>
---
 src/unit/test_util.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/unit/test_util.c b/src/unit/test_util.c
index 4558c38c3b..9858318e06 100644
--- a/src/unit/test_util.c
+++ b/src/unit/test_util.c
@@ -6,6 +6,11 @@
 #include "../util.h"
 #include "test_help.h"
 
+#if defined(__linux__)
+#include <sys/statfs.h>
+#include <linux/magic.h>
+#endif
+
 int test_string2ll(int argc, char **argv, int flags) {
     UNUSED(argc);
     UNUSED(argv);
@@ -291,6 +296,15 @@ int test_reclaimFilePageCache(int argc, char **argv, int flags) {
     if (flags & UNIT_TEST_VALGRIND) return 0;
 
 #if defined(__linux__)
+    struct statfs stats;
+
+    /* Check if /tmp is memory-backed (e.g., tmpfs) */
+    if (statfs("/tmp", &stats) == 0) {
+        if (stats.f_type != TMPFS_MAGIC) { // Not tmpfs, use /tmp
+            return 0;
+        }
+    }
+
     char *tmpfile = "/tmp/redis-reclaim-cache-test";
     int fd = open(tmpfile, O_RDWR | O_CREAT, 0644);
     TEST_ASSERT(fd >= 0);

From ba25b586d5744e315864790e6920a26830a54c09 Mon Sep 17 00:00:00 2001
From: ranshid <88133677+ranshid@users.noreply.github.com>
Date: Tue, 17 Dec 2024 19:07:55 +0200
Subject: [PATCH 53/73] Introduce FORCE_DEFRAG compilation option to allow
 activedefrag run when allocator is not jemalloc (#1303)

Introduce compile time option to force activedefrag to run even when
jemalloc is not used as the allocator.
This is in order to be able to run tests with defrag enabled
while using memory instrumentation tools.

fixes: https://github.com/valkey-io/valkey/issues/1241

---------

Signed-off-by: ranshid <ranshid@amazon.com>
Signed-off-by: Ran Shidlansik <ranshid@amazon.com>
Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
Signed-off-by: ranshid <88133677+ranshid@users.noreply.github.com>
Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
---
 .github/workflows/daily.yml | 46 +++++++++++++++++++++++++++++
 CMakeLists.txt              |  1 +
 deps/CMakeLists.txt         |  4 ++-
 src/CMakeLists.txt          |  6 ++++
 src/Makefile                |  5 ++++
 src/allocator_defrag.c      | 59 ++++++++++++++++++++++++++++++++++---
 src/allocator_defrag.h      | 10 ++++---
 src/config.c                |  2 +-
 src/defrag.c                | 28 ------------------
 src/server.h                |  5 ++++
 tests/support/server.tcl    |  5 ++++
 tests/test_helper.tcl       |  4 +++
 tests/unit/info.tcl         |  2 +-
 13 files changed, 138 insertions(+), 39 deletions(-)

diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml
index c06d73440d..44386f5ffd 100644
--- a/.github/workflows/daily.yml
+++ b/.github/workflows/daily.yml
@@ -689,6 +689,52 @@ jobs:
         if: true && !contains(github.event.inputs.skiptests, 'unittest')
         run: ./src/valkey-unit-tests --accurate
 
+  test-sanitizer-force-defrag:
+    runs-on: ubuntu-latest
+    if: |
+      (github.event_name == 'workflow_dispatch' ||
+        (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
+        (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) &&
+      !contains(github.event.inputs.skipjobs, 'sanitizer')
+    timeout-minutes: 14400
+    strategy:
+      fail-fast: false
+    steps:
+      - name: prep
+        if: github.event_name == 'workflow_dispatch'
+        run: |
+          echo "GITHUB_REPOSITORY=${{github.event.inputs.use_repo}}" >> $GITHUB_ENV
+          echo "GITHUB_HEAD_REF=${{github.event.inputs.use_git_ref}}" >> $GITHUB_ENV
+          echo "skipjobs: ${{github.event.inputs.skipjobs}}"
+          echo "skiptests: ${{github.event.inputs.skiptests}}"
+          echo "test_args: ${{github.event.inputs.test_args}}"
+          echo "cluster_test_args: ${{github.event.inputs.cluster_test_args}}"
+      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        with:
+          repository: ${{ env.GITHUB_REPOSITORY }}
+          ref: ${{ env.GITHUB_HEAD_REF }}
+      - name: make
+        run: make all-with-unit-tests OPT=-O3 SANITIZER=address DEBUG_FORCE_DEFRAG=yes USE_JEMALLOC=no SERVER_CFLAGS='-Werror'
+      - name: testprep
+        run: |
+          sudo apt-get update
+          sudo apt-get install tcl8.6 tclx -y
+      - name: test
+        if: true && !contains(github.event.inputs.skiptests, 'valkey')
+        run: ./runtest --accurate --verbose --dump-logs ${{github.event.inputs.test_args}}
+      - name: module api test
+        if: true && !contains(github.event.inputs.skiptests, 'modules')
+        run: CFLAGS='-Werror' ./runtest-moduleapi --verbose --dump-logs ${{github.event.inputs.test_args}}
+      - name: sentinel tests
+        if: true && !contains(github.event.inputs.skiptests, 'sentinel')
+        run: ./runtest-sentinel ${{github.event.inputs.cluster_test_args}}
+      - name: cluster tests
+        if: true && !contains(github.event.inputs.skiptests, 'cluster')
+        run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}}
+      - name: unittest
+        if: true && !contains(github.event.inputs.skiptests, 'unittest')
+        run: ./src/valkey-unit-tests
+
   test-rpm-distros-jemalloc:
     if: |
       (github.event_name == 'workflow_dispatch' ||
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 77d0c4e7d8..55b18cb994 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,3 +41,4 @@ unset(BUILD_UNIT_TESTS CACHE)
 unset(BUILD_TEST_MODULES CACHE)
 unset(BUILD_EXAMPLE_MODULES CACHE)
 unset(USE_TLS CACHE)
+unset(DEBUG_FORCE_DEFRAG CACHE)
diff --git a/deps/CMakeLists.txt b/deps/CMakeLists.txt
index c904b94031..3f5b04dc22 100644
--- a/deps/CMakeLists.txt
+++ b/deps/CMakeLists.txt
@@ -1,4 +1,6 @@
-add_subdirectory(jemalloc)
+if (USE_JEMALLOC)
+    add_subdirectory(jemalloc)
+endif ()
 add_subdirectory(lua)
 
 # Set hiredis options. We need to disable the defaults set in the OPTION(..) we do this by setting them in the CACHE
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b87dff3db0..90d7e25cf4 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -22,6 +22,12 @@ if (VALKEY_RELEASE_BUILD)
     set_property(TARGET valkey-server PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
 endif ()
 
+if (DEBUG_FORCE_DEFRAG)
+    message(STATUS "Forcing Active Defrag run on valkey-server")
+    target_compile_definitions(valkey-server PRIVATE DEBUG_FORCE_DEFRAG)
+    target_compile_definitions(valkey-server PRIVATE HAVE_DEFRAG)
+endif ()
+
 if (BUILD_SANITIZER)
     # 'BUILD_SANITIZER' is defined in ValkeySetup module (based on user input)
     # If defined, the variables 'VALKEY_SANITAIZER_CFLAGS' and 'VALKEY_SANITAIZER_LDFLAGS'
diff --git a/src/Makefile b/src/Makefile
index 8552deb3d9..e52f4f08d3 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -130,6 +130,11 @@ ifdef REDIS_LDFLAGS
     SERVER_LDFLAGS := $(REDIS_LDFLAGS)
 endif
 
+# Special case of forcing defrag to run even though we have no Jemlloc support
+ifeq ($(DEBUG_FORCE_DEFRAG), yes)
+	SERVER_CFLAGS +=-DHAVE_DEFRAG -DDEBUG_FORCE_DEFRAG
+endif
+
 FINAL_CFLAGS=$(STD) $(WARN) $(OPT) $(DEBUG) $(CFLAGS) $(SERVER_CFLAGS)
 FINAL_LDFLAGS=$(LDFLAGS) $(OPT) $(SERVER_LDFLAGS) $(DEBUG)
 FINAL_LIBS=-lm
diff --git a/src/allocator_defrag.c b/src/allocator_defrag.c
index b2330c95e0..5e805b3044 100644
--- a/src/allocator_defrag.c
+++ b/src/allocator_defrag.c
@@ -43,12 +43,10 @@
  * the other component to ensure both are using the same allocator configuration.
  */
 
-#include <stdio.h>
+#include "server.h"
 #include "serverassert.h"
 #include "allocator_defrag.h"
 
-#define UNUSED(x) (void)(x)
-
 #if defined(HAVE_DEFRAG) && defined(USE_JEMALLOC)
 
 #define STRINGIFY_(x) #x
@@ -402,8 +400,56 @@ int allocatorShouldDefrag(void *ptr) {
                               je_cb.bin_info[binind].nregs - SLAB_NFREE(out, 0));
 }
 
-#else
+/* Utility function to get the fragmentation ratio from jemalloc.
+ * It is critical to do that by comparing only heap maps that belong to
+ * jemalloc, and skip ones the jemalloc keeps as spare. Since we use this
+ * fragmentation ratio in order to decide if a defrag action should be taken
+ * or not, a false detection can cause the defragmenter to waste a lot of CPU
+ * without the possibility of getting any results. */
+float getAllocatorFragmentation(size_t *out_frag_bytes) {
+    size_t resident, active, allocated, frag_smallbins_bytes;
+    zmalloc_get_allocator_info(&allocated, &active, &resident, NULL, NULL);
+    frag_smallbins_bytes = allocatorDefragGetFragSmallbins();
+    /* Calculate the fragmentation ratio as the proportion of wasted memory in small
+     * bins (which are defraggable) relative to the total allocated memory (including large bins).
+     * This is because otherwise, if most of the memory usage is large bins, we may show high percentage,
+     * despite the fact it's not a lot of memory for the user. */
+    float frag_pct = (float)frag_smallbins_bytes / allocated * 100;
+    float rss_pct = ((float)resident / allocated) * 100 - 100;
+    size_t rss_bytes = resident - allocated;
+    if (out_frag_bytes) *out_frag_bytes = frag_smallbins_bytes;
+    serverLog(LL_DEBUG, "allocated=%zu, active=%zu, resident=%zu, frag=%.2f%% (%.2f%% rss), frag_bytes=%zu (%zu rss)",
+              allocated, active, resident, frag_pct, rss_pct, frag_smallbins_bytes, rss_bytes);
+    return frag_pct;
+}
 
+#elif defined(DEBUG_FORCE_DEFRAG)
+int allocatorDefragInit(void) {
+    return 0;
+}
+void allocatorDefragFree(void *ptr, size_t size) {
+    UNUSED(size);
+    zfree(ptr);
+}
+__attribute__((malloc)) void *allocatorDefragAlloc(size_t size) {
+    return zmalloc(size);
+    return NULL;
+}
+unsigned long allocatorDefragGetFragSmallbins(void) {
+    return 0;
+}
+
+int allocatorShouldDefrag(void *ptr) {
+    UNUSED(ptr);
+    return 1;
+}
+
+float getAllocatorFragmentation(size_t *out_frag_bytes) {
+    *out_frag_bytes = server.active_defrag_ignore_bytes + 1;
+    return server.active_defrag_threshold_upper;
+}
+
+#else
 int allocatorDefragInit(void) {
     return -1;
 }
@@ -423,4 +469,9 @@ int allocatorShouldDefrag(void *ptr) {
     UNUSED(ptr);
     return 0;
 }
+
+float getAllocatorFragmentation(size_t *out_frag_bytes) {
+    UNUSED(out_frag_bytes);
+    return 0;
+}
 #endif
diff --git a/src/allocator_defrag.h b/src/allocator_defrag.h
index 7fb56208b6..7947bef72c 100644
--- a/src/allocator_defrag.h
+++ b/src/allocator_defrag.h
@@ -5,10 +5,11 @@
 #include <jemalloc/jemalloc.h>
 /* We can enable the server defrag capabilities only if we are using Jemalloc
  * and the version that has the experimental.utilization namespace in mallctl . */
-#if defined(JEMALLOC_VERSION_MAJOR) &&                              \
-    (JEMALLOC_VERSION_MAJOR > 5 ||                                  \
-     (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR > 2) || \
-     (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR == 2 && JEMALLOC_VERSION_BUGFIX >= 1))
+#if (defined(JEMALLOC_VERSION_MAJOR) &&                                                                 \
+     (JEMALLOC_VERSION_MAJOR > 5 ||                                                                     \
+      (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR > 2) ||                                    \
+      (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR == 2 && JEMALLOC_VERSION_BUGFIX >= 1))) || \
+    defined(DEBUG_FORCE_DEFRAG)
 #define HAVE_DEFRAG
 #endif
 #endif
@@ -18,5 +19,6 @@ void allocatorDefragFree(void *ptr, size_t size);
 __attribute__((malloc)) void *allocatorDefragAlloc(size_t size);
 unsigned long allocatorDefragGetFragSmallbins(void);
 int allocatorShouldDefrag(void *ptr);
+float getAllocatorFragmentation(size_t *out_frag_bytes);
 
 #endif /* __ALLOCATOR_DEFRAG_H */
diff --git a/src/config.c b/src/config.c
index cc0f8d2dd8..e1cee3f95b 100644
--- a/src/config.c
+++ b/src/config.c
@@ -3186,7 +3186,7 @@ standardConfig static_configs[] = {
     createBoolConfig("replica-read-only", "slave-read-only", DEBUG_CONFIG | MODIFIABLE_CONFIG, server.repl_replica_ro, 1, NULL, NULL),
     createBoolConfig("replica-ignore-maxmemory", "slave-ignore-maxmemory", MODIFIABLE_CONFIG, server.repl_replica_ignore_maxmemory, 1, NULL, NULL),
     createBoolConfig("jemalloc-bg-thread", NULL, MODIFIABLE_CONFIG, server.jemalloc_bg_thread, 1, NULL, updateJemallocBgThread),
-    createBoolConfig("activedefrag", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.active_defrag_enabled, 0, isValidActiveDefrag, NULL),
+    createBoolConfig("activedefrag", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.active_defrag_enabled, CONFIG_ACTIVE_DEFRAG_DEFAULT, isValidActiveDefrag, NULL),
     createBoolConfig("syslog-enabled", NULL, IMMUTABLE_CONFIG, server.syslog_enabled, 0, NULL, NULL),
     createBoolConfig("cluster-enabled", NULL, IMMUTABLE_CONFIG, server.cluster_enabled, 0, NULL, NULL),
     createBoolConfig("appendonly", NULL, MODIFIABLE_CONFIG | DENY_LOADING_CONFIG, server.aof_enabled, 0, NULL, updateAppendonly),
diff --git a/src/defrag.c b/src/defrag.c
index 8e7fc8449e..6522d9aa7b 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -149,11 +149,6 @@ static_assert(offsetof(defragPubSubCtx, kvstate) == 0, "defragStageKvstoreHelper
 static list *defrag_later;
 static unsigned long defrag_later_cursor;
 
-
-/* this method was added to jemalloc in order to help us understand which
- * pointers are worthwhile moving and which aren't */
-int je_get_defrag_hint(void *ptr);
-
 /* Defrag function which allocates and copies memory if needed, but DOESN'T free the old block.
  * It is the responsibility of the caller to free the old block if a non-NULL value (new block)
  * is returned.  (Returns NULL if no relocation was needed.)
@@ -824,29 +819,6 @@ static void dbKeysScanCallback(void *privdata, void *elemref) {
     server.stat_active_defrag_scanned++;
 }
 
-/* Utility function to get the fragmentation ratio from jemalloc.
- * It is critical to do that by comparing only heap maps that belong to
- * jemalloc, and skip ones the jemalloc keeps as spare. Since we use this
- * fragmentation ratio in order to decide if a defrag action should be taken
- * or not, a false detection can cause the defragmenter to waste a lot of CPU
- * without the possibility of getting any results. */
-static float getAllocatorFragmentation(size_t *out_frag_bytes) {
-    size_t resident, active, allocated, frag_smallbins_bytes;
-    zmalloc_get_allocator_info(&allocated, &active, &resident, NULL, NULL);
-    frag_smallbins_bytes = allocatorDefragGetFragSmallbins();
-    /* Calculate the fragmentation ratio as the proportion of wasted memory in small
-     * bins (which are defraggable) relative to the total allocated memory (including large bins).
-     * This is because otherwise, if most of the memory usage is large bins, we may show high percentage,
-     * despite the fact it's not a lot of memory for the user. */
-    float frag_pct = (float)frag_smallbins_bytes / allocated * 100;
-    float rss_pct = ((float)resident / allocated) * 100 - 100;
-    size_t rss_bytes = resident - allocated;
-    if (out_frag_bytes) *out_frag_bytes = frag_smallbins_bytes;
-    serverLog(LL_DEBUG, "allocated=%zu, active=%zu, resident=%zu, frag=%.2f%% (%.2f%% rss), frag_bytes=%zu (%zu rss)",
-              allocated, active, resident, frag_pct, rss_pct, frag_smallbins_bytes, rss_bytes);
-    return frag_pct;
-}
-
 /* Defrag scan callback for a pubsub channels hashtable. */
 static void defragPubsubScanCallback(void *privdata, void *elemref) {
     defragPubSubCtx *ctx = privdata;
diff --git a/src/server.h b/src/server.h
index dc4d2e8808..1aafcaeb57 100644
--- a/src/server.h
+++ b/src/server.h
@@ -148,6 +148,11 @@ struct hdr_histogram;
 #define DEFAULT_WAIT_BEFORE_RDB_CLIENT_FREE 60      /* Grace period in seconds for replica main \
                                                      * channel to establish psync. */
 #define LOADING_PROCESS_EVENTS_INTERVAL_DEFAULT 100 /* Default: 0.1 seconds */
+#if !defined(DEBUG_FORCE_DEFRAG)
+#define CONFIG_ACTIVE_DEFRAG_DEFAULT 0
+#else
+#define CONFIG_ACTIVE_DEFRAG_DEFAULT 1
+#endif
 
 /* Bucket sizes for client eviction pools. Each bucket stores clients with
  * memory usage of up to twice the size of the bucket below it. */
diff --git a/tests/support/server.tcl b/tests/support/server.tcl
index 7257339042..8c545d900a 100644
--- a/tests/support/server.tcl
+++ b/tests/support/server.tcl
@@ -221,6 +221,11 @@ proc tags_acceptable {tags err_return} {
         return 0
     }
 
+    if {$::debug_defrag && [lsearch $tags "debug_defrag:skip"] >= 0} {
+        set err "Not supported on server compiled with DEBUG_FORCE_DEFRAG option"
+        return 0
+    }
+
     if {$::singledb && [lsearch $tags "singledb:skip"] >= 0} {
         set err "Not supported on singledb"
         return 0
diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl
index 1f0658071a..8a4125e48d 100644
--- a/tests/test_helper.tcl
+++ b/tests/test_helper.tcl
@@ -92,6 +92,7 @@ set ::large_memory 0
 set ::log_req_res 0
 set ::force_resp3 0
 set ::solo_tests_count 0
+set ::debug_defrag 0
 
 # Set to 1 when we are running in client mode. The server test uses a
 # server-client model to run tests simultaneously. The server instance
@@ -607,6 +608,7 @@ proc print_help_screen {} {
         "--ignore-encoding  Don't validate object encoding."
         "--ignore-digest    Don't use debug digest validations."
         "--large-memory     Run tests using over 100mb."
+        "--debug-defrag     Indicate the test is running against server compiled with DEBUG_FORCE_DEFRAG option"
         "--help             Print this help screen."
     } "\n"]
 }
@@ -748,6 +750,8 @@ for {set j 0} {$j < [llength $argv]} {incr j} {
         set ::ignoreencoding 1
     } elseif {$opt eq {--ignore-digest}} {
         set ::ignoredigest 1
+    } elseif {$opt eq {--debug-defrag}} {
+        set ::debug_defrag 1
     } elseif {$opt eq {--help}} {
         print_help_screen
         exit 0
diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl
index e50faba62b..a27043fa88 100644
--- a/tests/unit/info.tcl
+++ b/tests/unit/info.tcl
@@ -10,7 +10,7 @@ proc latency_percentiles_usec {cmd} {
     return [latencyrstat_percentiles $cmd r]
 }
 
-start_server {tags {"info" "external:skip"}} {
+start_server {tags {"info" "external:skip" "debug_defrag:skip"}} {
     start_server {} {
 
         test {latencystats: disable/enable} {

From b66698b8870771eadc46fae8d9ef7027aec50dfb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Viktor=20Sz=C3=A9pe?= <viktor@szepe.net>
Date: Wed, 18 Dec 2024 02:45:43 +0100
Subject: [PATCH 54/73] Discover and fix new typos (#1446)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Upgrade `typos` and fix corresponding typos

---------

Signed-off-by: Viktor Szépe <viktor@szepe.net>
---
 .github/workflows/spell-check.yml             |  2 +-
 src/geohash_helper.c                          |  2 +-
 src/server.c                                  |  4 +-
 src/server.h                                  |  2 +-
 src/zmalloc.c                                 |  2 +-
 tests/integration/aof-multi-part.tcl          | 84 +++++++++----------
 tests/integration/aof.tcl                     |  4 +-
 .../integration/dual-channel-replication.tcl  | 12 +--
 tests/support/aofmanifest.tcl                 |  4 +-
 tests/support/test.tcl                        |  4 +-
 10 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/.github/workflows/spell-check.yml b/.github/workflows/spell-check.yml
index 69d9b9cb6a..14db670b24 100644
--- a/.github/workflows/spell-check.yml
+++ b/.github/workflows/spell-check.yml
@@ -26,7 +26,7 @@ jobs:
         uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
 
       - name: Install typos
-        uses: taiki-e/install-action@cd5df4de2e75f3b819ba55f780f7bb8cd4a05a41 # v2.32.2
+        uses: taiki-e/install-action@fe9759bf4432218c779595708e80a1aadc85cedc # v2.46.10
         with:
           tool: typos
 
diff --git a/src/geohash_helper.c b/src/geohash_helper.c
index aa4b4743a6..c05c2f2634 100644
--- a/src/geohash_helper.c
+++ b/src/geohash_helper.c
@@ -48,7 +48,7 @@
 
 /// @brief The usual PI/180 constant
 const double DEG_TO_RAD = 0.017453292519943295769236907684886;
-/// @brief Earth's quatratic mean radius for WGS-84
+/// @brief Earth's quadratic mean radius for WGS-84
 const double EARTH_RADIUS_IN_METERS = 6372797.560856;
 
 const double MERCATOR_MAX = 20037726.37;
diff --git a/src/server.c b/src/server.c
index 9bd7bdd4a4..5275fed4b9 100644
--- a/src/server.c
+++ b/src/server.c
@@ -1702,7 +1702,7 @@ static void sendGetackToReplicas(void) {
     robj *argv[3];
     argv[0] = shared.replconf;
     argv[1] = shared.getack;
-    argv[2] = shared.special_asterick; /* Not used argument. */
+    argv[2] = shared.special_asterisk; /* Not used argument. */
     replicationFeedReplicas(-1, argv, 3);
 }
 
@@ -2088,7 +2088,7 @@ void createSharedObjects(void) {
     shared.load = createStringObject("LOAD", 4);
     shared.createconsumer = createStringObject("CREATECONSUMER", 14);
     shared.getack = createStringObject("GETACK", 6);
-    shared.special_asterick = createStringObject("*", 1);
+    shared.special_asterisk = createStringObject("*", 1);
     shared.special_equals = createStringObject("=", 1);
     shared.redacted = makeObjectShared(createStringObject("(redacted)", 10));
 
diff --git a/src/server.h b/src/server.h
index 1aafcaeb57..783871b856 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1444,7 +1444,7 @@ struct sharedObjectsStruct {
         *rpoplpush, *lmove, *blmove, *zpopmin, *zpopmax, *emptyscan, *multi, *exec, *left, *right, *hset, *srem,
         *xgroup, *xclaim, *script, *replconf, *eval, *persist, *set, *pexpireat, *pexpire, *time, *pxat, *absttl,
         *retrycount, *force, *justid, *entriesread, *lastid, *ping, *setid, *keepttl, *load, *createconsumer, *getack,
-        *special_asterick, *special_equals, *default_username, *redacted, *ssubscribebulk, *sunsubscribebulk,
+        *special_asterisk, *special_equals, *default_username, *redacted, *ssubscribebulk, *sunsubscribebulk,
         *smessagebulk, *select[PROTO_SHARED_SELECT_CMDS], *integers[OBJ_SHARED_INTEGERS],
         *mbulkhdr[OBJ_SHARED_BULKHDR_LEN], /* "*<value>\r\n" */
         *bulkhdr[OBJ_SHARED_BULKHDR_LEN],  /* "$<value>\r\n" */
diff --git a/src/zmalloc.c b/src/zmalloc.c
index b1de4f2af1..3abf9a31a0 100644
--- a/src/zmalloc.c
+++ b/src/zmalloc.c
@@ -762,7 +762,7 @@ void zlibc_trim(void) {
 /* For proc_pidinfo() used later in zmalloc_get_smap_bytes_by_field().
  * Note that this file cannot be included in zmalloc.h because it includes
  * a Darwin queue.h file where there is a "LIST_HEAD" macro (!) defined
- * conficting with user code. */
+ * conflicting with user code. */
 #include <libproc.h>
 #endif
 
diff --git a/tests/integration/aof-multi-part.tcl b/tests/integration/aof-multi-part.tcl
index 5c4f24b7d4..9a23031c08 100644
--- a/tests/integration/aof-multi-part.tcl
+++ b/tests/integration/aof-multi-part.tcl
@@ -4,11 +4,11 @@ set server_path [tmpdir server.multi.aof]
 set aof_dirname "appendonlydir"
 set aof_basename "appendonly.aof"
 set aof_dirpath "$server_path/$aof_dirname"
-set aof_base1_file "$server_path/$aof_dirname/${aof_basename}.1$::base_aof_sufix$::aof_format_suffix"
-set aof_base2_file "$server_path/$aof_dirname/${aof_basename}.2$::base_aof_sufix$::aof_format_suffix"
-set aof_incr1_file "$server_path/$aof_dirname/${aof_basename}.1$::incr_aof_sufix$::aof_format_suffix"
-set aof_incr2_file "$server_path/$aof_dirname/${aof_basename}.2$::incr_aof_sufix$::aof_format_suffix"
-set aof_incr3_file "$server_path/$aof_dirname/${aof_basename}.3$::incr_aof_sufix$::aof_format_suffix"
+set aof_base1_file "$server_path/$aof_dirname/${aof_basename}.1$::base_aof_suffix$::aof_format_suffix"
+set aof_base2_file "$server_path/$aof_dirname/${aof_basename}.2$::base_aof_suffix$::aof_format_suffix"
+set aof_incr1_file "$server_path/$aof_dirname/${aof_basename}.1$::incr_aof_suffix$::aof_format_suffix"
+set aof_incr2_file "$server_path/$aof_dirname/${aof_basename}.2$::incr_aof_suffix$::aof_format_suffix"
+set aof_incr3_file "$server_path/$aof_dirname/${aof_basename}.3$::incr_aof_suffix$::aof_format_suffix"
 set aof_manifest_file "$server_path/$aof_dirname/${aof_basename}$::manifest_suffix"
 set aof_old_name_old_path "$server_path/$aof_basename"
 set aof_old_name_new_path "$aof_dirpath/$aof_basename"
@@ -705,7 +705,7 @@ tags {"external:skip"} {
             set client [valkey [srv host] [srv port] 0 $::tls]
             wait_done_loading $client
 
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::rdb_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::rdb_format_suffix}"]
 
             assert_aof_manifest_content $aof_manifest_file {
                 {file appendonly.aof.1.base.rdb seq 1 type b}
@@ -728,7 +728,7 @@ tags {"external:skip"} {
             set client [valkey [srv host] [srv port] 0 $::tls]
             wait_done_loading $client
 
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::aof_format_suffix}"]
 
             assert_aof_manifest_content $aof_manifest_file {
                 {file appendonly.aof.1.base.aof seq 1 type b}
@@ -750,7 +750,7 @@ tags {"external:skip"} {
         start_server_aof [list dir $server_path aof-use-rdb-preamble no] {
             wait_done_loading r
 
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::aof_format_suffix}"]
 
             assert_aof_manifest_content $aof_manifest_file {
                 {file appendonly.aof.1.base.aof seq 1 type b}
@@ -827,8 +827,8 @@ tags {"external:skip"} {
 
             # Check we really have these files
             assert_equal 1 [check_file_exist $aof_dirpath $aof_manifest_name]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::rdb_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_sufix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::rdb_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_suffix}${::aof_format_suffix}"]
 
             r bgrewriteaof
             waitForBgrewriteaof r
@@ -842,13 +842,13 @@ tags {"external:skip"} {
             assert_equal 1 [check_file_exist $aof_dirpath $aof_manifest_name]
             # Wait bio delete history
             wait_for_condition 1000 10 {
-                [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::rdb_format_suffix}"] == 0 &&
-                [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_sufix}${::aof_format_suffix}"] == 0
+                [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::rdb_format_suffix}"] == 0 &&
+                [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_suffix}${::aof_format_suffix}"] == 0
             } else {
                 fail "Failed to delete history AOF"
             }
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_sufix}${::rdb_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_sufix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_suffix}${::rdb_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_suffix}${::aof_format_suffix}"]
 
             stop_write_load $load_handle0
             wait_load_handlers_disconnected
@@ -901,11 +901,11 @@ tags {"external:skip"} {
                 {file appendonly.aof.5.incr.aof seq 5 type i}
             }
 
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_sufix}${::rdb_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_sufix}${::aof_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.3${::incr_aof_sufix}${::aof_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.4${::incr_aof_sufix}${::aof_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::incr_aof_sufix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_suffix}${::rdb_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_suffix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.3${::incr_aof_suffix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.4${::incr_aof_suffix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::incr_aof_suffix}${::aof_format_suffix}"]
 
             stop_write_load $load_handle0
             wait_load_handlers_disconnected
@@ -936,17 +936,17 @@ tags {"external:skip"} {
 
             # Wait bio delete history
             wait_for_condition 1000 10 {
-                [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_sufix}${::rdb_format_suffix}"] == 0 &&
-                [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_sufix}${::aof_format_suffix}"] == 0 &&
-                [check_file_exist $aof_dirpath "${aof_basename}.3${::incr_aof_sufix}${::aof_format_suffix}"] == 0 &&
-                [check_file_exist $aof_dirpath "${aof_basename}.4${::incr_aof_sufix}${::aof_format_suffix}"] == 0 &&
-                [check_file_exist $aof_dirpath "${aof_basename}.5${::incr_aof_sufix}${::aof_format_suffix}"] == 0
+                [check_file_exist $aof_dirpath "${aof_basename}.2${::base_aof_suffix}${::rdb_format_suffix}"] == 0 &&
+                [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_suffix}${::aof_format_suffix}"] == 0 &&
+                [check_file_exist $aof_dirpath "${aof_basename}.3${::incr_aof_suffix}${::aof_format_suffix}"] == 0 &&
+                [check_file_exist $aof_dirpath "${aof_basename}.4${::incr_aof_suffix}${::aof_format_suffix}"] == 0 &&
+                [check_file_exist $aof_dirpath "${aof_basename}.5${::incr_aof_suffix}${::aof_format_suffix}"] == 0
             } else {
                 fail "Failed to delete history AOF"
             }
 
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.3${::base_aof_sufix}${::rdb_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.6${::incr_aof_sufix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.3${::base_aof_suffix}${::rdb_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.6${::incr_aof_suffix}${::aof_format_suffix}"]
 
             set d1 [r debug digest]
             r debug loadaof
@@ -965,10 +965,10 @@ tags {"external:skip"} {
                 {file appendonly.aof.4.base.rdb seq 4 type b}
             }
 
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.4${::base_aof_sufix}${::rdb_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.4${::base_aof_suffix}${::rdb_format_suffix}"]
             wait_for_condition 1000 10 {
-                [check_file_exist $aof_dirpath "${aof_basename}.6${::incr_aof_sufix}${::aof_format_suffix}"] == 0 &&
-                [check_file_exist $aof_dirpath "${aof_basename}.7${::incr_aof_sufix}${::aof_format_suffix}"] == 0
+                [check_file_exist $aof_dirpath "${aof_basename}.6${::incr_aof_suffix}${::aof_format_suffix}"] == 0 &&
+                [check_file_exist $aof_dirpath "${aof_basename}.7${::incr_aof_suffix}${::aof_format_suffix}"] == 0
             } else {
                 fail "Failed to delete history AOF"
             }
@@ -990,13 +990,13 @@ tags {"external:skip"} {
 
             # Wait bio delete history
             wait_for_condition 1000 10 {
-                [check_file_exist $aof_dirpath "${aof_basename}.4${::base_aof_sufix}${::rdb_format_suffix}"] == 0
+                [check_file_exist $aof_dirpath "${aof_basename}.4${::base_aof_suffix}${::rdb_format_suffix}"] == 0
             } else {
                 fail "Failed to delete history AOF"
             }
 
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_sufix}${::rdb_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_sufix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_suffix}${::rdb_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_suffix}${::aof_format_suffix}"]
         }
 
         test "AOF enable/disable auto gc" {
@@ -1018,10 +1018,10 @@ tags {"external:skip"} {
                 {file appendonly.aof.3.incr.aof seq 3 type i}
             }
 
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_sufix}${::rdb_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.6${::base_aof_sufix}${::rdb_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_sufix}${::aof_format_suffix}"]
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_sufix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_suffix}${::rdb_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.6${::base_aof_suffix}${::rdb_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_suffix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_suffix}${::aof_format_suffix}"]
 
             r config set aof-disable-auto-gc no
 
@@ -1033,10 +1033,10 @@ tags {"external:skip"} {
 
             # wait bio delete history
             wait_for_condition 1000 10 {
-                [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_sufix}${::rdb_format_suffix}"] == 0 &&
-                [check_file_exist $aof_dirpath "${aof_basename}.6${::base_aof_sufix}${::rdb_format_suffix}"] == 0 &&
-                [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_sufix}${::aof_format_suffix}"] == 0 &&
-                [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_sufix}${::aof_format_suffix}"] == 0
+                [check_file_exist $aof_dirpath "${aof_basename}.5${::base_aof_suffix}${::rdb_format_suffix}"] == 0 &&
+                [check_file_exist $aof_dirpath "${aof_basename}.6${::base_aof_suffix}${::rdb_format_suffix}"] == 0 &&
+                [check_file_exist $aof_dirpath "${aof_basename}.1${::incr_aof_suffix}${::aof_format_suffix}"] == 0 &&
+                [check_file_exist $aof_dirpath "${aof_basename}.2${::incr_aof_suffix}${::aof_format_suffix}"] == 0
             } else {
                 fail "Failed to delete history AOF"
             }
@@ -1192,7 +1192,7 @@ tags {"external:skip"} {
             waitForBgrewriteaof r
 
             # Can create New INCR AOF
-            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.10${::incr_aof_sufix}${::aof_format_suffix}"]
+            assert_equal 1 [check_file_exist $aof_dirpath "${aof_basename}.10${::incr_aof_suffix}${::aof_format_suffix}"]
 
             assert_aof_manifest_content $aof_manifest_file {
                 {file appendonly.aof.11.base.rdb seq 11 type b}
@@ -1248,7 +1248,7 @@ tags {"external:skip"} {
                 # Make sure manifest file is not created
                 assert_equal 0 [check_file_exist $aof_dirpath $aof_manifest_name]
                 # Make sure BASE AOF is not created
-                assert_equal 0 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_sufix}${::rdb_format_suffix}"]
+                assert_equal 0 [check_file_exist $aof_dirpath "${aof_basename}.1${::base_aof_suffix}${::rdb_format_suffix}"]
 
                 # Make sure the next AOFRW has started
                 wait_for_condition 1000 50 {
diff --git a/tests/integration/aof.tcl b/tests/integration/aof.tcl
index 33c7c12d4b..3a666bbd15 100644
--- a/tests/integration/aof.tcl
+++ b/tests/integration/aof.tcl
@@ -4,8 +4,8 @@ set server_path [tmpdir server.aof]
 set aof_dirname "appendonlydir"
 set aof_basename "appendonly.aof"
 set aof_dirpath "$server_path/$aof_dirname"
-set aof_base_file "$server_path/$aof_dirname/${aof_basename}.1$::base_aof_sufix$::aof_format_suffix"
-set aof_file "$server_path/$aof_dirname/${aof_basename}.1$::incr_aof_sufix$::aof_format_suffix"
+set aof_base_file "$server_path/$aof_dirname/${aof_basename}.1$::base_aof_suffix$::aof_format_suffix"
+set aof_file "$server_path/$aof_dirname/${aof_basename}.1$::incr_aof_suffix$::aof_format_suffix"
 set aof_manifest_file "$server_path/$aof_dirname/$aof_basename$::manifest_suffix"
 
 tags {"aof external:skip"} {
diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl
index e417dad6c9..8191b9f699 100644
--- a/tests/integration/dual-channel-replication.tcl
+++ b/tests/integration/dual-channel-replication.tcl
@@ -355,8 +355,8 @@ start_server {tags {"dual-channel-replication external:skip"}} {
                 verify_replica_online $primary 0 500
                 verify_replica_online $primary 1 500
 
-                wait_for_value_to_propegate_to_replica $primary $replica1 "key1"
-                wait_for_value_to_propegate_to_replica $primary $replica2 "key1"
+                wait_for_value_to_propagate_to_replica $primary $replica1 "key1"
+                wait_for_value_to_propagate_to_replica $primary $replica2 "key1"
 
                 assert {[s 0 total_forks] eq "1" }       
             }
@@ -374,8 +374,8 @@ start_server {tags {"dual-channel-replication external:skip"}} {
                 $replica2 replicaof $primary_host $primary_port
                 verify_replica_online $primary 0 500
                 verify_replica_online $primary 1 500
-                wait_for_value_to_propegate_to_replica $primary $replica1 "key2"
-                wait_for_value_to_propegate_to_replica $primary $replica2 "key2"
+                wait_for_value_to_propagate_to_replica $primary $replica1 "key2"
+                wait_for_value_to_propagate_to_replica $primary $replica2 "key2"
                 wait_for_condition 50 1000 {
                     [status $replica1 master_link_status] == "up"
                 } else {
@@ -444,7 +444,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
                 } else {
                     fail "Replica is not synced"
                 }
-                wait_for_value_to_propegate_to_replica $primary $replica1 "key3"
+                wait_for_value_to_propagate_to_replica $primary $replica1 "key3"
 
                 # Verify that we did not use dual-channel-replication sync
                 assert {[status $primary sync_partial_ok] == $cur_psync}
@@ -483,7 +483,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             } else {
                 fail "Replica is not synced"
             }
-            wait_for_value_to_propegate_to_replica $primary $replica "key1"
+            wait_for_value_to_propagate_to_replica $primary $replica "key1"
             # Confirm the occurrence of a race condition.
             wait_for_log_messages -1 {"*<Dual Channel> Psync established after rdb load*"} 0 2000 1
         }
diff --git a/tests/support/aofmanifest.tcl b/tests/support/aofmanifest.tcl
index 308d1172aa..fc20bacc99 100644
--- a/tests/support/aofmanifest.tcl
+++ b/tests/support/aofmanifest.tcl
@@ -1,5 +1,5 @@
-set ::base_aof_sufix ".base"
-set ::incr_aof_sufix ".incr"
+set ::base_aof_suffix ".base"
+set ::incr_aof_suffix ".incr"
 set ::manifest_suffix ".manifest"
 set ::aof_format_suffix ".aof"
 set ::rdb_format_suffix ".rdb"
diff --git a/tests/support/test.tcl b/tests/support/test.tcl
index 262dc66041..3fd74d0387 100644
--- a/tests/support/test.tcl
+++ b/tests/support/test.tcl
@@ -160,12 +160,12 @@ proc verify_replica_online {master replica_idx max_retry} {
     } 
 }
 
-proc wait_for_value_to_propegate_to_replica {master replica key} {
+proc wait_for_value_to_propagate_to_replica {master replica key} {
     set val [$master get $key]
     wait_for_condition 50 500 {
                 ([$replica get $key] eq $val)
     } else {
-        error "Key $key did not propegate. Expected $val but got [$replica get $key]"
+        error "Key $key did not propagate. Expected $val but got [$replica get $key]"
     }
 }
 

From e203ca35b7f8c162ac9ff5bb55b76bdfc98d244d Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Tue, 17 Dec 2024 17:48:53 -0800
Subject: [PATCH 55/73] Fix undefined behavior defined by ASAN (#1451)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Asan now supports making sure you are passing in the correct pointer
type, which seems useful but we can't support it since we pass in an
incorrect pointer in several places. This is most commonly done with
generic free functions, where we simply cast it to the correct type.

It's not a lot of code to clean up, so it seems appropriate to cleanup
instead of disabling the check.

---------

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
---
 src/acl.c                | 20 ++++++++++----------
 src/adlist.c             |  6 ++++++
 src/adlist.h             |  1 +
 src/call_reply.c         |  2 +-
 src/db.c                 |  2 +-
 src/defrag.c             |  2 +-
 src/eval.c               |  4 ++--
 src/functions.c          |  2 +-
 src/listpack.c           |  6 ++++++
 src/listpack.h           |  1 +
 src/module.c             |  2 +-
 src/networking.c         |  2 +-
 src/replication.c        |  2 +-
 src/t_stream.c           | 19 +++++++++++++++----
 src/unit/test_listpack.c |  2 +-
 src/unit/test_ziplist.c  |  2 +-
 16 files changed, 50 insertions(+), 25 deletions(-)

diff --git a/src/acl.c b/src/acl.c
index cfcf102887..d1f970a805 100644
--- a/src/acl.c
+++ b/src/acl.c
@@ -297,11 +297,6 @@ int ACLListMatchSds(void *a, void *b) {
     return sdscmp(a, b) == 0;
 }
 
-/* Method to free list elements from ACL users password/patterns lists. */
-void ACLListFreeSds(void *item) {
-    sdsfree(item);
-}
-
 /* Method to duplicate list elements from ACL users password/patterns lists. */
 void *ACLListDupSds(void *item) {
     return sdsdup(item);
@@ -374,7 +369,7 @@ aclSelector *ACLCreateSelector(int flags) {
     listSetFreeMethod(selector->patterns, ACLListFreeKeyPattern);
     listSetDupMethod(selector->patterns, ACLListDupKeyPattern);
     listSetMatchMethod(selector->channels, ACLListMatchSds);
-    listSetFreeMethod(selector->channels, ACLListFreeSds);
+    listSetFreeMethod(selector->channels, sdsfreeVoid);
     listSetDupMethod(selector->channels, ACLListDupSds);
     memset(selector->allowed_commands, 0, sizeof(selector->allowed_commands));
 
@@ -445,7 +440,7 @@ user *ACLCreateUser(const char *name, size_t namelen) {
     u->passwords = listCreate();
     u->acl_string = NULL;
     listSetMatchMethod(u->passwords, ACLListMatchSds);
-    listSetFreeMethod(u->passwords, ACLListFreeSds);
+    listSetFreeMethod(u->passwords, sdsfreeVoid);
     listSetDupMethod(u->passwords, ACLListDupSds);
 
     u->selectors = listCreate();
@@ -489,6 +484,11 @@ void ACLFreeUser(user *u) {
     zfree(u);
 }
 
+/* Used for generic free functions. */
+static void ACLFreeUserVoid(void *u) {
+    ACLFreeUser(u);
+}
+
 /* When a user is deleted we need to cycle the active
  * connections in order to kill all the pending ones that
  * are authenticated with such user. */
@@ -2445,12 +2445,12 @@ sds ACLLoadFromFile(const char *filename) {
             c->user = new_user;
         }
 
-        if (user_channels) raxFreeWithCallback(user_channels, (void (*)(void *))listRelease);
-        raxFreeWithCallback(old_users, (void (*)(void *))ACLFreeUser);
+        if (user_channels) raxFreeWithCallback(user_channels, listReleaseVoid);
+        raxFreeWithCallback(old_users, ACLFreeUserVoid);
         sdsfree(errors);
         return NULL;
     } else {
-        raxFreeWithCallback(Users, (void (*)(void *))ACLFreeUser);
+        raxFreeWithCallback(Users, ACLFreeUserVoid);
         Users = old_users;
         errors =
             sdscat(errors, "WARNING: ACL errors detected, no change to the previously active ACL rules was performed");
diff --git a/src/adlist.c b/src/adlist.c
index 11b152592b..0dc77cc038 100644
--- a/src/adlist.c
+++ b/src/adlist.c
@@ -77,6 +77,12 @@ void listRelease(list *list) {
     zfree(list);
 }
 
+/* Just like listRelease, but takes the list as a (void *).
+ * Useful as generic free callback. */
+void listReleaseVoid(void *l) {
+    listRelease((list *)l);
+}
+
 /* Add a new node to the list, to head, containing the specified 'value'
  * pointer as value.
  *
diff --git a/src/adlist.h b/src/adlist.h
index bfc4280434..c642c1c791 100644
--- a/src/adlist.h
+++ b/src/adlist.h
@@ -72,6 +72,7 @@ typedef struct list {
 /* Prototypes */
 list *listCreate(void);
 void listRelease(list *list);
+void listReleaseVoid(void *list);
 void listEmpty(list *list);
 list *listAddNodeHead(list *list, void *value);
 list *listAddNodeTail(list *list, void *value);
diff --git a/src/call_reply.c b/src/call_reply.c
index 00d196081e..dc981b8be8 100644
--- a/src/call_reply.c
+++ b/src/call_reply.c
@@ -559,7 +559,7 @@ CallReply *callReplyCreateError(sds reply, void *private_data) {
         sdsfree(reply);
     }
     list *deferred_error_list = listCreate();
-    listSetFreeMethod(deferred_error_list, (void (*)(void *))sdsfree);
+    listSetFreeMethod(deferred_error_list, sdsfreeVoid);
     listAddNodeTail(deferred_error_list, sdsnew(err_buff));
     return callReplyCreate(err_buff, deferred_error_list, private_data);
 }
diff --git a/src/db.c b/src/db.c
index 1223d00c8d..e31d7e7f7f 100644
--- a/src/db.c
+++ b/src/db.c
@@ -1193,7 +1193,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) {
      * are deep copied temporary strings. We must not free them if they are just
      * a shallow copy - a pointer to the actual data in the data structure */
     if (!shallow_copied_list_items) {
-        listSetFreeMethod(keys, (void (*)(void *))sdsfree);
+        listSetFreeMethod(keys, sdsfreeVoid);
     }
 
     /* For main hash table scan or scannable data structure. */
diff --git a/src/defrag.c b/src/defrag.c
index 6522d9aa7b..e9f40d4fab 100644
--- a/src/defrag.c
+++ b/src/defrag.c
@@ -421,7 +421,7 @@ static void activeDefragQuickListNodes(quicklist *ql) {
 static void defragLater(robj *obj) {
     if (!defrag_later) {
         defrag_later = listCreate();
-        listSetFreeMethod(defrag_later, (void (*)(void *))sdsfree);
+        listSetFreeMethod(defrag_later, sdsfreeVoid);
         defrag_later_cursor = 0;
     }
     sds key = sdsdup(objectGetKey(obj));
diff --git a/src/eval.c b/src/eval.c
index a9c50cdf90..e9fac531f5 100644
--- a/src/eval.c
+++ b/src/eval.c
@@ -204,7 +204,7 @@ void scriptingInit(int setup) {
      * and we need to free them respectively. */
     lctx.lua_scripts = dictCreate(&shaScriptObjectDictType);
     lctx.lua_scripts_lru_list = listCreate();
-    listSetFreeMethod(lctx.lua_scripts_lru_list, (void (*)(void *))sdsfree);
+    listSetFreeMethod(lctx.lua_scripts_lru_list, sdsfreeVoid);
     lctx.lua_scripts_mem = 0;
 
     luaRegisterServerAPI(lua);
@@ -777,7 +777,7 @@ void ldbInit(void) {
     ldb.conn = NULL;
     ldb.active = 0;
     ldb.logs = listCreate();
-    listSetFreeMethod(ldb.logs, (void (*)(void *))sdsfree);
+    listSetFreeMethod(ldb.logs, sdsfreeVoid);
     ldb.children = listCreate();
     ldb.src = NULL;
     ldb.lines = 0;
diff --git a/src/functions.c b/src/functions.c
index b694e35252..feb82d4ab7 100644
--- a/src/functions.c
+++ b/src/functions.c
@@ -348,7 +348,7 @@ libraryJoin(functionsLibCtx *functions_lib_ctx_dst, functionsLibCtx *functions_l
             } else {
                 if (!old_libraries_list) {
                     old_libraries_list = listCreate();
-                    listSetFreeMethod(old_libraries_list, (void (*)(void *))engineLibraryFree);
+                    listSetFreeMethod(old_libraries_list, engineLibraryDispose);
                 }
                 libraryUnlink(functions_lib_ctx_dst, old_li);
                 listAddNodeTail(old_libraries_list, old_li);
diff --git a/src/listpack.c b/src/listpack.c
index 2dfb321f56..76c2f9ea38 100644
--- a/src/listpack.c
+++ b/src/listpack.c
@@ -250,6 +250,12 @@ void lpFree(unsigned char *lp) {
     lp_free(lp);
 }
 
+/* Same as lpFree, but useful for when you are passing the listpack
+ * into a generic free function that expects (void *) */
+void lpFreeVoid(void *lp) {
+    lp_free((unsigned char *)lp);
+}
+
 /* Shrink the memory to fit. */
 unsigned char *lpShrinkToFit(unsigned char *lp) {
     size_t size = lpGetTotalBytes(lp);
diff --git a/src/listpack.h b/src/listpack.h
index aa7636143f..b143797261 100644
--- a/src/listpack.h
+++ b/src/listpack.h
@@ -56,6 +56,7 @@ typedef struct {
 
 unsigned char *lpNew(size_t capacity);
 void lpFree(unsigned char *lp);
+void lpFreeVoid(void *lp);
 unsigned char *lpShrinkToFit(unsigned char *lp);
 unsigned char *
 lpInsertString(unsigned char *lp, unsigned char *s, uint32_t slen, unsigned char *p, int where, unsigned char **newp);
diff --git a/src/module.c b/src/module.c
index 36283e2c73..541ae490ab 100644
--- a/src/module.c
+++ b/src/module.c
@@ -10399,7 +10399,7 @@ ValkeyModuleServerInfoData *VM_GetServerInfo(ValkeyModuleCtx *ctx, const char *s
  * context instead of passing NULL. */
 void VM_FreeServerInfo(ValkeyModuleCtx *ctx, ValkeyModuleServerInfoData *data) {
     if (ctx != NULL) autoMemoryFreed(ctx, VALKEYMODULE_AM_INFO, data);
-    raxFreeWithCallback(data->rax, (void (*)(void *))sdsfree);
+    raxFreeWithCallback(data->rax, sdsfreeVoid);
     zfree(data);
 }
 
diff --git a/src/networking.c b/src/networking.c
index 4d386d6dc4..16147ff0ba 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -556,7 +556,7 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) {
     if (c->flag.module) {
         if (!c->deferred_reply_errors) {
             c->deferred_reply_errors = listCreate();
-            listSetFreeMethod(c->deferred_reply_errors, (void (*)(void *))sdsfree);
+            listSetFreeMethod(c->deferred_reply_errors, sdsfreeVoid);
         }
         listAddNodeTail(c->deferred_reply_errors, sdsnewlen(s, len));
         return;
diff --git a/src/replication.c b/src/replication.c
index b5ce77f5e0..3a207a1d0f 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -282,7 +282,7 @@ void removeReplicaFromPsyncWait(client *replica_main_client) {
 void resetReplicationBuffer(void) {
     server.repl_buffer_mem = 0;
     server.repl_buffer_blocks = listCreate();
-    listSetFreeMethod(server.repl_buffer_blocks, (void (*)(void *))zfree);
+    listSetFreeMethod(server.repl_buffer_blocks, zfree);
 }
 
 int canFeedReplicaReplBuffer(client *replica) {
diff --git a/src/t_stream.c b/src/t_stream.c
index 79aa080703..17254b58dd 100644
--- a/src/t_stream.c
+++ b/src/t_stream.c
@@ -54,6 +54,7 @@
 #define STREAM_LISTPACK_MAX_SIZE (1 << 30)
 
 void streamFreeCG(streamCG *cg);
+void streamFreeCGVoid(void *cg);
 void streamFreeNACK(streamNACK *na);
 size_t streamReplyWithRangeFromConsumerPEL(client *c,
                                            stream *s,
@@ -86,8 +87,8 @@ stream *streamNew(void) {
 
 /* Free a stream, including the listpacks stored inside the radix tree. */
 void freeStream(stream *s) {
-    raxFreeWithCallback(s->rax, (void (*)(void *))lpFree);
-    if (s->cgroups) raxFreeWithCallback(s->cgroups, (void (*)(void *))streamFreeCG);
+    raxFreeWithCallback(s->rax, lpFreeVoid);
+    if (s->cgroups) raxFreeWithCallback(s->cgroups, streamFreeCGVoid);
     zfree(s);
 }
 
@@ -2454,6 +2455,11 @@ void streamFreeConsumer(streamConsumer *sc) {
     zfree(sc);
 }
 
+/* Used for generic free functions. */
+static void streamFreeConsumerVoid(void *sc) {
+    streamFreeConsumer((streamConsumer *)sc);
+}
+
 /* Create a new consumer group in the context of the stream 's', having the
  * specified name, last server ID and reads counter. If a consumer group with
  * the same name already exists NULL is returned, otherwise the pointer to the
@@ -2473,11 +2479,16 @@ streamCG *streamCreateCG(stream *s, char *name, size_t namelen, streamID *id, lo
 
 /* Free a consumer group and all its associated data. */
 void streamFreeCG(streamCG *cg) {
-    raxFreeWithCallback(cg->pel, (void (*)(void *))streamFreeNACK);
-    raxFreeWithCallback(cg->consumers, (void (*)(void *))streamFreeConsumer);
+    raxFreeWithCallback(cg->pel, zfree);
+    raxFreeWithCallback(cg->consumers, streamFreeConsumerVoid);
     zfree(cg);
 }
 
+/* Used for generic free functions. */
+void streamFreeCGVoid(void *cg) {
+    streamFreeCG((streamCG *)cg);
+}
+
 /* Lookup the consumer group in the specified stream and returns its
  * pointer, otherwise if there is no such group, NULL is returned. */
 streamCG *streamLookupCG(stream *s, sds groupname) {
diff --git a/src/unit/test_listpack.c b/src/unit/test_listpack.c
index 4838fc8952..0c71da18db 100644
--- a/src/unit/test_listpack.c
+++ b/src/unit/test_listpack.c
@@ -1184,7 +1184,7 @@ int test_listpackStressWithRandom(int argc, char **argv, int flags) {
     for (i = 0; i < iteration; i++) {
         lp = lpNew(0);
         ref = listCreate();
-        listSetFreeMethod(ref, (void (*)(void *))sdsfree);
+        listSetFreeMethod(ref, sdsfreeVoid);
         len = rand() % 256;
 
         /* Create lists */
diff --git a/src/unit/test_ziplist.c b/src/unit/test_ziplist.c
index d2f7ebe69c..58687d81fc 100644
--- a/src/unit/test_ziplist.c
+++ b/src/unit/test_ziplist.c
@@ -645,7 +645,7 @@ int test_ziplistStressWithRandomPayloadsOfDifferentEncoding(int argc, char **arg
     for (i = 0; i < iteration; i++) {
         zl = ziplistNew();
         ref = listCreate();
-        listSetFreeMethod(ref, (void (*)(void *))sdsfree);
+        listSetFreeMethod(ref, sdsfreeVoid);
         len = rand() % 256;
 
         /* Create lists */

From 8060c86d2015ea9fdb0afcb4efc88fbf3951b78d Mon Sep 17 00:00:00 2001
From: uriyage <78144248+uriyage@users.noreply.github.com>
Date: Wed, 18 Dec 2024 09:03:30 +0200
Subject: [PATCH 56/73] Offload TLS negotiation to I/O threads (#1338)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## TLS Negotiation Offloading to I/O Threads

### Overview
This PR introduces the ability to offload TLS handshake negotiations to
I/O threads, significantly improving performance under high TLS
connection loads.

### Key Changes
- Added infrastructure to offload TLS negotiations to I/O threads
- Refactored SSL event handling to allow I/O threads modify conn flags.
- Introduced new connection flag to identify client connections

### Performance Impact
Testing with 650 clients with SET commands and 160 new TLS connections
per second in the background:

#### Throughput Impact of new TLS connections
- **With Offloading**: Minimal impact (1050K → 990K ops/sec)
- **Without Offloading**: Significant drop (1050K → 670K ops/sec)

#### New Connection Rate
- **With Offloading**:
  - 1,757 conn/sec
- **Without Offloading**:
  - 477 conn/sec

### Implementation Details
1. **Main Thread**:
   - Initiates negotiation-offload jobs to I/O threads
- Adds connections to pending-read clients list (using existing read
offload mechanism)
   - Post-negotiation handling:
     - Creates read/write events if needed for incomplete negotiations
     - Calls accept handler for completed negotiations

2. **I/O Thread**:
   - Performs TLS negotiation
   - Updates connection flags based on negotiation result

Related issue:https://github.com/valkey-io/valkey/issues/761

---------

Signed-off-by: Uri Yagelnik <uriy@amazon.com>
Signed-off-by: ranshid <88133677+ranshid@users.noreply.github.com>
Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com>
Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
---
 .github/workflows/daily.yml |  38 ++++++++++
 src/connection.h            |   5 +-
 src/io_threads.c            |  52 ++++++++++++++
 src/io_threads.h            |   1 +
 src/networking.c            |   6 ++
 src/server.c                |   2 +
 src/server.h                |   1 +
 src/tls.c                   | 139 ++++++++++++++++++------------------
 8 files changed, 174 insertions(+), 70 deletions(-)

diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml
index 44386f5ffd..e1d577b51b 100644
--- a/.github/workflows/daily.yml
+++ b/.github/workflows/daily.yml
@@ -375,6 +375,44 @@ jobs:
         if: true && !contains(github.event.inputs.skiptests, 'cluster')
         run: ./runtest-cluster --io-threads ${{github.event.inputs.cluster_test_args}}
 
+  test-ubuntu-tls-io-threads:
+    runs-on: ubuntu-latest
+    if: |
+      (github.event_name == 'workflow_dispatch' ||
+        (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') ||
+        (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) &&
+      !contains(github.event.inputs.skipjobs, 'tls') && !contains(github.event.inputs.skipjobs, 'iothreads')
+    timeout-minutes: 14400
+    steps:
+      - name: prep
+        if: github.event_name == 'workflow_dispatch'
+        run: |
+          echo "GITHUB_REPOSITORY=${{github.event.inputs.use_repo}}" >> $GITHUB_ENV
+          echo "GITHUB_HEAD_REF=${{github.event.inputs.use_git_ref}}" >> $GITHUB_ENV
+          echo "skipjobs: ${{github.event.inputs.skipjobs}}"
+          echo "skiptests: ${{github.event.inputs.skiptests}}"
+          echo "test_args: ${{github.event.inputs.test_args}}"
+          echo "cluster_test_args: ${{github.event.inputs.cluster_test_args}}"
+      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        with:
+          repository: ${{ env.GITHUB_REPOSITORY }}
+          ref: ${{ env.GITHUB_HEAD_REF }}
+      - name: make
+        run: |
+          make BUILD_TLS=yes SERVER_CFLAGS='-Werror'
+      - name: testprep
+        run: |
+          sudo apt-get install tcl8.6 tclx tcl-tls
+          ./utils/gen-test-certs.sh
+      - name: test
+        if: true && !contains(github.event.inputs.skiptests, 'valkey')
+        run: |
+          ./runtest --io-threads --tls --accurate --verbose --tags network --dump-logs ${{github.event.inputs.test_args}}
+      - name: cluster tests
+        if: true && !contains(github.event.inputs.skiptests, 'cluster')
+        run: |
+          ./runtest-cluster --io-threads --tls ${{github.event.inputs.cluster_test_args}}
+
   test-ubuntu-reclaim-cache:
     runs-on: ubuntu-latest
     if: |
diff --git a/src/connection.h b/src/connection.h
index 8a2775ee34..fd7e0910cf 100644
--- a/src/connection.h
+++ b/src/connection.h
@@ -54,8 +54,9 @@ typedef enum {
     CONN_STATE_ERROR
 } ConnectionState;
 
-#define CONN_FLAG_CLOSE_SCHEDULED (1 << 0) /* Closed scheduled by a handler */
-#define CONN_FLAG_WRITE_BARRIER (1 << 1)   /* Write barrier requested */
+#define CONN_FLAG_CLOSE_SCHEDULED (1 << 0)      /* Closed scheduled by a handler */
+#define CONN_FLAG_WRITE_BARRIER (1 << 1)        /* Write barrier requested */
+#define CONN_FLAG_ALLOW_ACCEPT_OFFLOAD (1 << 2) /* Connection accept can be offloaded to IO threads. */
 
 #define CONN_TYPE_SOCKET "tcp"
 #define CONN_TYPE_UNIX "unix"
diff --git a/src/io_threads.c b/src/io_threads.c
index 3865eb77c3..90f5b88700 100644
--- a/src/io_threads.c
+++ b/src/io_threads.c
@@ -561,3 +561,55 @@ void trySendPollJobToIOThreads(void) {
     aeSetPollProtect(server.el, 1);
     IOJobQueue_push(jq, IOThreadPoll, server.el);
 }
+
+static void ioThreadAccept(void *data) {
+    client *c = (client *)data;
+    connAccept(c->conn, NULL);
+    c->io_read_state = CLIENT_COMPLETED_IO;
+}
+
+/*
+ * Attempts to offload an Accept operation (currently used for TLS accept) for a client
+ * connection to I/O threads.
+ *
+ * Returns:
+ *   C_OK  - If the accept operation was successfully queued for processing
+ *   C_ERR - If the connection is not eligible for offloading
+ *
+ * Parameters:
+ *   conn - The connection object to perform the accept operation on
+ */
+int trySendAcceptToIOThreads(connection *conn) {
+    if (server.io_threads_num <= 1) {
+        return C_ERR;
+    }
+
+    if (!(conn->flags & CONN_FLAG_ALLOW_ACCEPT_OFFLOAD)) {
+        return C_ERR;
+    }
+
+    client *c = connGetPrivateData(conn);
+    if (c->io_read_state != CLIENT_IDLE) {
+        return C_OK;
+    }
+
+    if (server.active_io_threads_num <= 1) {
+        return C_ERR;
+    }
+
+    size_t thread_id = (c->id % (server.active_io_threads_num - 1)) + 1;
+    IOJobQueue *job_queue = &io_jobs[thread_id];
+
+    if (IOJobQueue_isFull(job_queue)) {
+        return C_ERR;
+    }
+
+    c->io_read_state = CLIENT_PENDING_IO;
+    c->flag.pending_read = 1;
+    listLinkNodeTail(server.clients_pending_io_read, &c->pending_read_list_node);
+    connSetPostponeUpdateState(c->conn, 1);
+    server.stat_io_accept_offloaded++;
+    IOJobQueue_push(job_queue, ioThreadAccept, c);
+
+    return C_OK;
+}
diff --git a/src/io_threads.h b/src/io_threads.h
index 8818f08588..a3ff582a77 100644
--- a/src/io_threads.h
+++ b/src/io_threads.h
@@ -13,5 +13,6 @@ int tryOffloadFreeArgvToIOThreads(client *c, int argc, robj **argv);
 void adjustIOThreadsByEventLoad(int numevents, int increase_only);
 void drainIOThreadsQueue(void);
 void trySendPollJobToIOThreads(void);
+int trySendAcceptToIOThreads(connection *conn);
 
 #endif /* IO_THREADS_H */
diff --git a/src/networking.c b/src/networking.c
index 16147ff0ba..9f36f24275 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -134,6 +134,7 @@ client *createClient(connection *conn) {
         if (server.tcpkeepalive) connKeepAlive(conn, server.tcpkeepalive);
         connSetReadHandler(conn, readQueryFromClient);
         connSetPrivateData(conn, c);
+        conn->flags |= CONN_FLAG_ALLOW_ACCEPT_OFFLOAD;
     }
     c->buf = zmalloc_usable(PROTO_REPLY_CHUNK_BYTES, &c->buf_usable_size);
     selectDb(c, 0);
@@ -4805,9 +4806,14 @@ int processIOThreadsReadDone(void) {
         processed++;
         server.stat_io_reads_processed++;
 
+        /* Save the current conn state, as connUpdateState may modify it */
+        int in_accept_state = (connGetState(c->conn) == CONN_STATE_ACCEPTING);
         connSetPostponeUpdateState(c->conn, 0);
         connUpdateState(c->conn);
 
+        /* In accept state, no client's data was read - stop here. */
+        if (in_accept_state) continue;
+
         /* On read error - stop here. */
         if (handleReadResult(c) == C_ERR) {
             continue;
diff --git a/src/server.c b/src/server.c
index 5275fed4b9..3cdec9fa9b 100644
--- a/src/server.c
+++ b/src/server.c
@@ -2645,6 +2645,7 @@ void resetServerStats(void) {
     server.stat_total_reads_processed = 0;
     server.stat_io_writes_processed = 0;
     server.stat_io_freed_objects = 0;
+    server.stat_io_accept_offloaded = 0;
     server.stat_poll_processed_by_io_threads = 0;
     server.stat_total_writes_processed = 0;
     server.stat_client_qbuf_limit_disconnections = 0;
@@ -5915,6 +5916,7 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) {
                 "io_threaded_reads_processed:%lld\r\n", server.stat_io_reads_processed,
                 "io_threaded_writes_processed:%lld\r\n", server.stat_io_writes_processed,
                 "io_threaded_freed_objects:%lld\r\n", server.stat_io_freed_objects,
+                "io_threaded_accept_processed:%lld\r\n", server.stat_io_accept_offloaded,
                 "io_threaded_poll_processed:%lld\r\n", server.stat_poll_processed_by_io_threads,
                 "io_threaded_total_prefetch_batches:%lld\r\n", server.stat_total_prefetch_batches,
                 "io_threaded_total_prefetch_entries:%lld\r\n", server.stat_total_prefetch_entries,
diff --git a/src/server.h b/src/server.h
index 783871b856..841db70614 100644
--- a/src/server.h
+++ b/src/server.h
@@ -1869,6 +1869,7 @@ struct valkeyServer {
     long long stat_io_reads_processed;                 /* Number of read events processed by IO threads */
     long long stat_io_writes_processed;                /* Number of write events processed by IO threads */
     long long stat_io_freed_objects;                   /* Number of objects freed by IO threads */
+    long long stat_io_accept_offloaded;                /* Number of offloaded accepts */
     long long stat_poll_processed_by_io_threads;       /* Total number of poll jobs processed by IO */
     long long stat_total_reads_processed;              /* Total number of read events processed */
     long long stat_total_writes_processed;             /* Total number of write events processed */
diff --git a/src/tls.c b/src/tls.c
index 48b75553de..11e6143561 100644
--- a/src/tls.c
+++ b/src/tls.c
@@ -32,6 +32,7 @@
 #include "server.h"
 #include "connhelpers.h"
 #include "adlist.h"
+#include "io_threads.h"
 
 #if (USE_OPENSSL == 1 /* BUILD_YES */) || ((USE_OPENSSL == 2 /* BUILD_MODULE */) && (BUILD_TLS_MODULE == 2))
 
@@ -437,16 +438,13 @@ static ConnectionType CT_TLS;
  *
  */
 
-typedef enum {
-    WANT_READ = 1,
-    WANT_WRITE
-} WantIOType;
-
 #define TLS_CONN_FLAG_READ_WANT_WRITE (1 << 0)
 #define TLS_CONN_FLAG_WRITE_WANT_READ (1 << 1)
 #define TLS_CONN_FLAG_FD_SET (1 << 2)
 #define TLS_CONN_FLAG_POSTPONE_UPDATE_STATE (1 << 3)
 #define TLS_CONN_FLAG_HAS_PENDING (1 << 4)
+#define TLS_CONN_FLAG_ACCEPT_ERROR (1 << 5)
+#define TLS_CONN_FLAG_ACCEPT_SUCCESS (1 << 6)
 
 typedef struct tls_connection {
     connection c;
@@ -514,20 +512,26 @@ static connection *connCreateAcceptedTLS(int fd, void *priv) {
     return (connection *)conn;
 }
 
+static int connTLSAccept(connection *_conn, ConnectionCallbackFunc accept_handler);
 static void tlsEventHandler(struct aeEventLoop *el, int fd, void *clientData, int mask);
 static void updateSSLEvent(tls_connection *conn);
 
+static void clearTLSWantFlags(tls_connection *conn) {
+    conn->flags &= ~(TLS_CONN_FLAG_WRITE_WANT_READ | TLS_CONN_FLAG_READ_WANT_WRITE);
+}
+
 /* Process the return code received from OpenSSL>
- * Update the want parameter with expected I/O.
+ * Update the conn flags with the WANT_READ/WANT_WRITE flags.
  * Update the connection's error state if a real error has occurred.
  * Returns an SSL error code, or 0 if no further handling is required.
  */
-static int handleSSLReturnCode(tls_connection *conn, int ret_value, WantIOType *want) {
+static int handleSSLReturnCode(tls_connection *conn, int ret_value) {
+    clearTLSWantFlags(conn);
     if (ret_value <= 0) {
         int ssl_err = SSL_get_error(conn->ssl, ret_value);
         switch (ssl_err) {
-        case SSL_ERROR_WANT_WRITE: *want = WANT_WRITE; return 0;
-        case SSL_ERROR_WANT_READ: *want = WANT_READ; return 0;
+        case SSL_ERROR_WANT_WRITE: conn->flags |= TLS_CONN_FLAG_READ_WANT_WRITE; return 0;
+        case SSL_ERROR_WANT_READ: conn->flags |= TLS_CONN_FLAG_WRITE_WANT_READ; return 0;
         case SSL_ERROR_SYSCALL:
             conn->c.last_errno = errno;
             if (conn->ssl_error) zfree(conn->ssl_error);
@@ -563,11 +567,8 @@ static int updateStateAfterSSLIO(tls_connection *conn, int ret_value, int update
     }
 
     if (ret_value <= 0) {
-        WantIOType want = 0;
         int ssl_err;
-        if (!(ssl_err = handleSSLReturnCode(conn, ret_value, &want))) {
-            if (want == WANT_READ) conn->flags |= TLS_CONN_FLAG_WRITE_WANT_READ;
-            if (want == WANT_WRITE) conn->flags |= TLS_CONN_FLAG_READ_WANT_WRITE;
+        if (!(ssl_err = handleSSLReturnCode(conn, ret_value))) {
             if (update_event) updateSSLEvent(conn);
             errno = EAGAIN;
             return -1;
@@ -585,19 +586,17 @@ static int updateStateAfterSSLIO(tls_connection *conn, int ret_value, int update
     return ret_value;
 }
 
-static void registerSSLEvent(tls_connection *conn, WantIOType want) {
+static void registerSSLEvent(tls_connection *conn) {
     int mask = aeGetFileEvents(server.el, conn->c.fd);
 
-    switch (want) {
-    case WANT_READ:
+    if (conn->flags & TLS_CONN_FLAG_WRITE_WANT_READ) {
         if (mask & AE_WRITABLE) aeDeleteFileEvent(server.el, conn->c.fd, AE_WRITABLE);
         if (!(mask & AE_READABLE)) aeCreateFileEvent(server.el, conn->c.fd, AE_READABLE, tlsEventHandler, conn);
-        break;
-    case WANT_WRITE:
+    } else if (conn->flags & TLS_CONN_FLAG_READ_WANT_WRITE) {
         if (mask & AE_READABLE) aeDeleteFileEvent(server.el, conn->c.fd, AE_READABLE);
         if (!(mask & AE_WRITABLE)) aeCreateFileEvent(server.el, conn->c.fd, AE_WRITABLE, tlsEventHandler, conn);
-        break;
-    default: serverAssert(0); break;
+    } else {
+        serverAssert(0);
     }
 }
 
@@ -650,12 +649,47 @@ static void updateSSLEvent(tls_connection *conn) {
     if (!need_write && (mask & AE_WRITABLE)) aeDeleteFileEvent(server.el, conn->c.fd, AE_WRITABLE);
 }
 
+static int TLSHandleAcceptResult(tls_connection *conn, int call_handler_on_error) {
+    serverAssert(conn->c.state == CONN_STATE_ACCEPTING);
+    if (conn->flags & TLS_CONN_FLAG_ACCEPT_SUCCESS) {
+        conn->c.state = CONN_STATE_CONNECTED;
+    } else if (conn->flags & TLS_CONN_FLAG_ACCEPT_ERROR) {
+        conn->c.state = CONN_STATE_ERROR;
+        if (!call_handler_on_error) return C_ERR;
+    } else {
+        /* Still pending accept */
+        registerSSLEvent(conn);
+        return C_OK;
+    }
+
+    /* call accept handler */
+    if (!callHandler((connection *)conn, conn->c.conn_handler)) return C_ERR;
+    conn->c.conn_handler = NULL;
+    return C_OK;
+}
+
 static void updateSSLState(connection *conn_) {
     tls_connection *conn = (tls_connection *)conn_;
+
+    if (conn->c.state == CONN_STATE_ACCEPTING) {
+        if (TLSHandleAcceptResult(conn, 1) == C_ERR || conn->c.state != CONN_STATE_CONNECTED) return;
+    }
+
     updateSSLEvent(conn);
     updatePendingData(conn);
 }
 
+static void TLSAccept(void *_conn) {
+    tls_connection *conn = (tls_connection *)_conn;
+    ERR_clear_error();
+    int ret = SSL_accept(conn->ssl);
+    if (ret > 0) {
+        conn->flags |= TLS_CONN_FLAG_ACCEPT_SUCCESS;
+    } else if (handleSSLReturnCode(conn, ret)) {
+        conn->flags |= TLS_CONN_FLAG_ACCEPT_ERROR;
+    }
+}
+
 static void tlsHandleEvent(tls_connection *conn, int mask) {
     int ret, conn_error;
 
@@ -676,10 +710,8 @@ static void tlsHandleEvent(tls_connection *conn, int mask) {
             }
             ret = SSL_connect(conn->ssl);
             if (ret <= 0) {
-                WantIOType want = 0;
-                if (!handleSSLReturnCode(conn, ret, &want)) {
-                    registerSSLEvent(conn, want);
-
+                if (!handleSSLReturnCode(conn, ret)) {
+                    registerSSLEvent(conn);
                     /* Avoid hitting UpdateSSLEvent, which knows nothing
                      * of what SSL_connect() wants and instead looks at our
                      * R/W handlers.
@@ -698,27 +730,7 @@ static void tlsHandleEvent(tls_connection *conn, int mask) {
         conn->c.conn_handler = NULL;
         break;
     case CONN_STATE_ACCEPTING:
-        ERR_clear_error();
-        ret = SSL_accept(conn->ssl);
-        if (ret <= 0) {
-            WantIOType want = 0;
-            if (!handleSSLReturnCode(conn, ret, &want)) {
-                /* Avoid hitting UpdateSSLEvent, which knows nothing
-                 * of what SSL_connect() wants and instead looks at our
-                 * R/W handlers.
-                 */
-                registerSSLEvent(conn, want);
-                return;
-            }
-
-            /* If not handled, it's an error */
-            conn->c.state = CONN_STATE_ERROR;
-        } else {
-            conn->c.state = CONN_STATE_CONNECTED;
-        }
-
-        if (!callHandler((connection *)conn, conn->c.conn_handler)) return;
-        conn->c.conn_handler = NULL;
+        if (connTLSAccept((connection *)conn, NULL) == C_ERR || conn->c.state != CONN_STATE_CONNECTED) return;
         break;
     case CONN_STATE_CONNECTED: {
         int call_read = ((mask & AE_READABLE) && conn->c.read_handler) ||
@@ -740,20 +752,17 @@ static void tlsHandleEvent(tls_connection *conn, int mask) {
         int invert = conn->c.flags & CONN_FLAG_WRITE_BARRIER;
 
         if (!invert && call_read) {
-            conn->flags &= ~TLS_CONN_FLAG_READ_WANT_WRITE;
             if (!callHandler((connection *)conn, conn->c.read_handler)) return;
         }
 
         /* Fire the writable event. */
         if (call_write) {
-            conn->flags &= ~TLS_CONN_FLAG_WRITE_WANT_READ;
             if (!callHandler((connection *)conn, conn->c.write_handler)) return;
         }
 
         /* If we have to invert the call, fire the readable event now
          * after the writable one. */
         if (invert && call_read) {
-            conn->flags &= ~TLS_CONN_FLAG_READ_WANT_WRITE;
             if (!callHandler((connection *)conn, conn->c.read_handler)) return;
         }
         updatePendingData(conn);
@@ -845,31 +854,25 @@ static void connTLSClose(connection *conn_) {
 
 static int connTLSAccept(connection *_conn, ConnectionCallbackFunc accept_handler) {
     tls_connection *conn = (tls_connection *)_conn;
-    int ret;
-
     if (conn->c.state != CONN_STATE_ACCEPTING) return C_ERR;
-    ERR_clear_error();
-
+    int call_handler_on_error = 1;
     /* Try to accept */
-    conn->c.conn_handler = accept_handler;
-    ret = SSL_accept(conn->ssl);
-
-    if (ret <= 0) {
-        WantIOType want = 0;
-        if (!handleSSLReturnCode(conn, ret, &want)) {
-            registerSSLEvent(conn, want); /* We'll fire back */
-            return C_OK;
-        } else {
-            conn->c.state = CONN_STATE_ERROR;
-            return C_ERR;
-        }
+    if (accept_handler) {
+        conn->c.conn_handler = accept_handler;
+        call_handler_on_error = 0;
     }
 
-    conn->c.state = CONN_STATE_CONNECTED;
-    if (!callHandler((connection *)conn, conn->c.conn_handler)) return C_OK;
-    conn->c.conn_handler = NULL;
+    /* We're in IO thread - just call accept and return, the main thread will handle the rest */
+    if (!inMainThread()) {
+        TLSAccept(conn);
+        return C_OK;
+    }
 
-    return C_OK;
+    /* Try to offload accept to IO threads */
+    if (trySendAcceptToIOThreads(_conn) == C_OK) return C_OK;
+
+    TLSAccept(conn);
+    return TLSHandleAcceptResult(conn, call_handler_on_error);
 }
 
 static int connTLSConnect(connection *conn_,

From 60197b30e266842fc84eb7ee1eead2d27d87e62f Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Wed, 18 Dec 2024 09:17:11 -0800
Subject: [PATCH 57/73] Attempt to read secondary error from info test (#1452)

The test attempts to write 1MB of data in order to trigger a disconnect.
Normally, the data is fully flushed and we get the error on the read
(I/O error). However, it's possible we might fail the write, which
leaves the client in an inconsistent state. On the next command, we
finally process the I/O error on the FD. So, the simple fix is to
consume any secondary errors.

---------

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 tests/unit/info.tcl | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl
index a27043fa88..11dc4e5d40 100644
--- a/tests/unit/info.tcl
+++ b/tests/unit/info.tcl
@@ -391,7 +391,13 @@ start_server {tags {"info" "external:skip" "debug_defrag:skip"}} {
             # set qbuf limit to minimum to test stat
             set org_qbuf_limit [lindex [r config get client-query-buffer-limit] 1]
             r config set client-query-buffer-limit 1048576
-            catch {r set key [string repeat a 1048576]}
+            catch {r set key [string repeat a 2048576]} e
+            # We might get an error on the write path of the previous command, which won't be
+            # an I/O error based on how the client is designed. We will need to manually consume
+            # the secondary I/O error.
+            if {![string match "I/O error*" $e]} {
+                catch {r read}
+            }
             set info [r info stats]
             assert_equal [getInfoProperty $info client_query_buffer_limit_disconnections] {1}
             r config set client-query-buffer-limit $org_qbuf_limit

From 079f4edf2d7aabd98bd37ffaac608b54dea62b6a Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Wed, 18 Dec 2024 22:18:02 -0800
Subject: [PATCH 58/73] Add a hint about the current file for TCL debugging
 (#1459)

There are some tests that fail and give no useful information since they are
outside of a test context. Now we will at least get the file we are located in.

We can sort of reverse engineer where we are in the test by seeing which
tests have finished in a file.

```
[TIMEOUT]: clients state report follows.
sock6 => (SPAWNED SERVER) pid:30375 - tests/unit/info.tcl
Killing still running Valkey server 30375 - tests/unit/info.tcl
```

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 tests/support/server.tcl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/support/server.tcl b/tests/support/server.tcl
index 8c545d900a..bd3135e9d9 100644
--- a/tests/support/server.tcl
+++ b/tests/support/server.tcl
@@ -314,7 +314,7 @@ proc spawn_server {config_file stdout stderr args} {
     }
 
     # Tell the test server about this new instance.
-    send_data_packet $::test_server_fd server-spawned $pid
+    send_data_packet $::test_server_fd server-spawned "$pid - $::curfile"
     return $pid
 }
 

From 97029953a094a2ed27382bd9fed3d55c784834d0 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Thu, 19 Dec 2024 16:12:34 +0800
Subject: [PATCH 59/73] Minor log fixes when failover auth denied due to slot
 epoch (#1341)

The old reqEpoch mainly refers to requestCurrentEpoch, see:
```
    if (requestCurrentEpoch < server.cluster->currentEpoch) {
        serverLog(LL_WARNING, "Failover auth denied to %.40s (%s): reqEpoch (%llu) < curEpoch(%llu)", node->name,
                  node->human_nodename, (unsigned long long)requestCurrentEpoch,
                  (unsigned long long)server.cluster->currentEpoch);
        return;
    }
```

And in here we refer to requestConfigEpoch, it's a bit misleading,
so change it to reqConfigEpoch to make it clear.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 9ddcf6678d..bbf63d46b9 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -4430,7 +4430,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
          * by the replica requesting our vote. Refuse to vote for this replica. */
         serverLog(LL_WARNING,
                   "Failover auth denied to %.40s (%s): "
-                  "slot %d epoch (%llu) > reqEpoch (%llu)",
+                  "slot %d epoch (%llu) > reqConfigEpoch (%llu)",
                   node->name, node->human_nodename, j, (unsigned long long)server.cluster->slots[j]->configEpoch,
                   (unsigned long long)requestConfigEpoch);
         return;
@@ -4721,8 +4721,8 @@ void clusterHandleReplicaFailover(void) {
     if (server.cluster->failover_auth_sent == 0) {
         server.cluster->currentEpoch++;
         server.cluster->failover_auth_epoch = server.cluster->currentEpoch;
-        serverLog(LL_NOTICE, "Starting a failover election for epoch %llu.",
-                  (unsigned long long)server.cluster->currentEpoch);
+        serverLog(LL_NOTICE, "Starting a failover election for epoch %llu, node config epoch is %llu",
+                  (unsigned long long)server.cluster->currentEpoch, (unsigned long long)nodeEpoch(myself));
         clusterRequestFailoverAuth();
         server.cluster->failover_auth_sent = 1;
         clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG);

From e9a1fe0b320c2f1f262ffa2200321348c08f8849 Mon Sep 17 00:00:00 2001
From: Jungwoo Song <37579681+bluayer@users.noreply.github.com>
Date: Fri, 20 Dec 2024 01:32:31 +0900
Subject: [PATCH 60/73] Support for reading from replicas in valkey-benchmark
 (#1392)

**Background**
When conducting performance tests using `valkey-benchmark`, reading from
replicas was not supported. Consequently, even in cluster mode, all
reads were directed to the primary nodes. This limitation made it
challenging to obtain accurate metrics during workload stress testing
for performance measurement or before a version upgrade.

Related issue : https://github.com/valkey-io/valkey/issues/900

**Changes**
1. Replaced the use of `CLUSTER NODES` with `CLUSTER SLOTS` when
fetching cluster configuration. This allows for easier identification of
replica slots.
2. Support for reading from replicas by executing the client in
`READONLY` mode.
3. Support reading from replicas even during slot migrations.
4. Introduced two CLI options `--rfr` to enable reading from replicas
only or all cluster nodes. A warning added to indicate that write
requests might not be handled correctly when using this option.

---------

Signed-off-by: bluayer <ijacsong98@gmail.com>
Signed-off-by: bluayer <bluayer@gmail.com>
Signed-off-by: Jungwoo Song <37579681+bluayer@users.noreply.github.com>
Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com>
---
 src/valkey-benchmark.c | 354 +++++++++++++++++++----------------------
 1 file changed, 168 insertions(+), 186 deletions(-)

diff --git a/src/valkey-benchmark.c b/src/valkey-benchmark.c
index 57cdd6fc16..1924203ae7 100644
--- a/src/valkey-benchmark.c
+++ b/src/valkey-benchmark.c
@@ -77,6 +77,13 @@ struct benchmarkThread;
 struct clusterNode;
 struct serverConfig;
 
+/* Read from replica options */
+typedef enum readFromReplica {
+    FROM_PRIMARY_ONLY = 0, /* default option */
+    FROM_REPLICA_ONLY,
+    FROM_ALL
+} readFromReplica;
+
 static struct config {
     aeEventLoop *el;
     cliConnInfo conn_info;
@@ -112,6 +119,7 @@ static struct config {
     int num_threads;
     struct benchmarkThread **threads;
     int cluster_mode;
+    readFromReplica read_from_replica;
     int cluster_node_count;
     struct clusterNode **cluster_nodes;
     struct serverConfig *redis_config;
@@ -168,12 +176,6 @@ typedef struct clusterNode {
     int *updated_slots;      /* Used by updateClusterSlotsConfiguration */
     int updated_slots_count; /* Used by updateClusterSlotsConfiguration */
     int replicas_count;
-    sds *migrating;      /* An array of sds where even strings are slots and odd
-                          * strings are the destination node IDs. */
-    sds *importing;      /* An array of sds where even strings are slots and odd
-                          * strings are the source node IDs. */
-    int migrating_count; /* Length of the migrating array (migrating slots*2) */
-    int importing_count; /* Length of the importing array (importing slots*2) */
     struct serverConfig *redis_config;
 } clusterNode;
 
@@ -228,6 +230,15 @@ static int dictSdsKeyCompare(const void *key1, const void *key2) {
     return memcmp(key1, key2, l1) == 0;
 }
 
+static dictType dtype = {
+    dictSdsHash,       /* hash function */
+    NULL,              /* key dup */
+    dictSdsKeyCompare, /* key compare */
+    NULL,              /* key destructor */
+    NULL,              /* val destructor */
+    NULL               /* allow to expand */
+};
+
 static redisContext *getRedisContext(const char *ip, int port, const char *hostsocket) {
     redisContext *ctx = NULL;
     redisReply *reply = NULL;
@@ -710,6 +721,15 @@ static client createClient(char *cmd, size_t len, client from, int thread_id) {
         c->prefix_pending++;
     }
 
+    if (config.cluster_mode && (config.read_from_replica == FROM_REPLICA_ONLY || config.read_from_replica == FROM_ALL)) {
+        char *buf = NULL;
+        int len;
+        len = redisFormatCommand(&buf, "READONLY");
+        c->obuf = sdscatlen(c->obuf, buf, len);
+        free(buf);
+        c->prefix_pending++;
+    }
+
     c->prefixlen = sdslen(c->obuf);
     /* Append the request itself. */
     if (from) {
@@ -835,7 +855,15 @@ static void showLatencyReport(void) {
         printf("  %d bytes payload\n", config.datasize);
         printf("  keep alive: %d\n", config.keepalive);
         if (config.cluster_mode) {
-            printf("  cluster mode: yes (%d primaries)\n", config.cluster_node_count);
+            const char *node_roles = NULL;
+            if (config.read_from_replica == FROM_ALL) {
+                node_roles = "cluster";
+            } else if (config.read_from_replica == FROM_REPLICA_ONLY) {
+                node_roles = "replica";
+            } else {
+                node_roles = "primary";
+            }
+            printf("  cluster mode: yes (%d %s)\n", config.cluster_node_count, node_roles);
             int m;
             for (m = 0; m < config.cluster_node_count; m++) {
                 clusterNode *node = config.cluster_nodes[m];
@@ -1009,26 +1037,13 @@ static clusterNode *createClusterNode(char *ip, int port) {
     node->slots_count = 0;
     node->updated_slots = NULL;
     node->updated_slots_count = 0;
-    node->migrating = NULL;
-    node->importing = NULL;
-    node->migrating_count = 0;
-    node->importing_count = 0;
     node->redis_config = NULL;
     return node;
 }
 
 static void freeClusterNode(clusterNode *node) {
-    int i;
     if (node->name) sdsfree(node->name);
     if (node->replicate) sdsfree(node->replicate);
-    if (node->migrating != NULL) {
-        for (i = 0; i < node->migrating_count; i++) sdsfree(node->migrating[i]);
-        zfree(node->migrating);
-    }
-    if (node->importing != NULL) {
-        for (i = 0; i < node->importing_count; i++) sdsfree(node->importing[i]);
-        zfree(node->importing);
-    }
     /* If the node is not the reference node, that uses the address from
      * config.conn_info.hostip and config.conn_info.hostport, then the node ip has been
      * allocated by fetchClusterConfiguration, so it must be freed. */
@@ -1056,157 +1071,85 @@ static clusterNode **addClusterNode(clusterNode *node) {
     return config.cluster_nodes;
 }
 
-/* TODO: This should be refactored to use CLUSTER SLOTS, the migrating/importing
- * information is anyway not used.
- */
 static int fetchClusterConfiguration(void) {
     int success = 1;
     redisContext *ctx = NULL;
     redisReply *reply = NULL;
+    dict *nodes = NULL;
+    const char *errmsg = "Failed to fetch cluster configuration";
+    size_t i, j;
     ctx = getRedisContext(config.conn_info.hostip, config.conn_info.hostport, config.hostsocket);
     if (ctx == NULL) {
         exit(1);
     }
-    clusterNode *firstNode = createClusterNode((char *)config.conn_info.hostip, config.conn_info.hostport);
-    if (!firstNode) {
+
+    reply = redisCommand(ctx, "CLUSTER SLOTS");
+    if (reply == NULL || reply->type == REDIS_REPLY_ERROR) {
         success = 0;
+        if (reply) fprintf(stderr, "%s\nCLUSTER SLOTS ERROR: %s\n", errmsg, reply->str);
         goto cleanup;
     }
-    reply = redisCommand(ctx, "CLUSTER NODES");
-    success = (reply != NULL);
-    if (!success) goto cleanup;
-    success = (reply->type != REDIS_REPLY_ERROR);
-    if (!success) {
-        if (config.hostsocket == NULL) {
-            fprintf(stderr, "Cluster node %s:%d replied with error:\n%s\n", config.conn_info.hostip,
-                    config.conn_info.hostport, reply->str);
-        } else {
-            fprintf(stderr, "Cluster node %s replied with error:\n%s\n", config.hostsocket, reply->str);
-        }
-        goto cleanup;
-    }
-    char *lines = reply->str, *p, *line;
-    while ((p = strstr(lines, "\n")) != NULL) {
-        *p = '\0';
-        line = lines;
-        lines = p + 1;
-        char *name = NULL, *addr = NULL, *flags = NULL, *primary_id = NULL;
-        int i = 0;
-        while ((p = strchr(line, ' ')) != NULL) {
-            *p = '\0';
-            char *token = line;
-            line = p + 1;
-            switch (i++) {
-            case 0: name = token; break;
-            case 1: addr = token; break;
-            case 2: flags = token; break;
-            case 3: primary_id = token; break;
-            }
-            if (i == 8) break; // Slots
-        }
-        if (!flags) {
-            fprintf(stderr, "Invalid CLUSTER NODES reply: missing flags.\n");
-            success = 0;
-            goto cleanup;
-        }
-        int myself = (strstr(flags, "myself") != NULL);
-        int is_replica = (strstr(flags, "slave") != NULL || (primary_id != NULL && primary_id[0] != '-'));
-        if (is_replica) continue;
-        if (addr == NULL) {
-            fprintf(stderr, "Invalid CLUSTER NODES reply: missing addr.\n");
-            success = 0;
-            goto cleanup;
-        }
-        clusterNode *node = NULL;
-        char *ip = NULL;
-        int port = 0;
-        char *paddr = strrchr(addr, ':');
-        if (paddr != NULL) {
-            *paddr = '\0';
-            ip = addr;
-            addr = paddr + 1;
-            /* If internal bus is specified, then just drop it. */
-            if ((paddr = strchr(addr, '@')) != NULL) *paddr = '\0';
-            port = atoi(addr);
-        }
-        if (myself) {
-            node = firstNode;
-            if (ip != NULL && strcmp(node->ip, ip) != 0) {
-                node->ip = sdsnew(ip);
-                node->port = port;
+    assert(reply->type == REDIS_REPLY_ARRAY);
+    nodes = dictCreate(&dtype);
+    for (i = 0; i < reply->elements; i++) {
+        redisReply *r = reply->element[i];
+        assert(r->type == REDIS_REPLY_ARRAY);
+        assert(r->elements >= 3);
+        int from = r->element[0]->integer;
+        int to = r->element[1]->integer;
+        sds primary = NULL;
+        for (j = 2; j < r->elements; j++) {
+            redisReply *nr = r->element[j];
+            assert(nr->type == REDIS_REPLY_ARRAY && nr->elements >= 3);
+            assert(nr->element[0]->str != NULL);
+            assert(nr->element[2]->str != NULL);
+
+            int is_primary = (j == 2);
+            if (is_primary) primary = sdsnew(nr->element[2]->str);
+            int is_cluster_option_only = (config.read_from_replica == FROM_PRIMARY_ONLY);
+            if ((config.read_from_replica == FROM_REPLICA_ONLY && is_primary) || (is_cluster_option_only && !is_primary)) continue;
+
+            sds ip = sdsnew(nr->element[0]->str);
+            sds name = sdsnew(nr->element[2]->str);
+            int port = nr->element[1]->integer;
+            int slot_start = from;
+            int slot_end = to;
+
+            clusterNode *node = NULL;
+            dictEntry *entry = dictFind(nodes, name);
+            if (entry == NULL) {
+                node = createClusterNode(sdsnew(ip), port);
+                if (node == NULL) {
+                    success = 0;
+                    goto cleanup;
+                } else {
+                    node->name = name;
+                    if (!is_primary) node->replicate = sdsdup(primary);
+                }
+            } else {
+                node = dictGetVal(entry);
             }
-        } else {
-            node = createClusterNode(sdsnew(ip), port);
-        }
-        if (node == NULL) {
-            success = 0;
-            goto cleanup;
-        }
-        if (name != NULL) node->name = sdsnew(name);
-        if (i == 8) {
-            int remaining = strlen(line);
-            while (remaining > 0) {
-                p = strchr(line, ' ');
-                if (p == NULL) p = line + remaining;
-                remaining -= (p - line);
-
-                char *slotsdef = line;
-                *p = '\0';
-                if (remaining) {
-                    line = p + 1;
-                    remaining--;
-                } else
-                    line = p;
-                char *dash = NULL;
-                if (slotsdef[0] == '[') {
-                    slotsdef++;
-                    if ((p = strstr(slotsdef, "->-"))) { // Migrating
-                        *p = '\0';
-                        p += 3;
-                        char *closing_bracket = strchr(p, ']');
-                        if (closing_bracket) *closing_bracket = '\0';
-                        sds slot = sdsnew(slotsdef);
-                        sds dst = sdsnew(p);
-                        node->migrating_count += 2;
-                        node->migrating = zrealloc(node->migrating, (node->migrating_count * sizeof(sds)));
-                        node->migrating[node->migrating_count - 2] = slot;
-                        node->migrating[node->migrating_count - 1] = dst;
-                    } else if ((p = strstr(slotsdef, "-<-"))) { // Importing
-                        *p = '\0';
-                        p += 3;
-                        char *closing_bracket = strchr(p, ']');
-                        if (closing_bracket) *closing_bracket = '\0';
-                        sds slot = sdsnew(slotsdef);
-                        sds src = sdsnew(p);
-                        node->importing_count += 2;
-                        node->importing = zrealloc(node->importing, (node->importing_count * sizeof(sds)));
-                        node->importing[node->importing_count - 2] = slot;
-                        node->importing[node->importing_count - 1] = src;
-                    }
-                } else if ((dash = strchr(slotsdef, '-')) != NULL) {
-                    p = dash;
-                    int start, stop;
-                    *p = '\0';
-                    start = atoi(slotsdef);
-                    stop = atoi(p + 1);
-                    while (start <= stop) {
-                        int slot = start++;
-                        node->slots[node->slots_count++] = slot;
-                    }
-                } else if (p > slotsdef) {
-                    int slot = atoi(slotsdef);
+            if (slot_start == slot_end) {
+                node->slots[node->slots_count++] = slot_start;
+            } else {
+                while (slot_start <= slot_end) {
+                    int slot = slot_start++;
                     node->slots[node->slots_count++] = slot;
                 }
             }
+            if (node->slots_count == 0) {
+                fprintf(stderr, "WARNING: Node %s:%d has no slots, skipping...\n", node->ip, node->port);
+                continue;
+            }
+            if (entry == NULL) {
+                dictReplace(nodes, node->name, node);
+                if (!addClusterNode(node)) {
+                    success = 0;
+                    goto cleanup;
+                }
+            }
         }
-        if (node->slots_count == 0) {
-            fprintf(stderr, "WARNING: Primary node %s:%d has no slots, skipping...\n", node->ip, node->port);
-            continue;
-        }
-        if (!addClusterNode(node)) {
-            success = 0;
-            goto cleanup;
-        }
+        sdsfree(primary);
     }
 cleanup:
     if (ctx) redisFree(ctx);
@@ -1214,6 +1157,7 @@ static int fetchClusterConfiguration(void) {
         if (config.cluster_nodes) freeClusterNodes();
     }
     if (reply) freeReplyObject(reply);
+    if (nodes) dictRelease(nodes);
     return success;
 }
 
@@ -1222,7 +1166,7 @@ static int fetchClusterConfiguration(void) {
 static int fetchClusterSlotsConfiguration(client c) {
     UNUSED(c);
     int success = 1, is_fetching_slots = 0, last_update = 0;
-    size_t i;
+    size_t i, j;
 
     last_update = atomic_load_explicit(&config.slots_last_update, memory_order_relaxed);
     if (c->slots_last_update < last_update) {
@@ -1236,16 +1180,9 @@ static int fetchClusterSlotsConfiguration(client c) {
     atomic_store_explicit(&config.is_fetching_slots, 1, memory_order_relaxed);
     fprintf(stderr, "WARNING: Cluster slots configuration changed, fetching new one...\n");
     const char *errmsg = "Failed to update cluster slots configuration";
-    static dictType dtype = {
-        dictSdsHash,       /* hash function */
-        NULL,              /* key dup */
-        dictSdsKeyCompare, /* key compare */
-        NULL,              /* key destructor */
-        NULL,              /* val destructor */
-        NULL               /* allow to expand */
-    };
+
     /* printf("[%d] fetchClusterSlotsConfiguration\n", c->thread_id); */
-    dict *primaries = dictCreate(&dtype);
+    dict *nodes = dictCreate(&dtype);
     redisContext *ctx = NULL;
     for (i = 0; i < (size_t)config.cluster_node_count; i++) {
         clusterNode *node = config.cluster_nodes[i];
@@ -1263,7 +1200,7 @@ static int fetchClusterSlotsConfiguration(client c) {
         if (node->updated_slots != NULL) zfree(node->updated_slots);
         node->updated_slots = NULL;
         node->updated_slots_count = 0;
-        dictReplace(primaries, node->name, node);
+        dictReplace(nodes, node->name, node);
     }
     reply = redisCommand(ctx, "CLUSTER SLOTS");
     if (reply == NULL || reply->type == REDIS_REPLY_ERROR) {
@@ -1279,30 +1216,44 @@ static int fetchClusterSlotsConfiguration(client c) {
         int from, to, slot;
         from = r->element[0]->integer;
         to = r->element[1]->integer;
-        redisReply *nr = r->element[2];
-        assert(nr->type == REDIS_REPLY_ARRAY && nr->elements >= 3);
-        assert(nr->element[2]->str != NULL);
-        sds name = sdsnew(nr->element[2]->str);
-        dictEntry *entry = dictFind(primaries, name);
-        if (entry == NULL) {
-            success = 0;
-            fprintf(stderr,
-                    "%s: could not find node with ID %s in current "
-                    "configuration.\n",
-                    errmsg, name);
-            if (name) sdsfree(name);
-            goto cleanup;
+        size_t start, end;
+        if (config.read_from_replica == FROM_ALL) {
+            start = 2;
+            end = r->elements;
+        } else if (config.read_from_replica == FROM_REPLICA_ONLY) {
+            start = 3;
+            end = r->elements;
+        } else {
+            start = 2;
+            end = 3;
+        }
+
+        for (j = start; j < end; j++) {
+            redisReply *nr = r->element[j];
+            assert(nr->type == REDIS_REPLY_ARRAY && nr->elements >= 3);
+            assert(nr->element[2]->str != NULL);
+            sds name = sdsnew(nr->element[2]->str);
+            dictEntry *entry = dictFind(nodes, name);
+            if (entry == NULL) {
+                success = 0;
+                fprintf(stderr,
+                        "%s: could not find node with ID %s in current "
+                        "configuration.\n",
+                        errmsg, name);
+                if (name) sdsfree(name);
+                goto cleanup;
+            }
+            sdsfree(name);
+            clusterNode *node = dictGetVal(entry);
+            if (node->updated_slots == NULL) node->updated_slots = zcalloc(CLUSTER_SLOTS * sizeof(int));
+            for (slot = from; slot <= to; slot++) node->updated_slots[node->updated_slots_count++] = slot;
         }
-        sdsfree(name);
-        clusterNode *node = dictGetVal(entry);
-        if (node->updated_slots == NULL) node->updated_slots = zcalloc(CLUSTER_SLOTS * sizeof(int));
-        for (slot = from; slot <= to; slot++) node->updated_slots[node->updated_slots_count++] = slot;
     }
     updateClusterSlotsConfiguration();
 cleanup:
     freeReplyObject(reply);
     redisFree(ctx);
-    dictRelease(primaries);
+    dictRelease(nodes);
     atomic_store_explicit(&config.is_fetching_slots, 0, memory_order_relaxed);
     return success;
 }
@@ -1460,6 +1411,19 @@ int parseOptions(int argc, char **argv) {
                 config.num_threads = 0;
         } else if (!strcmp(argv[i], "--cluster")) {
             config.cluster_mode = 1;
+        } else if (!strcmp(argv[i], "--rfr")) {
+            if (argv[++i]) {
+                if (!strcmp(argv[i], "all")) {
+                    config.read_from_replica = FROM_ALL;
+                } else if (!strcmp(argv[i], "yes")) {
+                    config.read_from_replica = FROM_REPLICA_ONLY;
+                } else if (!strcmp(argv[i], "no")) {
+                    config.read_from_replica = FROM_PRIMARY_ONLY;
+                } else {
+                    goto invalid;
+                }
+            } else
+                goto invalid;
         } else if (!strcmp(argv[i], "--enable-tracking")) {
             config.enable_tracking = 1;
         } else if (!strcmp(argv[i], "--help")) {
@@ -1557,6 +1521,14 @@ int parseOptions(int argc, char **argv) {
         "                    If the command is supplied on the command line in cluster\n"
         "                    mode, the key must contain \"{tag}\". Otherwise, the\n"
         "                    command will not be sent to the right cluster node.\n"
+        " --rfr <mode>       Enable read from replicas in cluster mode.\n"
+        "                    This command must be used with the --cluster option.\n"
+        "                    There are three modes for reading from replicas:\n"
+        "                    'no' - sends read requests to primaries only (default) \n"
+        "                    'yes' - sends read requests to replicas only.\n"
+        "                    'all' - sends read requests to all nodes.\n"
+        "                    Since write commands will not be accepted by replicas,\n"
+        "                    it is recommended to enable read from replicas only for read command tests.\n"
         " --enable-tracking  Send CLIENT TRACKING on before starting benchmark.\n"
         " -k <boolean>       1=keep alive 0=reconnect (default 1)\n"
         " -r <keyspacelen>   Use random keys for SET/GET/INCR, random values for SADD,\n"
@@ -1698,6 +1670,7 @@ int main(int argc, char **argv) {
     config.num_threads = 0;
     config.threads = NULL;
     config.cluster_mode = 0;
+    config.read_from_replica = FROM_PRIMARY_ONLY;
     config.cluster_node_count = 0;
     config.cluster_nodes = NULL;
     config.redis_config = NULL;
@@ -1742,7 +1715,15 @@ int main(int argc, char **argv) {
             fprintf(stderr, "Invalid cluster: %d node(s).\n", config.cluster_node_count);
             exit(1);
         }
-        printf("Cluster has %d primary nodes:\n\n", config.cluster_node_count);
+        const char *node_roles = NULL;
+        if (config.read_from_replica == FROM_ALL) {
+            node_roles = "cluster";
+        } else if (config.read_from_replica == FROM_REPLICA_ONLY) {
+            node_roles = "replica";
+        } else {
+            node_roles = "primary";
+        }
+        printf("Cluster has %d %s nodes:\n\n", config.cluster_node_count, node_roles);
         int i = 0;
         for (; i < config.cluster_node_count; i++) {
             clusterNode *node = config.cluster_nodes[i];
@@ -1750,7 +1731,8 @@ int main(int argc, char **argv) {
                 fprintf(stderr, "Invalid cluster node #%d\n", i);
                 exit(1);
             }
-            printf("Primary %d: ", i);
+            const char *node_type = (node->replicate == NULL ? "Primary" : "Replica");
+            printf("Node %d(%s): ", i, node_type);
             if (node->name) printf("%s ", node->name);
             printf("%s:%d\n", node->ip, node->port);
             node->redis_config = getServerConfig(node->ip, node->port, NULL);

From e48317eb347fd1202aa4f65cb533d6727092f8ef Mon Sep 17 00:00:00 2001
From: Roshan Khatri <117414976+roshkhatri@users.noreply.github.com>
Date: Thu, 19 Dec 2024 12:32:40 -0800
Subject: [PATCH 61/73] Workflow changes to fix old release binaries (#1461)

- Moves `build-config.json` to workflow dir to build old versions with
new configs.
- Enables contributors to test release Wf on private repo by adding
`github.event_name == 'workflow_dispatch' ||`

---------

Signed-off-by: Roshan Khatri <rvkhatri@amazon.com>
---
 .github/actions/generate-package-build-matrix/action.yml    | 4 ++--
 .../generate-package-build-matrix}/build-config.json        | 0
 .github/workflows/build-release-packages.yml                | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)
 rename {utils/releasetools => .github/actions/generate-package-build-matrix}/build-config.json (100%)

diff --git a/.github/actions/generate-package-build-matrix/action.yml b/.github/actions/generate-package-build-matrix/action.yml
index 7e90f27be5..2494a71118 100644
--- a/.github/actions/generate-package-build-matrix/action.yml
+++ b/.github/actions/generate-package-build-matrix/action.yml
@@ -24,11 +24,11 @@ runs:
 
     - name: Get targets
       run: |
-        x86_arch=$(jq -c '[.linux_targets[] | select(.arch=="x86_64")]' utils/releasetools/build-config.json)
+        x86_arch=$(jq -c '[.linux_targets[] | select(.arch=="x86_64")]' .github/actions/generate-package-build-matrix/build-config.json)
         x86_matrix=$(echo "{ \"distro\" : $x86_arch }" | jq -c .)
         echo "X86_MATRIX=$x86_matrix" >> $GITHUB_ENV
 
-        arm_arch=$(jq -c '[.linux_targets[] | select(.arch=="arm64")]' utils/releasetools/build-config.json)
+        arm_arch=$(jq -c '[.linux_targets[] | select(.arch=="arm64")]' .github/actions/generate-package-build-matrix/build-config.json)
         arm_matrix=$(echo "{ \"distro\" : $arm_arch }" | jq -c .)
         echo "ARM_MATRIX=$arm_matrix" >> $GITHUB_ENV
       shell: bash
diff --git a/utils/releasetools/build-config.json b/.github/actions/generate-package-build-matrix/build-config.json
similarity index 100%
rename from utils/releasetools/build-config.json
rename to .github/actions/generate-package-build-matrix/build-config.json
diff --git a/.github/workflows/build-release-packages.yml b/.github/workflows/build-release-packages.yml
index 3f1ca2627b..d7ab8e57d6 100644
--- a/.github/workflows/build-release-packages.yml
+++ b/.github/workflows/build-release-packages.yml
@@ -8,7 +8,7 @@ on:
       - '.github/workflows/build-release-packages.yml'
       - '.github/workflows/call-build-linux-arm-packages.yml'
       - '.github/workflows/call-build-linux-x86-packages.yml'
-      - 'utils/releasetools/build-config.json'
+      - '.github/actions/generate-package-build-matrix/build-config.json'
   workflow_dispatch:
     inputs:
       version:
@@ -23,7 +23,7 @@ jobs:
   # This job provides the version metadata from the tag for the other jobs to use.
   release-build-get-meta:
     name: Get metadata to build
-    if: github.repository == 'valkey-io/valkey'
+    if: github.event_name == 'workflow_dispatch' || github.repository == 'valkey-io/valkey'
     runs-on: ubuntu-latest
     outputs:
       version: ${{ steps.get_version.outputs.VERSION }}
@@ -69,7 +69,7 @@ jobs:
 
   generate-build-matrix:
     name: Generating build matrix
-    if: github.repository == 'valkey-io/valkey'
+    if: github.event_name == 'workflow_dispatch' || github.repository == 'valkey-io/valkey'
     runs-on: ubuntu-latest
     outputs:
       x86_64-build-matrix: ${{ steps.set-matrix.outputs.x86_64-build-matrix }}

From ca0b0c662a991f84d8e11a0d433e06fdeae6980b Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Fri, 20 Dec 2024 10:14:01 +0800
Subject: [PATCH 62/73] Clear outdated failure reports more accurately (#1184)

There are two changes here:

1. The one in clusterNodeCleanupFailureReports, only primary with slots can
report the failure report, if the primary became a replica its failure report
should be cleared. This may lead to inaccurate node fail judgment in some network
partition cases i guess, it will also affect the CLUSTER COUNT-FAILURE-REPORTS
command.

2. The one in clusterProcessGossipSection, it is not that important, but it can
print a "node is back online" log helps us troubleshoot the problem, although
it may conflict with 1 at some points.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index bbf63d46b9..876beef91f 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -1552,9 +1552,14 @@ int clusterNodeAddFailureReport(clusterNode *failing, clusterNode *sender) {
  * older than the global node timeout. Note that anyway for a node to be
  * flagged as FAIL we need to have a local PFAIL state that is at least
  * older than the global node timeout, so we don't just trust the number
- * of failure reports from other nodes. */
+ * of failure reports from other nodes.
+ *
+ * If the reporting node loses its voting right during this time, we will
+ * also clear its report. */
 void clusterNodeCleanupFailureReports(clusterNode *node) {
     list *l = node->fail_reports;
+    if (!listLength(l)) return;
+
     listNode *ln;
     listIter li;
     clusterNodeFailReport *fr;
@@ -1564,7 +1569,11 @@ void clusterNodeCleanupFailureReports(clusterNode *node) {
     listRewind(l, &li);
     while ((ln = listNext(&li)) != NULL) {
         fr = ln->value;
-        if (now - fr->time > maxtime) listDelNode(l, ln);
+        if (now - fr->time > maxtime) {
+            listDelNode(l, ln);
+        } else if (!clusterNodeIsVotingPrimary(fr->node)) {
+            listDelNode(l, ln);
+        }
     }
 }
 
@@ -1581,6 +1590,8 @@ void clusterNodeCleanupFailureReports(clusterNode *node) {
  * Otherwise 0 is returned. */
 int clusterNodeDelFailureReport(clusterNode *node, clusterNode *sender) {
     list *l = node->fail_reports;
+    if (!listLength(l)) return 0;
+
     listNode *ln;
     listIter li;
     clusterNodeFailReport *fr;
@@ -2254,10 +2265,11 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) {
         /* Ignore gossips about self. */
         if (node && node != myself) {
             /* We already know this node.
-               Handle failure reports, only when the sender is a voting primary. */
-            if (sender && clusterNodeIsVotingPrimary(sender)) {
+             * Handle failure reports, the report is added only if the sender is a voting primary,
+             * and deletion of a failure report is not restricted. */
+            if (sender) {
                 if (flags & (CLUSTER_NODE_FAIL | CLUSTER_NODE_PFAIL)) {
-                    if (clusterNodeAddFailureReport(node, sender)) {
+                    if (clusterNodeIsVotingPrimary(sender) && clusterNodeAddFailureReport(node, sender)) {
                         serverLog(LL_NOTICE, "Node %.40s (%s) reported node %.40s (%s) as not reachable.", sender->name,
                                   sender->human_nodename, node->name, node->human_nodename);
                     }

From ffef236dbbfd26383262fa222e869814f5608ce5 Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Thu, 19 Dec 2024 18:14:56 -0800
Subject: [PATCH 63/73] Fix storing the wrong PID in active servers (#1464)

In #1459, I missed that the data was also used to keep track of the PID
files so if the testing framework crashed it would no longer be able to
cleanup the extra servers. So now we properly extract the PID and store
it so we can clean up PIDs.

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 tests/test_helper.tcl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl
index 8a4125e48d..54bb923674 100644
--- a/tests/test_helper.tcl
+++ b/tests/test_helper.tcl
@@ -421,7 +421,8 @@ proc read_from_test_client fd {
     } elseif {$status eq {server-spawning}} {
         set ::active_clients_task($fd) "(SPAWNING SERVER) $data"
     } elseif {$status eq {server-spawned}} {
-        lappend ::active_servers $data
+        set pid [string trim [lindex [split $data "-"] 0]]
+        lappend ::active_servers $pid
         set ::active_clients_task($fd) "(SPAWNED SERVER) pid:$data"
     } elseif {$status eq {server-killing}} {
         set ::active_clients_task($fd) "(KILLING SERVER) pid:$data"

From b56f4f70d2cae11988f2f330b8060e21d78b163b Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Thu, 19 Dec 2024 18:16:46 -0800
Subject: [PATCH 64/73] Update info.tcl test to revert client output limits
 sooner (#1462)

We set the client output buffer limits to 10 bytes, and then execute
`info stats` which produces more than 10 bytes of output, which can
cause that command to throw an error.

I'm not sure why it wasn't consistently erroring before, might have been
some change related to the ubuntu upgrade though.

Issues related to ubuntu-tls are hopefully resolved now.

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 tests/unit/info.tcl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl
index 11dc4e5d40..3295c5e31a 100644
--- a/tests/unit/info.tcl
+++ b/tests/unit/info.tcl
@@ -406,10 +406,10 @@ start_server {tags {"info" "external:skip" "debug_defrag:skip"}} {
             r config set client-output-buffer-limit "normal 10 0 0"
             r set key [string repeat a 100000] ;# to trigger output buffer limit check this needs to be big
             catch {r get key}
+            r config set client-output-buffer-limit $org_outbuf_limit
             set info [r info stats]
             assert_equal [getInfoProperty $info client_output_buffer_limit_disconnections] {1}
-            r config set client-output-buffer-limit $org_outbuf_limit
-        } {OK} {logreqres:skip} ;# same as obuf-limits.tcl, skip logreqres
+        } {} {logreqres:skip} ;# same as obuf-limits.tcl, skip logreqres
 
         test {clients: pubsub clients} {
             set info [r info clients]

From 1c97317518e40efb4c271ae3b0656cc6e43f0110 Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Fri, 20 Dec 2024 12:10:48 -0800
Subject: [PATCH 65/73] Resolve bounds checks on cluster_legacy.c (#1463)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We are getting a number of errors like:
```
array subscript ‘clusterMsg[0]’ is partly outside array bounds of ‘unsigned char[2272]’
```

Which is basically GCC telling us that we have an object which is longer
than the underlying storage of the allocation. We actually do this a
lot, but GCC is generally not aware of how big the underlying allocation
is, so it doesn't throw this error. We are specifically getting this
error because the msgBlock can be of variable length depending on the
type of message, but GCC assumes it's the longest one possible. The
solution I went with here was make the message type optional, so that it
wasn't included in the size. I think this also makes some sense, since
it's really just a helper for us to easily cast the object around.

I considered disabling this error, but it is generally pretty useful
since it can catch real issues. Another solution would be to
over-allocate to the largest possible object, which could hurt
performance as we initialize it to zero.

Results:
https://github.com/madolson/valkey/actions/runs/12423414811/job/34686899884

This is a slightly cleaned up version of
https://github.com/valkey-io/valkey/pull/1439. I thought I had another
strategy but alas, it didn't work out.

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 src/cluster_legacy.c | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 876beef91f..9a23527b30 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -424,9 +424,19 @@ typedef struct {
     union {
         clusterMsg msg;
         clusterMsgLight msg_light;
-    };
+    } data[];
 } clusterMsgSendBlock;
 
+/* Helper function to extract a normal message from a send block. */
+static clusterMsgLight *getLightMessageFromSendBlock(clusterMsgSendBlock *msgblock) {
+    return &msgblock->data[0].msg_light;
+}
+
+/* Helper function to extract a light message from a send block. */
+static clusterMsg *getMessageFromSendBlock(clusterMsgSendBlock *msgblock) {
+    return &msgblock->data[0].msg;
+}
+
 /* -----------------------------------------------------------------------------
  * Initialization
  * -------------------------------------------------------------------------- */
@@ -1288,15 +1298,15 @@ void clusterReset(int hard) {
  * CLUSTER communication link
  * -------------------------------------------------------------------------- */
 clusterMsgSendBlock *createClusterMsgSendBlock(int type, uint32_t msglen) {
-    uint32_t blocklen = msglen + offsetof(clusterMsgSendBlock, msg);
+    uint32_t blocklen = msglen + offsetof(clusterMsgSendBlock, data);
     clusterMsgSendBlock *msgblock = zcalloc(blocklen);
     msgblock->refcount = 1;
     msgblock->totlen = blocklen;
     server.stat_cluster_links_memory += blocklen;
     if (IS_LIGHT_MESSAGE(type)) {
-        clusterBuildMessageHdrLight(&msgblock->msg_light, type, msglen);
+        clusterBuildMessageHdrLight(getLightMessageFromSendBlock(msgblock), type, msglen);
     } else {
-        clusterBuildMessageHdr(&msgblock->msg, type, msglen);
+        clusterBuildMessageHdr(getMessageFromSendBlock(msgblock), type, msglen);
     }
     return msgblock;
 }
@@ -3668,7 +3678,7 @@ void clusterWriteHandler(connection *conn) {
     while (totwritten < NET_MAX_WRITES_PER_EVENT && listLength(link->send_msg_queue) > 0) {
         listNode *head = listFirst(link->send_msg_queue);
         clusterMsgSendBlock *msgblock = (clusterMsgSendBlock *)head->value;
-        clusterMsg *msg = &msgblock->msg;
+        clusterMsg *msg = getMessageFromSendBlock(msgblock);
         size_t msg_offset = link->head_msg_send_offset;
         size_t msg_len = ntohl(msg->totlen);
 
@@ -3853,7 +3863,7 @@ void clusterSendMessage(clusterLink *link, clusterMsgSendBlock *msgblock) {
     if (!link) {
         return;
     }
-    if (listLength(link->send_msg_queue) == 0 && msgblock->msg.totlen != 0)
+    if (listLength(link->send_msg_queue) == 0 && getMessageFromSendBlock(msgblock)->totlen != 0)
         connSetWriteHandlerWithBarrier(link->conn, clusterWriteHandler, 1);
 
     listAddNodeTail(link->send_msg_queue, msgblock);
@@ -3864,7 +3874,7 @@ void clusterSendMessage(clusterLink *link, clusterMsgSendBlock *msgblock) {
     server.stat_cluster_links_memory += sizeof(listNode);
 
     /* Populate sent messages stats. */
-    uint16_t type = ntohs(msgblock->msg.type);
+    uint16_t type = ntohs(getMessageFromSendBlock(msgblock)->type);
     if (type < CLUSTERMSG_TYPE_COUNT) server.cluster->stats_bus_messages_sent[type]++;
 }
 
@@ -4050,7 +4060,7 @@ void clusterSendPing(clusterLink *link, int type) {
      * sizeof(clusterMsg) or more. */
     if (estlen < (int)sizeof(clusterMsg)) estlen = sizeof(clusterMsg);
     clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, estlen);
-    clusterMsg *hdr = &msgblock->msg;
+    clusterMsg *hdr = getMessageFromSendBlock(msgblock);
 
     if (!link->inbound && type == CLUSTERMSG_TYPE_PING) link->node->ping_sent = mstime();
 
@@ -4195,10 +4205,10 @@ clusterMsgSendBlock *clusterCreatePublishMsgBlock(robj *channel, robj *message,
     clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(type, msglen);
     clusterMsgDataPublish *hdr_data_msg;
     if (is_light) {
-        clusterMsgLight *hdr_light = &msgblock->msg_light;
+        clusterMsgLight *hdr_light = getLightMessageFromSendBlock(msgblock);
         hdr_data_msg = &hdr_light->data.publish.msg;
     } else {
-        clusterMsg *hdr = &msgblock->msg;
+        clusterMsg *hdr = getMessageFromSendBlock(msgblock);
         hdr_data_msg = &hdr->data.publish.msg;
     }
     hdr_data_msg->channel_len = htonl(channel_len);
@@ -4221,7 +4231,7 @@ void clusterSendFail(char *nodename) {
     uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + sizeof(clusterMsgDataFail);
     clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAIL, msglen);
 
-    clusterMsg *hdr = &msgblock->msg;
+    clusterMsg *hdr = getMessageFromSendBlock(msgblock);
     memcpy(hdr->data.fail.about.nodename, nodename, CLUSTER_NAMELEN);
 
     clusterBroadcastMessage(msgblock);
@@ -4237,7 +4247,7 @@ void clusterSendUpdate(clusterLink *link, clusterNode *node) {
     uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData) + sizeof(clusterMsgDataUpdate);
     clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_UPDATE, msglen);
 
-    clusterMsg *hdr = &msgblock->msg;
+    clusterMsg *hdr = getMessageFromSendBlock(msgblock);
     memcpy(hdr->data.update.nodecfg.nodename, node->name, CLUSTER_NAMELEN);
     hdr->data.update.nodecfg.configEpoch = htonu64(node->configEpoch);
     memcpy(hdr->data.update.nodecfg.slots, node->slots, sizeof(node->slots));
@@ -4259,7 +4269,7 @@ void clusterSendModule(clusterLink *link, uint64_t module_id, uint8_t type, cons
     msglen += sizeof(clusterMsgModule) - 3 + len;
     clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_MODULE, msglen);
 
-    clusterMsg *hdr = &msgblock->msg;
+    clusterMsg *hdr = getMessageFromSendBlock(msgblock);
     hdr->data.module.msg.module_id = module_id; /* Already endian adjusted. */
     hdr->data.module.msg.type = type;
     hdr->data.module.msg.len = htonl(len);
@@ -4348,11 +4358,10 @@ void clusterRequestFailoverAuth(void) {
     uint32_t msglen = sizeof(clusterMsg) - sizeof(union clusterMsgData);
     clusterMsgSendBlock *msgblock = createClusterMsgSendBlock(CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST, msglen);
 
-    clusterMsg *hdr = &msgblock->msg;
     /* If this is a manual failover, set the CLUSTERMSG_FLAG0_FORCEACK bit
      * in the header to communicate the nodes receiving the message that
      * they should authorized the failover even if the primary is working. */
-    if (server.cluster->mf_end) hdr->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK;
+    if (server.cluster->mf_end) getMessageFromSendBlock(msgblock)->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK;
     clusterBroadcastMessage(msgblock);
     clusterMsgSendBlockDecrRefCount(msgblock);
 }

From 6adef8e2f97c3cd2dd4620fb9d5af8d426f1b548 Mon Sep 17 00:00:00 2001
From: Ricardo Dias <ricardo.dias@percona.com>
Date: Sat, 21 Dec 2024 22:09:35 +0000
Subject: [PATCH 66/73] Adds support for scripting engines as Valkey modules
 (#1277)

This PR extends the module API to support the addition of different
scripting engines to execute user defined functions.

The scripting engine can be implemented as a Valkey module, and can be
dynamically loaded with the `loadmodule` config directive, or with the
`MODULE LOAD` command.

This PR also adds an example of a dummy scripting engine module, to show
how to use the new module API. The dummy module is implemented in
`tests/modules/helloscripting.c`.

The current module API support, only allows to load scripting engines to
run functions using `FCALL` command.

The additions to the module API are the following:

```c
/* This struct represents a scripting engine function that results from the
 * compilation of a script by the engine implementation. */
struct ValkeyModuleScriptingEngineCompiledFunction

typedef ValkeyModuleScriptingEngineCompiledFunction **(*ValkeyModuleScriptingEngineCreateFunctionsLibraryFunc)(
    ValkeyModuleScriptingEngineCtx *engine_ctx,
    const char *code,
    size_t timeout,
    size_t *out_num_compiled_functions,
    char **err);

typedef void (*ValkeyModuleScriptingEngineCallFunctionFunc)(
    ValkeyModuleCtx *module_ctx,
    ValkeyModuleScriptingEngineCtx *engine_ctx,
    ValkeyModuleScriptingEngineFunctionCtx *func_ctx,
    void *compiled_function,
    ValkeyModuleString **keys,
    size_t nkeys,
    ValkeyModuleString **args,
    size_t nargs);

typedef size_t (*ValkeyModuleScriptingEngineGetUsedMemoryFunc)(
    ValkeyModuleScriptingEngineCtx *engine_ctx);

typedef size_t (*ValkeyModuleScriptingEngineGetFunctionMemoryOverheadFunc)(
    void *compiled_function);

typedef size_t (*ValkeyModuleScriptingEngineGetEngineMemoryOverheadFunc)(
    ValkeyModuleScriptingEngineCtx *engine_ctx);

typedef void (*ValkeyModuleScriptingEngineFreeFunctionFunc)(
    ValkeyModuleScriptingEngineCtx *engine_ctx,
    void *compiled_function);

/* This struct stores the callback functions implemented by the scripting
 * engine to provide the functionality for the `FUNCTION *` commands. */
typedef struct ValkeyModuleScriptingEngineMethodsV1 {
    uint64_t version; /* Version of this structure for ABI compat. */

    /* Library create function callback. When a new script is loaded, this
     * callback will be called with the script code, and returns a list of
     * ValkeyModuleScriptingEngineCompiledFunc objects. */
    ValkeyModuleScriptingEngineCreateFunctionsLibraryFunc create_functions_library;

    /* The callback function called when `FCALL` command is called on a function
     * registered in this engine. */
    ValkeyModuleScriptingEngineCallFunctionFunc call_function;

    /* Function callback to get current used memory by the engine. */
    ValkeyModuleScriptingEngineGetUsedMemoryFunc get_used_memory;

    /* Function callback to return memory overhead for a given function. */
    ValkeyModuleScriptingEngineGetFunctionMemoryOverheadFunc get_function_memory_overhead;

    /* Function callback to return memory overhead of the engine. */
    ValkeyModuleScriptingEngineGetEngineMemoryOverheadFunc get_engine_memory_overhead;

    /* Function callback to free the memory of a registered engine function. */
    ValkeyModuleScriptingEngineFreeFunctionFunc free_function;
} ValkeyModuleScriptingEngineMethodsV1;

/* Registers a new scripting engine in the server.
 *
 * - `engine_name`: the name of the scripting engine. This name will match
 *   against the engine name specified in the script header using a shebang.
 *
 * - `engine_ctx`: engine specific context pointer.
 *
 * - `engine_methods`: the struct with the scripting engine callback functions
 * pointers.
 */
int ValkeyModule_RegisterScriptingEngine(ValkeyModuleCtx *ctx,
                                         const char *engine_name,
                                         void *engine_ctx,
                                         ValkeyModuleScriptingEngineMethods engine_methods);

/* Removes the scripting engine from the server.
 *
 * `engine_name` is the name of the scripting engine.
 *
 */
int ValkeyModule_UnregisterScriptingEngine(ValkeyModuleCtx *ctx, const char *engine_name);
```

---------

Signed-off-by: Ricardo Dias <ricardo.dias@percona.com>
---
 src/function_lua.c                       | 205 +++++++-----
 src/functions.c                          | 240 ++++++++++++--
 src/functions.h                          |  74 +++--
 src/module.c                             |  76 +++++
 src/module.h                             |  17 +
 src/script.h                             |   2 +
 src/script_lua.c                         |   6 +-
 src/script_lua.h                         |   2 +-
 src/util.c                               |  21 ++
 src/util.h                               |   1 +
 src/valkeymodule.h                       |  99 +++++-
 tests/modules/CMakeLists.txt             |   1 +
 tests/modules/Makefile                   |   3 +-
 tests/modules/helloscripting.c           | 383 +++++++++++++++++++++++
 tests/unit/functions.tcl                 |   4 +-
 tests/unit/moduleapi/scriptingengine.tcl | 126 ++++++++
 16 files changed, 1124 insertions(+), 136 deletions(-)
 create mode 100644 src/module.h
 create mode 100644 tests/modules/helloscripting.c
 create mode 100644 tests/unit/moduleapi/scriptingengine.tcl

diff --git a/src/function_lua.c b/src/function_lua.c
index fa9983bf7e..b535528906 100644
--- a/src/function_lua.c
+++ b/src/function_lua.c
@@ -64,17 +64,14 @@ typedef struct luaFunctionCtx {
 } luaFunctionCtx;
 
 typedef struct loadCtx {
-    functionLibInfo *li;
+    list *functions;
     monotime start_time;
     size_t timeout;
 } loadCtx;
 
-typedef struct registerFunctionArgs {
-    sds name;
-    sds desc;
-    luaFunctionCtx *lua_f_ctx;
-    uint64_t f_flags;
-} registerFunctionArgs;
+static void luaEngineFreeFunction(ValkeyModuleCtx *module_ctx,
+                                  engineCtx *engine_ctx,
+                                  void *compiled_function);
 
 /* Hook for FUNCTION LOAD execution.
  * Used to cancel the execution in case of a timeout (500ms).
@@ -93,15 +90,42 @@ static void luaEngineLoadHook(lua_State *lua, lua_Debug *ar) {
     }
 }
 
+static void freeCompiledFunc(ValkeyModuleCtx *module_ctx,
+                             luaEngineCtx *lua_engine_ctx,
+                             void *compiled_func) {
+    /* The lua engine is implemented in the core, and not in a Valkey Module */
+    serverAssert(module_ctx == NULL);
+
+    compiledFunction *func = compiled_func;
+    decrRefCount(func->name);
+    if (func->desc) {
+        decrRefCount(func->desc);
+    }
+    luaEngineFreeFunction(module_ctx, lua_engine_ctx, func->function);
+    zfree(func);
+}
+
 /*
- * Compile a given blob and save it on the registry.
- * Return a function ctx with Lua ref that allows to later retrieve the
- * function from the registry.
+ * Compile a given script code by generating a set of compiled functions. These
+ * functions are also saved into the the registry of the Lua environment.
+ *
+ * Returns an array of compiled functions. The `compileFunction` struct stores a
+ * Lua ref that allows to later retrieve the function from the registry.
+ * In the `out_num_compiled_functions` parameter is returned the size of the
+ * array.
  *
  * Return NULL on compilation error and set the error to the err variable
  */
-static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, size_t timeout, sds *err) {
-    int ret = C_ERR;
+static compiledFunction **luaEngineCreate(ValkeyModuleCtx *module_ctx,
+                                          engineCtx *engine_ctx,
+                                          const char *code,
+                                          size_t timeout,
+                                          size_t *out_num_compiled_functions,
+                                          char **err) {
+    /* The lua engine is implemented in the core, and not in a Valkey Module */
+    serverAssert(module_ctx == NULL);
+
+    compiledFunction **compiled_functions = NULL;
     luaEngineCtx *lua_engine_ctx = engine_ctx;
     lua_State *lua = lua_engine_ctx->lua;
 
@@ -114,15 +138,15 @@ static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, size
     lua_pop(lua, 1);                                   /* pop the metatable */
 
     /* compile the code */
-    if (luaL_loadbuffer(lua, blob, sdslen(blob), "@user_function")) {
-        *err = sdscatprintf(sdsempty(), "Error compiling function: %s", lua_tostring(lua, -1));
+    if (luaL_loadbuffer(lua, code, strlen(code), "@user_function")) {
+        *err = valkey_asprintf("Error compiling function: %s", lua_tostring(lua, -1));
         lua_pop(lua, 1); /* pops the error */
         goto done;
     }
     serverAssert(lua_isfunction(lua, -1));
 
     loadCtx load_ctx = {
-        .li = li,
+        .functions = listCreate(),
         .start_time = getMonotonicUs(),
         .timeout = timeout,
     };
@@ -133,13 +157,31 @@ static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, size
     if (lua_pcall(lua, 0, 0, 0)) {
         errorInfo err_info = {0};
         luaExtractErrorInformation(lua, &err_info);
-        *err = sdscatprintf(sdsempty(), "Error registering functions: %s", err_info.msg);
+        *err = valkey_asprintf("Error registering functions: %s", err_info.msg);
         lua_pop(lua, 1); /* pops the error */
         luaErrorInformationDiscard(&err_info);
+        listIter *iter = listGetIterator(load_ctx.functions, AL_START_HEAD);
+        listNode *node = NULL;
+        while ((node = listNext(iter)) != NULL) {
+            freeCompiledFunc(module_ctx, lua_engine_ctx, listNodeValue(node));
+        }
+        listReleaseIterator(iter);
+        listRelease(load_ctx.functions);
         goto done;
     }
 
-    ret = C_OK;
+    compiled_functions =
+        zcalloc(sizeof(compiledFunction *) * listLength(load_ctx.functions));
+    listIter *iter = listGetIterator(load_ctx.functions, AL_START_HEAD);
+    listNode *node = NULL;
+    *out_num_compiled_functions = 0;
+    while ((node = listNext(iter)) != NULL) {
+        compiledFunction *func = listNodeValue(node);
+        compiled_functions[*out_num_compiled_functions] = func;
+        (*out_num_compiled_functions)++;
+    }
+    listReleaseIterator(iter);
+    listRelease(load_ctx.functions);
 
 done:
     /* restore original globals */
@@ -152,19 +194,23 @@ static int luaEngineCreate(void *engine_ctx, functionLibInfo *li, sds blob, size
 
     lua_sethook(lua, NULL, 0, 0); /* Disable hook */
     luaSaveOnRegistry(lua, REGISTRY_LOAD_CTX_NAME, NULL);
-    return ret;
+    return compiled_functions;
 }
 
 /*
  * Invole the give function with the given keys and args
  */
-static void luaEngineCall(scriptRunCtx *run_ctx,
-                          void *engine_ctx,
+static void luaEngineCall(ValkeyModuleCtx *module_ctx,
+                          engineCtx *engine_ctx,
+                          functionCtx *func_ctx,
                           void *compiled_function,
                           robj **keys,
                           size_t nkeys,
                           robj **args,
                           size_t nargs) {
+    /* The lua engine is implemented in the core, and not in a Valkey Module */
+    serverAssert(module_ctx == NULL);
+
     luaEngineCtx *lua_engine_ctx = engine_ctx;
     lua_State *lua = lua_engine_ctx->lua;
     luaFunctionCtx *f_ctx = compiled_function;
@@ -177,25 +223,38 @@ static void luaEngineCall(scriptRunCtx *run_ctx,
 
     serverAssert(lua_isfunction(lua, -1));
 
+    scriptRunCtx *run_ctx = (scriptRunCtx *)func_ctx;
     luaCallFunction(run_ctx, lua, keys, nkeys, args, nargs, 0);
     lua_pop(lua, 1); /* Pop error handler */
 }
 
-static size_t luaEngineGetUsedMemoy(void *engine_ctx) {
+static engineMemoryInfo luaEngineGetMemoryInfo(ValkeyModuleCtx *module_ctx,
+                                               engineCtx *engine_ctx) {
+    /* The lua engine is implemented in the core, and not in a Valkey Module */
+    serverAssert(module_ctx == NULL);
+
     luaEngineCtx *lua_engine_ctx = engine_ctx;
-    return luaMemory(lua_engine_ctx->lua);
+
+    return (engineMemoryInfo){
+        .used_memory = luaMemory(lua_engine_ctx->lua),
+        .engine_memory_overhead = zmalloc_size(lua_engine_ctx),
+    };
 }
 
-static size_t luaEngineFunctionMemoryOverhead(void *compiled_function) {
+static size_t luaEngineFunctionMemoryOverhead(ValkeyModuleCtx *module_ctx,
+                                              void *compiled_function) {
+    /* The lua engine is implemented in the core, and not in a Valkey Module */
+    serverAssert(module_ctx == NULL);
+
     return zmalloc_size(compiled_function);
 }
 
-static size_t luaEngineMemoryOverhead(void *engine_ctx) {
-    luaEngineCtx *lua_engine_ctx = engine_ctx;
-    return zmalloc_size(lua_engine_ctx);
-}
+static void luaEngineFreeFunction(ValkeyModuleCtx *module_ctx,
+                                  engineCtx *engine_ctx,
+                                  void *compiled_function) {
+    /* The lua engine is implemented in the core, and not in a Valkey Module */
+    serverAssert(module_ctx == NULL);
 
-static void luaEngineFreeFunction(void *engine_ctx, void *compiled_function) {
     luaEngineCtx *lua_engine_ctx = engine_ctx;
     lua_State *lua = lua_engine_ctx->lua;
     luaFunctionCtx *f_ctx = compiled_function;
@@ -203,26 +262,19 @@ static void luaEngineFreeFunction(void *engine_ctx, void *compiled_function) {
     zfree(f_ctx);
 }
 
-static void luaRegisterFunctionArgsInitialize(registerFunctionArgs *register_f_args,
-                                              sds name,
-                                              sds desc,
+static void luaRegisterFunctionArgsInitialize(compiledFunction *func,
+                                              robj *name,
+                                              robj *desc,
                                               luaFunctionCtx *lua_f_ctx,
                                               uint64_t flags) {
-    *register_f_args = (registerFunctionArgs){
+    *func = (compiledFunction){
         .name = name,
         .desc = desc,
-        .lua_f_ctx = lua_f_ctx,
+        .function = lua_f_ctx,
         .f_flags = flags,
     };
 }
 
-static void luaRegisterFunctionArgsDispose(lua_State *lua, registerFunctionArgs *register_f_args) {
-    sdsfree(register_f_args->name);
-    if (register_f_args->desc) sdsfree(register_f_args->desc);
-    lua_unref(lua, register_f_args->lua_f_ctx->lua_function_ref);
-    zfree(register_f_args->lua_f_ctx);
-}
-
 /* Read function flags located on the top of the Lua stack.
  * On success, return C_OK and set the flags to 'flags' out parameter
  * Return C_ERR if encounter an unknown flag. */
@@ -267,10 +319,11 @@ static int luaRegisterFunctionReadFlags(lua_State *lua, uint64_t *flags) {
     return ret;
 }
 
-static int luaRegisterFunctionReadNamedArgs(lua_State *lua, registerFunctionArgs *register_f_args) {
+static int luaRegisterFunctionReadNamedArgs(lua_State *lua,
+                                            compiledFunction *func) {
     char *err = NULL;
-    sds name = NULL;
-    sds desc = NULL;
+    robj *name = NULL;
+    robj *desc = NULL;
     luaFunctionCtx *lua_f_ctx = NULL;
     uint64_t flags = 0;
     if (!lua_istable(lua, 1)) {
@@ -287,14 +340,15 @@ static int luaRegisterFunctionReadNamedArgs(lua_State *lua, registerFunctionArgs
             err = "named argument key given to server.register_function is not a string";
             goto error;
         }
+
         const char *key = lua_tostring(lua, -2);
         if (!strcasecmp(key, "function_name")) {
-            if (!(name = luaGetStringSds(lua, -1))) {
+            if (!(name = luaGetStringObject(lua, -1))) {
                 err = "function_name argument given to server.register_function must be a string";
                 goto error;
             }
         } else if (!strcasecmp(key, "description")) {
-            if (!(desc = luaGetStringSds(lua, -1))) {
+            if (!(desc = luaGetStringObject(lua, -1))) {
                 err = "description argument given to server.register_function must be a string";
                 goto error;
             }
@@ -335,13 +389,17 @@ static int luaRegisterFunctionReadNamedArgs(lua_State *lua, registerFunctionArgs
         goto error;
     }
 
-    luaRegisterFunctionArgsInitialize(register_f_args, name, desc, lua_f_ctx, flags);
+    luaRegisterFunctionArgsInitialize(func,
+                                      name,
+                                      desc,
+                                      lua_f_ctx,
+                                      flags);
 
     return C_OK;
 
 error:
-    if (name) sdsfree(name);
-    if (desc) sdsfree(desc);
+    if (name) decrRefCount(name);
+    if (desc) decrRefCount(desc);
     if (lua_f_ctx) {
         lua_unref(lua, lua_f_ctx->lua_function_ref);
         zfree(lua_f_ctx);
@@ -350,11 +408,12 @@ static int luaRegisterFunctionReadNamedArgs(lua_State *lua, registerFunctionArgs
     return C_ERR;
 }
 
-static int luaRegisterFunctionReadPositionalArgs(lua_State *lua, registerFunctionArgs *register_f_args) {
+static int luaRegisterFunctionReadPositionalArgs(lua_State *lua,
+                                                 compiledFunction *func) {
     char *err = NULL;
-    sds name = NULL;
+    robj *name = NULL;
     luaFunctionCtx *lua_f_ctx = NULL;
-    if (!(name = luaGetStringSds(lua, 1))) {
+    if (!(name = luaGetStringObject(lua, 1))) {
         err = "first argument to server.register_function must be a string";
         goto error;
     }
@@ -369,17 +428,17 @@ static int luaRegisterFunctionReadPositionalArgs(lua_State *lua, registerFunctio
     lua_f_ctx = zmalloc(sizeof(*lua_f_ctx));
     lua_f_ctx->lua_function_ref = lua_function_ref;
 
-    luaRegisterFunctionArgsInitialize(register_f_args, name, NULL, lua_f_ctx, 0);
+    luaRegisterFunctionArgsInitialize(func, name, NULL, lua_f_ctx, 0);
 
     return C_OK;
 
 error:
-    if (name) sdsfree(name);
+    if (name) decrRefCount(name);
     luaPushError(lua, err);
     return C_ERR;
 }
 
-static int luaRegisterFunctionReadArgs(lua_State *lua, registerFunctionArgs *register_f_args) {
+static int luaRegisterFunctionReadArgs(lua_State *lua, compiledFunction *func) {
     int argc = lua_gettop(lua);
     if (argc < 1 || argc > 2) {
         luaPushError(lua, "wrong number of arguments to server.register_function");
@@ -387,33 +446,28 @@ static int luaRegisterFunctionReadArgs(lua_State *lua, registerFunctionArgs *reg
     }
 
     if (argc == 1) {
-        return luaRegisterFunctionReadNamedArgs(lua, register_f_args);
+        return luaRegisterFunctionReadNamedArgs(lua, func);
     } else {
-        return luaRegisterFunctionReadPositionalArgs(lua, register_f_args);
+        return luaRegisterFunctionReadPositionalArgs(lua, func);
     }
 }
 
 static int luaRegisterFunction(lua_State *lua) {
-    registerFunctionArgs register_f_args = {0};
+    compiledFunction *func = zcalloc(sizeof(*func));
 
     loadCtx *load_ctx = luaGetFromRegistry(lua, REGISTRY_LOAD_CTX_NAME);
     if (!load_ctx) {
+        zfree(func);
         luaPushError(lua, "server.register_function can only be called on FUNCTION LOAD command");
         return luaError(lua);
     }
 
-    if (luaRegisterFunctionReadArgs(lua, &register_f_args) != C_OK) {
+    if (luaRegisterFunctionReadArgs(lua, func) != C_OK) {
+        zfree(func);
         return luaError(lua);
     }
 
-    sds err = NULL;
-    if (functionLibCreateFunction(register_f_args.name, register_f_args.lua_f_ctx, load_ctx->li, register_f_args.desc,
-                                  register_f_args.f_flags, &err) != C_OK) {
-        luaRegisterFunctionArgsDispose(lua, &register_f_args);
-        luaPushError(lua, err);
-        sdsfree(err);
-        return luaError(lua);
-    }
+    listAddNodeTail(load_ctx->functions, func);
 
     return 0;
 }
@@ -494,16 +548,17 @@ int luaEngineInitEngine(void) {
     lua_enablereadonlytable(lua_engine_ctx->lua, -1, 1); /* protect the new global table */
     lua_replace(lua_engine_ctx->lua, LUA_GLOBALSINDEX);  /* set new global table as the new globals */
 
-
-    engine *lua_engine = zmalloc(sizeof(*lua_engine));
-    *lua_engine = (engine){
-        .engine_ctx = lua_engine_ctx,
-        .create = luaEngineCreate,
-        .call = luaEngineCall,
-        .get_used_memory = luaEngineGetUsedMemoy,
+    engineMethods lua_engine_methods = {
+        .version = VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION,
+        .create_functions_library = luaEngineCreate,
+        .call_function = luaEngineCall,
         .get_function_memory_overhead = luaEngineFunctionMemoryOverhead,
-        .get_engine_memory_overhead = luaEngineMemoryOverhead,
         .free_function = luaEngineFreeFunction,
+        .get_memory_info = luaEngineGetMemoryInfo,
     };
-    return functionsRegisterEngine(LUA_ENGINE_NAME, lua_engine);
+
+    return functionsRegisterEngine(LUA_ENGINE_NAME,
+                                   NULL,
+                                   lua_engine_ctx,
+                                   &lua_engine_methods);
 }
diff --git a/src/functions.c b/src/functions.c
index feb82d4ab7..0d003f7fac 100644
--- a/src/functions.c
+++ b/src/functions.c
@@ -31,6 +31,7 @@
 #include "sds.h"
 #include "dict.h"
 #include "adlist.h"
+#include "module.h"
 
 #define LOAD_TIMEOUT_MS 500
 
@@ -117,9 +118,28 @@ static dict *engines = NULL;
 /* Libraries Ctx. */
 static functionsLibCtx *curr_functions_lib_ctx = NULL;
 
+static void setupEngineModuleCtx(engineInfo *ei, client *c) {
+    if (ei->engineModule != NULL) {
+        serverAssert(ei->module_ctx != NULL);
+        moduleScriptingEngineInitContext(ei->module_ctx, ei->engineModule, c);
+    }
+}
+
+static void teardownEngineModuleCtx(engineInfo *ei) {
+    if (ei->engineModule != NULL) {
+        serverAssert(ei->module_ctx != NULL);
+        moduleFreeContext(ei->module_ctx);
+    }
+}
+
 static size_t functionMallocSize(functionInfo *fi) {
-    return zmalloc_size(fi) + sdsAllocSize(fi->name) + (fi->desc ? sdsAllocSize(fi->desc) : 0) +
-           fi->li->ei->engine->get_function_memory_overhead(fi->function);
+    setupEngineModuleCtx(fi->li->ei, NULL);
+    size_t size = zmalloc_size(fi) +
+                  sdsAllocSize(fi->name) +
+                  (fi->desc ? sdsAllocSize(fi->desc) : 0) +
+                  fi->li->ei->engine->get_function_memory_overhead(fi->li->ei->module_ctx, fi->function);
+    teardownEngineModuleCtx(fi->li->ei);
+    return size;
 }
 
 static size_t libraryMallocSize(functionLibInfo *li) {
@@ -141,8 +161,12 @@ static void engineFunctionDispose(void *obj) {
     if (fi->desc) {
         sdsfree(fi->desc);
     }
+    setupEngineModuleCtx(fi->li->ei, NULL);
     engine *engine = fi->li->ei->engine;
-    engine->free_function(engine->engine_ctx, fi->function);
+    engine->free_function(fi->li->ei->module_ctx,
+                          engine->engine_ctx,
+                          fi->function);
+    teardownEngineModuleCtx(fi->li->ei);
     zfree(fi);
 }
 
@@ -233,6 +257,15 @@ functionsLibCtx *functionsLibCtxCreate(void) {
     return ret;
 }
 
+void functionsAddEngineStats(engineInfo *ei) {
+    serverAssert(curr_functions_lib_ctx != NULL);
+    dictEntry *entry = dictFind(curr_functions_lib_ctx->engines_stats, ei->name);
+    if (entry == NULL) {
+        functionsLibEngineStats *stats = zcalloc(sizeof(*stats));
+        dictAdd(curr_functions_lib_ctx->engines_stats, ei->name, stats);
+    }
+}
+
 /*
  * Creating a function inside the given library.
  * On success, return C_OK.
@@ -242,24 +275,34 @@ functionsLibCtx *functionsLibCtxCreate(void) {
  *       the function will verify that the given name is following the naming format
  *       and return an error if its not.
  */
-int functionLibCreateFunction(sds name, void *function, functionLibInfo *li, sds desc, uint64_t f_flags, sds *err) {
-    if (functionsVerifyName(name) != C_OK) {
-        *err = sdsnew("Library names can only contain letters, numbers, or underscores(_) and must be at least one "
-                      "character long");
+static int functionLibCreateFunction(robj *name,
+                                     void *function,
+                                     functionLibInfo *li,
+                                     robj *desc,
+                                     uint64_t f_flags,
+                                     sds *err) {
+    serverAssert(name->type == OBJ_STRING);
+    serverAssert(desc == NULL || desc->type == OBJ_STRING);
+
+    if (functionsVerifyName(name->ptr) != C_OK) {
+        *err = sdsnew("Function names can only contain letters, numbers, or "
+                      "underscores(_) and must be at least one character long");
         return C_ERR;
     }
 
-    if (dictFetchValue(li->functions, name)) {
+    sds name_sds = sdsdup(name->ptr);
+    if (dictFetchValue(li->functions, name_sds)) {
         *err = sdsnew("Function already exists in the library");
+        sdsfree(name_sds);
         return C_ERR;
     }
 
     functionInfo *fi = zmalloc(sizeof(*fi));
     *fi = (functionInfo){
-        .name = name,
+        .name = name_sds,
         .function = function,
         .li = li,
-        .desc = desc,
+        .desc = desc ? sdsdup(desc->ptr) : NULL,
         .f_flags = f_flags,
     };
 
@@ -403,11 +446,24 @@ libraryJoin(functionsLibCtx *functions_lib_ctx_dst, functionsLibCtx *functions_l
     return ret;
 }
 
-/* Register an engine, should be called once by the engine on startup and give the following:
+/* Register an engine, should be called once by the engine on startup and give
+ * the following:
  *
  * - engine_name - name of the engine to register
- * - engine_ctx - the engine ctx that should be used by the server to interact with the engine */
-int functionsRegisterEngine(const char *engine_name, engine *engine) {
+ *
+ * - engine_module - the valkey module that implements this engine
+ *
+ * - engine_ctx - the engine ctx that should be used by the server to interact
+ * with the engine.
+ *
+ * - engine_methods - the struct with the scripting engine callback functions
+ * pointers.
+ *
+ */
+int functionsRegisterEngine(const char *engine_name,
+                            ValkeyModule *engine_module,
+                            engineCtx *engine_ctx,
+                            engineMethods *engine_methods) {
     sds engine_name_sds = sdsnew(engine_name);
     if (dictFetchValue(engines, engine_name_sds)) {
         serverLog(LL_WARNING, "Same engine was registered twice");
@@ -415,6 +471,16 @@ int functionsRegisterEngine(const char *engine_name, engine *engine) {
         return C_ERR;
     }
 
+    engine *eng = zmalloc(sizeof(engine));
+    *eng = (engine){
+        .engine_ctx = engine_ctx,
+        .create = engine_methods->create_functions_library,
+        .call = engine_methods->call_function,
+        .get_function_memory_overhead = engine_methods->get_function_memory_overhead,
+        .free_function = engine_methods->free_function,
+        .get_memory_info = engine_methods->get_memory_info,
+    };
+
     client *c = createClient(NULL);
     c->flag.deny_blocking = 1;
     c->flag.script = 1;
@@ -422,15 +488,64 @@ int functionsRegisterEngine(const char *engine_name, engine *engine) {
     engineInfo *ei = zmalloc(sizeof(*ei));
     *ei = (engineInfo){
         .name = engine_name_sds,
-        .engine = engine,
+        .engineModule = engine_module,
+        .module_ctx = engine_module ? moduleAllocateContext() : NULL,
+        .engine = eng,
         .c = c,
     };
 
     dictAdd(engines, engine_name_sds, ei);
 
-    engine_cache_memory += zmalloc_size(ei) + sdsAllocSize(ei->name) + zmalloc_size(engine) +
-                           engine->get_engine_memory_overhead(engine->engine_ctx);
+    functionsAddEngineStats(ei);
+
+    setupEngineModuleCtx(ei, NULL);
+    engineMemoryInfo mem_info = eng->get_memory_info(ei->module_ctx,
+                                                     eng->engine_ctx);
+    engine_cache_memory += zmalloc_size(ei) +
+                           sdsAllocSize(ei->name) +
+                           zmalloc_size(eng) +
+                           mem_info.engine_memory_overhead;
+
+    teardownEngineModuleCtx(ei);
+
+    return C_OK;
+}
+
+/* Removes a scripting engine from the server.
+ *
+ * - engine_name - name of the engine to remove
+ */
+int functionsUnregisterEngine(const char *engine_name) {
+    sds engine_name_sds = sdsnew(engine_name);
+    dictEntry *entry = dictFind(engines, engine_name_sds);
+    if (entry == NULL) {
+        serverLog(LL_WARNING, "There's no engine registered with name %s", engine_name);
+        sdsfree(engine_name_sds);
+        return C_ERR;
+    }
+
+    engineInfo *ei = dictGetVal(entry);
+
+    dictIterator *iter = dictGetSafeIterator(curr_functions_lib_ctx->libraries);
+    while ((entry = dictNext(iter))) {
+        functionLibInfo *li = dictGetVal(entry);
+        if (li->ei == ei) {
+            libraryUnlink(curr_functions_lib_ctx, li);
+            engineLibraryFree(li);
+        }
+    }
+    dictReleaseIterator(iter);
+
+    zfree(ei->engine);
+    sdsfree(ei->name);
+    freeClient(ei->c);
+    if (ei->engineModule != NULL) {
+        serverAssert(ei->module_ctx != NULL);
+        zfree(ei->module_ctx);
+    }
+    zfree(ei);
 
+    sdsfree(engine_name_sds);
     return C_OK;
 }
 
@@ -649,11 +764,19 @@ static void fcallCommandGeneric(client *c, int ro) {
     }
 
     scriptRunCtx run_ctx;
-
     if (scriptPrepareForRun(&run_ctx, fi->li->ei->c, c, fi->name, fi->f_flags, ro) != C_OK) return;
-
-    engine->call(&run_ctx, engine->engine_ctx, fi->function, c->argv + 3, numkeys, c->argv + 3 + numkeys,
+    setupEngineModuleCtx(fi->li->ei, run_ctx.original_client);
+
+    engine->call(fi->li->ei->module_ctx,
+                 engine->engine_ctx,
+                 &run_ctx,
+                 fi->function,
+                 c->argv + 3,
+                 numkeys,
+                 c->argv + 3 + numkeys,
                  c->argc - 3 - numkeys);
+
+    teardownEngineModuleCtx(fi->li->ei);
     scriptResetRun(&run_ctx);
 }
 
@@ -953,14 +1076,40 @@ void functionFreeLibMetaData(functionsLibMetaData *md) {
     if (md->engine) sdsfree(md->engine);
 }
 
+static void freeCompiledFunctions(engineInfo *ei,
+                                  compiledFunction **compiled_functions,
+                                  size_t num_compiled_functions,
+                                  size_t free_function_from_idx) {
+    setupEngineModuleCtx(ei, NULL);
+
+    for (size_t i = 0; i < num_compiled_functions; i++) {
+        compiledFunction *func = compiled_functions[i];
+        decrRefCount(func->name);
+        if (func->desc) {
+            decrRefCount(func->desc);
+        }
+        if (i >= free_function_from_idx) {
+            ei->engine->free_function(ei->module_ctx,
+                                      ei->engine->engine_ctx,
+                                      func->function);
+        }
+        zfree(func);
+    }
+
+    zfree(compiled_functions);
+
+    teardownEngineModuleCtx(ei);
+}
+
 /* Compile and save the given library, return the loaded library name on success
  * and NULL on failure. In case on failure the err out param is set with relevant error message */
 sds functionsCreateWithLibraryCtx(sds code, int replace, sds *err, functionsLibCtx *lib_ctx, size_t timeout) {
     dictIterator *iter = NULL;
     dictEntry *entry = NULL;
-    functionLibInfo *new_li = NULL;
     functionLibInfo *old_li = NULL;
     functionsLibMetaData md = {0};
+    functionLibInfo *new_li = NULL;
+
     if (functionExtractLibMetaData(code, &md, err) != C_OK) {
         return NULL;
     }
@@ -990,10 +1139,47 @@ sds functionsCreateWithLibraryCtx(sds code, int replace, sds *err, functionsLibC
     }
 
     new_li = engineLibraryCreate(md.name, ei, code);
-    if (engine->create(engine->engine_ctx, new_li, md.code, timeout, err) != C_OK) {
+    size_t num_compiled_functions = 0;
+    char *compile_error = NULL;
+    setupEngineModuleCtx(ei, NULL);
+    compiledFunction **compiled_functions =
+        engine->create(ei->module_ctx,
+                       engine->engine_ctx,
+                       md.code,
+                       timeout,
+                       &num_compiled_functions,
+                       &compile_error);
+    teardownEngineModuleCtx(ei);
+    if (compiled_functions == NULL) {
+        serverAssert(num_compiled_functions == 0);
+        serverAssert(compile_error != NULL);
+        *err = sdsnew(compile_error);
+        zfree(compile_error);
         goto error;
     }
 
+    for (size_t i = 0; i < num_compiled_functions; i++) {
+        compiledFunction *func = compiled_functions[i];
+        int ret = functionLibCreateFunction(func->name,
+                                            func->function,
+                                            new_li,
+                                            func->desc,
+                                            func->f_flags,
+                                            err);
+        if (ret == C_ERR) {
+            freeCompiledFunctions(ei,
+                                  compiled_functions,
+                                  num_compiled_functions,
+                                  i);
+            goto error;
+        }
+    }
+
+    freeCompiledFunctions(ei,
+                          compiled_functions,
+                          num_compiled_functions,
+                          num_compiled_functions);
+
     if (dictSize(new_li->functions) == 0) {
         *err = sdsnew("No functions registered");
         goto error;
@@ -1063,6 +1249,7 @@ void functionLoadCommand(client *c) {
         timeout = 0;
     }
     if (!(library_name = functionsCreateWithLibraryCtx(code->ptr, replace, &err, curr_functions_lib_ctx, timeout))) {
+        serverAssert(err != NULL);
         addReplyErrorSds(c, err);
         return;
     }
@@ -1080,7 +1267,11 @@ unsigned long functionsMemory(void) {
     while ((entry = dictNext(iter))) {
         engineInfo *ei = dictGetVal(entry);
         engine *engine = ei->engine;
-        engines_memory += engine->get_used_memory(engine->engine_ctx);
+        setupEngineModuleCtx(ei, NULL);
+        engineMemoryInfo mem_info = engine->get_memory_info(ei->module_ctx,
+                                                            engine->engine_ctx);
+        engines_memory += mem_info.used_memory;
+        teardownEngineModuleCtx(ei);
     }
     dictReleaseIterator(iter);
 
@@ -1120,12 +1311,11 @@ size_t functionsLibCtxFunctionsLen(functionsLibCtx *functions_ctx) {
 int functionsInit(void) {
     engines = dictCreate(&engineDictType);
 
+    curr_functions_lib_ctx = functionsLibCtxCreate();
+
     if (luaEngineInitEngine() != C_OK) {
         return C_ERR;
     }
 
-    /* Must be initialized after engines initialization */
-    curr_functions_lib_ctx = functionsLibCtxCreate();
-
     return C_OK;
 }
diff --git a/src/functions.h b/src/functions.h
index b199fbd06e..89e39fdc56 100644
--- a/src/functions.h
+++ b/src/functions.h
@@ -54,53 +54,68 @@
 
 typedef struct functionLibInfo functionLibInfo;
 
+/* ValkeyModule type aliases for scripting engine structs and types. */
+typedef ValkeyModuleScriptingEngineCtx engineCtx;
+typedef ValkeyModuleScriptingEngineFunctionCtx functionCtx;
+typedef ValkeyModuleScriptingEngineCompiledFunction compiledFunction;
+typedef ValkeyModuleScriptingEngineMemoryInfo engineMemoryInfo;
+typedef ValkeyModuleScriptingEngineMethods engineMethods;
+
 typedef struct engine {
     /* engine specific context */
-    void *engine_ctx;
-
-    /* Create function callback, get the engine_ctx, and function code
-     * engine_ctx - opaque struct that was created on engine initialization
-     * li - library information that need to be provided and when add functions
-     * code - the library code
-     * timeout - timeout for the library creation (0 for no timeout)
-     * err - description of error (if occurred)
-     * returns C_ERR on error and set err to be the error message */
-    int (*create)(void *engine_ctx, functionLibInfo *li, sds code, size_t timeout, sds *err);
-
-    /* Invoking a function, r_ctx is an opaque object (from engine POV).
-     * The r_ctx should be used by the engine to interaction with the server,
+    engineCtx *engine_ctx;
+
+    /* Compiles the script code and returns an array of compiled functions
+     * registered in the script./
+     *
+     * Returns NULL on error and set err to be the error message */
+    compiledFunction **(*create)(
+        ValkeyModuleCtx *module_ctx,
+        engineCtx *engine_ctx,
+        const char *code,
+        size_t timeout,
+        size_t *out_num_compiled_functions,
+        char **err);
+
+    /* Invoking a function, func_ctx is an opaque object (from engine POV).
+     * The func_ctx should be used by the engine to interaction with the server,
      * such interaction could be running commands, set resp, or set
      * replication mode
      */
-    void (*call)(scriptRunCtx *r_ctx,
-                 void *engine_ctx,
+    void (*call)(ValkeyModuleCtx *module_ctx,
+                 engineCtx *engine_ctx,
+                 functionCtx *func_ctx,
                  void *compiled_function,
                  robj **keys,
                  size_t nkeys,
                  robj **args,
                  size_t nargs);
 
-    /* get current used memory by the engine */
-    size_t (*get_used_memory)(void *engine_ctx);
+    /* free the given function */
+    void (*free_function)(ValkeyModuleCtx *module_ctx,
+                          engineCtx *engine_ctx,
+                          void *compiled_function);
 
     /* Return memory overhead for a given function,
      * such memory is not counted as engine memory but as general
      * structs memory that hold different information */
-    size_t (*get_function_memory_overhead)(void *compiled_function);
+    size_t (*get_function_memory_overhead)(ValkeyModuleCtx *module_ctx,
+                                           void *compiled_function);
 
-    /* Return memory overhead for engine (struct size holding the engine)*/
-    size_t (*get_engine_memory_overhead)(void *engine_ctx);
+    /* Get the current used memory by the engine */
+    engineMemoryInfo (*get_memory_info)(ValkeyModuleCtx *module_ctx,
+                                        engineCtx *engine_ctx);
 
-    /* free the given function */
-    void (*free_function)(void *engine_ctx, void *compiled_function);
 } engine;
 
 /* Hold information about an engine.
  * Used on rdb.c so it must be declared here. */
 typedef struct engineInfo {
-    sds name;       /* Name of the engine */
-    engine *engine; /* engine callbacks that allows to interact with the engine */
-    client *c;      /* Client that is used to run commands */
+    sds name;                    /* Name of the engine */
+    ValkeyModule *engineModule;  /* the module that implements the scripting engine */
+    ValkeyModuleCtx *module_ctx; /* Scripting engine module context */
+    engine *engine;              /* engine callbacks that allows to interact with the engine */
+    client *c;                   /* Client that is used to run commands */
 } engineInfo;
 
 /* Hold information about the specific function.
@@ -123,7 +138,12 @@ struct functionLibInfo {
     sds code;        /* Library code */
 };
 
-int functionsRegisterEngine(const char *engine_name, engine *engine_ctx);
+int functionsRegisterEngine(const char *engine_name,
+                            ValkeyModule *engine_module,
+                            void *engine_ctx,
+                            engineMethods *engine_methods);
+int functionsUnregisterEngine(const char *engine_name);
+
 sds functionsCreateWithLibraryCtx(sds code, int replace, sds *err, functionsLibCtx *lib_ctx, size_t timeout);
 unsigned long functionsMemory(void);
 unsigned long functionsMemoryOverhead(void);
@@ -138,8 +158,6 @@ void functionsLibCtxFree(functionsLibCtx *functions_lib_ctx);
 void functionsLibCtxClear(functionsLibCtx *lib_ctx, void(callback)(dict *));
 void functionsLibCtxSwapWithCurrent(functionsLibCtx *new_lib_ctx, int async);
 
-int functionLibCreateFunction(sds name, void *function, functionLibInfo *li, sds desc, uint64_t f_flags, sds *err);
-
 int luaEngineInitEngine(void);
 int functionsInit(void);
 
diff --git a/src/module.c b/src/module.c
index 541ae490ab..db493dd8bc 100644
--- a/src/module.c
+++ b/src/module.c
@@ -62,6 +62,7 @@
 #include "crc16_slottable.h"
 #include "valkeymodule.h"
 #include "io_threads.h"
+#include "functions.h"
 #include <dlfcn.h>
 #include <sys/stat.h>
 #include <sys/wait.h>
@@ -879,6 +880,15 @@ void moduleCallCommandUnblockedHandler(client *c) {
     moduleReleaseTempClient(c);
 }
 
+/* Allocates the memory necessary to hold the ValkeyModuleCtx structure, and
+ * returns the pointer to the allocated memory.
+ *
+ * Used by the scripting engines implementation to cache the context structure.
+ */
+ValkeyModuleCtx *moduleAllocateContext(void) {
+    return (ValkeyModuleCtx *)zcalloc(sizeof(ValkeyModuleCtx));
+}
+
 /* Create a module ctx and keep track of the nesting level.
  *
  * Note: When creating ctx for threads (VM_GetThreadSafeContext and
@@ -921,6 +931,16 @@ void moduleCreateContext(ValkeyModuleCtx *out_ctx, ValkeyModule *module, int ctx
     }
 }
 
+/* Initialize a module context to be used by scripting engines callback
+ * functions.
+ */
+void moduleScriptingEngineInitContext(ValkeyModuleCtx *out_ctx,
+                                      ValkeyModule *module,
+                                      client *client) {
+    moduleCreateContext(out_ctx, module, VALKEYMODULE_CTX_NONE);
+    out_ctx->client = client;
+}
+
 /* This command binds the normal command invocation with commands
  * exported by modules. */
 void ValkeyModuleCommandDispatcher(client *c) {
@@ -13074,6 +13094,60 @@ int VM_RdbSave(ValkeyModuleCtx *ctx, ValkeyModuleRdbStream *stream, int flags) {
     return VALKEYMODULE_OK;
 }
 
+/* Registers a new scripting engine in the server.
+ *
+ * - `module_ctx`: the module context object.
+ *
+ * - `engine_name`: the name of the scripting engine. This name will match
+ *   against the engine name specified in the script header using a shebang.
+ *
+ * - `engine_ctx`: engine specific context pointer.
+ *
+ * - `engine_methods`: the struct with the scripting engine callback functions
+ *   pointers.
+ *
+ * Returns VALKEYMODULE_OK if the engine is successfully registered, and
+ * VALKEYMODULE_ERR in case some failure occurs. In case of a failure, an error
+ * message is logged.
+ */
+int VM_RegisterScriptingEngine(ValkeyModuleCtx *module_ctx,
+                               const char *engine_name,
+                               ValkeyModuleScriptingEngineCtx *engine_ctx,
+                               ValkeyModuleScriptingEngineMethods *engine_methods) {
+    serverLog(LL_DEBUG, "Registering a new scripting engine: %s", engine_name);
+
+    if (engine_methods->version > VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION) {
+        serverLog(LL_WARNING, "The engine implementation version is greater "
+                              "than what this server supports. Server ABI "
+                              "Version: %lu, Engine ABI version: %lu",
+                  VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION,
+                  (unsigned long)engine_methods->version);
+        return VALKEYMODULE_ERR;
+    }
+
+    if (functionsRegisterEngine(engine_name,
+                                module_ctx->module,
+                                engine_ctx,
+                                engine_methods) != C_OK) {
+        return VALKEYMODULE_ERR;
+    }
+
+    return VALKEYMODULE_OK;
+}
+
+/* Removes the scripting engine from the server.
+ *
+ * `engine_name` is the name of the scripting engine.
+ *
+ * Returns VALKEYMODULE_OK.
+ *
+ */
+int VM_UnregisterScriptingEngine(ValkeyModuleCtx *ctx, const char *engine_name) {
+    UNUSED(ctx);
+    functionsUnregisterEngine(engine_name);
+    return VALKEYMODULE_OK;
+}
+
 /* MODULE command.
  *
  * MODULE LIST
@@ -13944,4 +14018,6 @@ void moduleRegisterCoreAPI(void) {
     REGISTER_API(RdbStreamFree);
     REGISTER_API(RdbLoad);
     REGISTER_API(RdbSave);
+    REGISTER_API(RegisterScriptingEngine);
+    REGISTER_API(UnregisterScriptingEngine);
 }
diff --git a/src/module.h b/src/module.h
new file mode 100644
index 0000000000..f61ef1e3cb
--- /dev/null
+++ b/src/module.h
@@ -0,0 +1,17 @@
+#ifndef _MODULE_H_
+#define _MODULE_H_
+
+/* This header file exposes a set of functions defined in module.c that are
+ * not part of the module API, but are used by the core to interact with modules
+ */
+
+typedef struct ValkeyModuleCtx ValkeyModuleCtx;
+typedef struct ValkeyModule ValkeyModule;
+
+ValkeyModuleCtx *moduleAllocateContext(void);
+void moduleScriptingEngineInitContext(ValkeyModuleCtx *out_ctx,
+                                      ValkeyModule *module,
+                                      client *client);
+void moduleFreeContext(ValkeyModuleCtx *ctx);
+
+#endif /* _MODULE_H_ */
diff --git a/src/script.h b/src/script.h
index 7fff34a40b..194cc8bd05 100644
--- a/src/script.h
+++ b/src/script.h
@@ -67,6 +67,8 @@
 #define SCRIPT_ALLOW_CROSS_SLOT (1ULL << 8) /* Indicate that the current script may access keys from multiple slots */
 typedef struct scriptRunCtx scriptRunCtx;
 
+/* This struct stores the necessary information to manage the execution of
+ * scripts using EVAL and FCALL. */
 struct scriptRunCtx {
     const char *funcname;
     client *c;
diff --git a/src/script_lua.c b/src/script_lua.c
index 5093fa944f..29d352d44b 100644
--- a/src/script_lua.c
+++ b/src/script_lua.c
@@ -1258,15 +1258,15 @@ static void luaLoadLibraries(lua_State *lua) {
 
 /* Return sds of the string value located on stack at the given index.
  * Return NULL if the value is not a string. */
-sds luaGetStringSds(lua_State *lua, int index) {
+robj *luaGetStringObject(lua_State *lua, int index) {
     if (!lua_isstring(lua, index)) {
         return NULL;
     }
 
     size_t len;
     const char *str = lua_tolstring(lua, index, &len);
-    sds str_sds = sdsnewlen(str, len);
-    return str_sds;
+    robj *str_obj = createStringObject(str, len);
+    return str_obj;
 }
 
 static int luaProtectedTableError(lua_State *lua) {
diff --git a/src/script_lua.h b/src/script_lua.h
index 35edf46af6..6c60754bbc 100644
--- a/src/script_lua.h
+++ b/src/script_lua.h
@@ -67,7 +67,7 @@ typedef struct errorInfo {
 } errorInfo;
 
 void luaRegisterServerAPI(lua_State *lua);
-sds luaGetStringSds(lua_State *lua, int index);
+robj *luaGetStringObject(lua_State *lua, int index);
 void luaRegisterGlobalProtectionFunction(lua_State *lua);
 void luaSetErrorMetatable(lua_State *lua);
 void luaSetAllowListProtection(lua_State *lua);
diff --git a/src/util.c b/src/util.c
index 6d99d47e5a..6e44392ce1 100644
--- a/src/util.c
+++ b/src/util.c
@@ -50,6 +50,7 @@
 #include "util.h"
 #include "sha256.h"
 #include "config.h"
+#include "zmalloc.h"
 
 #include "valkey_strtod.h"
 
@@ -1380,3 +1381,23 @@ int snprintf_async_signal_safe(char *to, size_t n, const char *fmt, ...) {
     va_end(args);
     return result;
 }
+
+/* A printf-like function that returns a freshly allocated string.
+ *
+ * This function is similar to asprintf function, but it uses zmalloc for
+ * allocating the string buffer. */
+char *valkey_asprintf(char const *fmt, ...) {
+    va_list args;
+
+    va_start(args, fmt);
+    size_t str_len = vsnprintf(NULL, 0, fmt, args) + 1;
+    va_end(args);
+
+    char *str = zmalloc(str_len);
+
+    va_start(args, fmt);
+    vsnprintf(str, str_len, fmt, args);
+    va_end(args);
+
+    return str;
+}
diff --git a/src/util.h b/src/util.h
index 51eb38f0b4..61095ddb65 100644
--- a/src/util.h
+++ b/src/util.h
@@ -99,5 +99,6 @@ int snprintf_async_signal_safe(char *to, size_t n, const char *fmt, ...);
 #endif
 size_t valkey_strlcpy(char *dst, const char *src, size_t dsize);
 size_t valkey_strlcat(char *dst, const char *src, size_t dsize);
+char *valkey_asprintf(char const *fmt, ...);
 
 #endif
diff --git a/src/valkeymodule.h b/src/valkeymodule.h
index 7c3adfd477..1d99d2ff7a 100644
--- a/src/valkeymodule.h
+++ b/src/valkeymodule.h
@@ -783,6 +783,7 @@ typedef enum {
 } ValkeyModuleACLLogEntryReason;
 
 /* Incomplete structures needed by both the core and modules. */
+typedef struct ValkeyModuleCtx ValkeyModuleCtx;
 typedef struct ValkeyModuleIO ValkeyModuleIO;
 typedef struct ValkeyModuleDigest ValkeyModuleDigest;
 typedef struct ValkeyModuleInfoCtx ValkeyModuleInfoCtx;
@@ -794,6 +795,93 @@ typedef void (*ValkeyModuleInfoFunc)(ValkeyModuleInfoCtx *ctx, int for_crash_rep
 typedef void (*ValkeyModuleDefragFunc)(ValkeyModuleDefragCtx *ctx);
 typedef void (*ValkeyModuleUserChangedFunc)(uint64_t client_id, void *privdata);
 
+/* Current ABI version for scripting engine modules. */
+#define VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION 1UL
+
+/* Type definitions for implementing scripting engines modules. */
+typedef void ValkeyModuleScriptingEngineCtx;
+typedef void ValkeyModuleScriptingEngineFunctionCtx;
+
+/* This struct represents a scripting engine function that results from the
+ * compilation of a script by the engine implementation.
+ *
+ * IMPORTANT: If we ever need to add/remove fields from this struct, we need
+ * to bump the version number defined in the
+ * `VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION` constant.
+ */
+typedef struct ValkeyModuleScriptingEngineCompiledFunction {
+    ValkeyModuleString *name; /* Function name */
+    void *function;           /* Opaque object representing a function, usually it'
+                                 the function compiled code. */
+    ValkeyModuleString *desc; /* Function description */
+    uint64_t f_flags;         /* Function flags */
+} ValkeyModuleScriptingEngineCompiledFunction;
+
+/* This struct is used to return the memory information of the scripting
+ * engine. */
+typedef struct ValkeyModuleScriptingEngineMemoryInfo {
+    /* The memory used by the scripting engine runtime. */
+    size_t used_memory;
+    /* The memory used by the scripting engine data structures. */
+    size_t engine_memory_overhead;
+} ValkeyModuleScriptingEngineMemoryInfo;
+
+typedef ValkeyModuleScriptingEngineCompiledFunction **(*ValkeyModuleScriptingEngineCreateFunctionsLibraryFunc)(
+    ValkeyModuleCtx *module_ctx,
+    ValkeyModuleScriptingEngineCtx *engine_ctx,
+    const char *code,
+    size_t timeout,
+    size_t *out_num_compiled_functions,
+    char **err);
+
+typedef void (*ValkeyModuleScriptingEngineCallFunctionFunc)(
+    ValkeyModuleCtx *module_ctx,
+    ValkeyModuleScriptingEngineCtx *engine_ctx,
+    ValkeyModuleScriptingEngineFunctionCtx *func_ctx,
+    void *compiled_function,
+    ValkeyModuleString **keys,
+    size_t nkeys,
+    ValkeyModuleString **args,
+    size_t nargs);
+
+typedef size_t (*ValkeyModuleScriptingEngineGetFunctionMemoryOverheadFunc)(
+    ValkeyModuleCtx *module_ctx,
+    void *compiled_function);
+
+typedef void (*ValkeyModuleScriptingEngineFreeFunctionFunc)(
+    ValkeyModuleCtx *module_ctx,
+    ValkeyModuleScriptingEngineCtx *engine_ctx,
+    void *compiled_function);
+
+typedef ValkeyModuleScriptingEngineMemoryInfo (*ValkeyModuleScriptingEngineGetMemoryInfoFunc)(
+    ValkeyModuleCtx *module_ctx,
+    ValkeyModuleScriptingEngineCtx *engine_ctx);
+
+typedef struct ValkeyModuleScriptingEngineMethodsV1 {
+    uint64_t version; /* Version of this structure for ABI compat. */
+
+    /* Library create function callback. When a new script is loaded, this
+     * callback will be called with the script code, and returns a list of
+     * ValkeyModuleScriptingEngineCompiledFunc objects. */
+    ValkeyModuleScriptingEngineCreateFunctionsLibraryFunc create_functions_library;
+
+    /* Function callback to free the memory of a registered engine function. */
+    ValkeyModuleScriptingEngineFreeFunctionFunc free_function;
+
+    /* The callback function called when `FCALL` command is called on a function
+     * registered in this engine. */
+    ValkeyModuleScriptingEngineCallFunctionFunc call_function;
+
+    /* Function callback to return memory overhead for a given function. */
+    ValkeyModuleScriptingEngineGetFunctionMemoryOverheadFunc get_function_memory_overhead;
+
+    /* Function callback to get the used memory by the engine. */
+    ValkeyModuleScriptingEngineGetMemoryInfoFunc get_memory_info;
+
+} ValkeyModuleScriptingEngineMethodsV1;
+
+#define ValkeyModuleScriptingEngineMethods ValkeyModuleScriptingEngineMethodsV1
+
 /* ------------------------- End of common defines ------------------------ */
 
 /* ----------- The rest of the defines are only for modules ----------------- */
@@ -826,7 +914,6 @@ typedef void (*ValkeyModuleUserChangedFunc)(uint64_t client_id, void *privdata);
 #endif
 
 /* Incomplete structures for compiler checks but opaque access. */
-typedef struct ValkeyModuleCtx ValkeyModuleCtx;
 typedef struct ValkeyModuleCommand ValkeyModuleCommand;
 typedef struct ValkeyModuleCallReply ValkeyModuleCallReply;
 typedef struct ValkeyModuleType ValkeyModuleType;
@@ -1650,6 +1737,14 @@ VALKEYMODULE_API int (*ValkeyModule_RdbSave)(ValkeyModuleCtx *ctx,
                                              ValkeyModuleRdbStream *stream,
                                              int flags) VALKEYMODULE_ATTR;
 
+VALKEYMODULE_API int (*ValkeyModule_RegisterScriptingEngine)(ValkeyModuleCtx *module_ctx,
+                                                             const char *engine_name,
+                                                             ValkeyModuleScriptingEngineCtx *engine_ctx,
+                                                             ValkeyModuleScriptingEngineMethods *engine_methods) VALKEYMODULE_ATTR;
+
+VALKEYMODULE_API int (*ValkeyModule_UnregisterScriptingEngine)(ValkeyModuleCtx *module_ctx,
+                                                               const char *engine_name) VALKEYMODULE_ATTR;
+
 #define ValkeyModule_IsAOFClient(id) ((id) == UINT64_MAX)
 
 /* This is included inline inside each Valkey module. */
@@ -2017,6 +2112,8 @@ static int ValkeyModule_Init(ValkeyModuleCtx *ctx, const char *name, int ver, in
     VALKEYMODULE_GET_API(RdbStreamFree);
     VALKEYMODULE_GET_API(RdbLoad);
     VALKEYMODULE_GET_API(RdbSave);
+    VALKEYMODULE_GET_API(RegisterScriptingEngine);
+    VALKEYMODULE_GET_API(UnregisterScriptingEngine);
 
     if (ValkeyModule_IsModuleNameBusy && ValkeyModule_IsModuleNameBusy(name)) return VALKEYMODULE_ERR;
     ValkeyModule_SetModuleAttribs(ctx, name, ver, apiver);
diff --git a/tests/modules/CMakeLists.txt b/tests/modules/CMakeLists.txt
index 0cac0c4cb6..e98a878c9d 100644
--- a/tests/modules/CMakeLists.txt
+++ b/tests/modules/CMakeLists.txt
@@ -40,6 +40,7 @@ list(APPEND MODULES_LIST "moduleauthtwo")
 list(APPEND MODULES_LIST "rdbloadsave")
 list(APPEND MODULES_LIST "crash")
 list(APPEND MODULES_LIST "cluster")
+list(APPEND MODULES_LIST "helloscripting")
 
 foreach (MODULE_NAME ${MODULES_LIST})
     message(STATUS "Building test module: ${MODULE_NAME}")
diff --git a/tests/modules/Makefile b/tests/modules/Makefile
index 82813bb6f7..963546a9ff 100644
--- a/tests/modules/Makefile
+++ b/tests/modules/Makefile
@@ -65,7 +65,8 @@ TEST_MODULES = \
     moduleauthtwo.so \
     rdbloadsave.so \
     crash.so \
-    cluster.so
+    cluster.so \
+    helloscripting.so
 
 .PHONY: all
 
diff --git a/tests/modules/helloscripting.c b/tests/modules/helloscripting.c
new file mode 100644
index 0000000000..fdca6c8e91
--- /dev/null
+++ b/tests/modules/helloscripting.c
@@ -0,0 +1,383 @@
+#include "valkeymodule.h"
+
+#include <ctype.h>
+#include <errno.h>
+#include <string.h>
+
+/*
+ * This module implements a very simple stack based scripting language.
+ * It's purpose is only to test the valkey module API to implement scripting
+ * engines.
+ *
+ * The language is called HELLO, and a program in this language is formed by
+ * a list of function definitions.
+ * The language only supports 32-bit integer, and it only allows to return an
+ * integer constant, or return the value passed as the first argument to the
+ * function.
+ *
+ * Example of a program:
+ *
+ * ```
+ * FUNCTION foo  # declaration of function 'foo'
+ * ARGS 0        # pushes the value in the first argument to the top of the
+ *               # stack
+ * RETURN        # returns the current value on the top of the stack and marks
+ *               # the end of the function declaration
+ *
+ * FUNCTION bar  # declaration of function 'bar'
+ * CONSTI 432    # pushes the value 432 to the top of the stack
+ * RETURN        # returns the current value on the top of the stack and marks
+ *               # the end of the function declaration.
+ * ```
+ */
+
+/*
+ * List of instructions of the HELLO language.
+ */
+typedef enum HelloInstKind {
+    FUNCTION = 0,
+    CONSTI,
+    ARGS,
+    RETURN,
+    _NUM_INSTRUCTIONS, // Not a real instruction.
+} HelloInstKind;
+
+/*
+ * String representations of the instructions above.
+ */
+const char *HelloInstKindStr[] = {
+    "FUNCTION",
+    "CONSTI",
+    "ARGS",
+    "RETURN",
+};
+
+/*
+ * Struct that represents an instance of an instruction.
+ * Instructions may have at most one parameter.
+ */
+typedef struct HelloInst {
+    HelloInstKind kind;
+    union {
+        uint32_t integer;
+        const char *string;
+    } param;
+} HelloInst;
+
+/*
+ * Struct that represents an instance of a function.
+ * A function is just a list of instruction instances.
+ */
+typedef struct HelloFunc {
+    char *name;
+    HelloInst instructions[256];
+    uint32_t num_instructions;
+} HelloFunc;
+
+/*
+ * Struct that represents an instance of an HELLO program.
+ * A program is just a list of function instances.
+ */
+typedef struct HelloProgram {
+    HelloFunc *functions[16];
+    uint32_t num_functions;
+} HelloProgram;
+
+/*
+ * Struct that represents the runtime context of an HELLO program.
+ */
+typedef struct HelloLangCtx {
+    HelloProgram *program;
+} HelloLangCtx;
+
+
+static HelloLangCtx *hello_ctx = NULL;
+
+
+static uint32_t str2int(const char *str) {
+    char *end;
+    errno = 0;
+    uint32_t val = (uint32_t)strtoul(str, &end, 10);
+    ValkeyModule_Assert(errno == 0);
+    return val;
+}
+
+/*
+ * Parses the kind of instruction that the current token points to.
+ */
+static HelloInstKind helloLangParseInstruction(const char *token) {
+    for (HelloInstKind i = 0; i < _NUM_INSTRUCTIONS; i++) {
+        if (strcmp(HelloInstKindStr[i], token) == 0) {
+            return i;
+        }
+    }
+    return _NUM_INSTRUCTIONS;
+}
+
+/*
+ * Parses the function param.
+ */
+static void helloLangParseFunction(HelloFunc *func) {
+    char *token = strtok(NULL, " \n");
+    ValkeyModule_Assert(token != NULL);
+    func->name = ValkeyModule_Alloc(sizeof(char) * strlen(token) + 1);
+    strcpy(func->name, token);
+}
+
+/*
+ * Parses an integer parameter.
+ */
+static void helloLangParseIntegerParam(HelloFunc *func) {
+    char *token = strtok(NULL, " \n");
+    func->instructions[func->num_instructions].param.integer = str2int(token);
+}
+
+/*
+ * Parses the CONSTI instruction parameter.
+ */
+static void helloLangParseConstI(HelloFunc *func) {
+    helloLangParseIntegerParam(func);
+    func->num_instructions++;
+}
+
+/*
+ * Parses the ARGS instruction parameter.
+ */
+static void helloLangParseArgs(HelloFunc *func) {
+    helloLangParseIntegerParam(func);
+    func->num_instructions++;
+}
+
+/*
+ * Parses an HELLO program source code.
+ */
+static HelloProgram *helloLangParseCode(const char *code,
+                                        HelloProgram *program) {
+    char *_code = ValkeyModule_Alloc(sizeof(char) * strlen(code) + 1);
+    strcpy(_code, code);
+
+    HelloFunc *currentFunc = NULL;
+
+    char *token = strtok(_code, " \n");
+    while (token != NULL) {
+        HelloInstKind kind = helloLangParseInstruction(token);
+
+        if (currentFunc != NULL) {
+            currentFunc->instructions[currentFunc->num_instructions].kind = kind;
+        }
+
+        switch (kind) {
+        case FUNCTION:
+            ValkeyModule_Assert(currentFunc == NULL);
+            currentFunc = ValkeyModule_Alloc(sizeof(HelloFunc));
+            memset(currentFunc, 0, sizeof(HelloFunc));
+            program->functions[program->num_functions++] = currentFunc;
+            helloLangParseFunction(currentFunc);
+            break;
+        case CONSTI:
+            ValkeyModule_Assert(currentFunc != NULL);
+            helloLangParseConstI(currentFunc);
+            break;
+        case ARGS:
+            ValkeyModule_Assert(currentFunc != NULL);
+            helloLangParseArgs(currentFunc);
+            break;
+        case RETURN:
+            ValkeyModule_Assert(currentFunc != NULL);
+            currentFunc->num_instructions++;
+            currentFunc = NULL;
+            break;
+        default:
+            ValkeyModule_Assert(0);
+        }
+
+        token = strtok(NULL, " \n");
+    }
+
+    ValkeyModule_Free(_code);
+
+    return program;
+}
+
+/*
+ * Executes an HELLO function.
+ */
+static uint32_t executeHelloLangFunction(HelloFunc *func,
+                                         ValkeyModuleString **args, int nargs) {
+    uint32_t stack[64];
+    int sp = 0;
+
+    for (uint32_t pc = 0; pc < func->num_instructions; pc++) {
+        HelloInst instr = func->instructions[pc];
+        switch (instr.kind) {
+        case CONSTI:
+            stack[sp++] = instr.param.integer;
+            break;
+        case ARGS:
+            uint32_t idx = instr.param.integer;
+            ValkeyModule_Assert(idx < (uint32_t)nargs);
+            size_t len;
+            const char *argStr = ValkeyModule_StringPtrLen(args[idx], &len);
+            uint32_t arg = str2int(argStr);
+            stack[sp++] = arg;
+            break;
+        case RETURN:
+            uint32_t val = stack[--sp];
+            ValkeyModule_Assert(sp == 0);
+            return val;
+        case FUNCTION:
+        default:
+            ValkeyModule_Assert(0);
+        }
+    }
+
+    ValkeyModule_Assert(0);
+    return 0;
+}
+
+static ValkeyModuleScriptingEngineMemoryInfo engineGetMemoryInfo(ValkeyModuleCtx *module_ctx,
+                                                                 ValkeyModuleScriptingEngineCtx *engine_ctx) {
+    VALKEYMODULE_NOT_USED(module_ctx);
+    HelloLangCtx *ctx = (HelloLangCtx *)engine_ctx;
+    ValkeyModuleScriptingEngineMemoryInfo mem_info = {0};
+
+    if (ctx->program != NULL) {
+        mem_info.used_memory += ValkeyModule_MallocSize(ctx->program);
+
+        for (uint32_t i = 0; i < ctx->program->num_functions; i++) {
+            HelloFunc *func = ctx->program->functions[i];
+            mem_info.used_memory += ValkeyModule_MallocSize(func);
+            mem_info.used_memory += ValkeyModule_MallocSize(func->name);
+        }
+    }
+
+    mem_info.engine_memory_overhead = ValkeyModule_MallocSize(ctx);
+    if (ctx->program != NULL) {
+        mem_info.engine_memory_overhead += ValkeyModule_MallocSize(ctx->program);
+    }
+
+    return mem_info;
+}
+
+static size_t engineFunctionMemoryOverhead(ValkeyModuleCtx *module_ctx,
+                                           void *compiled_function) {
+    VALKEYMODULE_NOT_USED(module_ctx);
+    HelloFunc *func = (HelloFunc *)compiled_function;
+    return ValkeyModule_MallocSize(func->name);
+}
+
+static void engineFreeFunction(ValkeyModuleCtx *module_ctx,
+                               ValkeyModuleScriptingEngineCtx *engine_ctx,
+                               void *compiled_function) {
+    VALKEYMODULE_NOT_USED(module_ctx);
+    VALKEYMODULE_NOT_USED(engine_ctx);
+    HelloFunc *func = (HelloFunc *)compiled_function;
+    ValkeyModule_Free(func->name);
+    func->name = NULL;
+    ValkeyModule_Free(func);
+}
+
+static ValkeyModuleScriptingEngineCompiledFunction **createHelloLangEngine(ValkeyModuleCtx *module_ctx,
+                                                                           ValkeyModuleScriptingEngineCtx *engine_ctx,
+                                                                           const char *code,
+                                                                           size_t timeout,
+                                                                           size_t *out_num_compiled_functions,
+                                                                           char **err) {
+    VALKEYMODULE_NOT_USED(module_ctx);
+    VALKEYMODULE_NOT_USED(timeout);
+    VALKEYMODULE_NOT_USED(err);
+
+    HelloLangCtx *ctx = (HelloLangCtx *)engine_ctx;
+
+    if (ctx->program == NULL) {
+        ctx->program = ValkeyModule_Alloc(sizeof(HelloProgram));
+        memset(ctx->program, 0, sizeof(HelloProgram));
+    } else {
+        ctx->program->num_functions = 0;
+    }
+
+    ctx->program = helloLangParseCode(code, ctx->program);
+
+    ValkeyModuleScriptingEngineCompiledFunction **compiled_functions =
+        ValkeyModule_Alloc(sizeof(ValkeyModuleScriptingEngineCompiledFunction *) * ctx->program->num_functions);
+
+    for (uint32_t i = 0; i < ctx->program->num_functions; i++) {
+        HelloFunc *func = ctx->program->functions[i];
+
+        ValkeyModuleScriptingEngineCompiledFunction *cfunc =
+            ValkeyModule_Alloc(sizeof(ValkeyModuleScriptingEngineCompiledFunction));
+        *cfunc = (ValkeyModuleScriptingEngineCompiledFunction) {
+            .name = ValkeyModule_CreateString(NULL, func->name, strlen(func->name)),
+            .function = func,
+            .desc = NULL,
+            .f_flags = 0,
+        };
+
+        compiled_functions[i] = cfunc;
+    }
+
+    *out_num_compiled_functions = ctx->program->num_functions;
+
+    return compiled_functions;
+}
+
+static void
+callHelloLangFunction(ValkeyModuleCtx *module_ctx,
+                      ValkeyModuleScriptingEngineCtx *engine_ctx,
+                      ValkeyModuleScriptingEngineFunctionCtx *func_ctx,
+                      void *compiled_function,
+                      ValkeyModuleString **keys, size_t nkeys,
+                      ValkeyModuleString **args, size_t nargs) {
+    VALKEYMODULE_NOT_USED(engine_ctx);
+    VALKEYMODULE_NOT_USED(func_ctx);
+    VALKEYMODULE_NOT_USED(keys);
+    VALKEYMODULE_NOT_USED(nkeys);
+
+    HelloFunc *func = (HelloFunc *)compiled_function;
+    uint32_t result = executeHelloLangFunction(func, args, nargs);
+
+    ValkeyModule_ReplyWithLongLong(module_ctx, result);
+}
+
+int ValkeyModule_OnLoad(ValkeyModuleCtx *ctx, ValkeyModuleString **argv,
+                        int argc) {
+    VALKEYMODULE_NOT_USED(argv);
+    VALKEYMODULE_NOT_USED(argc);
+
+    if (ValkeyModule_Init(ctx, "helloengine", 1, VALKEYMODULE_APIVER_1) ==
+        VALKEYMODULE_ERR)
+        return VALKEYMODULE_ERR;
+
+    hello_ctx = ValkeyModule_Alloc(sizeof(HelloLangCtx));
+    hello_ctx->program = NULL;
+
+    ValkeyModuleScriptingEngineMethods methods = {
+        .version = VALKEYMODULE_SCRIPTING_ENGINE_ABI_VERSION,
+        .create_functions_library = createHelloLangEngine,
+        .call_function = callHelloLangFunction,
+        .get_function_memory_overhead = engineFunctionMemoryOverhead,
+        .free_function = engineFreeFunction,
+        .get_memory_info = engineGetMemoryInfo,
+    };
+
+    ValkeyModule_RegisterScriptingEngine(ctx,
+                                         "HELLO",
+                                         hello_ctx,
+                                         &methods);
+
+    return VALKEYMODULE_OK;
+}
+
+int ValkeyModule_OnUnload(ValkeyModuleCtx *ctx) {
+    if (ValkeyModule_UnregisterScriptingEngine(ctx, "HELLO") != VALKEYMODULE_OK) {
+        ValkeyModule_Log(ctx, "error", "Failed to unregister engine");
+        return VALKEYMODULE_ERR;
+    }
+
+    ValkeyModule_Free(hello_ctx->program);
+    hello_ctx->program = NULL;
+    ValkeyModule_Free(hello_ctx);
+    hello_ctx = NULL;
+
+    return VALKEYMODULE_OK;
+}
diff --git a/tests/unit/functions.tcl b/tests/unit/functions.tcl
index 7ddd36dd7d..1636baaf6d 100644
--- a/tests/unit/functions.tcl
+++ b/tests/unit/functions.tcl
@@ -604,7 +604,7 @@ start_server {tags {"scripting"}} {
             }
         } e
         set _ $e
-    } {*Library names can only contain letters, numbers, or underscores(_) and must be at least one character long*}
+    } {*Function names can only contain letters, numbers, or underscores(_) and must be at least one character long*}
 
     test {LIBRARIES - test registration with empty name} {
         catch {
@@ -613,7 +613,7 @@ start_server {tags {"scripting"}} {
             }
         } e
         set _ $e
-    } {*Library names can only contain letters, numbers, or underscores(_) and must be at least one character long*}
+    } {*Function names can only contain letters, numbers, or underscores(_) and must be at least one character long*}
 
     test {LIBRARIES - math.random from function load} {
         catch {
diff --git a/tests/unit/moduleapi/scriptingengine.tcl b/tests/unit/moduleapi/scriptingengine.tcl
new file mode 100644
index 0000000000..c350633dd8
--- /dev/null
+++ b/tests/unit/moduleapi/scriptingengine.tcl
@@ -0,0 +1,126 @@
+set testmodule [file normalize tests/modules/helloscripting.so]
+
+set HELLO_PROGRAM "#!hello name=mylib\nFUNCTION foo\nARGS 0\nRETURN\nFUNCTION bar\nCONSTI 432\nRETURN"
+
+start_server {tags {"modules"}} {
+    r module load $testmodule
+
+    r function load $HELLO_PROGRAM
+
+    test {Load script with invalid library name} {
+        assert_error {ERR Library names can only contain letters, numbers, or underscores(_) and must be at least one character long} {r function load "#!hello name=my-lib\nFUNCTION foo\nARGS 0\nRETURN"}
+    }
+
+    test {Load script with existing library} {
+        assert_error {ERR Library 'mylib' already exists} {r function load $HELLO_PROGRAM}
+    }
+
+    test {Load script with invalid engine} {
+        assert_error {ERR Engine 'wasm' not found} {r function load "#!wasm name=mylib2\nFUNCTION foo\nARGS 0\nRETURN"}
+    }
+
+    test {Load script with no functions} {
+        assert_error {ERR No functions registered} {r function load "#!hello name=mylib2\n"}
+    }
+
+    test {Load script with duplicate function} {
+        assert_error {ERR Function foo already exists} {r function load "#!hello name=mylib2\nFUNCTION foo\nARGS 0\nRETURN"}
+    }
+
+    test {Load script with no metadata header} {
+        assert_error {ERR Missing library metadata} {r function load "FUNCTION foo\nARGS 0\nRETURN"}
+    }
+
+    test {Load script with header without lib name} {
+        assert_error {ERR Library name was not given} {r function load "#!hello \n"}
+    }
+
+    test {Load script with header with unknown param} {
+        assert_error {ERR Invalid metadata value given: nme=mylib} {r function load "#!hello nme=mylib\n"}
+    }
+
+    test {Load script with header with lib name passed twice} {
+        assert_error {ERR Invalid metadata value, name argument was given multiple times} {r function load "#!hello name=mylib2 name=mylib3\n"}
+    }
+
+    test {Load script with invalid function name} {
+        assert_error {ERR Function names can only contain letters, numbers, or underscores(_) and must be at least one character long} {r function load "#!hello name=mylib2\nFUNCTION foo-bar\nARGS 0\nRETURN"}
+    }
+
+    test {Load script with duplicate function} {
+        assert_error {ERR Function already exists in the library} {r function load "#!hello name=mylib2\nFUNCTION foo\nARGS 0\nRETURN\nFUNCTION foo\nARGS 0\nRETURN"}
+    }
+
+    test {Call scripting engine function: calling foo works} {
+        r fcall foo 0 134
+    } {134}
+
+    test {Call scripting engine function: calling bar works} {
+        r fcall bar 0
+    } {432}
+
+    test {Replace function library and call functions} {
+        set result [r function load replace "#!hello name=mylib\nFUNCTION foo\nARGS 0\nRETURN\nFUNCTION bar\nCONSTI 500\nRETURN"]
+        assert_equal $result "mylib"
+
+        set result [r fcall foo 0 132]
+        assert_equal $result 132
+
+        set result [r fcall bar 0]
+        assert_equal $result 500
+    }
+
+    test {List scripting engine functions} {
+        r function load replace "#!hello name=mylib\nFUNCTION foobar\nARGS 0\nRETURN"
+        r function list
+    } {{library_name mylib engine HELLO functions {{name foobar description {} flags {}}}}}
+
+    test {Load a second library and call a function} {
+        r function load "#!hello name=mylib2\nFUNCTION getarg\nARGS 0\nRETURN"
+        set result [r fcall getarg 0 456]
+        assert_equal $result 456
+    }
+
+    test {Delete all libraries and functions} {
+        set result [r function flush]
+        assert_equal $result {OK}
+        r function list
+    } {}
+
+    test {Test the deletion of a single library} {
+        r function load $HELLO_PROGRAM
+        r function load "#!hello name=mylib2\nFUNCTION getarg\nARGS 0\nRETURN"
+
+        set result [r function delete mylib]
+        assert_equal $result {OK}
+
+        set result [r fcall getarg 0 446]
+        assert_equal $result 446
+    }
+
+    test {Test dump and restore function library} {
+        r function load $HELLO_PROGRAM
+
+        set result [r fcall bar 0]
+        assert_equal $result 432
+
+        set dump [r function dump]
+
+        set result [r function flush]
+        assert_equal $result {OK}
+
+        set result [r function restore $dump]
+        assert_equal $result {OK}
+
+        set result [r fcall getarg 0 436]
+        assert_equal $result 436
+
+        set result [r fcall bar 0]
+        assert_equal $result 432
+    }
+
+    test {Unload scripting engine module} {
+        set result [r module unload helloengine]
+        assert_equal $result "OK"
+    }
+}

From d00c856448e918feb6bff47cf3fbd62dc0f861f5 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Mon, 23 Dec 2024 05:57:56 +0800
Subject: [PATCH 67/73] Fix switch case compilation error in the new
 helloscripting (#1472)

It is missing the curly braces for variable declaration after case.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 tests/modules/helloscripting.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/modules/helloscripting.c b/tests/modules/helloscripting.c
index fdca6c8e91..c912164bda 100644
--- a/tests/modules/helloscripting.c
+++ b/tests/modules/helloscripting.c
@@ -213,7 +213,7 @@ static uint32_t executeHelloLangFunction(HelloFunc *func,
         case CONSTI:
             stack[sp++] = instr.param.integer;
             break;
-        case ARGS:
+        case ARGS: {
             uint32_t idx = instr.param.integer;
             ValkeyModule_Assert(idx < (uint32_t)nargs);
             size_t len;
@@ -221,10 +221,12 @@ static uint32_t executeHelloLangFunction(HelloFunc *func,
             uint32_t arg = str2int(argStr);
             stack[sp++] = arg;
             break;
-        case RETURN:
+        }
+        case RETURN: {
             uint32_t val = stack[--sp];
             ValkeyModule_Assert(sp == 0);
             return val;
+        }
         case FUNCTION:
         default:
             ValkeyModule_Assert(0);

From 2ee06e79837df46389ae2e23348b4c51ab25315f Mon Sep 17 00:00:00 2001
From: Madelyn Olson <madelyneolson@gmail.com>
Date: Mon, 23 Dec 2024 21:07:15 -0800
Subject: [PATCH 68/73] Remove readability refactor for failover auth to fix
 clang warning (#1481)

As part of #1463, I made a small refactor between the PR and the daily
test I submitted to try to improve readability by adding a function to
abstract the extraction of the message types. However, that change
apparently caused GCC to throw another warning, so reverting the
abstraction on just one line.

Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
---
 src/cluster_legacy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 9a23527b30..3d838dfe06 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -4361,7 +4361,7 @@ void clusterRequestFailoverAuth(void) {
     /* If this is a manual failover, set the CLUSTERMSG_FLAG0_FORCEACK bit
      * in the header to communicate the nodes receiving the message that
      * they should authorized the failover even if the primary is working. */
-    if (server.cluster->mf_end) getMessageFromSendBlock(msgblock)->mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK;
+    if (server.cluster->mf_end) msgblock->data[0].msg.mflags[0] |= CLUSTERMSG_FLAG0_FORCEACK;
     clusterBroadcastMessage(msgblock);
     clusterMsgSendBlockDecrRefCount(msgblock);
 }

From f1b7f3072ce0378a181a26f3dfa5e4526b5d813b Mon Sep 17 00:00:00 2001
From: Amit Nagler <58042354+naglera@users.noreply.github.com>
Date: Tue, 24 Dec 2024 08:13:25 +0200
Subject: [PATCH 69/73] Reduce dual channel testing time (#1477)

- By not waiting `repl-diskless-sync-delay` when we don't have to, we
can reduce ~30% of dual channel tests execution time.
- This commit also drops one test which is not required for regular sync
(`Sync should continue if not all slaves dropped`).
- Skip dual channel test with master diskless disabled because it will
initiate the same synchronization process as the non-dual channel test,
making it redundant.


Before:
```
Execution time of different units:
  171 seconds - integration/dual-channel-replication
  305 seconds - integration/replication-psync

\o/ All tests passed without errors!
```
After:
```
Execution time of different units:
  120 seconds - integration/dual-channel-replication
  236 seconds - integration/replication-psync

\o/ All tests passed without errors!
```

Discused on https://github.com/valkey-io/valkey/pull/1173

---------

Signed-off-by: naglera <anagler123@gmail.com>
---
 .../integration/dual-channel-replication.tcl  | 60 ++++++++++---------
 tests/integration/replication-psync.tcl       |  4 ++
 2 files changed, 36 insertions(+), 28 deletions(-)

diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl
index 8191b9f699..b4b9286d68 100644
--- a/tests/integration/dual-channel-replication.tcl
+++ b/tests/integration/dual-channel-replication.tcl
@@ -110,6 +110,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
 
         $primary config set rdb-key-save-delay 200
         $primary config set dual-channel-replication-enabled yes
+        $primary config set repl-diskless-sync-delay 0
         $replica config set dual-channel-replication-enabled yes
         $replica config set repl-diskless-sync no
 
@@ -201,6 +202,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             # a replication buffer block.
             $primary config set client-output-buffer-limit "replica 1100k 0 0"
             $primary config set dual-channel-replication-enabled $enable
+            $primary config set repl-diskless-sync-delay 0
             $replica config set dual-channel-replication-enabled $enable
 
             test "Toggle dual-channel-replication-enabled: $enable start" {    
@@ -506,6 +508,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
         $primary config set dual-channel-replication-enabled yes
         $primary config set repl-backlog-size $backlog_size
         $primary config set loglevel debug
+        $primary config set repl-diskless-sync-delay 0
         if {$::valgrind} {
             $primary config set repl-timeout 100
         } else {
@@ -877,7 +880,6 @@ start_server {tags {"dual-channel-replication external:skip"}} {
     }
 }
 
-foreach dualchannel {yes no} {
 start_server {tags {"dual-channel-replication external:skip"}} {
     set primary [srv 0 client]
     set primary_host [srv 0 host]
@@ -893,20 +895,20 @@ start_server {tags {"dual-channel-replication external:skip"}} {
     # Generating RDB will cost 5s(10000 * 0.0005s)
     $primary debug populate 10000 primary 1
     $primary config set rdb-key-save-delay 500
-    $primary config set dual-channel-replication-enabled $dualchannel
+    $primary config set dual-channel-replication-enabled yes
 
     start_server {} {
         set replica1 [srv 0 client]
-        $replica1 config set dual-channel-replication-enabled $dualchannel
+        $replica1 config set dual-channel-replication-enabled yes
         $replica1 config set loglevel debug
         start_server {} {
             set replica2 [srv 0 client]
-            $replica2 config set dual-channel-replication-enabled $dualchannel
+            $replica2 config set dual-channel-replication-enabled yes
             $replica2 config set loglevel debug
             $replica2 config set repl-timeout 60
 
             set load_handle [start_one_key_write_load $primary_host $primary_port 100 "mykey1"]
-            test "Sync should continue if not all slaves dropped dual-channel-replication $dualchannel" {
+            test "Sync should continue if not all slaves dropped" {
                 $replica1 replicaof $primary_host $primary_port
                 $replica2 replicaof $primary_host $primary_port
 
@@ -915,20 +917,17 @@ start_server {tags {"dual-channel-replication external:skip"}} {
                 } else {
                     fail "Sync did not start"
                 }
-                if {$dualchannel == "yes"} {
-                    # Wait for both replicas main conns to establish psync
-                    wait_for_condition 50 1000 {
-                        [status $primary sync_partial_ok] == 2
-                    } else {
-                        fail "Replicas main conns didn't establish psync [status $primary sync_partial_ok]"
-                    }
+                # Wait for both replicas main conns to establish psync
+                wait_for_condition 50 1000 {
+                    [status $primary sync_partial_ok] == 2
+                } else {
+                    fail "Replicas main conns didn't establish psync [status $primary sync_partial_ok]"
                 }
-
                 catch {$replica1 shutdown nosave}
                 wait_for_condition 50 2000 {
                     [status $replica2 master_link_status] == "up" &&
                     [status $primary sync_full] == 2 &&
-                    (($dualchannel == "yes" && [status $primary sync_partial_ok] == 2) || $dualchannel == "no")
+                    ([status $primary sync_partial_ok] == 2)
                 } else {
                     fail "Sync session interapted\n
                         sync_full:[status $primary sync_full]\n
@@ -942,7 +941,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             $primary debug populate 1000000 primary 1
             $primary config set rdb-key-save-delay 100
     
-            test "Primary abort sync if all slaves dropped dual-channel-replication $dualchannel" {
+            test "Primary abort sync if all slaves dropped dual-channel-replication" {
                 set cur_psync [status $primary sync_partial_ok]
                 $replica2 replicaof $primary_host $primary_port
 
@@ -951,13 +950,11 @@ start_server {tags {"dual-channel-replication external:skip"}} {
                 } else {
                     fail "Sync did not start"
                 }
-                if {$dualchannel == "yes"} {
-                    # Wait for both replicas main conns to establish psync
-                    wait_for_condition 50 1000 {
-                        [status $primary sync_partial_ok] == $cur_psync + 1
-                    } else {
-                        fail "Replicas main conns didn't establish psync [status $primary sync_partial_ok]"
-                    }
+                # Wait for both replicas main conns to establish psync
+                wait_for_condition 50 1000 {
+                    [status $primary sync_partial_ok] == $cur_psync + 1
+                } else {
+                    fail "Replicas main conns didn't establish psync [status $primary sync_partial_ok]"
                 }
 
                 catch {$replica2 shutdown nosave}
@@ -971,7 +968,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
         }
     }
 }
-}
+
 
 start_server {tags {"dual-channel-replication external:skip"}} {
     set primary [srv 0 client]
@@ -982,8 +979,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
     $primary config set repl-diskless-sync yes
     $primary config set dual-channel-replication-enabled yes
     $primary config set loglevel debug
-    $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
-
+    $primary config set repl-diskless-sync-delay 0
     # Generating RDB will cost 500s(1000000 * 0.0001s)
     $primary debug populate 1000000 primary 1
     $primary config set rdb-key-save-delay 100
@@ -1014,6 +1010,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"]
             assert {$replica_main_conn_id != ""}
             set loglines [count_log_lines -1]
+            $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
             $primary client kill id $replica_main_conn_id
             # Wait for primary to abort the sync
             wait_for_condition 50 1000 {
@@ -1034,6 +1031,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
         }
 
         test "Test dual-channel-replication replica rdb connection disconnected" {
+            $primary config set repl-diskless-sync-delay 0
             $replica replicaof $primary_host $primary_port
             # Wait for sync session to start
             wait_for_condition 500 1000 {
@@ -1048,6 +1046,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             $primary debug log "killing replica rdb connection $replica_rdb_channel_id"
             assert {$replica_rdb_channel_id != ""}
             set loglines [count_log_lines -1]
+            $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
             $primary client kill id $replica_rdb_channel_id
             # Wait for primary to abort the sync
             wait_for_log_messages -1 {"*Background RDB transfer error*"} $loglines 1000 10
@@ -1063,6 +1062,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
         }
 
         test "Test dual-channel-replication primary reject set-rdb-client after client killed" {
+            $primary config set repl-diskless-sync-delay 0
             # Ensure replica main channel will not handshake before rdb client is killed
             $replica debug pause-after-fork 1
             $replica replicaof $primary_host $primary_port
@@ -1077,6 +1077,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             set replica_rdb_channel_id [get_client_id_by_last_cmd $primary "sync"]
             assert {$replica_rdb_channel_id != ""}
             $primary debug log "killing replica rdb connection $replica_rdb_channel_id"
+            $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
             $primary client kill id $replica_rdb_channel_id
             # Wait for primary to abort the sync
             wait_and_resume_process 0
@@ -1154,7 +1155,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
     $primary config set repl-diskless-sync yes
     $primary config set dual-channel-replication-enabled yes
     $primary config set loglevel debug
-    $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
+    $primary config set repl-diskless-sync-delay 0
 
     # Generating RDB will cost 100 sec to generate
     $primary debug populate 10000 primary 1
@@ -1185,6 +1186,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             set replica_rdb_channel_id [get_client_id_by_last_cmd $primary "sync"]
             assert {$replica_rdb_channel_id != ""}
             set loglines [count_log_lines -1]
+            $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
             $primary client kill id $replica_rdb_channel_id
             # Wait for primary to abort the sync
             wait_for_condition 50 1000 {
@@ -1192,6 +1194,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             } else {
                 fail "Primary did not free repl buf block after sync failure"
             }
+            $primary config set repl-diskless-sync-delay 0
             wait_for_log_messages -1 {"*Background RDB transfer error*"} $loglines 1000 10
             # Replica should retry
             wait_for_condition 500 1000 {
@@ -1200,7 +1203,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
                 [s -1 rdb_bgsave_in_progress] eq 1
             } else {
                 fail "replica didn't retry after connection close"
-            }            
+            }
         }
         $replica replicaof no one
         wait_for_condition 500 1000 {
@@ -1218,11 +1221,11 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             } else {
                 fail "replica didn't start sync session in time"
             }            
-
             $primary debug log "killing replica main connection"
             set replica_main_conn_id [get_client_id_by_last_cmd $primary "sync"]
             assert {$replica_main_conn_id != ""}
             set loglines [count_log_lines -1]
+            $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
             $primary client kill id $replica_main_conn_id
             # Wait for primary to abort the sync
             wait_for_condition 50 1000 {
@@ -1230,6 +1233,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
             } else {
                 fail "Primary did not free repl buf block after sync failure"
             }
+            $primary config set repl-diskless-sync-delay 0
             wait_for_log_messages -1 {"*Background RDB transfer error*"} $loglines 1000 10
             # Replica should retry
             wait_for_condition 500 1000 {
diff --git a/tests/integration/replication-psync.tcl b/tests/integration/replication-psync.tcl
index 4c305ebff4..88a33045f0 100644
--- a/tests/integration/replication-psync.tcl
+++ b/tests/integration/replication-psync.tcl
@@ -115,6 +115,10 @@ tags {"external:skip"} {
 foreach mdl {no yes} {
     foreach sdl {disabled swapdb} {
         foreach dualchannel {yes no} {
+            # Skip dual channel test with master diskless disabled
+            if {$dualchannel == "yes" && $mdl == "no"} {
+                continue
+            }
             test_psync {no reconnection, just sync} 6 1000000 3600 0 {
             } $mdl $sdl $dualchannel 0
 

From 9f4503ca500fe2b668decb6fb94c377fbce5d0a0 Mon Sep 17 00:00:00 2001
From: Amit Nagler <58042354+naglera@users.noreply.github.com>
Date: Tue, 24 Dec 2024 08:14:32 +0200
Subject: [PATCH 70/73] Add scoped RDB loading context and immediate abort flag
 (#1173)

This PR introduces a new mechanism for temporarily changing the
server's loading_rio context during RDB loading operations. The new
`RDB_SCOPED_LOADING_RIO` macro allows for a scoped change of the
`server.loading_rio` value, ensuring that it's automatically restored
to its original value when the scope ends.

Introduces a dedicated flag to `rio` to signal immediate abort,
preventing
potential use-after-free scenarios during replication disconnection in
dual-channel load. This ensures proper termination of
`rdbLoadRioWithLoadingCtx`
when replication is cancelled due to connection loss on main connection.

Fixes https://github.com/valkey-io/valkey/issues/1152

---------

Signed-off-by: naglera <anagler123@gmail.com>
Signed-off-by: Madelyn Olson <madelyneolson@gmail.com>
Signed-off-by: Amit Nagler <58042354+naglera@users.noreply.github.com>
Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
Co-authored-by: ranshid <88133677+ranshid@users.noreply.github.com>
---
 src/rdb.c                                     | 15 ++++-
 src/rdb.h                                     |  2 +-
 src/replication.c                             | 15 ++---
 src/rio.h                                     | 16 ++++-
 src/server.c                                  |  1 +
 src/server.h                                  |  1 +
 .../integration/dual-channel-replication.tcl  | 62 ++++++++++++++++++-
 7 files changed, 95 insertions(+), 17 deletions(-)

diff --git a/src/rdb.c b/src/rdb.c
index 5fb77a2897..a4eb2823fb 100644
--- a/src/rdb.c
+++ b/src/rdb.c
@@ -64,6 +64,7 @@ char *rdbFileBeingLoaded = NULL; /* used for rdb checking on read error */
 extern int rdbCheckMode;
 void rdbCheckError(const char *fmt, ...);
 void rdbCheckSetError(const char *fmt, ...);
+int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadingCtx *rdb_loading_ctx);
 
 #ifdef __GNUC__
 void rdbReportError(int corruption_error, int linenum, char *reason, ...) __attribute__((format(printf, 3, 4)));
@@ -2991,7 +2992,19 @@ int rdbFunctionLoad(rio *rdb, int ver, functionsLibCtx *lib_ctx, int rdbflags, s
 int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi) {
     functionsLibCtx *functions_lib_ctx = functionsLibCtxGetCurrent();
     rdbLoadingCtx loading_ctx = {.dbarray = server.db, .functions_lib_ctx = functions_lib_ctx};
-    int retval = rdbLoadRioWithLoadingCtx(rdb, rdbflags, rsi, &loading_ctx);
+    int retval = rdbLoadRioWithLoadingCtxScopedRdb(rdb, rdbflags, rsi, &loading_ctx);
+    return retval;
+}
+
+/* Wrapper for rdbLoadRioWithLoadingCtx that manages a scoped RDB context.
+ * This method wraps the rdbLoadRioWithLoadingCtx function, providing temporary
+ * RDB context management. It sets a new current loading RDB, calls the wrapped
+ * function, and then restores the previous loading RDB context. */
+int rdbLoadRioWithLoadingCtxScopedRdb(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadingCtx *rdb_loading_ctx) {
+    rio *prev_rio = server.loading_rio;
+    server.loading_rio = rdb;
+    int retval = rdbLoadRioWithLoadingCtx(rdb, rdbflags, rsi, rdb_loading_ctx);
+    server.loading_rio = prev_rio;
     return retval;
 }
 
diff --git a/src/rdb.h b/src/rdb.h
index e9d53fa398..7342a926b5 100644
--- a/src/rdb.h
+++ b/src/rdb.h
@@ -172,7 +172,7 @@ int rdbLoadBinaryDoubleValue(rio *rdb, double *val);
 int rdbSaveBinaryFloatValue(rio *rdb, float val);
 int rdbLoadBinaryFloatValue(rio *rdb, float *val);
 int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi);
-int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadingCtx *rdb_loading_ctx);
+int rdbLoadRioWithLoadingCtxScopedRdb(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadingCtx *rdb_loading_ctx);
 int rdbFunctionLoad(rio *rdb, int ver, functionsLibCtx *lib_ctx, int rdbflags, sds *err);
 int rdbSaveRio(int req, rio *rdb, int *error, int rdbflags, rdbSaveInfo *rsi);
 ssize_t rdbSaveFunctions(rio *rdb);
diff --git a/src/replication.c b/src/replication.c
index 3a207a1d0f..f907771e71 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -2254,7 +2254,7 @@ void readSyncBulkPayload(connection *conn) {
 
         int loadingFailed = 0;
         rdbLoadingCtx loadingCtx = {.dbarray = dbarray, .functions_lib_ctx = functions_lib_ctx};
-        if (rdbLoadRioWithLoadingCtx(&rdb, RDBFLAGS_REPLICATION, &rsi, &loadingCtx) != C_OK) {
+        if (rdbLoadRioWithLoadingCtxScopedRdb(&rdb, RDBFLAGS_REPLICATION, &rsi, &loadingCtx) != C_OK) {
             /* RDB loading failed. */
             serverLog(LL_WARNING, "Failed trying to load the PRIMARY synchronization DB "
                                   "from socket, check server logs.");
@@ -2831,18 +2831,15 @@ typedef struct replDataBufBlock {
  * Reads replication data from primary into specified repl buffer block */
 int readIntoReplDataBlock(connection *conn, replDataBufBlock *data_block, size_t read) {
     int nread = connRead(conn, data_block->buf + data_block->used, read);
-    if (nread == -1) {
-        if (connGetState(conn) != CONN_STATE_CONNECTED) {
-            dualChannelServerLog(LL_NOTICE, "Error reading from primary: %s", connGetLastError(conn));
+    if (nread <= 0) {
+        if (nread == 0 || connGetState(conn) != CONN_STATE_CONNECTED) {
+            dualChannelServerLog(LL_WARNING, "Provisional primary closed connection");
+            /* Signal ongoing RDB load to terminate gracefully */
+            if (server.loading_rio) rioCloseASAP(server.loading_rio);
             cancelReplicationHandshake(1);
         }
         return C_ERR;
     }
-    if (nread == 0) {
-        dualChannelServerLog(LL_VERBOSE, "Provisional primary closed connection");
-        cancelReplicationHandshake(1);
-        return C_ERR;
-    }
     data_block->used += nread;
     server.stat_total_reads_processed++;
     return read - nread;
diff --git a/src/rio.h b/src/rio.h
index ee0f27aa7e..d5c3263e79 100644
--- a/src/rio.h
+++ b/src/rio.h
@@ -39,6 +39,7 @@
 
 #define RIO_FLAG_READ_ERROR (1 << 0)
 #define RIO_FLAG_WRITE_ERROR (1 << 1)
+#define RIO_FLAG_CLOSE_ASAP (1 << 2) /* Rio was closed asynchronously during the current rio operation. */
 
 #define RIO_TYPE_FILE (1 << 0)
 #define RIO_TYPE_BUFFER (1 << 1)
@@ -115,7 +116,7 @@ typedef struct _rio rio;
  * if needed. */
 
 static inline size_t rioWrite(rio *r, const void *buf, size_t len) {
-    if (r->flags & RIO_FLAG_WRITE_ERROR) return 0;
+    if (r->flags & RIO_FLAG_WRITE_ERROR || r->flags & RIO_FLAG_CLOSE_ASAP) return 0;
     while (len) {
         size_t bytes_to_write =
             (r->max_processing_chunk && r->max_processing_chunk < len) ? r->max_processing_chunk : len;
@@ -132,7 +133,7 @@ static inline size_t rioWrite(rio *r, const void *buf, size_t len) {
 }
 
 static inline size_t rioRead(rio *r, void *buf, size_t len) {
-    if (r->flags & RIO_FLAG_READ_ERROR) return 0;
+    if (r->flags & RIO_FLAG_READ_ERROR || r->flags & RIO_FLAG_CLOSE_ASAP) return 0;
     while (len) {
         size_t bytes_to_read =
             (r->max_processing_chunk && r->max_processing_chunk < len) ? r->max_processing_chunk : len;
@@ -156,6 +157,10 @@ static inline int rioFlush(rio *r) {
     return r->flush(r);
 }
 
+static inline void rioCloseASAP(rio *r) {
+    r->flags |= RIO_FLAG_CLOSE_ASAP;
+}
+
 /* This function allows to know if there was a read error in any past
  * operation, since the rio stream was created or since the last call
  * to rioClearError(). */
@@ -168,8 +173,13 @@ static inline int rioGetWriteError(rio *r) {
     return (r->flags & RIO_FLAG_WRITE_ERROR) != 0;
 }
 
+/* Like rioGetReadError() but for async close errors. */
+static inline int rioGetAsyncCloseError(rio *r) {
+    return (r->flags & RIO_FLAG_CLOSE_ASAP) != 0;
+}
+
 static inline void rioClearErrors(rio *r) {
-    r->flags &= ~(RIO_FLAG_READ_ERROR | RIO_FLAG_WRITE_ERROR);
+    r->flags &= ~(RIO_FLAG_READ_ERROR | RIO_FLAG_WRITE_ERROR | RIO_FLAG_CLOSE_ASAP);
 }
 
 void rioInitWithFile(rio *r, FILE *fp);
diff --git a/src/server.c b/src/server.c
index 3cdec9fa9b..8f2ddf75df 100644
--- a/src/server.c
+++ b/src/server.c
@@ -2218,6 +2218,7 @@ void initServerConfig(void) {
     server.fsynced_reploff_pending = 0;
     server.rdb_client_id = -1;
     server.loading_process_events_interval_ms = LOADING_PROCESS_EVENTS_INTERVAL_DEFAULT;
+    server.loading_rio = NULL;
 
     /* Replication partial resync backlog */
     server.repl_backlog = NULL;
diff --git a/src/server.h b/src/server.h
index 841db70614..2f8b917267 100644
--- a/src/server.h
+++ b/src/server.h
@@ -2088,6 +2088,7 @@ struct valkeyServer {
         int dbid;
     } repl_provisional_primary;
     client *cached_primary;             /* Cached primary to be reused for PSYNC. */
+    rio *loading_rio;                   /* Pointer to the rio object currently used for loading data. */
     int repl_syncio_timeout;            /* Timeout for synchronous I/O calls */
     int repl_state;                     /* Replication status if the instance is a replica */
     int repl_rdb_channel_state;         /* State of the replica's rdb channel during dual-channel-replication */
diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl
index b4b9286d68..3adf9ce9fd 100644
--- a/tests/integration/dual-channel-replication.tcl
+++ b/tests/integration/dual-channel-replication.tcl
@@ -1158,8 +1158,8 @@ start_server {tags {"dual-channel-replication external:skip"}} {
     $primary config set repl-diskless-sync-delay 0
 
     # Generating RDB will cost 100 sec to generate
-    $primary debug populate 10000 primary 1
-    $primary config set rdb-key-save-delay 10000
+    $primary debug populate 100000 primary 1
+    $primary config set rdb-key-save-delay 1000
     
     start_server {} {
         set replica [srv 0 client]
@@ -1222,7 +1222,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
                 fail "replica didn't start sync session in time"
             }            
             $primary debug log "killing replica main connection"
-            set replica_main_conn_id [get_client_id_by_last_cmd $primary "sync"]
+            set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"]
             assert {$replica_main_conn_id != ""}
             set loglines [count_log_lines -1]
             $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
@@ -1247,3 +1247,59 @@ start_server {tags {"dual-channel-replication external:skip"}} {
         stop_write_load $load_handle
     }
 }
+
+
+start_server {tags {"dual-channel-replication external:skip"}} {
+    set primary [srv 0 client]
+    set primary_host [srv 0 host]
+    set primary_port [srv 0 port]
+
+    $primary config set repl-diskless-sync yes
+    $primary config set dual-channel-replication-enabled yes
+    $primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry
+
+    # Generating RDB will take 100 sec to generate
+    $primary debug populate 1000000 primary 1
+    $primary config set rdb-key-save-delay -10
+    
+    start_server {} {
+        set replica [srv 0 client]
+        set replica_host [srv 0 host]
+        set replica_port [srv 0 port]
+        set replica_log [srv 0 stdout]
+        
+        $replica config set dual-channel-replication-enabled yes
+        $replica config set loglevel debug
+        $replica config set repl-timeout 10
+        $replica config set repl-diskless-load flush-before-load
+
+        test "Replica notice main-connection killed during rdb load callback" {; # https://github.com/valkey-io/valkey/issues/1152
+            set loglines [count_log_lines 0]
+            $replica replicaof $primary_host $primary_port
+            # Wait for sync session to start
+            wait_for_condition 500 1000 {
+                [string match "*slave*,state=wait_bgsave*,type=rdb-channel*" [$primary info replication]] &&
+                [string match "*slave*,state=bg_transfer*,type=main-channel*" [$primary info replication]] &&
+                [s -1 rdb_bgsave_in_progress] eq 1
+            } else {
+                fail "replica didn't start sync session in time"
+            }
+            wait_for_log_messages 0 {"*Loading RDB produced by Valkey version*"} $loglines 1000 10
+            $primary set key val
+            set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"]
+            $primary debug log "killing replica main connection $replica_main_conn_id"
+            assert {$replica_main_conn_id != ""}
+            set loglines [count_log_lines 0]
+            $primary config set rdb-key-save-delay 0; # disable delay to allow next sync to succeed
+            $primary client kill id $replica_main_conn_id
+            # Wait for primary to abort the sync
+            wait_for_condition 50 1000 {
+                [string match {*replicas_waiting_psync:0*} [$primary info replication]]
+            } else {
+                fail "Primary did not free repl buf block after sync failure"
+            }
+            wait_for_log_messages 0 {"*Failed trying to load the PRIMARY synchronization DB from socket*"} $loglines 1000 10
+            verify_replica_online $primary 0 500
+        }
+    }
+}

From da92c1d6c8f3dd58f3b03da3c468a5fab162fd8f Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Wed, 25 Dec 2024 10:57:42 +0800
Subject: [PATCH 71/73] Document all command flags near serverCommand (#1474)

These flags are not documented here.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/server.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/server.h b/src/server.h
index 2f8b917267..1ed6219117 100644
--- a/src/server.h
+++ b/src/server.h
@@ -248,6 +248,8 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT];
 #define CMD_ALLOW_BUSY ((1ULL << 26))
 #define CMD_MODULE_GETCHANNELS (1ULL << 27) /* Use the modules getchannels interface. */
 #define CMD_TOUCHES_ARBITRARY_KEYS (1ULL << 28)
+/* Command flags. Please don't forget to add command flag documentation in struct
+ * serverCommand in this file. */
 
 /* Command flags that describe ACLs categories. */
 #define ACL_CATEGORY_KEYSPACE (1ULL << 0)
@@ -2471,6 +2473,8 @@ typedef int serverGetKeysProc(struct serverCommand *cmd, robj **argv, int argc,
  * CMD_DENYOOM:     May increase memory usage once called. Don't allow if out
  *                  of memory.
  *
+ * CMD_MODULE:      Command exported by module.
+ *
  * CMD_ADMIN:       Administrative command, like SAVE or SHUTDOWN.
  *
  * CMD_PUBSUB:      Pub/Sub related command.
@@ -2517,11 +2521,22 @@ typedef int serverGetKeysProc(struct serverCommand *cmd, robj **argv, int argc,
  *
  * CMD_NO_MANDATORY_KEYS: This key arguments for this command are optional.
  *
+ * CMD_PROTECTED: The command is a protected command, see enable-debug-command for more details.
+ *
+ * CMD_MODULE_GETKEYS: Use the modules getkeys interface.
+ *
+ * CMD_MODULE_NO_CLUSTER: Deny on cluster.
+ *
  * CMD_NO_MULTI: The command is not allowed inside a transaction
  *
+ * CMD_MOVABLE_KEYS: The legacy range spec doesn't cover all keys. Populated by
+ *                   populateCommandLegacyRangeSpec.
+ *
  * CMD_ALLOW_BUSY: The command can run while another command is running for
  *                 a long time (timedout script, module command that yields)
  *
+ * CMD_MODULE_GETCHANNELS: Use the modules getchannels interface.
+ *
  * CMD_TOUCHES_ARBITRARY_KEYS: The command may touch (and cause lazy-expire)
  *                             arbitrary key (i.e not provided in argv)
  *

From bb325bde355040a91b6d1237fe6965dd4650b2ec Mon Sep 17 00:00:00 2001
From: uriyage <78144248+uriyage@users.noreply.github.com>
Date: Wed, 25 Dec 2024 04:58:49 +0200
Subject: [PATCH 72/73] Fix restore replica output bytes stat update (#1486)

This PR fixes the missing stat update for `total_net_repl_output_bytes`
that was removed during the refactoring in PR #758. The metric was not
being updated when writing to replica connections.

Changes:
- Restored the stat update in postWriteToClient for replica connections
- Added integration test to verify the metric is properly updated

Signed-off-by: Uri Yagelnik <uriy@amazon.com>
Co-authored-by: Binbin <binloveplay1314@qq.com>
---
 src/networking.c                  |  2 ++
 tests/integration/replication.tcl | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/src/networking.c b/src/networking.c
index 9f36f24275..d93046a603 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -2231,6 +2231,8 @@ int postWriteToClient(client *c) {
     server.stat_total_writes_processed++;
     if (getClientType(c) != CLIENT_TYPE_REPLICA) {
         _postWriteToClient(c);
+    } else {
+        server.stat_net_repl_output_bytes += c->nwritten > 0 ? c->nwritten : 0;
     }
 
     if (c->write_flags & WRITE_FLAGS_WRITE_ERROR) {
diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl
index 1b5b0c030a..6d3c4e934f 100644
--- a/tests/integration/replication.tcl
+++ b/tests/integration/replication.tcl
@@ -194,6 +194,30 @@ start_server {tags {"repl external:skip"}} {
             }          
             assert_match {*calls=1,*,rejected_calls=0,failed_calls=1*} [cmdrstat blpop $B]
         }
+        
+        test {Replica output bytes metric} {
+            # reset stats 
+            $A config resetstat
+            
+            set info [$A info stats]
+            set replica_bytes_output [getInfoProperty $info "total_net_repl_output_bytes"]
+            assert_equal $replica_bytes_output 0
+            
+            # sent set command to primary
+            $A set key value
+            
+            # wait for command propagation
+            wait_for_condition 50 100 {
+                [$B get key] eq {value}
+            } else {
+                fail "Replica did not receive the command"
+            }
+            
+            # get the new stats
+            set info [$A info stats]
+            set replica_bytes_output [getInfoProperty $info "total_net_repl_output_bytes"]
+            assert_morethan $replica_bytes_output 0
+        }
     }
 }
 

From 8b40341295eaed356adae3de811deae1a9ab4b29 Mon Sep 17 00:00:00 2001
From: gmbnomis <gmbnomis@users.noreply.github.com>
Date: Fri, 27 Dec 2024 00:55:20 +0100
Subject: [PATCH 73/73] Fix JSON description of SET command (#1473)

In the `arguments` section, the `arguments` key is only used for
arguments of type `block` or `oneof`.

Consequently, the `arguments` given for `IFEQ` are ignored by the
server. However, they lead to strange results when rendering the
command's page for the web documentation.

Fix this by removing `arguments` for `IFEQ`.

Signed-off-by: Simon Baatz <gmbnomis@gmail.com>
---
 src/commands/set.json | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/commands/set.json b/src/commands/set.json
index 3d3800f11d..601bd676a2 100644
--- a/src/commands/set.json
+++ b/src/commands/set.json
@@ -111,14 +111,7 @@
                         "type": "string",
                         "token": "IFEQ",
                         "since": "8.1.0",
-                        "summary": "Sets the key's value only if the current value matches the specified comparison value.",
-                        "arguments": [
-                            {
-                                "name": "comparison-value",
-                                "type": "string",
-                                "summary": "The value to compare with the current key's value before setting."
-                            }
-                        ]
+                        "summary": "Sets the key's value only if the current value matches the specified comparison value."
                     }
                 ]
             },