From 9dced28095ddf76b77614d42299324a906dcd4d8 Mon Sep 17 00:00:00 2001 From: proost Date: Sun, 12 Jan 2025 17:54:32 +0900 Subject: [PATCH 01/31] feat: add key expiration misses stat Signed-off-by: proost --- src/db.c | 9 ++++++++- src/server.c | 2 ++ src/server.h | 2 +- tests/unit/info.tcl | 31 +++++++++++++++++++++++++++++++ 4 files changed, 42 insertions(+), 2 deletions(-) diff --git a/src/db.c b/src/db.c index 9a53e6b4d1..c2d1eea99b 100644 --- a/src/db.c +++ b/src/db.c @@ -101,6 +101,7 @@ void updateLFU(robj *val) { robj *lookupKey(serverDb *db, robj *key, int flags) { int dict_index = getKVStoreIndexForKey(key->ptr); robj *val = dbFindWithDictIndex(db, key->ptr, dict_index); + int is_expired = 0; if (val) { /* Forcing deletion of expired keys on a replica makes the replica * inconsistent with the primary. We forbid it on readonly replicas, but @@ -117,6 +118,7 @@ robj *lookupKey(serverDb *db, robj *key, int flags) { if (expireIfNeededWithDictIndex(db, key, val, expire_flags, dict_index) != KEY_VALID) { /* The key is no longer valid. */ val = NULL; + is_expired = 1; } } @@ -141,7 +143,12 @@ robj *lookupKey(serverDb *db, robj *key, int flags) { /* TODO: Use separate hits stats for WRITE */ } else { if (!(flags & (LOOKUP_NONOTIFY | LOOKUP_WRITE))) notifyKeyspaceEvent(NOTIFY_KEY_MISS, "keymiss", key, db->id); - if (!(flags & (LOOKUP_NOSTATS | LOOKUP_WRITE))) server.stat_keyspace_misses++; + if (!(flags & (LOOKUP_NOSTATS | LOOKUP_WRITE))) { + server.stat_keyspace_misses++; + if (is_expired) { + server.stat_keyspace_expiration_misses++; + } + } /* TODO: Use separate misses stats and notify event for WRITE */ } diff --git a/src/server.c b/src/server.c index 0b88e68056..a353deef0d 100644 --- a/src/server.c +++ b/src/server.c @@ -2628,6 +2628,7 @@ void resetServerStats(void) { server.stat_total_eviction_exceeded_time = 0; server.stat_last_eviction_exceeded_time = 0; server.stat_keyspace_misses = 0; + server.stat_keyspace_expiration_misses = 0; server.stat_keyspace_hits = 0; server.stat_active_defrag_hits = 0; server.stat_active_defrag_misses = 0; @@ -5893,6 +5894,7 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { "current_eviction_exceeded_time:%lld\r\n", current_eviction_exceeded_time / 1000, "keyspace_hits:%lld\r\n", server.stat_keyspace_hits, "keyspace_misses:%lld\r\n", server.stat_keyspace_misses, + "keyspace_expiration_misses:%lld\r\n", server.stat_keyspace_expiration_misses, "pubsub_channels:%llu\r\n", kvstoreSize(server.pubsub_channels), "pubsub_patterns:%lu\r\n", dictSize(server.pubsub_patterns), "pubsubshard_channels:%llu\r\n", kvstoreSize(server.pubsubshard_channels), diff --git a/src/server.h b/src/server.h index eb7d344a93..66f56ac279 100644 --- a/src/server.h +++ b/src/server.h @@ -932,7 +932,6 @@ typedef struct readyList { #define SELECTOR_FLAG_ALLCOMMANDS (1 << 2) /* The user can run all commands. */ #define SELECTOR_FLAG_ALLCHANNELS (1 << 3) /* The user can mention any Pub/Sub \ channel. */ - typedef struct { sds name; /* The username as an SDS string. */ uint32_t flags; /* See USER_FLAG_* */ @@ -1671,6 +1670,7 @@ struct valkeyServer { monotime stat_last_eviction_exceeded_time; /* Timestamp of current eviction start, unit us */ long long stat_keyspace_hits; /* Number of successful lookups of keys */ long long stat_keyspace_misses; /* Number of failed lookups of keys */ + long long stat_keyspace_expiration_misses; /* Number of failed lookups of keys due to expiration */ long long stat_active_defrag_hits; /* number of allocations moved */ long long stat_active_defrag_misses; /* number of allocations scanned but not moved */ long long stat_active_defrag_key_hits; /* number of keys with moved allocations */ diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl index 4a638cac80..159d3d88a6 100644 --- a/tests/unit/info.tcl +++ b/tests/unit/info.tcl @@ -504,6 +504,37 @@ start_server {tags {"info" "external:skip" "debug_defrag:skip"}} { $r2 close wait_for_watched_clients_count 0 } + + test {stats: keyspace misses} { + # disable active expire cycle + r debug set-active-expire 0 + + # clear stats before test + r config resetstat + + # test keyspace misses + r set k1 "v1" PX 1 + + # before expiration, k1 is in db + set before [r info stats] + + # wait for key to expire + after 100 + + # after expiration, k1 is not in db + r get k1 + r get k2 + set after [r info stats] + + # check keyspace misses + assert_equal [getInfoProperty $before keyspace_misses] 0 + assert_equal [getInfoProperty $before keyspace_expiration_misses] 0 + assert_equal [getInfoProperty $after keyspace_misses] 2 + assert_equal [getInfoProperty $after keyspace_expiration_misses] 1 + + # rollback active expire cycle + r debug set-active-expire 1 + } } } From 6f58024bd22e8f539c47f5d582f9ea14e49d0dff Mon Sep 17 00:00:00 2001 From: proost Date: Sun, 12 Jan 2025 17:55:25 +0900 Subject: [PATCH 02/31] chore: rollback header file Signed-off-by: proost --- src/server.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/server.h b/src/server.h index 66f56ac279..a8d486db05 100644 --- a/src/server.h +++ b/src/server.h @@ -932,6 +932,7 @@ typedef struct readyList { #define SELECTOR_FLAG_ALLCOMMANDS (1 << 2) /* The user can run all commands. */ #define SELECTOR_FLAG_ALLCHANNELS (1 << 3) /* The user can mention any Pub/Sub \ channel. */ + typedef struct { sds name; /* The username as an SDS string. */ uint32_t flags; /* See USER_FLAG_* */ From 35a5622688401061f8f8f2272d722e2549515bc8 Mon Sep 17 00:00:00 2001 From: proost Date: Fri, 17 Jan 2025 23:24:29 +0900 Subject: [PATCH 03/31] feat: add notify lazy expired Signed-off-by: proost --- src/db.c | 13 ++++++++----- src/notify.c | 2 ++ src/server.h | 1 + tests/unit/pubsub.tcl | 42 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 5 deletions(-) diff --git a/src/db.c b/src/db.c index c2d1eea99b..53337c371d 100644 --- a/src/db.c +++ b/src/db.c @@ -101,7 +101,7 @@ void updateLFU(robj *val) { robj *lookupKey(serverDb *db, robj *key, int flags) { int dict_index = getKVStoreIndexForKey(key->ptr); robj *val = dbFindWithDictIndex(db, key->ptr, dict_index); - int is_expired = 0; + bool is_expired = false; if (val) { /* Forcing deletion of expired keys on a replica makes the replica * inconsistent with the primary. We forbid it on readonly replicas, but @@ -118,7 +118,7 @@ robj *lookupKey(serverDb *db, robj *key, int flags) { if (expireIfNeededWithDictIndex(db, key, val, expire_flags, dict_index) != KEY_VALID) { /* The key is no longer valid. */ val = NULL; - is_expired = 1; + is_expired = true; } } @@ -1839,13 +1839,16 @@ long long getExpire(serverDb *db, robj *key) { return getExpireWithDictIndex(db, key, dict_index); } -void deleteExpiredKeyAndPropagateWithDictIndex(serverDb *db, robj *keyobj, int dict_index) { +void deleteExpiredKeyAndPropagateWithDictIndex(serverDb *db, robj *keyobj, int dict_index, bool lazy) { mstime_t expire_latency; latencyStartMonitor(expire_latency); dbGenericDeleteWithDictIndex(db, keyobj, server.lazyfree_lazy_expire, DB_FLAG_KEY_EXPIRED, dict_index); latencyEndMonitor(expire_latency); latencyAddSampleIfNeeded("expire-del", expire_latency); notifyKeyspaceEvent(NOTIFY_EXPIRED, "expired", keyobj, db->id); + if (lazy) { + notifyKeyspaceEvent(NOTIFY_LAZY_EXPIRED, "lazyexpired", keyobj, db->id); + } signalModifiedKey(NULL, db, keyobj); propagateDeletion(db, keyobj, server.lazyfree_lazy_expire); server.stat_expiredkeys++; @@ -1854,7 +1857,7 @@ void deleteExpiredKeyAndPropagateWithDictIndex(serverDb *db, robj *keyobj, int d /* Delete the specified expired key and propagate expire. */ void deleteExpiredKeyAndPropagate(serverDb *db, robj *keyobj) { int dict_index = getKVStoreIndexForKey(keyobj->ptr); - deleteExpiredKeyAndPropagateWithDictIndex(db, keyobj, dict_index); + deleteExpiredKeyAndPropagateWithDictIndex(db, keyobj, dict_index, false); } /* Delete the specified expired key from overwriting and propagate the DEL or UNLINK. */ @@ -2014,7 +2017,7 @@ static keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, robj *val, key = createStringObject(key->ptr, sdslen(key->ptr)); } /* Delete the key */ - deleteExpiredKeyAndPropagateWithDictIndex(db, key, dict_index); + deleteExpiredKeyAndPropagateWithDictIndex(db, key, dict_index, true); if (static_key) { decrRefCount(key); } diff --git a/src/notify.c b/src/notify.c index d10d7dd9b9..570207fcc8 100644 --- a/src/notify.c +++ b/src/notify.c @@ -59,6 +59,7 @@ int keyspaceEventsStringToFlags(char *classes) { case 'm': flags |= NOTIFY_KEY_MISS; break; case 'd': flags |= NOTIFY_MODULE; break; case 'n': flags |= NOTIFY_NEW; break; + case 'X': flags |= NOTIFY_LAZY_EXPIRED; break; default: return -1; } } @@ -91,6 +92,7 @@ sds keyspaceEventsFlagsToString(int flags) { if (flags & NOTIFY_KEYSPACE) res = sdscatlen(res, "K", 1); if (flags & NOTIFY_KEYEVENT) res = sdscatlen(res, "E", 1); if (flags & NOTIFY_KEY_MISS) res = sdscatlen(res, "m", 1); + if (flags & NOTIFY_LAZY_EXPIRED) res = sdscatlen(res, "X", 1); return res; } diff --git a/src/server.h b/src/server.h index a8d486db05..11f0482fee 100644 --- a/src/server.h +++ b/src/server.h @@ -632,6 +632,7 @@ typedef enum { #define NOTIFY_LOADED (1 << 12) /* module only key space notification, indicate a key loaded from rdb */ #define NOTIFY_MODULE (1 << 13) /* d, module key space notification */ #define NOTIFY_NEW (1 << 14) /* n, new key notification */ +#define NOTIFY_LAZY_EXPIRED (1 << 15) /* X, lazy expire key notification */ #define NOTIFY_ALL \ (NOTIFY_GENERIC | NOTIFY_STRING | NOTIFY_LIST | NOTIFY_SET | NOTIFY_HASH | NOTIFY_ZSET | NOTIFY_EXPIRED | \ NOTIFY_EVICTED | NOTIFY_STREAM | NOTIFY_MODULE) /* A flag */ diff --git a/tests/unit/pubsub.tcl b/tests/unit/pubsub.tcl index 24b78b6e5a..7164a67f63 100644 --- a/tests/unit/pubsub.tcl +++ b/tests/unit/pubsub.tcl @@ -425,6 +425,48 @@ start_server {tags {"pubsub network"}} { $rd1 close } + test "Keyspace notifications: lazy expired events" { + # disable active expire + r debug set-active-expire 0 + r config set notify-keyspace-events KEX + r del foo + set rd1 [valkey_deferring_client] + assert_equal {1} [psubscribe $rd1 *] + r set foo bar PX 1 + wait_for_condition 50 100 { + [r exists foo] == 0 + } else { + fail "Key does not lazy expire?!" + } + assert_equal "pmessage * __keyspace@${db}__:foo lazyexpired" [$rd1 read] + assert_equal "pmessage * __keyevent@${db}__:lazyexpired foo" [$rd1 read] + $rd1 close + # enable active expire + r debug set-active-expire 1 + } + + test "Keyspace notifications: expired and lazy expired events" { + # disable active expire + r debug set-active-expire 0 + r config set notify-keyspace-events KEXx + r del foo + set rd1 [valkey_deferring_client] + assert_equal {1} [psubscribe $rd1 *] + r set foo bar PX 1 + wait_for_condition 50 100 { + [r exists foo] == 0 + } else { + fail "Key does not lazy expire?!" + } + assert_equal "pmessage * __keyspace@${db}__:foo expired" [$rd1 read] + assert_equal "pmessage * __keyevent@${db}__:expired foo" [$rd1 read] + assert_equal "pmessage * __keyspace@${db}__:foo lazyexpired" [$rd1 read] + assert_equal "pmessage * __keyevent@${db}__:lazyexpired foo" [$rd1 read] + $rd1 close + # enable active expire + r debug set-active-expire 1 + } + test "Keyspace notifications: evicted events" { r config set notify-keyspace-events Ee r config set maxmemory-policy allkeys-lru From bc217808564bfd6fd346159f4278adef74ddee99 Mon Sep 17 00:00:00 2001 From: Rain Valentine Date: Wed, 8 Jan 2025 09:34:02 -0800 Subject: [PATCH 04/31] Replace dict with new hashtable: sorted set datatype (#1427) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR replaces dict with hashtable in the ZSET datatype. Instead of mapping key to score as dict did, the hashtable maps key to a node in the skiplist, which contains the score. This takes advantage of hashtable performance improvements and saves 15 bytes per set item - 24 bytes overhead before, 9 bytes after. Closes #1096 --------- Signed-off-by: Rain Valentine Signed-off-by: Viktor Söderqvist Co-authored-by: Viktor Söderqvist Signed-off-by: proost --- src/aof.c | 21 +- src/db.c | 44 +++- src/debug.c | 26 +-- src/defrag.c | 117 +++++----- src/evict.c | 6 +- src/geo.c | 2 +- src/module.c | 24 ++- src/object.c | 19 +- src/rdb.c | 4 +- src/server.c | 16 +- src/server.h | 11 +- src/sort.c | 21 +- src/t_zset.c | 593 +++++++++++++++++++++++---------------------------- 13 files changed, 420 insertions(+), 484 deletions(-) diff --git a/src/aof.c b/src/aof.c index fddadf1675..5dc12db61e 100644 --- a/src/aof.c +++ b/src/aof.c @@ -1890,30 +1890,29 @@ int rewriteSortedSetObject(rio *r, robj *key, robj *o) { } } else if (o->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = o->ptr; - dictIterator *di = dictGetIterator(zs->dict); - dictEntry *de; - - while ((de = dictNext(di)) != NULL) { - sds ele = dictGetKey(de); - double *score = dictGetVal(de); - + hashtableIterator iter; + hashtableInitIterator(&iter, zs->ht); + void *next; + while (hashtableNext(&iter, &next)) { + zskiplistNode *node = next; if (count == 0) { int cmd_items = (items > AOF_REWRITE_ITEMS_PER_CMD) ? AOF_REWRITE_ITEMS_PER_CMD : items; if (!rioWriteBulkCount(r, '*', 2 + cmd_items * 2) || !rioWriteBulkString(r, "ZADD", 4) || !rioWriteBulkObject(r, key)) { - dictReleaseIterator(di); + hashtableResetIterator(&iter); return 0; } } - if (!rioWriteBulkDouble(r, *score) || !rioWriteBulkString(r, ele, sdslen(ele))) { - dictReleaseIterator(di); + sds ele = node->ele; + if (!rioWriteBulkDouble(r, node->score) || !rioWriteBulkString(r, ele, sdslen(ele))) { + hashtableResetIterator(&iter); return 0; } if (++count == AOF_REWRITE_ITEMS_PER_CMD) count = 0; items--; } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } else { serverPanic("Unknown sorted zset encoding"); } diff --git a/src/db.c b/src/db.c index 53337c371d..40315cddaf 100644 --- a/src/db.c +++ b/src/db.c @@ -1011,13 +1011,6 @@ void dictScanCallback(void *privdata, const dictEntry *de) { if (!data->only_keys) { val = dictGetVal(de); } - } else if (o->type == OBJ_ZSET) { - key = sdsdup(keysds); - if (!data->only_keys) { - char buf[MAX_LONG_DOUBLE_CHARS]; - int len = ld2string(buf, sizeof(buf), *(double *)dictGetVal(de), LD_STR_AUTO); - val = sdsnewlen(buf, len); - } } else { serverPanic("Type not handled in dict SCAN callback."); } @@ -1028,13 +1021,26 @@ void dictScanCallback(void *privdata, const dictEntry *de) { void hashtableScanCallback(void *privdata, void *entry) { scanData *data = (scanData *)privdata; + sds val = NULL; + sds key = NULL; + robj *o = data->o; list *keys = data->keys; data->sampled++; - /* currently only implemented for SET scan */ - serverAssert(o && o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HASHTABLE); - sds key = (sds)entry; /* Specific for OBJ_SET */ + /* This callback is only used for scanning elements within a key (hash + * fields, set elements, etc.) so o must be set here. */ + serverAssert(o != NULL); + + /* get key */ + if (o->type == OBJ_SET) { + key = (sds)entry; + } else if (o->type == OBJ_ZSET) { + zskiplistNode *node = (zskiplistNode *)entry; + key = node->ele; + } else { + serverPanic("Type not handled in hashset SCAN callback."); + } /* Filter element if it does not match the pattern. */ if (data->pattern) { @@ -1043,7 +1049,23 @@ void hashtableScanCallback(void *privdata, void *entry) { } } + if (o->type == OBJ_SET) { + /* no value, key used by reference */ + } else if (o->type == OBJ_ZSET) { + /* zset data is copied */ + zskiplistNode *node = (zskiplistNode *)entry; + key = sdsdup(node->ele); + if (!data->only_keys) { + char buf[MAX_LONG_DOUBLE_CHARS]; + int len = ld2string(buf, sizeof(buf), node->score, LD_STR_AUTO); + val = sdsnewlen(buf, len); + } + } else { + serverPanic("Type not handled in hashset SCAN callback."); + } + listAddNodeTail(keys, key); + if (val) listAddNodeTail(keys, val); } /* Try to parse a SCAN cursor stored at object 'o': @@ -1191,7 +1213,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { shallow_copied_list_items = 1; } else if (o->type == OBJ_ZSET && o->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = o->ptr; - dict_table = zs->dict; + hashtable_table = zs->ht; /* scanning ZSET allocates temporary strings even though it's a dict */ shallow_copied_list_items = 0; } diff --git a/src/debug.c b/src/debug.c index 7e52874e30..c80ff5af39 100644 --- a/src/debug.c +++ b/src/debug.c @@ -206,20 +206,20 @@ void xorObjectDigest(serverDb *db, robj *keyobj, unsigned char *digest, robj *o) } } else if (o->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = o->ptr; - dictIterator *di = dictGetIterator(zs->dict); - dictEntry *de; + hashtableIterator iter; + hashtableInitIterator(&iter, zs->ht); - while ((de = dictNext(di)) != NULL) { - sds sdsele = dictGetKey(de); - double *score = dictGetVal(de); - const int len = fpconv_dtoa(*score, buf); + void *next; + while (hashtableNext(&iter, &next)) { + zskiplistNode *node = next; + const int len = fpconv_dtoa(node->score, buf); buf[len] = '\0'; memset(eledigest, 0, 20); - mixDigest(eledigest, sdsele, sdslen(sdsele)); + mixDigest(eledigest, node->ele, sdslen(node->ele)); mixDigest(eledigest, buf, strlen(buf)); xorDigest(digest, eledigest, 20); } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } else { serverPanic("Unknown sorted set encoding"); } @@ -284,13 +284,11 @@ void xorObjectDigest(serverDb *db, robj *keyobj, unsigned char *digest, robj *o) * a different digest. */ void computeDatasetDigest(unsigned char *final) { unsigned char digest[20]; - robj *o; - int j; uint32_t aux; memset(final, 0, 20); /* Start with a clean result */ - for (j = 0; j < server.dbnum; j++) { + for (int j = 0; j < server.dbnum; j++) { serverDb *db = server.db + j; if (kvstoreSize(db->keys) == 0) continue; kvstoreIterator *kvs_it = kvstoreIteratorInit(db->keys); @@ -300,7 +298,9 @@ void computeDatasetDigest(unsigned char *final) { mixDigest(final, &aux, sizeof(aux)); /* Iterate this DB writing every entry */ - while (kvstoreIteratorNext(kvs_it, (void **)&o)) { + void *next; + while (kvstoreIteratorNext(kvs_it, &next)) { + robj *o = next; sds key; robj *keyobj; @@ -929,7 +929,7 @@ void debugCommand(client *c) { switch (o->encoding) { case OBJ_ENCODING_SKIPLIST: { zset *zs = o->ptr; - d = zs->dict; + ht = zs->ht; } break; case OBJ_ENCODING_HT: d = o->ptr; break; case OBJ_ENCODING_HASHTABLE: ht = o->ptr; break; diff --git a/src/defrag.c b/src/defrag.c index a755db559a..103730ee14 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -297,55 +297,45 @@ static void zslUpdateNode(zskiplist *zsl, zskiplistNode *oldnode, zskiplistNode } } -/* Defrag helper for sorted set. - * Update the robj pointer, defrag the skiplist struct and return the new score - * reference. We may not access oldele pointer (not even the pointer stored in - * the skiplist), as it was already freed. Newele may be null, in which case we - * only need to defrag the skiplist, but not update the obj pointer. - * When return value is non-NULL, it is the score reference that must be updated - * in the dict record. */ -static double *zslDefrag(zskiplist *zsl, double score, sds oldele, sds newele) { - zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x, *newx; - int i; - sds ele = newele ? newele : oldele; - - /* find the skiplist node referring to the object that was moved, - * and all pointers that need to be updated if we'll end up moving the skiplist node. */ - x = zsl->header; - for (i = zsl->level - 1; i >= 0; i--) { - while (x->level[i].forward && x->level[i].forward->ele != oldele && /* make sure not to access the - ->obj pointer if it matches - oldele */ - (x->level[i].forward->score < score || - (x->level[i].forward->score == score && sdscmp(x->level[i].forward->ele, ele) < 0))) - x = x->level[i].forward; +/* Hashtable scan callback for sorted set. It defragments a single skiplist + * node, updates skiplist pointers, and updates the hashtable pointer to the + * node. */ +static void activeDefragZsetNode(void *privdata, void *entry_ref) { + zskiplist *zsl = privdata; + zskiplistNode **node_ref = (zskiplistNode **)entry_ref; + zskiplistNode *node = *node_ref; + + /* defragment node internals */ + sds newsds = activeDefragSds(node->ele); + if (newsds) node->ele = newsds; + + const double score = node->score; + const sds ele = node->ele; + + /* find skiplist pointers that need to be updated if we end up moving the + * skiplist node. */ + zskiplistNode *update[ZSKIPLIST_MAXLEVEL]; + zskiplistNode *x = zsl->header; + for (int i = zsl->level - 1; i >= 0; i--) { + /* stop when we've reached the end of this level or the next node comes + * after our target in sorted order */ + zskiplistNode *next = x->level[i].forward; + while (next && + (next->score < score || + (next->score == score && sdscmp(next->ele, ele) < 0))) { + x = next; + next = x->level[i].forward; + } update[i] = x; } - - /* update the robj pointer inside the skip list record. */ - x = x->level[0].forward; - serverAssert(x && score == x->score && x->ele == oldele); - if (newele) x->ele = newele; + /* should have arrived at intended node */ + serverAssert(x->level[0].forward == node); /* try to defrag the skiplist record itself */ - newx = activeDefragAlloc(x); - if (newx) { - zslUpdateNode(zsl, x, newx, update); - return &newx->score; - } - return NULL; -} - -/* Defrag helper for sorted set. - * Defrag a single dict entry key name, and corresponding skiplist struct */ -static void activeDefragZsetEntry(zset *zs, dictEntry *de) { - sds newsds; - double *newscore; - sds sdsele = dictGetKey(de); - if ((newsds = activeDefragSds(sdsele))) dictSetKey(zs->dict, de, newsds); - newscore = zslDefrag(zs->zsl, *(double *)dictGetVal(de), sdsele, newsds); - if (newscore) { - dictSetVal(zs->dict, de, newscore); + zskiplistNode *newnode = activeDefragAlloc(node); + if (newnode) { + zslUpdateNode(zsl, node, newnode, update); + *node_ref = newnode; /* update hashtable pointer */ } } @@ -472,24 +462,15 @@ static long scanLaterList(robj *ob, unsigned long *cursor, monotime endtime) { return bookmark_failed ? 1 : 0; } -typedef struct { - zset *zs; -} scanLaterZsetData; - -static void scanLaterZsetCallback(void *privdata, const dictEntry *_de) { - dictEntry *de = (dictEntry *)_de; - scanLaterZsetData *data = privdata; - activeDefragZsetEntry(data->zs, de); +static void scanLaterZsetCallback(void *privdata, void *element_ref) { + activeDefragZsetNode(privdata, element_ref); server.stat_active_defrag_scanned++; } static void scanLaterZset(robj *ob, unsigned long *cursor) { if (ob->type != OBJ_ZSET || ob->encoding != OBJ_ENCODING_SKIPLIST) return; zset *zs = (zset *)ob->ptr; - dict *d = zs->dict; - scanLaterZsetData data = {zs}; - dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc}; - *cursor = dictScanDefrag(d, *cursor, scanLaterZsetCallback, &defragfns, &data); + *cursor = hashtableScanDefrag(zs->ht, *cursor, scanLaterZsetCallback, zs->zsl, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF); } /* Used as hashtable scan callback when all we need is to defrag the hashtable @@ -533,27 +514,27 @@ static void defragQuicklist(robj *ob) { } static void defragZsetSkiplist(robj *ob) { + serverAssert(ob->type == OBJ_ZSET && ob->encoding == OBJ_ENCODING_SKIPLIST); zset *zs = (zset *)ob->ptr; + zset *newzs; zskiplist *newzsl; - dict *newdict; - dictEntry *de; struct zskiplistNode *newheader; - serverAssert(ob->type == OBJ_ZSET && ob->encoding == OBJ_ENCODING_SKIPLIST); if ((newzs = activeDefragAlloc(zs))) ob->ptr = zs = newzs; if ((newzsl = activeDefragAlloc(zs->zsl))) zs->zsl = newzsl; if ((newheader = activeDefragAlloc(zs->zsl->header))) zs->zsl->header = newheader; - if (dictSize(zs->dict) > server.active_defrag_max_scan_fields) + + hashtable *newtable; + if ((newtable = hashtableDefragTables(zs->ht, activeDefragAlloc))) zs->ht = newtable; + + if (hashtableSize(zs->ht) > server.active_defrag_max_scan_fields) defragLater(ob); else { - dictIterator *di = dictGetIterator(zs->dict); - while ((de = dictNext(di)) != NULL) { - activeDefragZsetEntry(zs, de); - } - dictReleaseIterator(di); + unsigned long cursor = 0; + do { + cursor = hashtableScanDefrag(zs->ht, cursor, activeDefragZsetNode, zs->zsl, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF); + } while (cursor != 0); } - /* defrag the dict struct and tables */ - if ((newdict = dictDefragTables(zs->dict))) zs->dict = newdict; } static void defragHash(robj *ob) { diff --git a/src/evict.c b/src/evict.c index eecd000a4b..d4bfade4fc 100644 --- a/src/evict.c +++ b/src/evict.c @@ -642,9 +642,9 @@ int performEvictions(void) { kvs = db->expires; } int slot = kvstoreGetFairRandomHashtableIndex(kvs); - int found = kvstoreHashtableRandomEntry(kvs, slot, (void **)&valkey); - if (found) { - bestkey = objectGetKey(valkey); + void *entry; + if (kvstoreHashtableRandomEntry(kvs, slot, &entry)) { + bestkey = objectGetKey((robj *)entry); bestdbid = j; break; } diff --git a/src/geo.c b/src/geo.c index 75654f85a5..65f17c81db 100644 --- a/src/geo.c +++ b/src/geo.c @@ -774,7 +774,7 @@ void georadiusGeneric(client *c, int srcKeyIndex, int flags) { if (maxelelen < elelen) maxelelen = elelen; totelelen += elelen; znode = zslInsert(zs->zsl, score, gp->member); - serverAssert(dictAdd(zs->dict, gp->member, &znode->score) == DICT_OK); + serverAssert(hashtableAdd(zs->ht, znode)); gp->member = NULL; } diff --git a/src/module.c b/src/module.c index 7388dc6a20..58555839f2 100644 --- a/src/module.c +++ b/src/module.c @@ -11096,12 +11096,10 @@ static void moduleScanKeyDictCallback(void *privdata, const dictEntry *de) { robj *o = data->key->value; robj *field = createStringObject(key, sdslen(key)); robj *value = NULL; + if (o->type == OBJ_HASH) { sds val = dictGetVal(de); value = createStringObject(val, sdslen(val)); - } else if (o->type == OBJ_ZSET) { - double *val = (double *)dictGetVal(de); - value = createStringObjectFromLongDouble(*val, 0); } else { serverPanic("unexpected object type"); } @@ -11114,12 +11112,24 @@ static void moduleScanKeyDictCallback(void *privdata, const dictEntry *de) { static void moduleScanKeyHashtableCallback(void *privdata, void *entry) { ScanKeyCBData *data = privdata; robj *o = data->key->value; - serverAssert(o->type == OBJ_SET); - sds key = entry; + robj *value = NULL; + sds key = NULL; + + if (o->type == OBJ_SET) { + key = entry; + /* no value */ + } else if (o->type == OBJ_ZSET) { + zskiplistNode *node = (zskiplistNode *)entry; + key = node->ele; + value = createStringObjectFromLongDouble(node->score, 0); + } else { + serverPanic("unexpected object type"); + } robj *field = createStringObject(key, sdslen(key)); - data->fn(data->key, field, NULL, data->user_data); + data->fn(data->key, field, value, data->user_data); decrRefCount(field); + if (value) decrRefCount(value); } /* Scan api that allows a module to scan the elements in a hash, set or sorted set key @@ -11183,7 +11193,7 @@ int VM_ScanKey(ValkeyModuleKey *key, ValkeyModuleScanCursor *cursor, ValkeyModul } else if (o->type == OBJ_HASH) { if (o->encoding == OBJ_ENCODING_HT) d = o->ptr; } else if (o->type == OBJ_ZSET) { - if (o->encoding == OBJ_ENCODING_SKIPLIST) d = ((zset *)o->ptr)->dict; + if (o->encoding == OBJ_ENCODING_SKIPLIST) ht = ((zset *)o->ptr)->ht; } else { errno = EINVAL; return 0; diff --git a/src/object.c b/src/object.c index 637b25e30c..86eefe43a3 100644 --- a/src/object.c +++ b/src/object.c @@ -461,7 +461,7 @@ robj *createZsetObject(void) { zset *zs = zmalloc(sizeof(*zs)); robj *o; - zs->dict = dictCreate(&zsetDictType); + zs->ht = hashtableCreate(&zsetHashtableType); zs->zsl = zslCreate(); o = createObject(OBJ_ZSET, zs); o->encoding = OBJ_ENCODING_SKIPLIST; @@ -519,7 +519,7 @@ void freeZsetObject(robj *o) { switch (o->encoding) { case OBJ_ENCODING_SKIPLIST: zs = o->ptr; - dictRelease(zs->dict); + hashtableRelease(zs->ht); zslFree(zs->zsl); zfree(zs); break; @@ -665,10 +665,7 @@ void dismissZsetObject(robj *o, size_t size_hint) { } } - /* Dismiss hash table memory. */ - dict *d = zs->dict; - dismissMemory(d->ht_table[0], DICTHT_SIZE(d->ht_size_exp[0]) * sizeof(dictEntry *)); - dismissMemory(d->ht_table[1], DICTHT_SIZE(d->ht_size_exp[1]) * sizeof(dictEntry *)); + dismissHashtable(zs->ht); } else if (o->encoding == OBJ_ENCODING_LISTPACK) { dismissMemory(o->ptr, lpBytes((unsigned char *)o->ptr)); } else { @@ -1187,18 +1184,18 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) { if (o->encoding == OBJ_ENCODING_LISTPACK) { asize = sizeof(*o) + zmalloc_size(o->ptr); } else if (o->encoding == OBJ_ENCODING_SKIPLIST) { - d = ((zset *)o->ptr)->dict; + hashtable *ht = ((zset *)o->ptr)->ht; zskiplist *zsl = ((zset *)o->ptr)->zsl; zskiplistNode *znode = zsl->header->level[0].forward; - asize = sizeof(*o) + sizeof(zset) + sizeof(zskiplist) + sizeof(dict) + - (sizeof(struct dictEntry *) * dictBuckets(d)) + zmalloc_size(zsl->header); + asize = sizeof(*o) + sizeof(zset) + sizeof(zskiplist) + + hashtableMemUsage(ht) + zmalloc_size(zsl->header); while (znode != NULL && samples < sample_size) { elesize += sdsAllocSize(znode->ele); - elesize += dictEntryMemUsage(NULL) + zmalloc_size(znode); + elesize += zmalloc_size(znode); samples++; znode = znode->level[0].forward; } - if (samples) asize += (double)elesize / samples * dictSize(d); + if (samples) asize += (double)elesize / samples * hashtableSize(ht); } else { serverPanic("Unknown sorted set encoding"); } diff --git a/src/rdb.c b/src/rdb.c index 32c9021669..6a2ec78d71 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -2005,7 +2005,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { o = createZsetObject(); zs = o->ptr; - if (zsetlen > DICT_HT_INITIAL_SIZE && dictTryExpand(zs->dict, zsetlen) != DICT_OK) { + if (!hashtableTryExpand(zs->ht, zsetlen)) { rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)zsetlen); decrRefCount(o); return NULL; @@ -2048,7 +2048,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { totelelen += sdslen(sdsele); znode = zslInsert(zs->zsl, score, sdsele); - if (dictAdd(zs->dict, sdsele, &znode->score) != DICT_OK) { + if (!hashtableAdd(zs->ht, znode)) { rdbReportCorruptRDB("Duplicate zset fields detected"); decrRefCount(o); /* no need to free 'sdsele', will be released by zslFree together with 'o' */ diff --git a/src/server.c b/src/server.c index a353deef0d..59766cccda 100644 --- a/src/server.c +++ b/src/server.c @@ -556,14 +556,16 @@ hashtableType setHashtableType = { .keyCompare = hashtableSdsKeyCompare, .entryDestructor = dictSdsDestructor}; +const void *zsetHashtableGetKey(const void *element) { + const zskiplistNode *node = element; + return node->ele; +} + /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */ -dictType zsetDictType = { - dictSdsHash, /* hash function */ - NULL, /* key dup */ - dictSdsKeyCompare, /* key compare */ - NULL, /* Note: SDS string shared & freed by skiplist */ - NULL, /* val destructor */ - NULL, /* allow to expand */ +hashtableType zsetHashtableType = { + .hashFunction = dictSdsHash, + .entryGetKey = zsetHashtableGetKey, + .keyCompare = hashtableSdsKeyCompare, }; uint64_t hashtableSdsHash(const void *key) { diff --git a/src/server.h b/src/server.h index 11f0482fee..2293f3baf1 100644 --- a/src/server.h +++ b/src/server.h @@ -1324,7 +1324,7 @@ typedef struct zskiplist { } zskiplist; typedef struct zset { - dict *dict; + hashtable *ht; zskiplist *zsl; } zset; @@ -2540,7 +2540,7 @@ extern dictType objectKeyPointerValueDictType; extern dictType objectKeyHeapPointerValueDictType; extern hashtableType setHashtableType; extern dictType BenchmarkDictType; -extern dictType zsetDictType; +extern hashtableType zsetHashtableType; extern hashtableType kvstoreKeysHashtableType; extern hashtableType kvstoreExpiresHashtableType; extern double R_Zero, R_PosInf, R_NegInf, R_Nan; @@ -3086,8 +3086,6 @@ typedef struct { zskiplist *zslCreate(void); void zslFree(zskiplist *zsl); zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele); -unsigned char *zzlInsert(unsigned char *zl, sds ele, double score); -int zslDelete(zskiplist *zsl, double score, sds ele, zskiplistNode **node); zskiplistNode *zslNthInRange(zskiplist *zsl, zrangespec *range, long n); double zzlGetScore(unsigned char *sptr); void zzlNext(unsigned char *zl, unsigned char **eptr, unsigned char **sptr); @@ -3098,9 +3096,7 @@ unsigned long zsetLength(const robj *zobj); void zsetConvert(robj *zobj, int encoding); void zsetConvertToListpackIfNeeded(robj *zobj, size_t maxelelen, size_t totelelen); int zsetScore(robj *zobj, sds member, double *score); -unsigned long zslGetRank(zskiplist *zsl, double score, sds o); int zsetAdd(robj *zobj, double score, sds ele, int in_flags, int *out_flags, double *newscore); -long zsetRank(robj *zobj, sds ele, int reverse, double *score); int zsetDel(robj *zobj, sds ele); robj *zsetDup(robj *o); void genericZpopCommand(client *c, @@ -3567,10 +3563,11 @@ unsigned long LFUDecrAndReturn(robj *o); int performEvictions(void); void startEvictionTimeProc(void); -/* Keys hashing / comparison functions for dict.c hash tables. */ +/* Keys hashing/comparison functions for dict.c and hashtable.c hash tables. */ uint64_t dictSdsHash(const void *key); uint64_t dictSdsCaseHash(const void *key); int dictSdsKeyCompare(const void *key1, const void *key2); +int hashtableSdsKeyCompare(const void *key1, const void *key2); int dictSdsKeyCaseCompare(const void *key1, const void *key2); void dictSdsDestructor(void *val); void dictListDestructor(void *val); diff --git a/src/sort.c b/src/sort.c index b1723daff0..7af96141e8 100644 --- a/src/sort.c +++ b/src/sort.c @@ -330,7 +330,7 @@ void sortCommandGeneric(client *c, int readonly) { switch (sortval->type) { case OBJ_LIST: vectorlen = listTypeLength(sortval); break; case OBJ_SET: vectorlen = setTypeSize(sortval); break; - case OBJ_ZSET: vectorlen = dictSize(((zset *)sortval->ptr)->dict); break; + case OBJ_ZSET: vectorlen = hashtableSize(((zset *)sortval->ptr)->ht); break; default: vectorlen = 0; serverPanic("Bad SORT type"); /* Avoid GCC warning */ } @@ -423,7 +423,7 @@ void sortCommandGeneric(client *c, int readonly) { /* Check if starting point is trivial, before doing log(N) lookup. */ if (desc) { - long zsetlen = dictSize(((zset *)sortval->ptr)->dict); + long zsetlen = hashtableSize(((zset *)sortval->ptr)->ht); ln = zsl->tail; if (start > 0) ln = zslGetElementByRank(zsl, zsetlen - start); @@ -445,19 +445,18 @@ void sortCommandGeneric(client *c, int readonly) { end -= start; start = 0; } else if (sortval->type == OBJ_ZSET) { - dict *set = ((zset *)sortval->ptr)->dict; - dictIterator *di; - dictEntry *setele; - sds sdsele; - di = dictGetIterator(set); - while ((setele = dictNext(di)) != NULL) { - sdsele = dictGetKey(setele); - vector[j].obj = createStringObject(sdsele, sdslen(sdsele)); + hashtable *ht = ((zset *)sortval->ptr)->ht; + hashtableIterator iter; + hashtableInitIterator(&iter, ht); + void *next; + while (hashtableNext(&iter, &next)) { + zskiplistNode *node = next; + vector[j].obj = createStringObject(node->ele, sdslen(node->ele)); vector[j].u.score = 0; vector[j].u.cmpobj = NULL; j++; } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } else { serverPanic("Unknown type"); } diff --git a/src/t_zset.c b/src/t_zset.c index e8c5a369b7..77c96613b7 100644 --- a/src/t_zset.c +++ b/src/t_zset.c @@ -69,10 +69,10 @@ int zslLexValueGteMin(sds value, zlexrangespec *spec); int zslLexValueLteMax(sds value, zlexrangespec *spec); void zsetConvertAndExpand(robj *zobj, int encoding, unsigned long cap); -zskiplistNode *zslGetElementByRankFromNode(zskiplistNode *start_node, int start_level, unsigned long rank); +static zskiplistNode *zslGetElementByRankFromNode(zskiplistNode *start_node, int start_level, unsigned long rank); zskiplistNode *zslGetElementByRank(zskiplist *zsl, unsigned long rank); -static inline unsigned long zslGetNodeSpanAtLevel(zskiplistNode *x, int level) { +static inline unsigned long zslGetNodeSpanAtLevel(const zskiplistNode *x, int level) { /* We use the level 0 span in order to hold the node height, so in case the span is requested on * level 0 and this is not the last node we return 1 and 0 otherwise. For the rest of the levels we just return * the recorded span in that level. */ @@ -98,7 +98,7 @@ static inline void zslDecrNodeSpanAtLevel(zskiplistNode *x, int level, unsigned x->level[level].span -= decr; } -static inline unsigned long zslGetNodeHeight(zskiplistNode *x) { +static inline unsigned long zslGetNodeHeight(const zskiplistNode *x) { /* Since the span at level 0 is always 1 (or 0 for the last node), this * field is instead used for storing the height of the node. */ return x->level[0].span; @@ -112,7 +112,7 @@ static inline void zslSetNodeHeight(zskiplistNode *x, int height) { /* Create a skiplist node with the specified number of levels. * The SDS string 'ele' is referenced by the node after the call. */ -zskiplistNode *zslCreateNode(int height, double score, sds ele) { +static zskiplistNode *zslCreateNode(int height, double score, sds ele) { zskiplistNode *zn = zmalloc(sizeof(*zn) + height * sizeof(struct zskiplistLevel)); zn->score = score; zn->ele = ele; @@ -141,7 +141,7 @@ zskiplist *zslCreate(void) { /* Free the specified skiplist node. The referenced SDS string representation * of the element is freed too, unless node->ele is set to NULL before calling * this function. */ -void zslFreeNode(zskiplistNode *node) { +static void zslFreeNode(zskiplistNode *node) { sdsfree(node->ele); zfree(node); } @@ -163,29 +163,44 @@ void zslFree(zskiplist *zsl) { * The return value of this function is between 1 and ZSKIPLIST_MAXLEVEL * (both inclusive), with a powerlaw-alike distribution where higher * levels are less likely to be returned. */ -int zslRandomLevel(void) { +static int zslRandomLevel(void) { static const int threshold = ZSKIPLIST_P * RAND_MAX; int level = 1; while (random() < threshold) level += 1; return (level < ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL; } -/* Insert a new node in the skiplist. Assumes the element does not already - * exist (up to the caller to enforce that). The skiplist takes ownership - * of the passed SDS string 'ele'. */ -zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele) { - zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x; +/* Compares node and score/ele; defines zset ordering. Return value: + * positive if a comes after b. + * negative if a comes before b. + * 0 if a's score and ele are both equal to b's. */ +static int zslCompareNodes(const zskiplistNode *a, const zskiplistNode *b) { + if (a == b) return 0; + + /* null indicates end of list - ordered after any score/ele */ + if (a == NULL) return 1; + if (b == NULL) return -1; + + if (a->score > b->score) return 1; + if (a->score < b->score) return -1; + + return sdscmp(a->ele, b->ele); +} + +/* Insert a node in the skiplist. Assumes the element does not already exist in + * the skiplist (up to the caller to enforce that). The skiplist takes ownership + * of the passed node. */ +static zskiplistNode *zslInsertNode(zskiplist *zsl, zskiplistNode *node) { + zskiplistNode *update[ZSKIPLIST_MAXLEVEL]; unsigned long rank[ZSKIPLIST_MAXLEVEL]; - int i, level; + const int level = zslGetNodeHeight(node); - serverAssert(!isnan(score)); - x = zsl->header; - for (i = zsl->level - 1; i >= 0; i--) { + serverAssert(!isnan(node->score)); + zskiplistNode *x = zsl->header; + for (int i = zsl->level - 1; i >= 0; i--) { /* store rank that is crossed to reach the insert position */ rank[i] = i == (zsl->level - 1) ? 0 : rank[i + 1]; - while (x->level[i].forward && - (x->level[i].forward->score < score || - (x->level[i].forward->score == score && sdscmp(x->level[i].forward->ele, ele) < 0))) { + while (zslCompareNodes(x->level[i].forward, node) < 0) { rank[i] += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } @@ -193,11 +208,10 @@ zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele) { } /* we assume the element is not already inside, since we allow duplicated * scores, reinserting the same element should never happen since the - * caller of zslInsert() should test in the hash table if the element is + * caller should test in the hash table if the element is * already inside or not. */ - level = zslRandomLevel(); if (level > zsl->level) { - for (i = zsl->level; i < level; i++) { + for (int i = zsl->level; i < level; i++) { rank[i] = 0; update[i] = zsl->header; zslSetNodeSpanAtLevel(update[i], i, zsl->length); @@ -205,33 +219,42 @@ zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele) { zsl->level = level; zslSetNodeHeight(zsl->header, level); } - x = zslCreateNode(level, score, ele); - for (i = 0; i < level; i++) { - x->level[i].forward = update[i]->level[i].forward; - update[i]->level[i].forward = x; + for (int i = 0; i < level; i++) { + node->level[i].forward = update[i]->level[i].forward; + update[i]->level[i].forward = node; /* update span covered by update[i] as x is inserted here */ - zslSetNodeSpanAtLevel(x, i, zslGetNodeSpanAtLevel(update[i], i) - (rank[0] - rank[i])); + zslSetNodeSpanAtLevel(node, i, zslGetNodeSpanAtLevel(update[i], i) - (rank[0] - rank[i])); zslSetNodeSpanAtLevel(update[i], i, (rank[0] - rank[i]) + 1); } /* increment span for untouched levels */ - for (i = level; i < zsl->level; i++) { + for (int i = level; i < zsl->level; i++) { zslIncrNodeSpanAtLevel(update[i], i, 1); } - x->backward = (update[0] == zsl->header) ? NULL : update[0]; - if (x->level[0].forward) - x->level[0].forward->backward = x; + node->backward = (update[0] == zsl->header) ? NULL : update[0]; + if (node->level[0].forward) + node->level[0].forward->backward = node; else - zsl->tail = x; + zsl->tail = node; zsl->length++; - return x; + return node; +} + +/* Insert a new node in the skiplist. Assumes the element does not already + * exist (up to the caller to enforce that). The skiplist takes ownership + * of the passed SDS string 'ele'. */ +zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele) { + const int level = zslRandomLevel(); + zskiplistNode *node = zslCreateNode(level, score, ele); + zslInsertNode(zsl, node); + return node; } /* Internal function used by zslDelete, zslDeleteRangeByScore and * zslDeleteRangeByRank. */ -void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) { +static void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) { int i; for (i = 0; i < zsl->level; i++) { if (update[i]->level[i].forward == x) { @@ -250,91 +273,64 @@ void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) { zsl->length--; } -/* Delete an element with matching score/element from the skiplist. - * The function returns 1 if the node was found and deleted, otherwise - * 0 is returned. - * - * If 'node' is NULL the deleted node is freed by zslFreeNode(), otherwise - * it is not freed (but just unlinked) and *node is set to the node pointer, - * so that it is possible for the caller to reuse the node (including the - * referenced SDS string at node->ele). */ -int zslDelete(zskiplist *zsl, double score, sds ele, zskiplistNode **node) { - zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x; - int i; - - x = zsl->header; - for (i = zsl->level - 1; i >= 0; i--) { - while (x->level[i].forward && - (x->level[i].forward->score < score || - (x->level[i].forward->score == score && sdscmp(x->level[i].forward->ele, ele) < 0))) { +/* Delete specified node from the skiplist. */ +static void zslDelete(zskiplist *zsl, zskiplistNode *node) { + zskiplistNode *update[ZSKIPLIST_MAXLEVEL]; + zskiplistNode *x = zsl->header; + for (int i = zsl->level - 1; i >= 0; i--) { + while (zslCompareNodes(x->level[i].forward, node) < 0) { x = x->level[i].forward; } update[i] = x; } - /* We may have multiple elements with the same score, what we need - * is to find the element with both the right score and object. */ - x = x->level[0].forward; - if (x && score == x->score && sdscmp(x->ele, ele) == 0) { - zslDeleteNode(zsl, x, update); - if (!node) - zslFreeNode(x); - else - *node = x; - return 1; - } - return 0; /* not found */ + + /* We should have arrived at the correct node */ + serverAssert(x->level[0].forward == node); + + zslDeleteNode(zsl, node, update); + zslFreeNode(node); } /* Update the score of an element inside the sorted set skiplist. - * Note that the element must exist and must match 'score'. - * This function does not update the score in the hash table side, the - * caller should take care of it. + * Note that the element must exist in the skiplist. * * Note that this function attempts to just update the node, in case after - * the score update, the node would be exactly at the same position. + * the score update, the node would be exactly at the same position. If the old + * node can be kept it returns NULL. * Otherwise the skiplist is modified by removing and re-adding a new - * element, which is more costly. - * - * The function returns the updated element skiplist node pointer. */ -zskiplistNode *zslUpdateScore(zskiplist *zsl, double curscore, sds ele, double newscore) { - zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x; - int i; - - /* We need to seek to element to update to start: this is useful anyway, - * we'll have to update or remove it. */ - x = zsl->header; - for (i = zsl->level - 1; i >= 0; i--) { - while (x->level[i].forward && - (x->level[i].forward->score < curscore || - (x->level[i].forward->score == curscore && sdscmp(x->level[i].forward->ele, ele) < 0))) { + * element, which is more costly. A pointer to the new node is returned. */ +static zskiplistNode *zslUpdateScore(zskiplist *zsl, zskiplistNode *node, double newscore) { + /* If the node, after the score update, would be still exactly + * at the same position, we can just update the score without + * actually removing and re-inserting the element in the skiplist. + * (TODO: The check can be extended to check also equality of the + * score, but then we'll also need to compare the key order). */ + if ((node->backward == NULL || node->backward->score < newscore) && + (node->level[0].forward == NULL || node->level[0].forward->score > newscore)) { + node->score = newscore; + return NULL; + } + + /* We need to remove the node from the skiplist and insert a new one */ + zskiplistNode *update[ZSKIPLIST_MAXLEVEL]; + zskiplistNode *x = zsl->header; + for (int i = zsl->level - 1; i >= 0; i--) { + while (zslCompareNodes(x->level[i].forward, node) < 0) { x = x->level[i].forward; } update[i] = x; } + /* We assume that the node exists in the skiplist */ + serverAssert(x->level[0].forward == node); - /* Jump to our element: note that this function assumes that the - * element with the matching score exists. */ - x = x->level[0].forward; - serverAssert(x && curscore == x->score && sdscmp(x->ele, ele) == 0); - - /* If the node, after the score update, would be still exactly - * at the same position, we can just update the score without - * actually removing and re-inserting the element in the skiplist. */ - if ((x->backward == NULL || x->backward->score < newscore) && - (x->level[0].forward == NULL || x->level[0].forward->score > newscore)) { - x->score = newscore; - return x; - } - - /* No way to reuse the old node: we need to remove and insert a new - * one at a different place. */ - zslDeleteNode(zsl, x, update); - zskiplistNode *newnode = zslInsert(zsl, newscore, x->ele); - /* We reused the old node x->ele SDS string, free the node now - * since zslInsert created a new one. */ - x->ele = NULL; - zslFreeNode(x); - return newnode; + zslDeleteNode(zsl, node, update); + /* update pointer inside hashtable with new node */ + zskiplistNode *new_node = zslInsert(zsl, newscore, node->ele); + /* We reused the old node->ele SDS string, free the node now + * since zslInsert created a new node */ + node->ele = NULL; + zslFreeNode(node); + return new_node; } int zslValueGteMin(double value, zrangespec *spec) { @@ -442,7 +438,7 @@ zskiplistNode *zslNthInRange(zskiplist *zsl, zrangespec *range, long n) { * range->maxex). When inclusive a score >= min && score <= max is deleted. * Note that this function takes the reference to the hash table view of the * sorted set, in order to remove the elements from the hash table too. */ -unsigned long zslDeleteRangeByScore(zskiplist *zsl, zrangespec *range, dict *dict) { +static unsigned long zslDeleteRangeByScore(zskiplist *zsl, zrangespec *range, hashtable *ht) { zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x; unsigned long removed = 0; int i; @@ -460,7 +456,7 @@ unsigned long zslDeleteRangeByScore(zskiplist *zsl, zrangespec *range, dict *dic while (x && zslValueLteMax(x->score, range)) { zskiplistNode *next = x->level[0].forward; zslDeleteNode(zsl, x, update); - dictDelete(dict, x->ele); + hashtableDelete(ht, x->ele); zslFreeNode(x); /* Here is where x->ele is actually released. */ removed++; x = next; @@ -468,7 +464,7 @@ unsigned long zslDeleteRangeByScore(zskiplist *zsl, zrangespec *range, dict *dic return removed; } -unsigned long zslDeleteRangeByLex(zskiplist *zsl, zlexrangespec *range, dict *dict) { +static unsigned long zslDeleteRangeByLex(zskiplist *zsl, zlexrangespec *range, hashtable *ht) { zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x; unsigned long removed = 0; int i; @@ -487,7 +483,7 @@ unsigned long zslDeleteRangeByLex(zskiplist *zsl, zlexrangespec *range, dict *di while (x && zslLexValueLteMax(x->ele, range)) { zskiplistNode *next = x->level[0].forward; zslDeleteNode(zsl, x, update); - dictDelete(dict, x->ele); + hashtableDelete(ht, x->ele); zslFreeNode(x); /* Here is where x->ele is actually released. */ removed++; x = next; @@ -497,7 +493,7 @@ unsigned long zslDeleteRangeByLex(zskiplist *zsl, zlexrangespec *range, dict *di /* Delete all the elements with rank between start and end from the skiplist. * Start and end are inclusive. Note that start and end need to be 1-based */ -unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) { +static unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, hashtable *ht) { zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x; unsigned long traversed = 0, removed = 0; int i; @@ -516,7 +512,7 @@ unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned while (x && traversed <= end) { zskiplistNode *next = x->level[0].forward; zslDeleteNode(zsl, x, update); - dictDelete(dict, x->ele); + hashtableDelete(ht, x->ele); zslFreeNode(x); removed++; traversed++; @@ -525,46 +521,46 @@ unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned return removed; } -/* Find the rank for an element by both score and key. +/* Find the rank for a specific skiplist node. * Returns 0 when the element cannot be found, rank otherwise. * Note that the rank is 1-based due to the span of zsl->header to the * first element. */ -unsigned long zslGetRank(zskiplist *zsl, double score, sds ele) { - zskiplistNode *x; +static unsigned long zslGetRank(zskiplist *zsl, const zskiplistNode *node) { unsigned long rank = 0; - int i; - x = zsl->header; - for (i = zsl->level - 1; i >= 0; i--) { - while (x->level[i].forward && - (x->level[i].forward->score < score || - (x->level[i].forward->score == score && sdscmp(x->level[i].forward->ele, ele) <= 0))) { + /* Count up nodes that come before */ + zskiplistNode *x = zsl->header; + for (int i = zsl->level - 1; i >= 0; i--) { + while (zslCompareNodes(x->level[i].forward, node) <= 0) { rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } - /* x might be equal to zsl->header, so test if obj is non-NULL */ - if (x->ele && x->score == score && sdscmp(x->ele, ele) == 0) { + if (x == node) { return rank; } } return 0; } -/* Find the rank for a specific skiplist node. */ -unsigned long zslGetRankByNode(zskiplist *zsl, zskiplistNode *x) { - int i = zslGetNodeHeight(x) - 1; - unsigned long rank = zslGetNodeSpanAtLevel(x, i); - while (x->level[zslGetNodeHeight(x) - 1].forward) { - x = x->level[zslGetNodeHeight(x) - 1].forward; - rank += zslGetNodeSpanAtLevel(x, zslGetNodeHeight(x) - 1); +/* Find the rank for a specific skiplist node. Alternate method that counts + * nodes after the one specified and subtracts from list length. + * TODO investigate whether both rank methods are needed and compare perf */ +static unsigned long zslGetRankCountingNodesAfter(zskiplist *zsl, const zskiplistNode *node) { + int highest_node_span = zslGetNodeHeight(node) - 1; + unsigned long count_after_node = zslGetNodeSpanAtLevel(node, highest_node_span); + while (node->level[highest_node_span].forward) { + node = node->level[highest_node_span].forward; + highest_node_span = zslGetNodeHeight(node) - 1; + count_after_node += zslGetNodeSpanAtLevel(node, highest_node_span); } - rank = zsl->length - rank; + + unsigned long rank = zsl->length - count_after_node; return rank; } /* Finds an element by its rank from start node. The rank argument needs to be 1-based. */ -zskiplistNode *zslGetElementByRankFromNode(zskiplistNode *start_node, int start_level, unsigned long rank) { +static zskiplistNode *zslGetElementByRankFromNode(zskiplistNode *start_node, int start_level, unsigned long rank) { zskiplistNode *x; unsigned long traversed = 0; int i; @@ -639,7 +635,7 @@ static int zslParseRange(robj *min, robj *max, zrangespec *spec) { * * If the string is not a valid range C_ERR is returned, and the value * of *dest and *ex is undefined. */ -int zslParseLexRangeItem(robj *item, sds *dest, int *ex) { +static int zslParseLexRangeItem(robj *item, sds *dest, int *ex) { char *c = item->ptr; switch (c[0]) { @@ -695,7 +691,7 @@ int zslParseLexRange(robj *min, robj *max, zlexrangespec *spec) { /* This is just a wrapper to sdscmp() that is able to * handle shared.minstring and shared.maxstring as the equivalent of * -inf and +inf for strings */ -int sdscmplex(sds a, sds b) { +static int sdscmplex(sds a, sds b) { if (a == b) return 0; if (a == shared.minstring || b == shared.maxstring) return -1; if (a == shared.maxstring || b == shared.minstring) return 1; @@ -711,7 +707,7 @@ int zslLexValueLteMax(sds value, zlexrangespec *spec) { } /* Returns if there is a part of the zset is in the lex range. */ -int zslIsInLexRange(zskiplist *zsl, zlexrangespec *range) { +static int zslIsInLexRange(zskiplist *zsl, zlexrangespec *range) { zskiplistNode *x; /* Test for ranges that will always be empty. */ @@ -806,7 +802,7 @@ zskiplistNode *zslNthInLexRange(zskiplist *zsl, zlexrangespec *range, long n) { * Listpack-backed sorted set API *----------------------------------------------------------------------------*/ -double zzlStrtod(unsigned char *vstr, unsigned int vlen) { +static double zzlStrtod(unsigned char *vstr, unsigned int vlen) { char buf[128]; if (vlen > sizeof(buf) - 1) vlen = sizeof(buf) - 1; memcpy(buf, vstr, vlen); @@ -849,7 +845,7 @@ sds lpGetObject(unsigned char *sptr) { } /* Compare element in sorted set with given element. */ -int zzlCompareElements(unsigned char *eptr, unsigned char *cstr, unsigned int clen) { +static int zzlCompareElements(unsigned char *eptr, unsigned char *cstr, unsigned int clen) { unsigned char *vstr; unsigned int vlen; long long vlong; @@ -869,7 +865,7 @@ int zzlCompareElements(unsigned char *eptr, unsigned char *cstr, unsigned int cl return cmp; } -unsigned int zzlLength(unsigned char *zl) { +static unsigned int zzlLength(unsigned char *zl) { return lpLength(zl) / 2; } @@ -1077,7 +1073,7 @@ unsigned char *zzlLastInLexRange(unsigned char *zl, zlexrangespec *range) { return NULL; } -unsigned char *zzlFind(unsigned char *lp, sds ele, double *score) { +static unsigned char *zzlFind(unsigned char *lp, sds ele, double *score) { unsigned char *eptr, *sptr; if ((eptr = lpFirst(lp)) == NULL) return NULL; @@ -1096,11 +1092,11 @@ unsigned char *zzlFind(unsigned char *lp, sds ele, double *score) { /* Delete (element,score) pair from listpack. Use local copy of eptr because we * don't want to modify the one given as argument. */ -unsigned char *zzlDelete(unsigned char *zl, unsigned char *eptr) { +static unsigned char *zzlDelete(unsigned char *zl, unsigned char *eptr) { return lpDeleteRangeWithEntry(zl, &eptr, 2); } -unsigned char *zzlInsertAt(unsigned char *zl, unsigned char *eptr, sds ele, double score) { +static unsigned char *zzlInsertAt(unsigned char *zl, unsigned char *eptr, sds ele, double score) { unsigned char *sptr; char scorebuf[MAX_D2STRING_CHARS]; int scorelen = 0; @@ -1128,7 +1124,7 @@ unsigned char *zzlInsertAt(unsigned char *zl, unsigned char *eptr, sds ele, doub /* Insert (element,score) pair in listpack. This function assumes the element is * not yet present in the list. */ -unsigned char *zzlInsert(unsigned char *zl, sds ele, double score) { +static unsigned char *zzlInsert(unsigned char *zl, sds ele, double score) { unsigned char *eptr = lpSeek(zl, 0), *sptr; double s; @@ -1160,7 +1156,7 @@ unsigned char *zzlInsert(unsigned char *zl, sds ele, double score) { return zl; } -unsigned char *zzlDeleteRangeByScore(unsigned char *zl, zrangespec *range, unsigned long *deleted) { +static unsigned char *zzlDeleteRangeByScore(unsigned char *zl, zrangespec *range, unsigned long *deleted) { unsigned char *eptr, *sptr; double score; unsigned long num = 0; @@ -1187,7 +1183,7 @@ unsigned char *zzlDeleteRangeByScore(unsigned char *zl, zrangespec *range, unsig return zl; } -unsigned char *zzlDeleteRangeByLex(unsigned char *zl, zlexrangespec *range, unsigned long *deleted) { +static unsigned char *zzlDeleteRangeByLex(unsigned char *zl, zlexrangespec *range, unsigned long *deleted) { unsigned char *eptr, *sptr; unsigned long num = 0; @@ -1225,13 +1221,6 @@ unsigned char *zzlDeleteRangeByRank(unsigned char *zl, unsigned int start, unsig * Common sorted set API *----------------------------------------------------------------------------*/ -/* Utility function used for mapping the hashtable entry to the matching skiplist node. - * For example, this is used in case of ZRANK query. */ -static inline zskiplistNode *zsetGetSLNodeByEntry(dictEntry *de) { - char *score_ref = ((char *)dictGetVal(de)); - return (zskiplistNode *)(score_ref - offsetof(zskiplistNode, score)); -} - unsigned long zsetLength(const robj *zobj) { unsigned long length = 0; if (zobj->encoding == OBJ_ENCODING_LISTPACK) { @@ -1259,7 +1248,7 @@ robj *zsetTypeCreate(size_t size_hint, size_t val_len_hint) { robj *zobj = createZsetObject(); zset *zs = zobj->ptr; - dictExpand(zs->dict, size_hint); + hashtableExpand(zs->ht, size_hint); return zobj; } @@ -1297,11 +1286,11 @@ void zsetConvertAndExpand(robj *zobj, int encoding, unsigned long cap) { if (encoding != OBJ_ENCODING_SKIPLIST) serverPanic("Unknown target encoding"); zs = zmalloc(sizeof(*zs)); - zs->dict = dictCreate(&zsetDictType); + zs->ht = hashtableCreate(&zsetHashtableType); zs->zsl = zslCreate(); /* Presize the dict to avoid rehashing */ - dictExpand(zs->dict, cap); + hashtableExpand(zs->ht, cap); eptr = lpSeek(zl, 0); if (eptr != NULL) { @@ -1318,7 +1307,7 @@ void zsetConvertAndExpand(robj *zobj, int encoding, unsigned long cap) { ele = sdsnewlen((char *)vstr, vlen); node = zslInsert(zs->zsl, score, ele); - serverAssert(dictAdd(zs->dict, ele, &node->score) == DICT_OK); + serverAssert(hashtableAdd(zs->ht, node)); zzlNext(zl, &eptr, &sptr); } @@ -1333,7 +1322,7 @@ void zsetConvertAndExpand(robj *zobj, int encoding, unsigned long cap) { /* Approach similar to zslFree(), since we want to free the skiplist at * the same time as creating the listpack. */ zs = zobj->ptr; - dictRelease(zs->dict); + hashtableRelease(zs->ht); node = zs->zsl->header->level[0].forward; zfree(zs->zsl->header); zfree(zs->zsl); @@ -1377,9 +1366,10 @@ int zsetScore(robj *zobj, sds member, double *score) { if (zzlFind(zobj->ptr, member, score) == NULL) return C_ERR; } else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = zobj->ptr; - dictEntry *de = dictFind(zs->dict, member); - if (de == NULL) return C_ERR; - *score = *(double *)dictGetVal(de); + void *entry; + if (!hashtableFind(zs->ht, member, &entry)) return C_ERR; + zskiplistNode *setElement = entry; + *score = setElement->score; } else { serverPanic("Unknown sorted set encoding"); } @@ -1504,18 +1494,17 @@ int zsetAdd(robj *zobj, double score, sds ele, int in_flags, int *out_flags, dou * converted the key to skiplist. */ if (zobj->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = zobj->ptr; - zskiplistNode *znode; - dictEntry *de; - de = dictFind(zs->dict, ele); - if (de != NULL) { + void **node_ref_in_hashtable = hashtableFindRef(zs->ht, ele); + if (node_ref_in_hashtable != NULL) { /* NX? Return, same element already exists. */ if (nx) { *out_flags |= ZADD_OUT_NOP; return 1; } - curscore = *(double *)dictGetVal(de); + zskiplistNode *old_node = *node_ref_in_hashtable; + curscore = old_node->score; /* Prepare the score for the increment if needed. */ if (incr) { @@ -1536,18 +1525,17 @@ int zsetAdd(robj *zobj, double score, sds ele, int in_flags, int *out_flags, dou /* Remove and re-insert when score changes. */ if (score != curscore) { - znode = zslUpdateScore(zs->zsl, curscore, ele, score); - /* Note that we did not removed the original element from - * the hash table representing the sorted set, so we just - * update the score. */ - dictSetVal(zs->dict, de, &znode->score); /* Update score ptr. */ + zskiplistNode *new_node = zslUpdateScore(zs->zsl, old_node, score); + /* Note that this assignment updates the node pointer stored in + * the hashtable */ + if (new_node) *node_ref_in_hashtable = new_node; *out_flags |= ZADD_OUT_UPDATED; } return 1; } else if (!xx) { ele = sdsdup(ele); - znode = zslInsert(zs->zsl, score, ele); - serverAssert(dictAdd(zs->dict, ele, &znode->score) == DICT_OK); + zskiplistNode *new_node = zslInsert(zs->zsl, score, ele); + serverAssert(hashtableAdd(zs->ht, new_node)); *out_flags |= ZADD_OUT_ADDED; if (newscore) *newscore = score; return 1; @@ -1561,34 +1549,20 @@ int zsetAdd(robj *zobj, double score, sds ele, int in_flags, int *out_flags, dou return 0; /* Never reached. */ } -/* Deletes the element 'ele' from the sorted set encoded as a skiplist+dict, +/* Deletes the element 'ele' from the sorted set encoded as a skiplist+hashtable, * returning 1 if the element existed and was deleted, 0 otherwise (the - * element was not there). It does not resize the dict after deleting the - * element. */ + * element was not there). */ static int zsetRemoveFromSkiplist(zset *zs, sds ele) { - dictEntry *de; - double score; - - de = dictUnlink(zs->dict, ele); - if (de != NULL) { - /* Get the score in order to delete from the skiplist later. */ - score = *(double *)dictGetVal(de); + void *entry; + if (!hashtablePop(zs->ht, ele, &entry)) return 0; + zskiplistNode *node = entry; - /* Delete from the hash table and later from the skiplist. - * Note that the order is important: deleting from the skiplist - * actually releases the SDS string representing the element, - * which is shared between the skiplist and the hash table, so - * we need to delete from the skiplist as the final step. */ - dictFreeUnlinkedEntry(zs->dict, de); + /* hashtable only contains pointers to skiplist nodes. Nothing to free. */ - /* Delete from skiplist. */ - int retval = zslDelete(zs->zsl, score, ele, NULL); - serverAssert(retval); + /* Delete from skiplist. */ + zslDelete(zs->zsl, node); - return 1; - } - - return 0; + return 1; } /* Delete the element 'ele' from the sorted set, returning 1 if the element @@ -1623,7 +1597,7 @@ int zsetDel(robj *zobj, sds ele) { * the one with the lowest score. Otherwise if 'reverse' is non-zero * the rank is computed considering as element with rank 0 the one with * the highest score. */ -long zsetRank(robj *zobj, sds ele, int reverse, double *output_score) { +static long zsetRank(robj *zobj, sds ele, int reverse, double *output_score) { unsigned long llen; unsigned long rank; @@ -1656,25 +1630,19 @@ long zsetRank(robj *zobj, sds ele, int reverse, double *output_score) { } } else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = zobj->ptr; - zskiplist *zsl = zs->zsl; - dictEntry *de; - double score; - de = dictFind(zs->dict, ele); - if (de != NULL) { - zskiplistNode *n = zsetGetSLNodeByEntry(de); - score = n->score; - rank = zslGetRankByNode(zsl, n); - /* Existing elements always have a rank. */ - serverAssert(rank != 0); - if (output_score) *output_score = score; - if (reverse) - return llen - rank; - else - return rank - 1; - } else { - return -1; - } + void *entry; + if (!hashtableFind(zs->ht, ele, &entry)) return -1; + zskiplistNode *node = entry; + + rank = zslGetRankCountingNodesAfter(zs->zsl, node); + /* Existing elements always have a rank. */ + serverAssert(rank != 0); + if (output_score) *output_score = node->score; + if (reverse) + return llen - rank; + else + return rank - 1; } else { serverPanic("Unknown sorted set encoding"); } @@ -1704,7 +1672,7 @@ robj *zsetDup(robj *o) { zobj = createZsetObject(); zs = o->ptr; new_zs = zobj->ptr; - dictExpand(new_zs->dict, dictSize(zs->dict)); + hashtableExpand(new_zs->ht, hashtableSize(zs->ht)); zskiplist *zsl = zs->zsl; zskiplistNode *ln; sds ele; @@ -1721,7 +1689,7 @@ robj *zsetDup(robj *o) { ele = ln->ele; sds new_ele = sdsdup(ele); zskiplistNode *znode = zslInsert(new_zs->zsl, ln->score, new_ele); - dictAdd(new_zs->dict, new_ele, &znode->score); + hashtableAdd(new_zs->ht, znode); ln = ln->backward; } } else { @@ -1748,14 +1716,15 @@ void zsetReplyFromListpackEntry(client *c, listpackEntry *e) { * 'key' and 'val' will be set to hold the element. * The memory in `key` is not to be freed or modified by the caller. * 'score' can be NULL in which case it's not extracted. */ -void zsetTypeRandomElement(robj *zsetobj, unsigned long zsetsize, listpackEntry *key, double *score) { +static void zsetTypeRandomElement(robj *zsetobj, unsigned long zsetsize, listpackEntry *key, double *score) { if (zsetobj->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = zsetobj->ptr; - dictEntry *de = dictGetFairRandomKey(zs->dict); - sds s = dictGetKey(de); - key->sval = (unsigned char *)s; - key->slen = sdslen(s); - if (score) *score = *(double *)dictGetVal(de); + void *entry; + hashtableFairRandomEntry(zs->ht, &entry); + zskiplistNode *node = entry; + key->sval = (unsigned char *)node->ele; + key->slen = sdslen(node->ele); + if (score) *score = node->score; } else if (zsetobj->encoding == OBJ_ENCODING_LISTPACK) { listpackEntry val; lpRandomPair(zsetobj->ptr, zsetsize, key, &val); @@ -1776,7 +1745,7 @@ void zsetTypeRandomElement(robj *zsetobj, unsigned long zsetsize, listpackEntry *----------------------------------------------------------------------------*/ /* This generic command implements both ZADD and ZINCRBY. */ -void zaddGenericCommand(client *c, int flags) { +static void zaddGenericCommand(client *c, int flags) { static char *nanerr = "resulting score is not a number (NaN)"; robj *key = c->argv[1]; robj *zobj; @@ -2012,19 +1981,17 @@ void zremrangeGenericCommand(client *c, zrange_type rangetype) { } } else if (zobj->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = zobj->ptr; - dictPauseAutoResize(zs->dict); + hashtablePauseAutoShrink(zs->ht); switch (rangetype) { case ZRANGE_AUTO: - case ZRANGE_RANK: deleted = zslDeleteRangeByRank(zs->zsl, start + 1, end + 1, zs->dict); break; - case ZRANGE_SCORE: deleted = zslDeleteRangeByScore(zs->zsl, &range, zs->dict); break; - case ZRANGE_LEX: deleted = zslDeleteRangeByLex(zs->zsl, &lexrange, zs->dict); break; + case ZRANGE_RANK: deleted = zslDeleteRangeByRank(zs->zsl, start + 1, end + 1, zs->ht); break; + case ZRANGE_SCORE: deleted = zslDeleteRangeByScore(zs->zsl, &range, zs->ht); break; + case ZRANGE_LEX: deleted = zslDeleteRangeByLex(zs->zsl, &lexrange, zs->ht); break; } - dictResumeAutoResize(zs->dict); - if (dictSize(zs->dict) == 0) { + hashtableResumeAutoShrink(zs->ht); + if (hashtableSize(zs->ht) == 0) { dbDelete(c->db, key); keyremoved = 1; - } else { - dictShrinkIfNeeded(zs->dict); } } else { serverPanic("Unknown sorted set encoding"); @@ -2116,7 +2083,7 @@ typedef struct { typedef union _iterset iterset; typedef union _iterzset iterzset; -void zuiInitIterator(zsetopsrc *op) { +static void zuiInitIterator(zsetopsrc *op) { if (op->subject == NULL) return; if (op->type == OBJ_SET) { @@ -2155,7 +2122,7 @@ void zuiInitIterator(zsetopsrc *op) { } } -void zuiClearIterator(zsetopsrc *op) { +static void zuiClearIterator(zsetopsrc *op) { if (op->subject == NULL) return; if (op->type == OBJ_SET) { @@ -2183,7 +2150,7 @@ void zuiClearIterator(zsetopsrc *op) { } } -void zuiDiscardDirtyValue(zsetopval *val) { +static void zuiDiscardDirtyValue(zsetopval *val) { if (val->flags & OPVAL_DIRTY_SDS) { sdsfree(val->ele); val->ele = NULL; @@ -2191,7 +2158,7 @@ void zuiDiscardDirtyValue(zsetopval *val) { } } -unsigned long zuiLength(zsetopsrc *op) { +static unsigned long zuiLength(zsetopsrc *op) { if (op->subject == NULL) return 0; if (op->type == OBJ_SET) { @@ -2213,7 +2180,7 @@ unsigned long zuiLength(zsetopsrc *op) { /* Check if the current value is valid. If so, store it in the passed structure * and move to the next element. If not valid, this means we have reached the * end of the structure and can abort. */ -int zuiNext(zsetopsrc *op, zsetopval *val) { +static int zuiNext(zsetopsrc *op, zsetopval *val) { if (op->subject == NULL) return 0; zuiDiscardDirtyValue(val); @@ -2272,23 +2239,7 @@ int zuiNext(zsetopsrc *op, zsetopval *val) { return 1; } -int zuiLongLongFromValue(zsetopval *val) { - if (!(val->flags & OPVAL_DIRTY_LL)) { - val->flags |= OPVAL_DIRTY_LL; - - if (val->ele != NULL) { - if (string2ll(val->ele, sdslen(val->ele), &val->ell)) val->flags |= OPVAL_VALID_LL; - } else if (val->estr != NULL) { - if (string2ll((char *)val->estr, val->elen, &val->ell)) val->flags |= OPVAL_VALID_LL; - } else { - /* The long long was already set, flag as valid. */ - val->flags |= OPVAL_VALID_LL; - } - } - return val->flags & OPVAL_VALID_LL; -} - -sds zuiSdsFromValue(zsetopval *val) { +static sds zuiSdsFromValue(zsetopval *val) { if (val->ele == NULL) { if (val->estr != NULL) { val->ele = sdsnewlen((char *)val->estr, val->elen); @@ -2302,7 +2253,7 @@ sds zuiSdsFromValue(zsetopval *val) { /* This is different from zuiSdsFromValue since returns a new SDS string * which is up to the caller to free. */ -sds zuiNewSdsFromValue(zsetopval *val) { +static sds zuiNewSdsFromValue(zsetopval *val) { if (val->flags & OPVAL_DIRTY_SDS) { /* We have already one to return! */ sds ele = val->ele; @@ -2318,22 +2269,9 @@ sds zuiNewSdsFromValue(zsetopval *val) { } } -int zuiBufferFromValue(zsetopval *val) { - if (val->estr == NULL) { - if (val->ele != NULL) { - val->elen = sdslen(val->ele); - val->estr = (unsigned char *)val->ele; - } else { - val->elen = ll2string((char *)val->_buf, sizeof(val->_buf), val->ell); - val->estr = val->_buf; - } - } - return 1; -} - /* Find value pointed to by val in the source pointer to by op. When found, * return 1 and store its score in target. Return 0 otherwise. */ -int zuiFind(zsetopsrc *op, zsetopval *val, double *score) { +static int zuiFind(zsetopsrc *op, zsetopval *val, double *score) { if (op->subject == NULL) return 0; if (op->type == OBJ_SET) { @@ -2357,9 +2295,10 @@ int zuiFind(zsetopsrc *op, zsetopval *val, double *score) { } } else if (op->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = op->subject->ptr; - dictEntry *de; - if ((de = dictFind(zs->dict, val->ele)) != NULL) { - *score = *(double *)dictGetVal(de); + void *entry; + if (hashtableFind(zs->ht, val->ele, &entry)) { + zskiplistNode *node = entry; + *score = node->score; return 1; } else { return 0; @@ -2372,7 +2311,7 @@ int zuiFind(zsetopsrc *op, zsetopval *val, double *score) { } } -int zuiCompareByCardinality(const void *s1, const void *s2) { +static int zuiCompareByCardinality(const void *s1, const void *s2) { unsigned long first = zuiLength((zsetopsrc *)s1); unsigned long second = zuiLength((zsetopsrc *)s2); if (first > second) return 1; @@ -2406,20 +2345,19 @@ inline static void zunionInterAggregate(double *target, double val, int aggregat } } -static size_t zsetDictGetMaxElementLength(dict *d, size_t *totallen) { - dictIterator *di; - dictEntry *de; +static size_t zsetHashtableGetMaxElementLength(hashtable *ht, size_t *totallen) { size_t maxelelen = 0; - di = dictGetIterator(d); - - while ((de = dictNext(di)) != NULL) { - sds ele = dictGetKey(de); - if (sdslen(ele) > maxelelen) maxelelen = sdslen(ele); - if (totallen) (*totallen) += sdslen(ele); + hashtableIterator iter; + hashtableInitIterator(&iter, ht); + void *next; + while (hashtableNext(&iter, &next)) { + zskiplistNode *node = next; + size_t elelen = sdslen(node->ele); + if (elelen > maxelelen) maxelelen = elelen; + if (totallen) (*totallen) += elelen; } - - dictReleaseIterator(di); + hashtableResetIterator(&iter); return maxelelen; } @@ -2469,7 +2407,7 @@ static void zdiffAlgorithm1(zsetopsrc *src, long setnum, zset *dstzset, size_t * if (!exists) { tmp = zuiNewSdsFromValue(&zval); znode = zslInsert(dstzset->zsl, zval.score, tmp); - dictAdd(dstzset->dict, tmp, &znode->score); + hashtableAdd(dstzset->ht, znode); if (sdslen(tmp) > *maxelelen) *maxelelen = sdslen(tmp); (*totelelen) += sdslen(tmp); } @@ -2500,6 +2438,7 @@ static void zdiffAlgorithm2(zsetopsrc *src, long setnum, zset *dstzset, size_t * zskiplistNode *znode; sds tmp; + hashtablePauseAutoShrink(dstzset->ht); for (j = 0; j < setnum; j++) { if (zuiLength(&src[j]) == 0) continue; @@ -2509,15 +2448,13 @@ static void zdiffAlgorithm2(zsetopsrc *src, long setnum, zset *dstzset, size_t * if (j == 0) { tmp = zuiNewSdsFromValue(&zval); znode = zslInsert(dstzset->zsl, zval.score, tmp); - dictAdd(dstzset->dict, tmp, &znode->score); + hashtableAdd(dstzset->ht, znode); cardinality++; } else { - dictPauseAutoResize(dstzset->dict); tmp = zuiSdsFromValue(&zval); if (zsetRemoveFromSkiplist(dstzset, tmp)) { cardinality--; } - dictResumeAutoResize(dstzset->dict); } /* Exit if result set is empty as any additional removal @@ -2530,16 +2467,14 @@ static void zdiffAlgorithm2(zsetopsrc *src, long setnum, zset *dstzset, size_t * } /* Resize dict if needed after removing multiple elements */ - dictShrinkIfNeeded(dstzset->dict); + hashtableResumeAutoShrink(dstzset->ht); /* Using this algorithm, we can't calculate the max element as we go, * we have to iterate through all elements to find the max one after. */ - *maxelelen = zsetDictGetMaxElementLength(dstzset->dict, totelelen); + *maxelelen = zsetHashtableGetMaxElementLength(dstzset->ht, totelelen); } static int zsetChooseDiffAlgorithm(zsetopsrc *src, long setnum) { - int j; - /* Select what DIFF algorithm to use. * * Algorithm 1 is O(N*M + K*log(K)) where N is the size of the @@ -2554,7 +2489,7 @@ static int zsetChooseDiffAlgorithm(zsetopsrc *src, long setnum) { long long algo_one_work = 0; long long algo_two_work = 0; - for (j = 0; j < setnum; j++) { + for (int j = 0; j < setnum; j++) { /* If any other set is equal to the first set, there is nothing to be * done, since we would remove all elements anyway. */ if (j > 0 && src[0].subject == src[j].subject) { @@ -2597,7 +2532,7 @@ static void zdiff(zsetopsrc *src, long setnum, zset *dstzset, size_t *maxelelen, * 'cardinality_only' is currently only applicable when 'op' is SET_OP_INTER. * Work for SINTERCARD, only return the cardinality with minimum processing and memory overheads. */ -void zunionInterDiffGenericCommand(client *c, robj *dstkey, int numkeysIndex, int op, int cardinality_only) { +static void zunionInterDiffGenericCommand(client *c, robj *dstkey, int numkeysIndex, int op, int cardinality_only) { int i, j; long setnum; int aggregate = REDIS_AGGR_SUM; @@ -2607,7 +2542,6 @@ void zunionInterDiffGenericCommand(client *c, robj *dstkey, int numkeysIndex, in size_t maxelelen = 0, totelelen = 0; robj *dstobj = NULL; zset *dstzset = NULL; - zskiplistNode *znode; int withscores = 0; unsigned long cardinality = 0; long limit = 0; /* Stop searching after reaching the limit. 0 means unlimited. */ @@ -2762,8 +2696,8 @@ void zunionInterDiffGenericCommand(client *c, robj *dstkey, int numkeysIndex, in } } else if (j == setnum) { tmp = zuiNewSdsFromValue(&zval); - znode = zslInsert(dstzset->zsl, score, tmp); - dictAdd(dstzset->dict, tmp, &znode->score); + zskiplistNode *znode = zslInsert(dstzset->zsl, score, tmp); + hashtableAdd(dstzset->ht, znode); totelelen += sdslen(tmp); if (sdslen(tmp) > maxelelen) maxelelen = sdslen(tmp); } @@ -2771,64 +2705,58 @@ void zunionInterDiffGenericCommand(client *c, robj *dstkey, int numkeysIndex, in zuiClearIterator(&src[0]); } } else if (op == SET_OP_UNION) { - dictIterator *di; - dictEntry *de, *existing; - double score; - + /* Step 1: Create the hash table first by iterating one sorted set after + * the other. We wait to create the skiplist until scores/ordering are + * finalized. */ if (setnum) { /* Our union is at least as large as the largest set. * Resize the dictionary ASAP to avoid useless rehashing. */ - dictExpand(dstzset->dict, zuiLength(&src[setnum - 1])); + hashtableExpand(dstzset->ht, zuiLength(&src[setnum - 1])); } - - /* Step 1: Create a dictionary of elements -> aggregated-scores - * by iterating one sorted set after the other. */ for (i = 0; i < setnum; i++) { if (zuiLength(&src[i]) == 0) continue; zuiInitIterator(&src[i]); while (zuiNext(&src[i], &zval)) { /* Initialize value */ - score = src[i].weight * zval.score; + double score = src[i].weight * zval.score; if (isnan(score)) score = 0; /* Search for this element in the accumulating dictionary. */ - de = dictAddRaw(dstzset->dict, zuiSdsFromValue(&zval), &existing); + sds sdsval = zuiSdsFromValue(&zval); + hashtablePosition position; /* If we don't have it, we need to create a new entry. */ - if (!existing) { - tmp = zuiNewSdsFromValue(&zval); + void *existing; + if (hashtableFindPositionForInsert(dstzset->ht, sdsval, &position, &existing)) { + zskiplistNode *new_node = zslCreateNode(zslRandomLevel(), score, zuiNewSdsFromValue(&zval)); + hashtableInsertAtPosition(dstzset->ht, new_node, &position); /* Remember the longest single element encountered, * to understand if it's possible to convert to listpack * at the end. */ - totelelen += sdslen(tmp); - if (sdslen(tmp) > maxelelen) maxelelen = sdslen(tmp); - /* Update the element with its initial score. */ - dictSetKey(dstzset->dict, de, tmp); - dictSetDoubleVal(de, score); + totelelen += sdslen(new_node->ele); + if (sdslen(new_node->ele) > maxelelen) { + maxelelen = sdslen(new_node->ele); + } } else { /* Update the score with the score of the new instance - * of the element found in the current sorted set. - * - * Here we access directly the dictEntry double - * value inside the union as it is a big speedup - * compared to using the getDouble/setDouble API. */ - double *existing_score_ptr = dictGetDoubleValPtr(existing); - zunionInterAggregate(existing_score_ptr, score, aggregate); + * of the element found in the current sorted set. */ + zskiplistNode *node = existing; + zunionInterAggregate(&node->score, score, aggregate); } } zuiClearIterator(&src[i]); } - /* Step 2: convert the dictionary into the final sorted set. */ - di = dictGetIterator(dstzset->dict); + /* Step 2: Create the skiplist using final score ordering */ + hashtableIterator iter; + hashtableInitIterator(&iter, dstzset->ht); - while ((de = dictNext(di)) != NULL) { - sds ele = dictGetKey(de); - score = dictGetDoubleVal(de); - znode = zslInsert(dstzset->zsl, score, ele); - dictSetVal(dstzset->dict, de, &znode->score); + void *next; + while (hashtableNext(&iter, &next)) { + zskiplistNode *node = next; + zslInsertNode(dstzset->zsl, node); } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } else if (op == SET_OP_DIFF) { zdiff(src, setnum, dstzset, &maxelelen, &totelelen); } else { @@ -3381,7 +3309,7 @@ void zcountCommand(client *c) { /* Use rank of first element, if any, to determine preliminary count */ if (zn != NULL) { - rank = zslGetRank(zsl, zn->score, zn->ele); + rank = zslGetRank(zsl, zn); count = (zsl->length - (rank - 1)); /* Find last element in range */ @@ -3389,7 +3317,7 @@ void zcountCommand(client *c) { /* Use rank of last element, if any, to determine the actual count */ if (zn != NULL) { - rank = zslGetRank(zsl, zn->score, zn->ele); + rank = zslGetRank(zsl, zn); count -= (zsl->length - rank); } } @@ -3457,7 +3385,7 @@ void zlexcountCommand(client *c) { /* Use rank of first element, if any, to determine preliminary count */ if (zn != NULL) { - rank = zslGetRank(zsl, zn->score, zn->ele); + rank = zslGetRank(zsl, zn); count = (zsl->length - (rank - 1)); /* Find last element in range */ @@ -3465,7 +3393,7 @@ void zlexcountCommand(client *c) { /* Use rank of last element, if any, to determine the actual count */ if (zn != NULL) { - rank = zslGetRank(zsl, zn->score, zn->ele); + rank = zslGetRank(zsl, zn); count -= (zsl->length - rank); } } @@ -4170,11 +4098,12 @@ void zrandmemberWithCountCommand(client *c, long l, int withscores) { if (zsetobj->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = zsetobj->ptr; while (count--) { - dictEntry *de = dictGetFairRandomKey(zs->dict); - sds key = dictGetKey(de); + void *entry; + serverAssert(hashtableFairRandomEntry(zs->ht, &entry)); + zskiplistNode *node = entry; if (withscores && c->resp > 2) addReplyArrayLen(c, 2); - addReplyBulkCBuffer(c, key, sdslen(key)); - if (withscores) addReplyDouble(c, *(double *)dictGetVal(de)); + addReplyBulkCBuffer(c, node->ele, sdslen(node->ele)); + if (withscores) addReplyDouble(c, node->score); if (c->flag.close_asap) break; } } else if (zsetobj->encoding == OBJ_ENCODING_LISTPACK) { From 980ab2a98f78acf9eacbf75ca98342e7f366d503 Mon Sep 17 00:00:00 2001 From: Rueian Date: Wed, 8 Jan 2025 10:05:20 -0800 Subject: [PATCH 05/31] Skip logreqres on tests for the HELLO command (#1528) Skip logreqres on tests for the HELLO command Signed-off-by: Rueian Signed-off-by: proost --- tests/unit/protocol.tcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/protocol.tcl b/tests/unit/protocol.tcl index f0e64368cc..0d05f6dee5 100644 --- a/tests/unit/protocol.tcl +++ b/tests/unit/protocol.tcl @@ -232,7 +232,7 @@ start_server {tags {"protocol network"}} { } -start_server {tags {"protocol hello"}} { +start_server {tags {"protocol hello logreqres:skip"}} { test {HELLO without protover} { set reply [r HELLO 3] assert_equal [dict get $reply proto] 3 From 8d5f1519f15d8f2954f5fcf935189d84b1e29ae3 Mon Sep 17 00:00:00 2001 From: Nikhil Manglore Date: Wed, 8 Jan 2025 12:03:06 -0800 Subject: [PATCH 06/31] valkey-cli auto-exit from subscribed mode (#1432) Resolves issue with valkey-cli not auto exiting from subscribed mode on reaching zero pub/sub subscription (previously filed on Redis) https://github.com/redis/redis/issues/12592 --------- Signed-off-by: Nikhil Manglore Signed-off-by: proost --- src/valkey-cli.c | 45 +++++- tests/integration/valkey-cli.tcl | 226 +++++++++++++++++++++++++++++++ 2 files changed, 267 insertions(+), 4 deletions(-) diff --git a/src/valkey-cli.c b/src/valkey-cli.c index 0a4f1affa2..d2fa537036 100644 --- a/src/valkey-cli.c +++ b/src/valkey-cli.c @@ -218,6 +218,8 @@ static struct config { int shutdown; int monitor_mode; int pubsub_mode; + int pubsub_unsharded_count; /* channels and patterns */ + int pubsub_sharded_count; /* shard channels */ int blocking_state_aborted; /* used to abort monitor_mode and pubsub_mode. */ int latency_mode; int latency_dist_mode; @@ -2229,6 +2231,28 @@ static int cliReadReply(int output_raw_strings) { return REDIS_OK; } +/* Helper method to handle pubsub subscription/unsubscription. */ +static void handlePubSubMode(redisReply *reply) { + char *cmd = reply->element[0]->str; + int count = reply->element[2]->integer; + + /* Update counts based on the command type */ + if (strcmp(cmd, "subscribe") == 0 || strcmp(cmd, "psubscribe") == 0 || strcmp(cmd, "unsubscribe") == 0 || strcmp(cmd, "punsubscribe") == 0) { + config.pubsub_unsharded_count = count; + } else if (strcmp(cmd, "ssubscribe") == 0 || strcmp(cmd, "sunsubscribe") == 0) { + config.pubsub_sharded_count = count; + } + + /* Update pubsub mode based on the current counts */ + if (config.pubsub_unsharded_count + config.pubsub_sharded_count == 0 && config.pubsub_mode) { + config.pubsub_mode = 0; + cliRefreshPrompt(); + } else if (config.pubsub_unsharded_count + config.pubsub_sharded_count > 0 && !config.pubsub_mode) { + config.pubsub_mode = 1; + cliRefreshPrompt(); + } +} + /* Simultaneously wait for pubsub messages from the server and input on stdin. */ static void cliWaitForMessagesOrStdin(void) { int show_info = config.output != OUTPUT_RAW && (isatty(STDOUT_FILENO) || getenv("FAKETTY")); @@ -2246,7 +2270,13 @@ static void cliWaitForMessagesOrStdin(void) { sds out = cliFormatReply(reply, config.output, 0); fwrite(out, sdslen(out), 1, stdout); fflush(stdout); + + if (isPubsubPush(reply)) { + handlePubSubMode(reply); + } + sdsfree(out); + freeReplyObject(reply); } } while (reply); @@ -2397,13 +2427,11 @@ static int cliSendCommand(int argc, char **argv, long repeat) { fflush(stdout); if (config.pubsub_mode || num_expected_pubsub_push > 0) { if (isPubsubPush(config.last_reply)) { + handlePubSubMode(config.last_reply); + if (num_expected_pubsub_push > 0 && !strcasecmp(config.last_reply->element[0]->str, command)) { /* This pushed message confirms the * [p|s][un]subscribe command. */ - if (is_subscribe && !config.pubsub_mode) { - config.pubsub_mode = 1; - cliRefreshPrompt(); - } if (--num_expected_pubsub_push > 0) { continue; /* We need more of these. */ } @@ -3117,6 +3145,13 @@ void cliSetPreferences(char **argv, int argc, int interactive) { else { printf("%sunknown valkey-cli preference '%s'\n", interactive ? "" : ".valkeyclirc: ", argv[1]); } + } else if (!strcasecmp(argv[0], ":get") && argc >= 2) { + if (!strcasecmp(argv[1], "pubsub")) { + printf("%d\n", config.pubsub_mode); + } else { + printf("%sunknown valkey-cli get option '%s'\n", interactive ? "" : ".valkeyclirc: ", argv[1]); + } + fflush(stdout); } else { printf("%sunknown valkey-cli internal command '%s'\n", interactive ? "" : ".valkeyclirc: ", argv[0]); } @@ -9495,6 +9530,8 @@ int main(int argc, char **argv) { config.shutdown = 0; config.monitor_mode = 0; config.pubsub_mode = 0; + config.pubsub_unsharded_count = 0; + config.pubsub_sharded_count = 0; config.blocking_state_aborted = 0; config.latency_mode = 0; config.latency_dist_mode = 0; diff --git a/tests/integration/valkey-cli.tcl b/tests/integration/valkey-cli.tcl index 0c15af74f9..a56818b8c2 100644 --- a/tests/integration/valkey-cli.tcl +++ b/tests/integration/valkey-cli.tcl @@ -608,6 +608,232 @@ if {!$::tls} { ;# fake_redis_node doesn't support TLS assert_equal "a\n1\nb\n2\nc\n3" [exec {*}$cmdline ZRANGE new_zset 0 -1 WITHSCORES] } + test "valkey-cli pubsub mode with single standard channel subscription" { + set fd [open_cli] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "0" $pubsub_status + + write_cli $fd "SUBSCRIBE ch1" + set response [read_cli $fd] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "1" $pubsub_status + + write_cli $fd "UNSUBSCRIBE ch1" + set response [read_cli $fd] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "0" $pubsub_status + + close_cli $fd + } + + test "valkey-cli pubsub mode with multiple standard channel subscriptions" { + set fd [open_cli] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "0" $pubsub_status + + write_cli $fd "SUBSCRIBE ch1 ch2 ch3" + set response [read_cli $fd] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "1" $pubsub_status + + write_cli $fd "UNSUBSCRIBE" + set response [read_cli $fd] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "0" $pubsub_status + + close_cli $fd + } + + test "valkey-cli pubsub mode with single shard channel subscription" { + set fd [open_cli] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "0" $pubsub_status + + write_cli $fd "SSUBSCRIBE schannel1" + set response [read_cli $fd] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "1" $pubsub_status + + write_cli $fd "SUNSUBSCRIBE schannel1" + set response [read_cli $fd] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "0" $pubsub_status + + close_cli $fd + } + + test "valkey-cli pubsub mode with multiple shard channel subscriptions" { + + set fd [open_cli] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "0" $pubsub_status + + write_cli $fd "SSUBSCRIBE schannel1 schannel2 schannel3" + set response [read_cli $fd] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "1" $pubsub_status + + write_cli $fd "SUNSUBSCRIBE" + set response [read_cli $fd] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "0" $pubsub_status + + close_cli $fd + } + + test "valkey-cli pubsub mode with single pattern channel subscription" { + set fd [open_cli] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "0" $pubsub_status + + write_cli $fd "PSUBSCRIBE pattern1*" + set response [read_cli $fd] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "1" $pubsub_status + + write_cli $fd "PUNSUBSCRIBE pattern1*" + set response [read_cli $fd] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "0" $pubsub_status + + close_cli $fd + } + + test "valkey-cli pubsub mode with multiple pattern channel subscriptions" { + set fd [open_cli] + + write_cli $fd "PSUBSCRIBE pattern1* pattern2* pattern3*" + set response [read_cli $fd] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "1" $pubsub_status + + write_cli $fd "PUNSUBSCRIBE" + set response [read_cli $fd] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "0" $pubsub_status + + close_cli $fd + } + + test "valkey-cli pubsub mode when subscribing to the same channel" { + set fd [open_cli] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "0" $pubsub_status + + write_cli $fd "SUBSCRIBE ch1 ch1" + set response [read_cli $fd] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "1" $pubsub_status + + write_cli $fd "UNSUBSCRIBE" + set response [read_cli $fd] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "0" $pubsub_status + + close_cli $fd + } + + test "valkey-cli pubsub mode with multiple subscription types" { + set fd [open_cli] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "0" $pubsub_status + + write_cli $fd "SUBSCRIBE ch1 ch2 ch3" + set response [read_cli $fd] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "1" $pubsub_status + + write_cli $fd "PSUBSCRIBE pattern*" + set response [read_cli $fd] + set lines [split $response "\n"] + assert_equal "psubscribe" [lindex $lines 0] + assert_equal "pattern*" [lindex $lines 1] + assert_equal "4" [lindex $lines 2] + + write_cli $fd "SSUBSCRIBE schannel" + set response [read_cli $fd] + set lines [split $response "\n"] + assert_equal "ssubscribe" [lindex $lines 0] + assert_equal "schannel" [lindex $lines 1] + assert_equal "1" [lindex $lines 2] + + write_cli $fd "PUNSUBSCRIBE pattern*" + set response [read_cli $fd] + set lines [split $response "\n"] + assert_equal "punsubscribe" [lindex $lines 0] + assert_equal "pattern*" [lindex $lines 1] + assert_equal "3" [lindex $lines 2] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "1" $pubsub_status + + write_cli $fd "SUNSUBSCRIBE schannel" + set response [read_cli $fd] + set lines [split $response "\n"] + assert_equal "sunsubscribe" [lindex $lines 0] + assert_equal "schannel" [lindex $lines 1] + assert_equal "0" [lindex $lines 2] + + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "1" $pubsub_status + + write_cli $fd "UNSUBSCRIBE" + set response [read_cli $fd] + + # Verify pubsub mode is no longer active + write_cli $fd ":get pubsub" + set pubsub_status [string trim [read_cli $fd]] + assert_equal "0" $pubsub_status + + close_cli $fd + } + test "Valid Connection Scheme: redis://" { set cmdline [valkeycliuri "redis://" [srv host] [srv port]] assert_equal {PONG} [exec {*}$cmdline PING] From 1dd2cba27e079934b43d81dce512103593b730cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20Sz=C3=A9pe?= Date: Wed, 8 Jan 2025 22:39:45 +0100 Subject: [PATCH 07/31] Improve Typos configuration (#1456) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - remove old ignores - fix a "new" typo 🎁 Signed-off-by: Viktor Szépe Signed-off-by: proost --- .cmake-format.yaml | 2 +- .config/typos.toml | 21 ++++++++------------- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/.cmake-format.yaml b/.cmake-format.yaml index 98ab11753a..60b1b46f1e 100644 --- a/.cmake-format.yaml +++ b/.cmake-format.yaml @@ -53,7 +53,7 @@ format: _help_require_valid_layout: - By default, if cmake-format cannot successfully fit - everything into the desired linewidth it will apply the - - last, most agressive attempt that it made. If this flag is + - last, most aggressive attempt that it made. If this flag is - True, however, cmake-format will print error, exit with non- - zero status code, and write-out nothing require_valid_layout: false diff --git a/.config/typos.toml b/.config/typos.toml index 1dc44ea0e9..a8ecab921c 100644 --- a/.config/typos.toml +++ b/.config/typos.toml @@ -2,16 +2,16 @@ [files] extend-exclude = [ + ".git/", "deps/", # crc16_slottable is primarily pre-generated random strings. "src/crc16_slottable.h", ] +ignore-hidden = false [default.extend-words] -advices = "advices" exat = "exat" optin = "optin" -ro = "ro" smove = "smove" [type.c] @@ -20,7 +20,7 @@ extend-ignore-re = [ "D4C4DAA4", # sha1.c "Georg Nees", "\\[l\\]ist", # eval.c - "LKE", # test_rax.c + '"LKE"', # test_rax.c ] [type.tcl] @@ -28,26 +28,23 @@ extend-ignore-re = [ "DUMPed", ] -[type.sv.extend-identifiers] -# sv = .h -module_gil_acquring = "module_gil_acquring" - [type.c.extend-identifiers] -ang = "ang" +advices = "advices" clen = "clen" fle = "fle" -module_gil_acquring = "module_gil_acquring" nd = "nd" ot = "ot" [type.tcl.extend-identifiers] -fo = "fo" oll = "oll" stressers = "stressers" -[type.sv.extend-words] +[type.sv.extend-identifiers] # sv = .h fo = "fo" + +[type.sv.extend-words] +# sv = .h seeked = "seeked" [type.c.extend-words] @@ -58,7 +55,6 @@ limite = "limite" pn = "pn" seeked = "seeked" tre = "tre" -ws = "ws" [type.systemd.extend-words] # systemd = .conf @@ -66,5 +62,4 @@ ake = "ake" [type.tcl.extend-words] fo = "fo" -lst = "lst" tre = "tre" From 820d33a38677c808d2e3e2941fc389b7383a2ded Mon Sep 17 00:00:00 2001 From: Nadav Gigi <95503908+NadavGigi@users.noreply.github.com> Date: Thu, 9 Jan 2025 00:18:55 +0200 Subject: [PATCH 08/31] Accelerate hash table iterator with prefetching (#1501) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR introduces improvements to the hashtable iterator, implementing prefetching technique described in the blog post [Unlock One Million RPS - Part 2](https://valkey.io/blog/unlock-one-million-rps-part2/) . The changes lay the groundwork for further enhancements in use cases involving iterators. Future PRs will build upon this foundation to improve performance and functionality in various iterator-dependent operations. In the pursuit of maximizing iterator performance, I conducted a comprehensive series of experiments. My tests encompassed a wide range of approaches, including processing multiple bucket indices in parallel, prefetching the next bucket upon completion of the current one, and several other timing and quantity variations. Surprisingly, after rigorous testing and performance analysis, the simplest implementation presented in this PR consistently outperformed all other more complex strategies. ## Implementation Each time we start iterating over a bucket, we prefetch data for future iterations: - We prefetch the entries of the next bucket (if it exists). - We prefetch the structure (but not the entries) of the bucket after the next. This prefetching is done when we pick up a new bucket, increasing the chance that the data will be in cache by the time we need it. ## Performance The data below was taken by conducting keys command on 64cores Graviton 3 Amazon EC2 instance with 50 mil keys in size of 100 bytes each. The results regarding the duration of “keys *” command was taken from “info all” command. ``` +--------------------+------------------+-----------------------------+ | prefetching | Time (seconds) | Keys Processed per Second | +--------------------+------------------+-----------------------------+ | No | 11.112279 | 4,499,529 | | Yes | 3.141916 | 15,913,862 | +--------------------+------------------+-----------------------------+ Improvement: Comparing the iterator without prefetching to the one with prefetching, we can see a speed improvement of 11.112279 / 3.141916 ≈ 3.54 times faster. ``` ### Save command improvment #### Setup: - 64cores Graviton 3 Amazon EC2 instance. - 50 mil keys in size of 100 bytes each. - Running valkey server over RAM file system. - crc checksum and comperssion off. #### Results ``` +--------------------+------------------+-----------------------------+ | prefetching | Time (seconds) | Keys Processed per Second | +--------------------+------------------+-----------------------------+ | No | 28 | 1,785,700 | | Yes | 19.6 | 2,550,000 | +--------------------+------------------+-----------------------------+ Improvement: - Reduced SAVE time by 30% (8.4 seconds faster) - Increased key processing rate by 42.8% (764,300 more keys/second) ``` Signed-off-by: NadavGigi Signed-off-by: proost --- src/hashtable.c | 92 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 70 insertions(+), 22 deletions(-) diff --git a/src/hashtable.c b/src/hashtable.c index 11ba360800..3f1eff19c1 100644 --- a/src/hashtable.c +++ b/src/hashtable.c @@ -498,7 +498,7 @@ size_t nextCursor(size_t v, size_t mask) { } /* Returns the next bucket in a bucket chain, or NULL if there's no next. */ -static bucket *bucketNext(bucket *b) { +static bucket *getChildBucket(bucket *b) { return b->chained ? b->entries[ENTRIES_PER_BUCKET - 1] : NULL; } @@ -548,12 +548,12 @@ static void rehashStep(hashtable *ht) { rehashBucket(ht, b); if (b->chained) { /* Rehash and free child buckets. */ - bucket *next = bucketNext(b); + bucket *next = getChildBucket(b); b->chained = 0; b = next; while (b != NULL) { rehashBucket(ht, b); - next = bucketNext(b); + next = getChildBucket(b); zfree(b); if (ht->type->trackMemUsage) ht->type->trackMemUsage(ht, -sizeof(bucket)); ht->child_buckets[0]--; @@ -708,7 +708,7 @@ static bucket *findBucket(hashtable *ht, uint64_t hash, const void *key, int *po } } } - b = bucketNext(b); + b = getChildBucket(b); } while (b != NULL); } return NULL; @@ -753,7 +753,7 @@ static void bucketConvertToUnchained(bucket *b) { * This function needs the penultimate 'before_last' bucket in the chain, to be * able to update it when the last bucket is freed. */ static void pruneLastBucket(hashtable *ht, bucket *before_last, bucket *last, int table_index) { - assert(before_last->chained && bucketNext(before_last) == last); + assert(before_last->chained && getChildBucket(before_last) == last); assert(!last->chained); assert(last->presence == 0 || __builtin_popcount(last->presence) == 1); bucketConvertToUnchained(before_last); @@ -775,10 +775,10 @@ static void fillBucketHole(hashtable *ht, bucket *b, int pos_in_bucket, int tabl assert(b->chained && !isPositionFilled(b, pos_in_bucket)); /* Find the last bucket */ bucket *before_last = b; - bucket *last = bucketNext(b); + bucket *last = getChildBucket(b); while (last->chained) { before_last = last; - last = bucketNext(last); + last = getChildBucket(last); } /* Unless the last bucket is empty, find an entry in the last bucket and * move it to the hole in b. */ @@ -800,10 +800,10 @@ static void fillBucketHole(hashtable *ht, bucket *b, int pos_in_bucket, int tabl static void compactBucketChain(hashtable *ht, size_t bucket_index, int table_index) { bucket *b = &ht->tables[table_index][bucket_index]; while (b->chained) { - bucket *next = bucketNext(b); + bucket *next = getChildBucket(b); if (next->chained && next->presence == 0) { /* Empty bucket in the middle of the chain. Remove it from the chain. */ - bucket *next_next = bucketNext(next); + bucket *next_next = getChildBucket(next); b->entries[ENTRIES_PER_BUCKET - 1] = next_next; zfree(next); if (ht->type->trackMemUsage) ht->type->trackMemUsage(ht, -sizeof(bucket)); @@ -846,7 +846,7 @@ static bucket *findBucketForInsert(hashtable *ht, uint64_t hash, int *pos_in_buc bucketConvertToChained(ht, b); ht->child_buckets[table]++; } - b = bucketNext(b); + b = getChildBucket(b); } /* Find a free slot in the bucket. There must be at least one. */ int pos; @@ -934,6 +934,51 @@ static inline incrementalFind *incrementalFindFromOpaque(hashtableIncrementalFin return (incrementalFind *)(void *)state; } +/* Prefetches all filled entries in the given bucket to optimize future memory access. */ +static void prefetchBucketEntries(bucket *b) { + for (int pos = 0; pos < numBucketPositions(b); pos++) { + if (isPositionFilled(b, pos)) { + valkey_prefetch(b->entries[pos]); + } + } +} + +/* Returns the child bucket if chained, otherwise the next bucket in the table. returns NULL if neither exists. */ +static bucket *getNextBucket(bucket *current_bucket, size_t bucket_index, hashtable *ht, int table_index) { + bucket *next_bucket = NULL; + if (current_bucket->chained) { + next_bucket = getChildBucket(current_bucket); + } else { + size_t table_size = numBuckets(ht->bucket_exp[table_index]); + size_t next_index = bucket_index + 1; + if (next_index < table_size) { + next_bucket = &ht->tables[table_index][next_index]; + } + } + return next_bucket; +} + +/* This function prefetches data that will be needed in subsequent iterations: + * - The entries of the next bucket + * - The next of the next bucket + * It attempts to bring this data closer to the L1 cache to reduce future memory access latency. + * + * Cache state before this function is called(due to last call for this function): + * 1. The current bucket and its entries are likely already in cache. + * 2. The next bucket is in cache. + */ +static void prefetchNextBucketEntries(iter *iter, bucket *current_bucket) { + size_t next_index = iter->index + 1; + bucket *next_bucket = getNextBucket(current_bucket, next_index, iter->hashtable, iter->table); + if (next_bucket) { + prefetchBucketEntries(next_bucket); + bucket *next_next_bucket = getNextBucket(next_bucket, next_index + 1, iter->hashtable, iter->table); + if (next_next_bucket) { + valkey_prefetch(next_next_bucket); + } + } +} + /* --- API functions --- */ /* Allocates and initializes a new hashtable specified by the given type. */ @@ -979,7 +1024,7 @@ void hashtableEmpty(hashtable *ht, void(callback)(hashtable *)) { } } } - bucket *next = bucketNext(b); + bucket *next = getChildBucket(b); /* Free allocated bucket. */ if (b != &ht->tables[table_index][idx]) { @@ -1375,7 +1420,7 @@ int hashtableReplaceReallocatedEntry(hashtable *ht, const void *old_entry, void return 1; } } - b = bucketNext(b); + b = getChildBucket(b); } while (b != NULL); } return 0; @@ -1538,8 +1583,8 @@ int hashtableIncrementalFindStep(hashtableIncrementalFindState *state) { bucket_idx = data->hash & mask; } data->bucket = &ht->tables[data->table][bucket_idx]; - } else if (bucketNext(data->bucket) != NULL) { - data->bucket = bucketNext(data->bucket); + } else if (getChildBucket(data->bucket) != NULL) { + data->bucket = getChildBucket(data->bucket); } else if (data->table == 0 && ht->rehash_idx >= 0) { data->table = 1; size_t mask = expToMask(ht->bucket_exp[1]); @@ -1656,7 +1701,7 @@ size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction f } } } - bucket *next = bucketNext(b); + bucket *next = getChildBucket(b); if (next != NULL && defragfn != NULL) { next = bucketDefrag(b, next, defragfn); } @@ -1693,7 +1738,7 @@ size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction f } } } - bucket *next = bucketNext(b); + bucket *next = getChildBucket(b); if (next != NULL && defragfn != NULL) { next = bucketDefrag(b, next, defragfn); } @@ -1723,7 +1768,7 @@ size_t hashtableScanDefrag(hashtable *ht, size_t cursor, hashtableScanFunction f } } } - bucket *next = bucketNext(b); + bucket *next = getChildBucket(b); if (next != NULL && defragfn != NULL) { next = bucketDefrag(b, next, defragfn); } @@ -1859,7 +1904,7 @@ int hashtableNext(hashtableIterator *iterator, void **elemptr) { iter->pos_in_bucket++; if (iter->bucket->chained && iter->pos_in_bucket >= ENTRIES_PER_BUCKET - 1) { iter->pos_in_bucket = 0; - iter->bucket = bucketNext(iter->bucket); + iter->bucket = getChildBucket(iter->bucket); } else if (iter->pos_in_bucket >= ENTRIES_PER_BUCKET) { /* Bucket index done. */ if (iter->safe) { @@ -1890,6 +1935,9 @@ int hashtableNext(hashtableIterator *iterator, void **elemptr) { } } bucket *b = iter->bucket; + if (iter->pos_in_bucket == 0) { + prefetchNextBucketEntries(iter, b); + } if (!isPositionFilled(b, iter->pos_in_bucket)) { /* No entry here. */ continue; @@ -1988,7 +2036,7 @@ hashtableStats *hashtableGetStatsHt(hashtable *ht, int table_index, int full) { unsigned long chainlen = 0; while (b->chained) { chainlen++; - b = bucketNext(b); + b = getChildBucket(b); } if (chainlen > stats->max_chain_len) { stats->max_chain_len = chainlen; @@ -2083,7 +2131,7 @@ void hashtableDump(hashtable *ht) { printf("(empty)\n"); } } - b = bucketNext(b); + b = getChildBucket(b); level++; } while (b != NULL); } @@ -2117,7 +2165,7 @@ void hashtableHistogram(hashtable *ht) { continue; } printf("%X", __builtin_popcount(b->presence)); - buckets[idx] = bucketNext(b); + buckets[idx] = getChildBucket(b); if (buckets[idx] == NULL) chains_left--; } printf("\n"); @@ -2138,7 +2186,7 @@ int hashtableLongestBucketChain(hashtable *ht) { if (++chainlen > maxlen) { maxlen = chainlen; } - b = bucketNext(b); + b = getChildBucket(b); } } } From 9323ecf610ea07d2414ee870855ab880a2f1bae0 Mon Sep 17 00:00:00 2001 From: Karthick Ariyaratnam Date: Wed, 8 Jan 2025 22:52:45 -0500 Subject: [PATCH 09/31] Remove legacy SERVER_TEST compiler flag from cmake. (#1530) This PR is to cleanup the `SERVER_TEST` compiler flag from cmake compile definitions, as it is no longer required in the new unit test framework, see #428. Signed-off-by: Karthick Ariyaratnam Signed-off-by: proost --- src/unit/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/src/unit/CMakeLists.txt b/src/unit/CMakeLists.txt index 7d80c533cf..49960e1f4b 100644 --- a/src/unit/CMakeLists.txt +++ b/src/unit/CMakeLists.txt @@ -7,7 +7,6 @@ get_valkey_server_linker_option(VALKEY_SERVER_LDFLAGS) # Build unit tests only message(STATUS "Building unit tests") -list(APPEND COMPILE_DEFINITIONS "SERVER_TEST=1") if (USE_TLS) if (BUILD_TLS_MODULE) # TLS as a module From ad9601d1910246a70f1e9f04a7695a480ffa8a3f Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 9 Jan 2025 12:21:31 +0800 Subject: [PATCH 10/31] Fix new cli subscribed mode test in cluster mode (#1533) We need to add a hash tag in cluster mode. Fixes #1531. Signed-off-by: Binbin Signed-off-by: proost --- tests/integration/valkey-cli.tcl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/integration/valkey-cli.tcl b/tests/integration/valkey-cli.tcl index a56818b8c2..095f1e77bc 100644 --- a/tests/integration/valkey-cli.tcl +++ b/tests/integration/valkey-cli.tcl @@ -681,14 +681,13 @@ if {!$::tls} { ;# fake_redis_node doesn't support TLS } test "valkey-cli pubsub mode with multiple shard channel subscriptions" { - set fd [open_cli] write_cli $fd ":get pubsub" set pubsub_status [string trim [read_cli $fd]] assert_equal "0" $pubsub_status - write_cli $fd "SSUBSCRIBE schannel1 schannel2 schannel3" + write_cli $fd "SSUBSCRIBE {schannel}1 {schannel}2 {schannel}3" set response [read_cli $fd] write_cli $fd ":get pubsub" From 00d97a3108813142168bf34dad98c20b91ef59f2 Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Wed, 8 Jan 2025 22:35:48 -0800 Subject: [PATCH 11/31] Free the passed in lua context instead of the global (#1536) The fix that Redis gave us for the CVE-2024-46981 was freeing lctx.lua, and I didn't merge it correctly. We made some changes so that we are able to async free the lua context, so we need to free the passed in context. This was applied correctly on the two released versions (8.0 and 7.2) just not on unstable. Signed-off-by: Madelyn Olson Signed-off-by: proost --- src/eval.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/eval.c b/src/eval.c index 9aa185d77b..62780447a9 100644 --- a/src/eval.c +++ b/src/eval.c @@ -285,7 +285,7 @@ void scriptingInit(int setup) { void freeLuaScriptsSync(dict *lua_scripts, list *lua_scripts_lru_list, lua_State *lua) { dictRelease(lua_scripts); listRelease(lua_scripts_lru_list); - lua_gc(lctx.lua, LUA_GCCOLLECT, 0); + lua_gc(lua, LUA_GCCOLLECT, 0); lua_close(lua); #if !defined(USE_LIBC) From 6a58098fd3eaffcd52057f5bd3d71541eb337637 Mon Sep 17 00:00:00 2001 From: Harkrishn Patro Date: Thu, 9 Jan 2025 17:19:36 -0800 Subject: [PATCH 12/31] Update upload artifacts to v4 (#1539) Fixes #1538 Signed-off-by: Harkrishn Patro Signed-off-by: proost --- .github/workflows/external.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/external.yml b/.github/workflows/external.yml index cfcab995d5..eeacf80609 100644 --- a/.github/workflows/external.yml +++ b/.github/workflows/external.yml @@ -34,7 +34,7 @@ jobs: --tags -slow - name: Archive server log if: ${{ failure() }} - uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3 + uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 with: name: test-external-standalone-log path: external-server.log @@ -62,7 +62,7 @@ jobs: --tags -slow - name: Archive server log if: ${{ failure() }} - uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3 + uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 with: name: test-external-cluster-log path: external-server.log @@ -86,7 +86,7 @@ jobs: --tags "-slow -needs:debug" - name: Archive server log if: ${{ failure() }} - uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3 + uses: actions/upload-artifact@6f51ac03b9356f520e9adb1b1b7802705f340c2b # v4.5.0 with: name: test-external-nodebug-log path: external-server.log From 426bfed86348c6e4e499d4ed28b110b08a8f2830 Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 10 Jan 2025 10:19:04 +0800 Subject: [PATCH 13/31] Fix crash when freeing newly created node when nodeIp2String fail (#1535) In #1441, we found a assert, and decided remove this assert and instead just free the newly created node and close the link, since if we cannot get the IP from the link it probably means the connection was closed. ``` === VALKEY BUG REPORT START: Cut & paste starting from here === 17847:M 19 Dec 2024 00:15:58.021 # === ASSERTION FAILED === 17847:M 19 Dec 2024 00:15:58.021 # ==> cluster_legacy.c:3252 'nodeIp2String(node->ip, link, hdr->myip) == C_OK' is not true ------ STACK TRACE ------ 17847 valkey-server * src/valkey-server 127.0.0.1:27131 [cluster](clusterProcessPacket+0x1304) [0x4e5634] src/valkey-server 127.0.0.1:27131 [cluster](clusterReadHandler+0x11e) [0x4e59de] /__w/valkey/valkey/src/valkey-tls.so(+0x2f1e) [0x7f083983ff1e] src/valkey-server 127.0.0.1:27131 [cluster](aeMain+0x8a) [0x41afea] src/valkey-server 127.0.0.1:27131 [cluster](main+0x4d7) [0x40f547] /lib64/libc.so.6(+0x40c8) [0x7f083985a0c8] /lib64/libc.so.6(__libc_start_main+0x8b) [0x7f083985a18b] src/valkey-server 127.0.0.1:27131 [cluster](_start+0x25) [0x410ef5] ``` But it also introduces another assert. The reason is that this new node is not added to the cluster nodes dict. ``` 17128:M 08 Jan 2025 10:51:44.061 # === ASSERTION FAILED === 17128:M 08 Jan 2025 10:51:44.061 # ==> cluster_legacy.c:1693 'dictDelete(server.cluster->nodes, nodename) == DICT_OK' is not true ------ STACK TRACE ------ 17128 valkey-server * src/valkey-server 127.0.0.1:28627 [cluster][0x4ebdc4] src/valkey-server 127.0.0.1:28627 [cluster][0x4e81d2] src/valkey-server 127.0.0.1:28627 [cluster](clusterReadHandler+0x268)[0x4e8618] /__w/valkey/valkey/src/valkey-tls.so(+0xb278)[0x7f109480b278] src/valkey-server 127.0.0.1:28627 [cluster](aeMain+0x89)[0x592b09] src/valkey-server 127.0.0.1:28627 [cluster](main+0x4b3)[0x453e23] /lib64/libc.so.6(__libc_start_main+0xe5)[0x7f10958bf7e5] src/valkey-server 127.0.0.1:28627 [cluster](_start+0x2e)[0x454a5e] ``` This closes #1527. Signed-off-by: Binbin Signed-off-by: proost --- src/cluster_legacy.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 0777d6d8c6..bf5d314908 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3245,6 +3245,17 @@ int clusterProcessPacket(clusterLink *link) { if (type == CLUSTERMSG_TYPE_MEET) { if (!sender) { if (!link->node) { + char ip[NET_IP_STR_LEN] = {0}; + if (nodeIp2String(ip, link, hdr->myip) != C_OK) { + /* Unable to retrieve the node's IP address from the connection. Without a + * valid IP, the node becomes unusable in the cluster. This failure might be + * due to the connection being closed. */ + serverLog(LL_NOTICE, "Closing link even though we received a MEET packet on it, " + "because the connection has an error"); + freeClusterLink(link); + return 0; + } + /* Add this node if it is new for us and the msg type is MEET. * In this stage we don't try to add the node with the right * flags, replicaof pointer, and so forth, as this details will be @@ -3253,14 +3264,7 @@ int clusterProcessPacket(clusterLink *link) { * we want to send extensions right away in the return PONG in order * to reduce the amount of time needed to stabilize the shard ID. */ clusterNode *node = createClusterNode(NULL, CLUSTER_NODE_HANDSHAKE); - if (nodeIp2String(node->ip, link, hdr->myip) != C_OK) { - /* We cannot get the IP info from the link, it probably means the connection is closed. */ - serverLog(LL_NOTICE, "Closing link even though we received a MEET packet on it, " - "because the connection has an error"); - freeClusterLink(link); - freeClusterNode(node); - return 0; - } + memcpy(node->ip, ip, sizeof(ip)); getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port); node->cport = ntohs(hdr->cport); if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) { From 0ae56e7af65baf4c450e594b695f091703d02702 Mon Sep 17 00:00:00 2001 From: Binbin Date: Sat, 11 Jan 2025 10:32:58 +0800 Subject: [PATCH 14/31] Fix module LatencyAddSample still work when latency-monitor-threshold is 0 (#1541) When latency-monitor-threshold is set to 0, it means the latency monitor is disabled, and in VM_LatencyAddSample, we wrote the condition incorrectly, causing us to record latency when latency was turned off. This bug was introduced in the very first day, see e3b1d6d, it was merged in 2019. Signed-off-by: Binbin Signed-off-by: proost --- src/module.c | 2 +- tests/modules/basics.c | 21 +++++++++++++++++++++ tests/unit/moduleapi/basics.tcl | 17 +++++++++++++++++ 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/src/module.c b/src/module.c index 58555839f2..76c362b758 100644 --- a/src/module.c +++ b/src/module.c @@ -7680,7 +7680,7 @@ void VM__Assert(const char *estr, const char *file, int line) { * command. The call is skipped if the latency is smaller than the configured * latency-monitor-threshold. */ void VM_LatencyAddSample(const char *event, mstime_t latency) { - if (latency >= server.latency_monitor_threshold) latencyAddSample(event, latency); + latencyAddSampleIfNeeded(event, latency); } /* -------------------------------------------------------------------------- diff --git a/tests/modules/basics.c b/tests/modules/basics.c index 36f88becbe..c5b5565686 100644 --- a/tests/modules/basics.c +++ b/tests/modules/basics.c @@ -669,6 +669,24 @@ int TestNotifications(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int argc) return ValkeyModule_ReplyWithSimpleString(ctx, "ERR"); } +/* test.latency latency_ms */ +int TestLatency(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int argc) { + if (argc != 2) { + ValkeyModule_WrongArity(ctx); + return VALKEYMODULE_OK; + } + + long long latency_ms; + if (ValkeyModule_StringToLongLong(argv[1], &latency_ms) != VALKEYMODULE_OK) { + ValkeyModule_ReplyWithError(ctx, "Invalid integer value"); + return VALKEYMODULE_OK; + } + + ValkeyModule_LatencyAddSample("test", latency_ms); + ValkeyModule_ReplyWithSimpleString(ctx, "OK"); + return VALKEYMODULE_OK; +} + /* TEST.CTXFLAGS -- Test GetContextFlags. */ int TestCtxFlags(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int argc) { VALKEYMODULE_NOT_USED(argc); @@ -1048,5 +1066,8 @@ int ValkeyModule_OnLoad(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int arg TestNotifications,"write deny-oom",1,1,1) == VALKEYMODULE_ERR) return VALKEYMODULE_ERR; + if (ValkeyModule_CreateCommand(ctx, "test.latency", TestLatency, "readonly", 0, 0, 0) == VALKEYMODULE_ERR) + return VALKEYMODULE_ERR; + return VALKEYMODULE_OK; } diff --git a/tests/unit/moduleapi/basics.tcl b/tests/unit/moduleapi/basics.tcl index 733e8e1962..0696e30050 100644 --- a/tests/unit/moduleapi/basics.tcl +++ b/tests/unit/moduleapi/basics.tcl @@ -39,6 +39,23 @@ start_server {tags {"modules"}} { verify_log_message 0 "*Module name is busy*" 0 } + test "test latency" { + r config set latency-monitor-threshold 0 + r latency reset + r test.latency 0 + r test.latency 1 + assert_equal {} [r latency latest] + assert_equal {} [r latency history test] + + r config set latency-monitor-threshold 1 + r test.latency 0 + assert_equal 0 [llength [r latency history test]] + r test.latency 1 + assert_match {*test * 1 1*} [r latency latest] + r test.latency 2 + assert_match {*test * 2 2*} [r latency latest] + } + test "Unload the module - basics" { assert_equal {OK} [r module unload test] } From c84d1ed0cdcd11079d85d7001434e43634df7855 Mon Sep 17 00:00:00 2001 From: Binbin Date: Sat, 11 Jan 2025 10:43:18 +0800 Subject: [PATCH 15/31] Do election in order based on failed primary rank to avoid voting conflicts (#1018) When multiple primary nodes fail simultaneously, the cluster can not recover within the default effective time (data_age limit). The main reason is that the vote is without ranking among multiple replica nodes, which case too many epoch conflicts. Therefore, we introduced into ranking based on the failed primary shard-id. Introduced a new failed_primary_rank var, this var means the rank of this myself instance in the context of all failed primary list. This var will be used in failover and we will do the failover election packets in order based on the rank, this can effectively avoid the voting conflicts. If a single primary is down, the behavior is the same as before. If multiple primaries are down, their replica election initiation time will be delayed by 500ms according to the ranking. Signed-off-by: Binbin Signed-off-by: proost --- src/cluster_legacy.c | 61 ++++++++++++++++++++++++++++++-- src/cluster_legacy.h | 15 ++++---- tests/unit/cluster/failover2.tcl | 35 ++++++++++++++++-- 3 files changed, 100 insertions(+), 11 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index bf5d314908..01b92f71c1 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1103,6 +1103,7 @@ void clusterInit(void) { server.cluster->failover_auth_time = 0; server.cluster->failover_auth_count = 0; server.cluster->failover_auth_rank = 0; + server.cluster->failover_failed_primary_rank = 0; server.cluster->failover_auth_epoch = 0; server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE; server.cluster->lastVoteEpoch = 0; @@ -4514,6 +4515,45 @@ int clusterGetReplicaRank(void) { return rank; } +/* This function returns the "rank" of this instance's primary, in the context + * of all failed primary list. The primary node will be ignored if failed time + * exceeds cluster-node-timeout * cluster-replica-validity-factor. + * + * If multiple primary nodes go down at the same time, there is a certain + * probability that their replicas will initiate the elections at the same time, + * and lead to insufficient votes. + * + * The failed primary rank is used to add a delay to start an election in order + * to avoid simultaneous elections of replicas. */ +int clusterGetFailedPrimaryRank(void) { + serverAssert(nodeIsReplica(myself)); + serverAssert(myself->replicaof); + + int rank = 0; + mstime_t now = mstime(); + dictIterator *di; + dictEntry *de; + + di = dictGetSafeIterator(server.cluster->nodes); + while ((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + + /* Skip nodes that do not need to participate in the rank. */ + if (!nodeFailed(node) || !clusterNodeIsVotingPrimary(node) || node->num_replicas == 0) continue; + + /* If cluster-replica-validity-factor is enabled, skip the invalid nodes. */ + if (server.cluster_replica_validity_factor) { + if ((now - node->fail_time) > (server.cluster_node_timeout * server.cluster_replica_validity_factor)) + continue; + } + + if (memcmp(node->shard_id, myself->shard_id, CLUSTER_NAMELEN) < 0) rank++; + } + dictReleaseIterator(di); + + return rank; +} + /* This function is called by clusterHandleReplicaFailover() in order to * let the replica log why it is not able to failover. Sometimes there are * not the conditions, but since the failover function is called again and @@ -4695,6 +4735,11 @@ void clusterHandleReplicaFailover(void) { * Specifically 1 second * rank. This way replicas that have a probably * less updated replication offset, are penalized. */ server.cluster->failover_auth_time += server.cluster->failover_auth_rank * 1000; + /* We add another delay that is proportional to the failed primary rank. + * Specifically 0.5 second * rank. This way those failed primaries will be + * elected in rank to avoid the vote conflicts. */ + server.cluster->failover_failed_primary_rank = clusterGetFailedPrimaryRank(); + server.cluster->failover_auth_time += server.cluster->failover_failed_primary_rank * 500; /* However if this is a manual failover, no delay is needed. */ if (server.cluster->mf_end) { server.cluster->failover_auth_time = now; @@ -4705,9 +4750,9 @@ void clusterHandleReplicaFailover(void) { } serverLog(LL_NOTICE, "Start of election delayed for %lld milliseconds " - "(rank #%d, offset %lld).", + "(rank #%d, primary rank #%d, offset %lld).", server.cluster->failover_auth_time - now, server.cluster->failover_auth_rank, - replicationGetReplicaOffset()); + server.cluster->failover_failed_primary_rank, replicationGetReplicaOffset()); /* Now that we have a scheduled election, broadcast our offset * to all the other replicas so that they'll updated their offsets * if our offset is better. */ @@ -4723,6 +4768,9 @@ void clusterHandleReplicaFailover(void) { * replicas for the same primary since we computed our election delay. * Update the delay if our rank changed. * + * It is also possible that we received the message that telling a + * shard is up. Update the delay if our failed_primary_rank changed. + * * Not performed if this is a manual failover. */ if (server.cluster->failover_auth_sent == 0 && server.cluster->mf_end == 0) { int newrank = clusterGetReplicaRank(); @@ -4733,6 +4781,15 @@ void clusterHandleReplicaFailover(void) { serverLog(LL_NOTICE, "Replica rank updated to #%d, added %lld milliseconds of delay.", newrank, added_delay); } + + int new_failed_primary_rank = clusterGetFailedPrimaryRank(); + if (new_failed_primary_rank != server.cluster->failover_failed_primary_rank) { + long long added_delay = (new_failed_primary_rank - server.cluster->failover_failed_primary_rank) * 500; + server.cluster->failover_auth_time += added_delay; + server.cluster->failover_failed_primary_rank = new_failed_primary_rank; + serverLog(LL_NOTICE, "Failed primary rank updated to #%d, added %lld milliseconds of delay.", + new_failed_primary_rank, added_delay); + } } /* Return ASAP if we can't still start the election. */ diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index ac14bd583c..226842c5dc 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -382,13 +382,14 @@ struct clusterState { clusterNode *importing_slots_from[CLUSTER_SLOTS]; clusterNode *slots[CLUSTER_SLOTS]; /* The following fields are used to take the replica state on elections. */ - mstime_t failover_auth_time; /* Time of previous or next election. */ - int failover_auth_count; /* Number of votes received so far. */ - int failover_auth_sent; /* True if we already asked for votes. */ - int failover_auth_rank; /* This replica rank for current auth request. */ - uint64_t failover_auth_epoch; /* Epoch of the current election. */ - int cant_failover_reason; /* Why a replica is currently not able to - failover. See the CANT_FAILOVER_* macros. */ + mstime_t failover_auth_time; /* Time of previous or next election. */ + int failover_auth_count; /* Number of votes received so far. */ + int failover_auth_sent; /* True if we already asked for votes. */ + int failover_auth_rank; /* This replica rank for current auth request. */ + int failover_failed_primary_rank; /* The rank of this instance in the context of all failed primary list. */ + uint64_t failover_auth_epoch; /* Epoch of the current election. */ + int cant_failover_reason; /* Why a replica is currently not able to + * failover. See the CANT_FAILOVER_* macros. */ /* Manual failover state in common. */ mstime_t mf_end; /* Manual failover time limit (ms unixtime). It is zero if there is no MF in progress. */ diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl index 9262049e4e..2272a150ee 100644 --- a/tests/unit/cluster/failover2.tcl +++ b/tests/unit/cluster/failover2.tcl @@ -62,10 +62,8 @@ start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-ping-interval verify_no_log_message -3 "*Failover attempt expired*" 0 verify_no_log_message -6 "*Failover attempt expired*" 0 } - } ;# start_cluster - start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000}} { test "Primaries will not time out then they are elected in the same epoch" { # Since we have the delay time, so these node may not initiate the @@ -102,3 +100,36 @@ start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval resume_process [srv -2 pid] } } ;# start_cluster + +run_solo {cluster} { + start_cluster 32 15 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 15000}} { + test "Multiple primary nodes are down, rank them based on the failed primary" { + # Killing these primary nodes. + for {set j 0} {$j < 15} {incr j} { + pause_process [srv -$j pid] + } + + # Make sure that a node starts failover. + wait_for_condition 1000 100 { + [s -40 role] == "master" + } else { + fail "No failover detected" + } + + # Wait for the cluster state to become ok. + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused [srv -$j pid]]} continue + wait_for_condition 1000 100 { + [CI $j cluster_state] eq "ok" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } + + # Resuming these primary nodes, speed up the shutdown. + for {set j 0} {$j < 15} {incr j} { + resume_process [srv -$j pid] + } + } + } ;# start_cluster +} ;# run_solo From 88abb0d7208a0d0c8e17805901d20f188ddcade6 Mon Sep 17 00:00:00 2001 From: Binbin Date: Sat, 11 Jan 2025 11:02:05 +0800 Subject: [PATCH 16/31] Mark the node as FAIL when the node is marked as NOADDR and broadcast the FAIL (#1191) Imagine we have a cluster, for example a three-shard cluster, if shard 1 doing a CLUSTER RESET HARD, it will change the node name, and then other nodes will mark it as NOADR since the node name received by PONG has changed. In the eyes of other nodes, there is one working primary node left but with no address, and in this case, the address report in MOVED will be invalid and will confuse the clients. And in the same time, the replica will not failover since its primary is not in the FAIL state. And the cluster looks OK to everyone. This leaves a cluster that appears OK, but with no coverage for shard 1, obviously we should do something like CLUSTER FORGET to remove the node and fix the cluster before using it. But the point in here, we can mark the NOADDR node as FAIL to advance the cluster state. If a node is NOADDR means it does not have a valid address, so we won't reconnect it, we won't send PING, we won't gossip it, it seems reasonable to mark it as FAIL. Signed-off-by: Binbin Signed-off-by: proost --- src/cluster_legacy.c | 21 +++++++++++++--- tests/unit/cluster/noaddr.tcl | 47 +++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 3 deletions(-) create mode 100644 tests/unit/cluster/noaddr.tcl diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 01b92f71c1..9251667584 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -666,7 +666,8 @@ int clusterLoadConfig(char *filename) { } else if (!strcasecmp(s, "handshake")) { n->flags |= CLUSTER_NODE_HANDSHAKE; } else if (!strcasecmp(s, "noaddr")) { - n->flags |= CLUSTER_NODE_NOADDR; + n->flags |= (CLUSTER_NODE_NOADDR | CLUSTER_NODE_FAIL); + n->fail_time = mstime(); } else if (!strcasecmp(s, "nofailover")) { n->flags |= CLUSTER_NODE_NOFAILOVER; } else if (!strcasecmp(s, "noflags")) { @@ -3349,7 +3350,9 @@ int clusterProcessPacket(clusterLink *link) { } else if (memcmp(link->node->name, hdr->sender, CLUSTER_NAMELEN) != 0) { /* If the reply has a non matching node ID we * disconnect this node and set it as not having an associated - * address. */ + * address. This can happen if the node did CLUSTER RESET and changed + * its node ID. In this case, the old node ID will not come back. */ + clusterNode *noaddr_node = link->node; serverLog(LL_NOTICE, "PONG contains mismatching sender ID. About node %.40s (%s) in shard %.40s added %d ms ago, " "having flags %d", @@ -3361,7 +3364,19 @@ int clusterProcessPacket(clusterLink *link) { link->node->tls_port = 0; link->node->cport = 0; freeClusterLink(link); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); + /* We will also mark the node as fail because we have disconnected from it, + * and will not reconnect, and obviously we will not gossip NOADDR nodes. + * Marking it as FAIL can help us advance the state, such as the cluster + * state becomes FAIL or the replica can do the failover. Otherwise, the + * NOADDR node will provide an invalid address in redirection and confuse + * the clients, and the replica will never initiate a failover since the + * node is not actually in FAIL state. */ + if (!nodeFailed(noaddr_node)) { + noaddr_node->flags |= CLUSTER_NODE_FAIL; + noaddr_node->fail_time = now; + clusterSendFail(noaddr_node->name); + } + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE); return 0; } } diff --git a/tests/unit/cluster/noaddr.tcl b/tests/unit/cluster/noaddr.tcl new file mode 100644 index 0000000000..fb4e501809 --- /dev/null +++ b/tests/unit/cluster/noaddr.tcl @@ -0,0 +1,47 @@ +start_cluster 3 3 {tags {external:skip cluster} overrides {cluster-replica-no-failover yes}} { + test "NOADDR nodes will be marked as FAIL" { + set primary0_id [R 0 CLUSTER MYID] + + # R 0 is a primary, after doing a CLUSTER RESET, the node name will be modified, + # and other nodes will set it to NOADDR and FAIL. + R 0 cluster reset hard + wait_for_log_messages -1 {"*PONG contains mismatching sender ID*"} 0 1000 10 + wait_for_log_messages -2 {"*PONG contains mismatching sender ID*"} 0 1000 10 + wait_for_condition 1000 10 { + [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] noaddr] eq 1 && + [cluster_has_flag [cluster_get_node_by_id 1 $primary0_id] fail] eq 1 && + [cluster_has_flag [cluster_get_node_by_id 2 $primary0_id] noaddr] eq 1 && + [cluster_has_flag [cluster_get_node_by_id 2 $primary0_id] fail] eq 1 + } else { + fail "The node is not marked with the correct flag" + } + + # Also we will set the node to FAIL, so the cluster will eventually be down. + wait_for_condition 1000 50 { + [CI 1 cluster_state] eq {fail} && + [CI 2 cluster_state] eq {fail} && + [CI 3 cluster_state] eq {fail} && + [CI 4 cluster_state] eq {fail} && + [CI 5 cluster_state] eq {fail} + } else { + fail "Cluster doesn't fail" + } + + # key_977613 belong to slot 0 and belong to R 0. + # Make sure we get a CLUSTER DOWN instead of an invalid MOVED. + assert_error {CLUSTERDOWN*} {R 1 set key_977613 bar} + + # Let the replica 3 do the failover. + R 3 config set cluster-replica-validity-factor 0 + R 3 config set cluster-replica-no-failover no + wait_for_condition 1000 50 { + [CI 1 cluster_state] eq {ok} && + [CI 2 cluster_state] eq {ok} && + [CI 3 cluster_state] eq {ok} && + [CI 4 cluster_state] eq {ok} && + [CI 5 cluster_state] eq {ok} + } else { + fail "Cluster doesn't stabilize" + } + } +} ;# start_cluster From e8e6f684d5d35727eedd0df5536a0a232ff81a19 Mon Sep 17 00:00:00 2001 From: Binbin Date: Sat, 11 Jan 2025 11:03:10 +0800 Subject: [PATCH 17/31] Add latency stats around cluster config file operations (#1534) When the cluster changes, we need to persist the cluster configuration, and these file IO operations may cause latency. Signed-off-by: Binbin Signed-off-by: proost --- src/cluster_legacy.c | 30 ++++++++++++++++++++++++++++-- tests/unit/latency-monitor.tcl | 12 ++++++++++++ 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 9251667584..5c4bb65aae 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -817,6 +817,7 @@ int clusterSaveConfig(int do_fsync) { ssize_t written_bytes; int fd = -1; int retval = C_ERR; + mstime_t latency; server.cluster->todo_before_sleep &= ~CLUSTER_TODO_SAVE_CONFIG; @@ -830,11 +831,15 @@ int clusterSaveConfig(int do_fsync) { /* Create a temp file with the new content. */ tmpfilename = sdscatfmt(sdsempty(), "%s.tmp-%i-%I", server.cluster_configfile, (int)getpid(), mstime()); + latencyStartMonitor(latency); if ((fd = open(tmpfilename, O_WRONLY | O_CREAT, 0644)) == -1) { serverLog(LL_WARNING, "Could not open temp cluster config file: %s", strerror(errno)); goto cleanup; } + latencyEndMonitor(latency); + latencyAddSampleIfNeeded("cluster-config-open", latency); + latencyStartMonitor(latency); while (offset < content_size) { written_bytes = write(fd, ci + offset, content_size - offset); if (written_bytes <= 0) { @@ -845,31 +850,52 @@ int clusterSaveConfig(int do_fsync) { } offset += written_bytes; } + latencyEndMonitor(latency); + latencyAddSampleIfNeeded("cluster-config-write", latency); if (do_fsync) { + latencyStartMonitor(latency); server.cluster->todo_before_sleep &= ~CLUSTER_TODO_FSYNC_CONFIG; if (valkey_fsync(fd) == -1) { serverLog(LL_WARNING, "Could not sync tmp cluster config file: %s", strerror(errno)); goto cleanup; } + latencyEndMonitor(latency); + latencyAddSampleIfNeeded("cluster-config-fsync", latency); } + latencyStartMonitor(latency); if (rename(tmpfilename, server.cluster_configfile) == -1) { serverLog(LL_WARNING, "Could not rename tmp cluster config file: %s", strerror(errno)); goto cleanup; } + latencyEndMonitor(latency); + latencyAddSampleIfNeeded("cluster-config-rename", latency); if (do_fsync) { + latencyStartMonitor(latency); if (fsyncFileDir(server.cluster_configfile) == -1) { serverLog(LL_WARNING, "Could not sync cluster config file dir: %s", strerror(errno)); goto cleanup; } + latencyEndMonitor(latency); + latencyAddSampleIfNeeded("cluster-config-dir-fsync", latency); } retval = C_OK; /* If we reached this point, everything is fine. */ cleanup: - if (fd != -1) close(fd); - if (retval == C_ERR) unlink(tmpfilename); + if (fd != -1) { + latencyStartMonitor(latency); + close(fd); + latencyEndMonitor(latency); + latencyAddSampleIfNeeded("cluster-config-close", latency); + } + if (retval == C_ERR) { + latencyStartMonitor(latency); + unlink(tmpfilename); + latencyEndMonitor(latency); + latencyAddSampleIfNeeded("cluster-config-unlink", latency); + } sdsfree(tmpfilename); sdsfree(ci); return retval; diff --git a/tests/unit/latency-monitor.tcl b/tests/unit/latency-monitor.tcl index 9048192a44..e4f45389d7 100644 --- a/tests/unit/latency-monitor.tcl +++ b/tests/unit/latency-monitor.tcl @@ -189,3 +189,15 @@ tags {"needs:debug"} { assert_match "*wrong number of arguments for 'latency|help' command" $e } } + +start_cluster 1 1 {tags {"latency-monitor cluster external:skip needs:latency"} overrides {latency-monitor-threshold 1}} { + test "Cluster config file latency" { + # This test just a sanity test so that we can make sure the code path is cover. + # We don't assert anything since we can't be sure whether it will be counted. + R 0 cluster saveconfig + R 1 cluster saveconfig + R 1 cluster failover force + R 0 latency latest + R 1 latency latest + } +} From af563bb45a7c2fd21a2e03510e90f00b5c066d7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Sun, 12 Jan 2025 01:02:39 +0100 Subject: [PATCH 18/31] Skip CLI tests with reply schema validation (#1545) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commands used in valkey-cli tests are not important the reply schema validation. Skip them to avoid the problem if tests hanging. This has failed lately in the daily job: ``` [TIMEOUT]: clients state report follows. sock55fedcc19be0 => (IN PROGRESS) valkey-cli pubsub mode with single standard channel subscription Killing still running Valkey server 33357 ``` These test cases use a special valkey-cli command `:get pubsub` command, which is an internal command to valkey-cli rather than a Valkey server command. This command hangs when compiled with with logreqres enabled. Easy solution is to skip the tests in this setup. The test cases were introduced in #1432. Signed-off-by: Viktor Söderqvist Signed-off-by: proost --- tests/integration/valkey-cli.tcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/valkey-cli.tcl b/tests/integration/valkey-cli.tcl index 095f1e77bc..d4adf9c071 100644 --- a/tests/integration/valkey-cli.tcl +++ b/tests/integration/valkey-cli.tcl @@ -6,7 +6,7 @@ if {$::singledb} { set ::dbnum 9 } -start_server {tags {"cli"}} { +start_server {tags {"cli logreqres:skip"}} { proc open_cli {{opts ""} {infile ""}} { if { $opts == "" } { set opts "-n $::dbnum" From 0c04b84b6628697a590414a175f937bdbd0e0925 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Mon, 13 Jan 2025 03:14:09 +0100 Subject: [PATCH 19/31] Test coverage for ECHO for reply schema validation (#1549) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After #1545 disabled some tests for reply schema validation, we now have another issue that ECHO is not covered. ``` WARNING! The following commands were not hit at all: echo ERROR! at least one command was not hit by the tests ``` This patch adds a test case for ECHO in the unit/other test suite. I haven't checked if there are more commands that aren't covered. Signed-off-by: Viktor Söderqvist Signed-off-by: proost --- .github/workflows/daily.yml | 2 +- tests/unit/other.tcl | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index e1d577b51b..c0e7e4b446 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -1220,7 +1220,7 @@ jobs: if: | (github.event_name == 'workflow_dispatch' || (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || - (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) && + (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'reply-schema') steps: - name: prep diff --git a/tests/unit/other.tcl b/tests/unit/other.tcl index bb08c67471..225287aa3a 100644 --- a/tests/unit/other.tcl +++ b/tests/unit/other.tcl @@ -30,6 +30,10 @@ start_server {tags {"other"}} { } } + test {Coverage: ECHO} { + assert_equal bang [r ECHO bang] + } + test {SAVE - make sure there are all the types as values} { # Wait for a background saving in progress to terminate waitForBgsave r From 9b42d9cf9772be64011d48cfa65fd1d06495215e Mon Sep 17 00:00:00 2001 From: Rain Valentine Date: Mon, 13 Jan 2025 02:17:16 -0800 Subject: [PATCH 20/31] Replace dict with new hashtable: hash datatype (#1502) This PR replaces dict with the new hashtable data structure in the HASH datatype. There is a new struct for hashtable items which contains a pointer to value sds string and the embedded key sds string. These values were previously stored in dictEntry. This structure is kept opaque so we can easily add small value embedding or other optimizations in the future. closes #1095 --------- Signed-off-by: Rain Valentine Signed-off-by: proost --- src/aof.c | 6 +- src/db.c | 68 +++------- src/debug.c | 12 +- src/defrag.c | 63 +++++---- src/lazyfree.c | 6 +- src/module.c | 35 +---- src/object.c | 54 ++++---- src/rdb.c | 62 ++++----- src/server.c | 18 +++ src/server.h | 20 ++- src/t_hash.c | 339 +++++++++++++++++++++++++++++-------------------- 11 files changed, 350 insertions(+), 333 deletions(-) diff --git a/src/aof.c b/src/aof.c index 5dc12db61e..024cdb2771 100644 --- a/src/aof.c +++ b/src/aof.c @@ -1922,7 +1922,7 @@ int rewriteSortedSetObject(rio *r, robj *key, robj *o) { /* Write either the key or the value of the currently selected item of a hash. * The 'hi' argument passes a valid hash iterator. * The 'what' filed specifies if to write a key or a value and can be - * either OBJ_HASH_KEY or OBJ_HASH_VALUE. + * either OBJ_HASH_FIELD or OBJ_HASH_VALUE. * * The function returns 0 on error, non-zero on success. */ static int rioWriteHashIteratorCursor(rio *r, hashTypeIterator *hi, int what) { @@ -1936,7 +1936,7 @@ static int rioWriteHashIteratorCursor(rio *r, hashTypeIterator *hi, int what) { return rioWriteBulkString(r, (char *)vstr, vlen); else return rioWriteBulkLongLong(r, vll); - } else if (hi->encoding == OBJ_ENCODING_HT) { + } else if (hi->encoding == OBJ_ENCODING_HASHTABLE) { sds value = hashTypeCurrentFromHashTable(hi, what); return rioWriteBulkString(r, value, sdslen(value)); } @@ -1963,7 +1963,7 @@ int rewriteHashObject(rio *r, robj *key, robj *o) { } } - if (!rioWriteHashIteratorCursor(r, &hi, OBJ_HASH_KEY) || !rioWriteHashIteratorCursor(r, &hi, OBJ_HASH_VALUE)) { + if (!rioWriteHashIteratorCursor(r, &hi, OBJ_HASH_FIELD) || !rioWriteHashIteratorCursor(r, &hi, OBJ_HASH_VALUE)) { hashTypeResetIterator(&hi); return 0; } diff --git a/src/db.c b/src/db.c index 40315cddaf..d3a20a8e29 100644 --- a/src/db.c +++ b/src/db.c @@ -986,39 +986,6 @@ void keysScanCallback(void *privdata, void *entry) { /* This callback is used by scanGenericCommand in order to collect elements * returned by the dictionary iterator into a list. */ -void dictScanCallback(void *privdata, const dictEntry *de) { - scanData *data = (scanData *)privdata; - list *keys = data->keys; - robj *o = data->o; - sds val = NULL; - sds key = NULL; - data->sampled++; - - /* This callback is only used for scanning elements within a key (hash - * fields, set elements, etc.) so o must be set here. */ - serverAssert(o != NULL); - - /* Filter element if it does not match the pattern. */ - sds keysds = dictGetKey(de); - if (data->pattern) { - if (!stringmatchlen(data->pattern, sdslen(data->pattern), keysds, sdslen(keysds), 0)) { - return; - } - } - - if (o->type == OBJ_HASH) { - key = keysds; - if (!data->only_keys) { - val = dictGetVal(de); - } - } else { - serverPanic("Type not handled in dict SCAN callback."); - } - - listAddNodeTail(keys, key); - if (val) listAddNodeTail(keys, val); -} - void hashtableScanCallback(void *privdata, void *entry) { scanData *data = (scanData *)privdata; sds val = NULL; @@ -1032,14 +999,20 @@ void hashtableScanCallback(void *privdata, void *entry) { * fields, set elements, etc.) so o must be set here. */ serverAssert(o != NULL); - /* get key */ + /* get key, value */ if (o->type == OBJ_SET) { key = (sds)entry; } else if (o->type == OBJ_ZSET) { zskiplistNode *node = (zskiplistNode *)entry; key = node->ele; + /* zset data is copied after filtering by key */ + } else if (o->type == OBJ_HASH) { + key = hashTypeEntryGetField(entry); + if (!data->only_keys) { + val = hashTypeEntryGetValue(entry); + } } else { - serverPanic("Type not handled in hashset SCAN callback."); + serverPanic("Type not handled in hashtable SCAN callback."); } /* Filter element if it does not match the pattern. */ @@ -1049,9 +1022,9 @@ void hashtableScanCallback(void *privdata, void *entry) { } } - if (o->type == OBJ_SET) { - /* no value, key used by reference */ - } else if (o->type == OBJ_ZSET) { + /* zset data must be copied. Do this after filtering to avoid unneeded + * allocations. */ + if (o->type == OBJ_ZSET) { /* zset data is copied */ zskiplistNode *node = (zskiplistNode *)entry; key = sdsdup(node->ele); @@ -1060,8 +1033,6 @@ void hashtableScanCallback(void *privdata, void *entry) { int len = ld2string(buf, sizeof(buf), node->score, LD_STR_AUTO); val = sdsnewlen(buf, len); } - } else { - serverPanic("Type not handled in hashset SCAN callback."); } listAddNodeTail(keys, key); @@ -1200,20 +1171,19 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { * cursor to zero to signal the end of the iteration. */ /* Handle the case of kvstore, dict or hashtable. */ - dict *dict_table = NULL; - hashtable *hashtable_table = NULL; + hashtable *ht = NULL; int shallow_copied_list_items = 0; if (o == NULL) { shallow_copied_list_items = 1; } else if (o->type == OBJ_SET && o->encoding == OBJ_ENCODING_HASHTABLE) { - hashtable_table = o->ptr; + ht = o->ptr; shallow_copied_list_items = 1; - } else if (o->type == OBJ_HASH && o->encoding == OBJ_ENCODING_HT) { - dict_table = o->ptr; + } else if (o->type == OBJ_HASH && o->encoding == OBJ_ENCODING_HASHTABLE) { + ht = o->ptr; shallow_copied_list_items = 1; } else if (o->type == OBJ_ZSET && o->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = o->ptr; - hashtable_table = zs->ht; + ht = zs->ht; /* scanning ZSET allocates temporary strings even though it's a dict */ shallow_copied_list_items = 0; } @@ -1227,7 +1197,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { } /* For main hash table scan or scannable data structure. */ - if (!o || dict_table || hashtable_table) { + if (!o || ht) { /* We set the max number of iterations to ten times the specified * COUNT, so if the hash table is in a pathological state (very * sparsely populated) we avoid to block too much time at the cost @@ -1267,10 +1237,8 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { * If cursor is empty, we should try exploring next non-empty slot. */ if (o == NULL) { cursor = kvstoreScan(c->db->keys, cursor, onlydidx, keysScanCallback, NULL, &data); - } else if (dict_table) { - cursor = dictScan(dict_table, cursor, dictScanCallback, &data); } else { - cursor = hashtableScan(hashtable_table, cursor, hashtableScanCallback, &data); + cursor = hashtableScan(ht, cursor, hashtableScanCallback, &data); } } while (cursor && maxiterations-- && data.sampled < count); } else if (o->type == OBJ_SET) { diff --git a/src/debug.c b/src/debug.c index c80ff5af39..915e0c264d 100644 --- a/src/debug.c +++ b/src/debug.c @@ -231,7 +231,7 @@ void xorObjectDigest(serverDb *db, robj *keyobj, unsigned char *digest, robj *o) sds sdsele; memset(eledigest, 0, 20); - sdsele = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_KEY); + sdsele = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_FIELD); mixDigest(eledigest, sdsele, sdslen(sdsele)); sdsfree(sdsele); sdsele = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_VALUE); @@ -923,23 +923,17 @@ void debugCommand(client *c) { robj *o = objectCommandLookupOrReply(c, c->argv[2], shared.nokeyerr); if (o == NULL) return; - /* Get the dict reference from the object, if possible. */ - dict *d = NULL; + /* Get the hashtable reference from the object, if possible. */ hashtable *ht = NULL; switch (o->encoding) { case OBJ_ENCODING_SKIPLIST: { zset *zs = o->ptr; ht = zs->ht; } break; - case OBJ_ENCODING_HT: d = o->ptr; break; case OBJ_ENCODING_HASHTABLE: ht = o->ptr; break; } - if (d != NULL) { - char buf[4096]; - dictGetStats(buf, sizeof(buf), d, full); - addReplyVerbatim(c, buf, strlen(buf), "txt"); - } else if (ht != NULL) { + if (ht != NULL) { char buf[4096]; hashtableGetStats(buf, sizeof(buf), ht, full); addReplyVerbatim(c, buf, strlen(buf), "txt"); diff --git a/src/defrag.c b/src/defrag.c index 103730ee14..fb98da96c7 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -373,13 +373,6 @@ void activeDefragSdsHashtableCallback(void *privdata, void *entry_ref) { if (new_sds != NULL) *sds_ref = new_sds; } -void activeDefragSdsHashtable(hashtable *ht) { - unsigned long cursor = 0; - do { - cursor = hashtableScanDefrag(ht, cursor, activeDefragSdsHashtableCallback, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF); - } while (cursor != 0); -} - /* Defrag a list of ptr, sds or robj string values */ static void activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) { quicklistNode *newnode, *node = *node_ref; @@ -481,26 +474,25 @@ static void scanHashtableCallbackCountScanned(void *privdata, void *elemref) { server.stat_active_defrag_scanned++; } -/* Used as dict scan callback when all the work is done in the dictDefragFunctions. */ -static void scanCallbackCountScanned(void *privdata, const dictEntry *de) { - UNUSED(privdata); - UNUSED(de); - server.stat_active_defrag_scanned++; -} - static void scanLaterSet(robj *ob, unsigned long *cursor) { if (ob->type != OBJ_SET || ob->encoding != OBJ_ENCODING_HASHTABLE) return; hashtable *ht = ob->ptr; *cursor = hashtableScanDefrag(ht, *cursor, activeDefragSdsHashtableCallback, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF); } +/* Hashtable scan callback for hash datatype */ +static void activeDefragHashTypeEntry(void *privdata, void *element_ref) { + UNUSED(privdata); + hashTypeEntry **entry_ref = (hashTypeEntry **)element_ref; + + hashTypeEntry *new_entry = hashTypeEntryDefrag(*entry_ref, activeDefragAlloc, activeDefragSds); + if (new_entry) *entry_ref = new_entry; +} + static void scanLaterHash(robj *ob, unsigned long *cursor) { - if (ob->type != OBJ_HASH || ob->encoding != OBJ_ENCODING_HT) return; - dict *d = ob->ptr; - dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc, - .defragKey = (dictDefragAllocFunction *)activeDefragSds, - .defragVal = (dictDefragAllocFunction *)activeDefragSds}; - *cursor = dictScanDefrag(d, *cursor, scanCallbackCountScanned, &defragfns, NULL); + if (ob->type != OBJ_HASH || ob->encoding != OBJ_ENCODING_HASHTABLE) return; + hashtable *ht = ob->ptr; + *cursor = hashtableScanDefrag(ht, *cursor, activeDefragHashTypeEntry, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF); } static void defragQuicklist(robj *ob) { @@ -538,15 +530,19 @@ static void defragZsetSkiplist(robj *ob) { } static void defragHash(robj *ob) { - dict *d, *newd; - serverAssert(ob->type == OBJ_HASH && ob->encoding == OBJ_ENCODING_HT); - d = ob->ptr; - if (dictSize(d) > server.active_defrag_max_scan_fields) + serverAssert(ob->type == OBJ_HASH && ob->encoding == OBJ_ENCODING_HASHTABLE); + hashtable *ht = ob->ptr; + if (hashtableSize(ht) > server.active_defrag_max_scan_fields) { defragLater(ob); - else - activeDefragSdsDict(d, DEFRAG_SDS_DICT_VAL_IS_SDS); - /* defrag the dict struct and tables */ - if ((newd = dictDefragTables(ob->ptr))) ob->ptr = newd; + } else { + unsigned long cursor = 0; + do { + cursor = hashtableScanDefrag(ht, cursor, activeDefragHashTypeEntry, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF); + } while (cursor != 0); + } + /* defrag the hashtable struct and tables */ + hashtable *new_hashtable = hashtableDefragTables(ht, activeDefragAlloc); + if (new_hashtable) ob->ptr = new_hashtable; } static void defragSet(robj *ob) { @@ -555,11 +551,14 @@ static void defragSet(robj *ob) { if (hashtableSize(ht) > server.active_defrag_max_scan_fields) { defragLater(ob); } else { - activeDefragSdsHashtable(ht); + unsigned long cursor = 0; + do { + cursor = hashtableScanDefrag(ht, cursor, activeDefragSdsHashtableCallback, NULL, activeDefragAlloc, HASHTABLE_SCAN_EMIT_REF); + } while (cursor != 0); } /* defrag the hashtable struct and tables */ - hashtable *newHashtable = hashtableDefragTables(ht, activeDefragAlloc); - if (newHashtable) ob->ptr = newHashtable; + hashtable *new_hashtable = hashtableDefragTables(ht, activeDefragAlloc); + if (new_hashtable) ob->ptr = new_hashtable; } /* Defrag callback for radix tree iterator, called for each node, @@ -776,7 +775,7 @@ static void defragKey(defragKeysCtx *ctx, robj **elemref) { } else if (ob->type == OBJ_HASH) { if (ob->encoding == OBJ_ENCODING_LISTPACK) { if ((newzl = activeDefragAlloc(ob->ptr))) ob->ptr = newzl; - } else if (ob->encoding == OBJ_ENCODING_HT) { + } else if (ob->encoding == OBJ_ENCODING_HASHTABLE) { defragHash(ob); } else { serverPanic("Unknown hash encoding"); diff --git a/src/lazyfree.c b/src/lazyfree.c index c22d3da964..3b061ccd84 100644 --- a/src/lazyfree.c +++ b/src/lazyfree.c @@ -123,9 +123,9 @@ size_t lazyfreeGetFreeEffort(robj *key, robj *obj, int dbid) { } else if (obj->type == OBJ_ZSET && obj->encoding == OBJ_ENCODING_SKIPLIST) { zset *zs = obj->ptr; return zs->zsl->length; - } else if (obj->type == OBJ_HASH && obj->encoding == OBJ_ENCODING_HT) { - dict *ht = obj->ptr; - return dictSize(ht); + } else if (obj->type == OBJ_HASH && obj->encoding == OBJ_ENCODING_HASHTABLE) { + hashtable *ht = obj->ptr; + return hashtableSize(ht); } else if (obj->type == OBJ_STREAM) { size_t effort = 0; stream *s = obj->ptr; diff --git a/src/module.c b/src/module.c index 76c362b758..fa60335837 100644 --- a/src/module.c +++ b/src/module.c @@ -11090,25 +11090,6 @@ typedef struct { ValkeyModuleScanKeyCB fn; } ScanKeyCBData; -static void moduleScanKeyDictCallback(void *privdata, const dictEntry *de) { - ScanKeyCBData *data = privdata; - sds key = dictGetKey(de); - robj *o = data->key->value; - robj *field = createStringObject(key, sdslen(key)); - robj *value = NULL; - - if (o->type == OBJ_HASH) { - sds val = dictGetVal(de); - value = createStringObject(val, sdslen(val)); - } else { - serverPanic("unexpected object type"); - } - - data->fn(data->key, field, value, data->user_data); - decrRefCount(field); - if (value) decrRefCount(value); -} - static void moduleScanKeyHashtableCallback(void *privdata, void *entry) { ScanKeyCBData *data = privdata; robj *o = data->key->value; @@ -11122,6 +11103,10 @@ static void moduleScanKeyHashtableCallback(void *privdata, void *entry) { zskiplistNode *node = (zskiplistNode *)entry; key = node->ele; value = createStringObjectFromLongDouble(node->score, 0); + } else if (o->type == OBJ_HASH) { + key = hashTypeEntryGetField(entry); + sds val = hashTypeEntryGetValue(entry); + value = createStringObject(val, sdslen(val)); } else { serverPanic("unexpected object type"); } @@ -11185,13 +11170,12 @@ int VM_ScanKey(ValkeyModuleKey *key, ValkeyModuleScanCursor *cursor, ValkeyModul errno = EINVAL; return 0; } - dict *d = NULL; hashtable *ht = NULL; robj *o = key->value; if (o->type == OBJ_SET) { if (o->encoding == OBJ_ENCODING_HASHTABLE) ht = o->ptr; } else if (o->type == OBJ_HASH) { - if (o->encoding == OBJ_ENCODING_HT) d = o->ptr; + if (o->encoding == OBJ_ENCODING_HASHTABLE) ht = o->ptr; } else if (o->type == OBJ_ZSET) { if (o->encoding == OBJ_ENCODING_SKIPLIST) ht = ((zset *)o->ptr)->ht; } else { @@ -11203,14 +11187,7 @@ int VM_ScanKey(ValkeyModuleKey *key, ValkeyModuleScanCursor *cursor, ValkeyModul return 0; } int ret = 1; - if (d) { - ScanKeyCBData data = {key, privdata, fn}; - cursor->cursor = dictScan(d, cursor->cursor, moduleScanKeyDictCallback, &data); - if (cursor->cursor == 0) { - cursor->done = 1; - ret = 0; - } - } else if (ht) { + if (ht) { ScanKeyCBData data = {key, privdata, fn}; cursor->cursor = hashtableScan(ht, cursor->cursor, moduleScanKeyHashtableCallback, &data); if (cursor->cursor == 0) { diff --git a/src/object.c b/src/object.c index 86eefe43a3..b8200dd815 100644 --- a/src/object.c +++ b/src/object.c @@ -530,7 +530,7 @@ void freeZsetObject(robj *o) { void freeHashObject(robj *o) { switch (o->encoding) { - case OBJ_ENCODING_HT: dictRelease((dict *)o->ptr); break; + case OBJ_ENCODING_HASHTABLE: hashtableRelease((hashtable *)o->ptr); break; case OBJ_ENCODING_LISTPACK: lpFree(o->ptr); break; default: serverPanic("Unknown hash encoding type"); break; } @@ -675,25 +675,22 @@ void dismissZsetObject(robj *o, size_t size_hint) { /* See dismissObject() */ void dismissHashObject(robj *o, size_t size_hint) { - if (o->encoding == OBJ_ENCODING_HT) { - dict *d = o->ptr; - serverAssert(dictSize(d) != 0); + if (o->encoding == OBJ_ENCODING_HASHTABLE) { + hashtable *ht = o->ptr; + serverAssert(hashtableSize(ht) != 0); /* We iterate all fields only when average field/value size is bigger than * a page size, and there's a high chance we'll actually dismiss something. */ - if (size_hint / dictSize(d) >= server.page_size) { - dictEntry *de; - dictIterator *di = dictGetIterator(d); - while ((de = dictNext(di)) != NULL) { - /* Only dismiss values memory since the field size - * usually is small. */ - dismissSds(dictGetVal(de)); + if (size_hint / hashtableSize(ht) >= server.page_size) { + hashtableIterator iter; + hashtableInitIterator(&iter, ht); + void *next; + while (hashtableNext(&iter, &next)) { + dismissHashTypeEntry(next); } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } - /* Dismiss hash table memory. */ - dismissMemory(d->ht_table[0], DICTHT_SIZE(d->ht_size_exp[0]) * sizeof(dictEntry *)); - dismissMemory(d->ht_table[1], DICTHT_SIZE(d->ht_size_exp[1]) * sizeof(dictEntry *)); + dismissHashtable(ht); } else if (o->encoding == OBJ_ENCODING_LISTPACK) { dismissMemory(o->ptr, lpBytes((unsigned char *)o->ptr)); } else { @@ -1106,7 +1103,6 @@ char *strEncoding(int encoding) { switch (encoding) { case OBJ_ENCODING_RAW: return "raw"; case OBJ_ENCODING_INT: return "int"; - case OBJ_ENCODING_HT: return "hashtable"; case OBJ_ENCODING_HASHTABLE: return "hashtable"; case OBJ_ENCODING_QUICKLIST: return "quicklist"; case OBJ_ENCODING_LISTPACK: return "listpack"; @@ -1127,10 +1123,6 @@ char *strEncoding(int encoding) { * are checked and averaged to estimate the total size. */ #define OBJ_COMPUTE_SIZE_DEF_SAMPLES 5 /* Default sample size. */ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) { - sds ele, ele2; - dict *d; - dictIterator *di; - struct dictEntry *de; size_t asize = 0, elesize = 0, samples = 0; if (o->type == OBJ_STRING) { @@ -1202,19 +1194,19 @@ size_t objectComputeSize(robj *key, robj *o, size_t sample_size, int dbid) { } else if (o->type == OBJ_HASH) { if (o->encoding == OBJ_ENCODING_LISTPACK) { asize = sizeof(*o) + zmalloc_size(o->ptr); - } else if (o->encoding == OBJ_ENCODING_HT) { - d = o->ptr; - di = dictGetIterator(d); - asize = sizeof(*o) + sizeof(dict) + (sizeof(struct dictEntry *) * dictBuckets(d)); - while ((de = dictNext(di)) != NULL && samples < sample_size) { - ele = dictGetKey(de); - ele2 = dictGetVal(de); - elesize += sdsAllocSize(ele) + sdsAllocSize(ele2); - elesize += dictEntryMemUsage(de); + } else if (o->encoding == OBJ_ENCODING_HASHTABLE) { + hashtable *ht = o->ptr; + hashtableIterator iter; + hashtableInitIterator(&iter, ht); + void *next; + + asize = sizeof(*o) + hashtableMemUsage(ht); + while (hashtableNext(&iter, &next) && samples < sample_size) { + elesize += hashTypeEntryAllocSize(next); samples++; } - dictReleaseIterator(di); - if (samples) asize += (double)elesize / samples * dictSize(d); + hashtableResetIterator(&iter); + if (samples) asize += (double)elesize / samples * hashtableSize(ht); } else { serverPanic("Unknown hash encoding"); } diff --git a/src/rdb.c b/src/rdb.c index 6a2ec78d71..0bb5d7d45d 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -710,7 +710,7 @@ int rdbSaveObjectType(rio *rdb, robj *o) { case OBJ_HASH: if (o->encoding == OBJ_ENCODING_LISTPACK) return rdbSaveType(rdb, RDB_TYPE_HASH_LISTPACK); - else if (o->encoding == OBJ_ENCODING_HT) + else if (o->encoding == OBJ_ENCODING_HASHTABLE) return rdbSaveType(rdb, RDB_TYPE_HASH); else serverPanic("Unknown hash encoding"); @@ -950,32 +950,33 @@ ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid) { if ((n = rdbSaveRawString(rdb, o->ptr, l)) == -1) return -1; nwritten += n; - } else if (o->encoding == OBJ_ENCODING_HT) { - dictIterator *di = dictGetIterator(o->ptr); - dictEntry *de; + } else if (o->encoding == OBJ_ENCODING_HASHTABLE) { + hashtable *ht = o->ptr; - if ((n = rdbSaveLen(rdb, dictSize((dict *)o->ptr))) == -1) { - dictReleaseIterator(di); + if ((n = rdbSaveLen(rdb, hashtableSize(ht))) == -1) { return -1; } nwritten += n; - while ((de = dictNext(di)) != NULL) { - sds field = dictGetKey(de); - sds value = dictGetVal(de); + hashtableIterator iter; + hashtableInitIterator(&iter, ht); + void *next; + while (hashtableNext(&iter, &next)) { + sds field = hashTypeEntryGetField(next); + sds value = hashTypeEntryGetValue(next); if ((n = rdbSaveRawString(rdb, (unsigned char *)field, sdslen(field))) == -1) { - dictReleaseIterator(di); + hashtableResetIterator(&iter); return -1; } nwritten += n; if ((n = rdbSaveRawString(rdb, (unsigned char *)value, sdslen(value))) == -1) { - dictReleaseIterator(di); + hashtableResetIterator(&iter); return -1; } nwritten += n; } - dictReleaseIterator(di); + hashtableResetIterator(&iter); } else { serverPanic("Unknown hash encoding"); } @@ -2063,7 +2064,6 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { } } else if (rdbtype == RDB_TYPE_HASH) { uint64_t len; - int ret; sds field, value; dict *dupSearchDict = NULL; @@ -2075,10 +2075,10 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { /* Too many entries? Use a hash table right from the start. */ if (len > server.hash_max_listpack_entries) - hashTypeConvert(o, OBJ_ENCODING_HT); + hashTypeConvert(o, OBJ_ENCODING_HASHTABLE); else if (deep_integrity_validation) { /* In this mode, we need to guarantee that the server won't crash - * later when the ziplist is converted to a dict. + * later when the ziplist is converted to a hashtable. * Create a set (dict with no values) to for a dup search. * We can dismiss it as soon as we convert the ziplist to a hash. */ dupSearchDict = dictCreate(&hashDictType); @@ -2117,13 +2117,13 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { /* Convert to hash table if size threshold is exceeded */ if (sdslen(field) > server.hash_max_listpack_value || sdslen(value) > server.hash_max_listpack_value || !lpSafeToAdd(o->ptr, sdslen(field) + sdslen(value))) { - hashTypeConvert(o, OBJ_ENCODING_HT); - ret = dictAdd((dict *)o->ptr, field, value); - if (ret == DICT_ERR) { + hashTypeConvert(o, OBJ_ENCODING_HASHTABLE); + hashTypeEntry *entry = hashTypeCreateEntry(field, value); + sdsfree(field); + if (!hashtableAdd((hashtable *)o->ptr, entry)) { rdbReportCorruptRDB("Duplicate hash fields detected"); if (dupSearchDict) dictRelease(dupSearchDict); - sdsfree(value); - sdsfree(field); + freeHashTypeEntry(entry); decrRefCount(o); return NULL; } @@ -2145,16 +2145,16 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { dupSearchDict = NULL; } - if (o->encoding == OBJ_ENCODING_HT && len > DICT_HT_INITIAL_SIZE) { - if (dictTryExpand(o->ptr, len) != DICT_OK) { - rdbReportCorruptRDB("OOM in dictTryExpand %llu", (unsigned long long)len); + if (o->encoding == OBJ_ENCODING_HASHTABLE) { + if (!hashtableTryExpand(o->ptr, len)) { + rdbReportCorruptRDB("OOM in hashtableTryExpand %llu", (unsigned long long)len); decrRefCount(o); return NULL; } } /* Load remaining fields and values into the hash table */ - while (o->encoding == OBJ_ENCODING_HT && len > 0) { + while (o->encoding == OBJ_ENCODING_HASHTABLE && len > 0) { len--; /* Load encoded strings */ if ((field = rdbGenericLoadStringObject(rdb, RDB_LOAD_SDS, NULL)) == NULL) { @@ -2168,11 +2168,11 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { } /* Add pair to hash table */ - ret = dictAdd((dict *)o->ptr, field, value); - if (ret == DICT_ERR) { + hashTypeEntry *entry = hashTypeCreateEntry(field, value); + sdsfree(field); + if (!hashtableAdd((hashtable *)o->ptr, entry)) { rdbReportCorruptRDB("Duplicate hash fields detected"); - sdsfree(value); - sdsfree(field); + freeHashTypeEntry(entry); decrRefCount(o); return NULL; } @@ -2317,7 +2317,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { o->encoding = OBJ_ENCODING_LISTPACK; if (hashTypeLength(o) > server.hash_max_listpack_entries || maxlen > server.hash_max_listpack_value) { - hashTypeConvert(o, OBJ_ENCODING_HT); + hashTypeConvert(o, OBJ_ENCODING_HASHTABLE); } } break; @@ -2445,7 +2445,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { } if (hashTypeLength(o) > server.hash_max_listpack_entries) - hashTypeConvert(o, OBJ_ENCODING_HT); + hashTypeConvert(o, OBJ_ENCODING_HASHTABLE); else o->ptr = lpShrinkToFit(o->ptr); break; @@ -2466,7 +2466,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) { goto emptykey; } - if (hashTypeLength(o) > server.hash_max_listpack_entries) hashTypeConvert(o, OBJ_ENCODING_HT); + if (hashTypeLength(o) > server.hash_max_listpack_entries) hashTypeConvert(o, OBJ_ENCODING_HASHTABLE); break; default: /* totally unreachable */ diff --git a/src/server.c b/src/server.c index 59766cccda..66101ea839 100644 --- a/src/server.c +++ b/src/server.c @@ -624,6 +624,24 @@ hashtableType subcommandSetType = {.entryGetKey = hashtableSubcommandGetKey, .keyCompare = hashtableStringKeyCaseCompare, .instant_rehashing = 1}; +/* Hash type hash table (note that small hashes are represented with listpacks) */ +const void *hashHashtableTypeGetKey(const void *entry) { + const hashTypeEntry *hash_entry = entry; + return (const void *)hashTypeEntryGetField(hash_entry); +} + +void hashHashtableTypeDestructor(void *entry) { + hashTypeEntry *hash_entry = entry; + freeHashTypeEntry(hash_entry); +} + +hashtableType hashHashtableType = { + .hashFunction = dictSdsHash, + .entryGetKey = hashHashtableTypeGetKey, + .keyCompare = hashtableSdsKeyCompare, + .entryDestructor = hashHashtableTypeDestructor, +}; + /* Hash type hash table (note that small hashes are represented with listpacks) */ dictType hashDictType = { dictSdsHash, /* hash function */ diff --git a/src/server.h b/src/server.h index 2293f3baf1..6d87b46aed 100644 --- a/src/server.h +++ b/src/server.h @@ -712,7 +712,7 @@ typedef struct ValkeyModuleType moduleType; * is set to one of this fields for this object. */ #define OBJ_ENCODING_RAW 0 /* Raw representation */ #define OBJ_ENCODING_INT 1 /* Encoded as integer */ -#define OBJ_ENCODING_HT 2 /* Encoded as hash table */ +#define OBJ_ENCODING_HASHTABLE 2 /* Encoded as a hashtable */ #define OBJ_ENCODING_ZIPMAP 3 /* No longer used: old hash encoding. */ #define OBJ_ENCODING_LINKEDLIST 4 /* No longer used: old list encoding. */ #define OBJ_ENCODING_ZIPLIST 5 /* No longer used: old list/hash/zset encoding. */ @@ -722,7 +722,6 @@ typedef struct ValkeyModuleType moduleType; #define OBJ_ENCODING_QUICKLIST 9 /* Encoded as linked list of listpacks */ #define OBJ_ENCODING_STREAM 10 /* Encoded as a radix tree of listpacks */ #define OBJ_ENCODING_LISTPACK 11 /* Encoded as a listpack */ -#define OBJ_ENCODING_HASHTABLE 12 /* Encoded as a hashtable */ #define LRU_BITS 24 #define LRU_CLOCK_MAX ((1 << LRU_BITS) - 1) /* Max value of obj->lru */ @@ -2521,13 +2520,13 @@ typedef struct { unsigned char *fptr, *vptr; - dictIterator di; - dictEntry *de; + hashtableIterator iter; + void *next; } hashTypeIterator; #include "stream.h" /* Stream data type header file. */ -#define OBJ_HASH_KEY 1 +#define OBJ_HASH_FIELD 1 #define OBJ_HASH_VALUE 2 /*----------------------------------------------------------------------------- @@ -2545,6 +2544,7 @@ extern hashtableType kvstoreKeysHashtableType; extern hashtableType kvstoreExpiresHashtableType; extern double R_Zero, R_PosInf, R_NegInf, R_Nan; extern dictType hashDictType; +extern hashtableType hashHashtableType; extern dictType stringSetDictType; extern dictType externalStringType; extern dictType sdsHashDictType; @@ -3234,6 +3234,15 @@ robj *setTypeDup(robj *o); #define HASH_SET_TAKE_VALUE (1 << 1) #define HASH_SET_COPY 0 +typedef struct hashTypeEntry hashTypeEntry; +hashTypeEntry *hashTypeCreateEntry(sds field, sds value); +sds hashTypeEntryGetField(const hashTypeEntry *entry); +sds hashTypeEntryGetValue(const hashTypeEntry *entry); +size_t hashTypeEntryAllocSize(hashTypeEntry *entry); +hashTypeEntry *hashTypeEntryDefrag(hashTypeEntry *entry, void *(*defragfn)(void *), sds (*sdsdefragfn)(sds)); +void dismissHashTypeEntry(hashTypeEntry *entry); +void freeHashTypeEntry(hashTypeEntry *entry); + void hashTypeConvert(robj *o, int enc); void hashTypeTryConversion(robj *subject, robj **argv, int start, int end); int hashTypeExists(robj *o, sds key); @@ -3248,7 +3257,6 @@ void hashTypeCurrentFromListpack(hashTypeIterator *hi, unsigned int *vlen, long long *vll); sds hashTypeCurrentFromHashTable(hashTypeIterator *hi, int what); -void hashTypeCurrentObject(hashTypeIterator *hi, int what, unsigned char **vstr, unsigned int *vlen, long long *vll); sds hashTypeCurrentObjectNewSds(hashTypeIterator *hi, int what); robj *hashTypeLookupWriteOrCreate(client *c, robj *key); robj *hashTypeGetValueObject(robj *o, sds field); diff --git a/src/t_hash.c b/src/t_hash.c index 1aa37968b7..b6e6457bb6 100644 --- a/src/t_hash.c +++ b/src/t_hash.c @@ -30,6 +30,80 @@ #include "server.h" #include +/*----------------------------------------------------------------------------- + * Hash Entry API + *----------------------------------------------------------------------------*/ + +struct hashTypeEntry { + sds value; + unsigned char field_offset; + unsigned char field_data[]; +}; + +/* takes ownership of value, does not take ownership of field */ +hashTypeEntry *hashTypeCreateEntry(sds field, sds value) { + size_t field_size = sdscopytobuffer(NULL, 0, field, NULL); + + size_t total_size = sizeof(hashTypeEntry) + field_size; + hashTypeEntry *entry = zmalloc(total_size); + + entry->value = value; + sdscopytobuffer(entry->field_data, field_size, field, &entry->field_offset); + return entry; +} + +sds hashTypeEntryGetField(const hashTypeEntry *entry) { + const unsigned char *field = entry->field_data + entry->field_offset; + return (sds)field; +} + +sds hashTypeEntryGetValue(const hashTypeEntry *entry) { + return entry->value; +} + +/* frees previous value, takes ownership of new value */ +static void hashTypeEntryReplaceValue(hashTypeEntry *entry, sds value) { + sdsfree(entry->value); + entry->value = value; +} + +/* Returns allocation size of hashTypeEntry and data owned by hashTypeEntry, + * even if not embedded in the same allocation. */ +size_t hashTypeEntryAllocSize(hashTypeEntry *entry) { + size_t size = zmalloc_usable_size(entry); + size += sdsAllocSize(entry->value); + return size; +} + +/* Defragments a hashtable entry (field-value pair) if needed, using the + * provided defrag functions. The defrag functions return NULL if the allocation + * was not moved, otherwise they return a pointer to the new memory location. + * A separate sds defrag function is needed because of the unique memory layout + * of sds strings. + * If the location of the hashTypeEntry changed we return the new location, + * otherwise we return NULL. */ +hashTypeEntry *hashTypeEntryDefrag(hashTypeEntry *entry, void *(*defragfn)(void *), sds (*sdsdefragfn)(sds)) { + hashTypeEntry *new_entry = defragfn(entry); + if (new_entry) entry = new_entry; + + sds new_value = sdsdefragfn(entry->value); + if (new_value) entry->value = new_value; + + return new_entry; +} + +/* Used for releasing memory to OS to avoid unnecessary CoW. Called when we've + * forked and memory won't be used again. See zmadvise_dontneed() */ +void dismissHashTypeEntry(hashTypeEntry *entry) { + /* Only dismiss values memory since the field size usually is small. */ + dismissSds(entry->value); +} + +void freeHashTypeEntry(hashTypeEntry *entry) { + sdsfree(entry->value); + zfree(entry); +} + /*----------------------------------------------------------------------------- * Hash type API *----------------------------------------------------------------------------*/ @@ -48,8 +122,8 @@ void hashTypeTryConversion(robj *o, robj **argv, int start, int end) { * might over allocate memory if there are duplicates. */ size_t new_fields = (end - start + 1) / 2; if (new_fields > server.hash_max_listpack_entries) { - hashTypeConvert(o, OBJ_ENCODING_HT); - dictExpand(o->ptr, new_fields); + hashTypeConvert(o, OBJ_ENCODING_HASHTABLE); + hashtableExpand(o->ptr, new_fields); return; } @@ -57,12 +131,12 @@ void hashTypeTryConversion(robj *o, robj **argv, int start, int end) { if (!sdsEncodedObject(argv[i])) continue; size_t len = sdslen(argv[i]->ptr); if (len > server.hash_max_listpack_value) { - hashTypeConvert(o, OBJ_ENCODING_HT); + hashTypeConvert(o, OBJ_ENCODING_HASHTABLE); return; } sum += len; } - if (!lpSafeToAdd(o->ptr, sum)) hashTypeConvert(o, OBJ_ENCODING_HT); + if (!lpSafeToAdd(o->ptr, sum)) hashTypeConvert(o, OBJ_ENCODING_HASHTABLE); } /* Get the value from a listpack encoded hash, identified by field. @@ -95,13 +169,10 @@ int hashTypeGetFromListpack(robj *o, sds field, unsigned char **vstr, unsigned i * Returns NULL when the field cannot be found, otherwise the SDS value * is returned. */ sds hashTypeGetFromHashTable(robj *o, sds field) { - dictEntry *de; - - serverAssert(o->encoding == OBJ_ENCODING_HT); - - de = dictFind(o->ptr, field); - if (de == NULL) return NULL; - return dictGetVal(de); + serverAssert(o->encoding == OBJ_ENCODING_HASHTABLE); + void *found_element; + if (!hashtableFind(o->ptr, field, &found_element)) return NULL; + return hashTypeEntryGetValue(found_element); } /* Higher level function of hashTypeGet*() that returns the hash value @@ -117,9 +188,9 @@ int hashTypeGetValue(robj *o, sds field, unsigned char **vstr, unsigned int *vle if (o->encoding == OBJ_ENCODING_LISTPACK) { *vstr = NULL; if (hashTypeGetFromListpack(o, field, vstr, vlen, vll) == 0) return C_OK; - } else if (o->encoding == OBJ_ENCODING_HT) { - sds value; - if ((value = hashTypeGetFromHashTable(o, field)) != NULL) { + } else if (o->encoding == OBJ_ENCODING_HASHTABLE) { + sds value = hashTypeGetFromHashTable(o, field); + if (value != NULL) { *vstr = (unsigned char *)value; *vlen = sdslen(value); return C_OK; @@ -173,7 +244,7 @@ int hashTypeExists(robj *o, sds field) { /* Add a new field, overwrite the old with the new value if it already exists. * Return 0 on insert and 1 on update. * - * By default, the key and value SDS strings are copied if needed, so the + * By default, the field and value SDS strings are copied if needed, so the * caller retains ownership of the strings passed. However this behavior * can be effected by passing appropriate flags (possibly bitwise OR-ed): * @@ -199,7 +270,7 @@ int hashTypeSet(robj *o, sds field, sds value, int flags) { * hashTypeTryConversion, so this check will be a NOP. */ if (o->encoding == OBJ_ENCODING_LISTPACK) { if (sdslen(field) > server.hash_max_listpack_value || sdslen(value) > server.hash_max_listpack_value) - hashTypeConvert(o, OBJ_ENCODING_HT); + hashTypeConvert(o, OBJ_ENCODING_HASHTABLE); } if (o->encoding == OBJ_ENCODING_LISTPACK) { @@ -228,10 +299,10 @@ int hashTypeSet(robj *o, sds field, sds value, int flags) { o->ptr = zl; /* Check if the listpack needs to be converted to a hash table */ - if (hashTypeLength(o) > server.hash_max_listpack_entries) hashTypeConvert(o, OBJ_ENCODING_HT); - } else if (o->encoding == OBJ_ENCODING_HT) { - dict *ht = o->ptr; - dictEntry *de, *existing; + if (hashTypeLength(o) > server.hash_max_listpack_entries) hashTypeConvert(o, OBJ_ENCODING_HASHTABLE); + } else if (o->encoding == OBJ_ENCODING_HASHTABLE) { + hashtable *ht = o->ptr; + sds v; if (flags & HASH_SET_TAKE_VALUE) { v = value; @@ -239,17 +310,16 @@ int hashTypeSet(robj *o, sds field, sds value, int flags) { } else { v = sdsdup(value); } - de = dictAddRaw(ht, field, &existing); - if (de) { - dictSetVal(ht, de, v); - if (flags & HASH_SET_TAKE_FIELD) { - field = NULL; - } else { - dictSetKey(ht, de, sdsdup(field)); - } + + hashtablePosition position; + void *existing; + if (hashtableFindPositionForInsert(ht, field, &position, &existing)) { + /* does not exist yet */ + hashTypeEntry *entry = hashTypeCreateEntry(field, v); + hashtableInsertAtPosition(ht, entry, &position); } else { - sdsfree(dictGetVal(existing)); - dictSetVal(ht, existing, v); + /* exists: replace value */ + hashTypeEntryReplaceValue(existing, v); update = 1; } } else { @@ -276,17 +346,15 @@ int hashTypeDelete(robj *o, sds field) { if (fptr != NULL) { fptr = lpFind(zl, fptr, (unsigned char *)field, sdslen(field), 1); if (fptr != NULL) { - /* Delete both of the key and the value. */ + /* Delete both field and value. */ zl = lpDeleteRangeWithEntry(zl, &fptr, 2); o->ptr = zl; deleted = 1; } } - } else if (o->encoding == OBJ_ENCODING_HT) { - if (dictDelete((dict *)o->ptr, field) == C_OK) { - deleted = 1; - } - + } else if (o->encoding == OBJ_ENCODING_HASHTABLE) { + hashtable *ht = o->ptr; + deleted = hashtableDelete(ht, field); } else { serverPanic("Unknown hash encoding"); } @@ -295,16 +363,15 @@ int hashTypeDelete(robj *o, sds field) { /* Return the number of elements in a hash. */ unsigned long hashTypeLength(const robj *o) { - unsigned long length = ULONG_MAX; - - if (o->encoding == OBJ_ENCODING_LISTPACK) { - length = lpLength(o->ptr) / 2; - } else if (o->encoding == OBJ_ENCODING_HT) { - length = dictSize((const dict *)o->ptr); - } else { + switch (o->encoding) { + case OBJ_ENCODING_LISTPACK: + return lpLength(o->ptr) / 2; + case OBJ_ENCODING_HASHTABLE: + return hashtableSize((const hashtable *)o->ptr); + default: serverPanic("Unknown hash encoding"); + return ULONG_MAX; } - return length; } void hashTypeInitIterator(robj *subject, hashTypeIterator *hi) { @@ -314,15 +381,15 @@ void hashTypeInitIterator(robj *subject, hashTypeIterator *hi) { if (hi->encoding == OBJ_ENCODING_LISTPACK) { hi->fptr = NULL; hi->vptr = NULL; - } else if (hi->encoding == OBJ_ENCODING_HT) { - dictInitIterator(&hi->di, subject->ptr); + } else if (hi->encoding == OBJ_ENCODING_HASHTABLE) { + hashtableInitIterator(&hi->iter, subject->ptr); } else { serverPanic("Unknown hash encoding"); } } void hashTypeResetIterator(hashTypeIterator *hi) { - if (hi->encoding == OBJ_ENCODING_HT) dictResetIterator(&hi->di); + if (hi->encoding == OBJ_ENCODING_HASHTABLE) hashtableResetIterator(&hi->iter); } /* Move to the next entry in the hash. Return C_OK when the next entry @@ -354,8 +421,8 @@ int hashTypeNext(hashTypeIterator *hi) { /* fptr, vptr now point to the first or next pair */ hi->fptr = fptr; hi->vptr = vptr; - } else if (hi->encoding == OBJ_ENCODING_HT) { - if ((hi->de = dictNext(&hi->di)) == NULL) return C_ERR; + } else if (hi->encoding == OBJ_ENCODING_HASHTABLE) { + if (!hashtableNext(&hi->iter, &hi->next)) return C_ERR; } else { serverPanic("Unknown hash encoding"); } @@ -371,7 +438,7 @@ void hashTypeCurrentFromListpack(hashTypeIterator *hi, long long *vll) { serverAssert(hi->encoding == OBJ_ENCODING_LISTPACK); - if (what & OBJ_HASH_KEY) { + if (what & OBJ_HASH_FIELD) { *vstr = lpGetValue(hi->fptr, vlen, vll); } else { *vstr = lpGetValue(hi->vptr, vlen, vll); @@ -382,12 +449,12 @@ void hashTypeCurrentFromListpack(hashTypeIterator *hi, * encoded as a hash table. Prototype is similar to * `hashTypeGetFromHashTable`. */ sds hashTypeCurrentFromHashTable(hashTypeIterator *hi, int what) { - serverAssert(hi->encoding == OBJ_ENCODING_HT); + serverAssert(hi->encoding == OBJ_ENCODING_HASHTABLE); - if (what & OBJ_HASH_KEY) { - return dictGetKey(hi->de); + if (what & OBJ_HASH_FIELD) { + return hashTypeEntryGetField(hi->next); } else { - return dictGetVal(hi->de); + return hashTypeEntryGetValue(hi->next); } } @@ -401,11 +468,11 @@ sds hashTypeCurrentFromHashTable(hashTypeIterator *hi, int what) { * If *vll is populated *vstr is set to NULL, so the caller * can always check the function return by checking the return value * type checking if vstr == NULL. */ -void hashTypeCurrentObject(hashTypeIterator *hi, int what, unsigned char **vstr, unsigned int *vlen, long long *vll) { +static void hashTypeCurrentObject(hashTypeIterator *hi, int what, unsigned char **vstr, unsigned int *vlen, long long *vll) { if (hi->encoding == OBJ_ENCODING_LISTPACK) { *vstr = NULL; hashTypeCurrentFromListpack(hi, what, vstr, vlen, vll); - } else if (hi->encoding == OBJ_ENCODING_HT) { + } else if (hi->encoding == OBJ_ENCODING_HASHTABLE) { sds ele = hashTypeCurrentFromHashTable(hi, what); *vstr = (unsigned char *)ele; *vlen = sdslen(ele); @@ -414,7 +481,7 @@ void hashTypeCurrentObject(hashTypeIterator *hi, int what, unsigned char **vstr, } } -/* Return the key or value at the current iterator position as a new +/* Return the field or value at the current iterator position as a new * SDS string. */ sds hashTypeCurrentObjectNewSds(hashTypeIterator *hi, int what) { unsigned char *vstr; @@ -444,26 +511,22 @@ void hashTypeConvertListpack(robj *o, int enc) { if (enc == OBJ_ENCODING_LISTPACK) { /* Nothing to do... */ - } else if (enc == OBJ_ENCODING_HT) { + } else if (enc == OBJ_ENCODING_HASHTABLE) { hashTypeIterator hi; - dict *dict; - int ret; - hashTypeInitIterator(o, &hi); - dict = dictCreate(&hashDictType); + hashtable *ht = hashtableCreate(&hashHashtableType); - /* Presize the dict to avoid rehashing */ - dictExpand(dict, hashTypeLength(o)); + /* Presize the hashtable to avoid rehashing */ + hashtableExpand(ht, hashTypeLength(o)); + hashTypeInitIterator(o, &hi); while (hashTypeNext(&hi) != C_ERR) { - sds key, value; - - key = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_KEY); - value = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_VALUE); - ret = dictAdd(dict, key, value); - if (ret != DICT_OK) { - sdsfree(key); - sdsfree(value); /* Needed for gcc ASAN */ + sds field = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_FIELD); + sds value = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_VALUE); + hashTypeEntry *entry = hashTypeCreateEntry(field, value); + sdsfree(field); + if (!hashtableAdd(ht, entry)) { + freeHashTypeEntry(entry); hashTypeResetIterator(&hi); /* Needed for gcc ASAN */ serverLogHexDump(LL_WARNING, "listpack with dup elements dump", o->ptr, lpBytes(o->ptr)); serverPanic("Listpack corruption detected"); @@ -471,8 +534,8 @@ void hashTypeConvertListpack(robj *o, int enc) { } hashTypeResetIterator(&hi); zfree(o->ptr); - o->encoding = OBJ_ENCODING_HT; - o->ptr = dict; + o->encoding = OBJ_ENCODING_HASHTABLE; + o->ptr = ht; } else { serverPanic("Unknown hash encoding"); } @@ -481,7 +544,7 @@ void hashTypeConvertListpack(robj *o, int enc) { void hashTypeConvert(robj *o, int enc) { if (o->encoding == OBJ_ENCODING_LISTPACK) { hashTypeConvertListpack(o, enc); - } else if (o->encoding == OBJ_ENCODING_HT) { + } else if (o->encoding == OBJ_ENCODING_HASHTABLE) { serverPanic("Not implemented"); } else { serverPanic("Unknown hash encoding"); @@ -506,27 +569,24 @@ robj *hashTypeDup(robj *o) { memcpy(new_zl, zl, sz); hobj = createObject(OBJ_HASH, new_zl); hobj->encoding = OBJ_ENCODING_LISTPACK; - } else if (o->encoding == OBJ_ENCODING_HT) { - dict *d = dictCreate(&hashDictType); - dictExpand(d, dictSize((const dict *)o->ptr)); + } else if (o->encoding == OBJ_ENCODING_HASHTABLE) { + hashtable *ht = hashtableCreate(&hashHashtableType); + hashtableExpand(ht, hashtableSize((const hashtable *)o->ptr)); hashTypeInitIterator(o, &hi); while (hashTypeNext(&hi) != C_ERR) { - sds field, value; - sds newfield, newvalue; /* Extract a field-value pair from an original hash object.*/ - field = hashTypeCurrentFromHashTable(&hi, OBJ_HASH_KEY); - value = hashTypeCurrentFromHashTable(&hi, OBJ_HASH_VALUE); - newfield = sdsdup(field); - newvalue = sdsdup(value); + sds field = hashTypeCurrentFromHashTable(&hi, OBJ_HASH_FIELD); + sds value = hashTypeCurrentFromHashTable(&hi, OBJ_HASH_VALUE); /* Add a field-value pair to a new hash object. */ - dictAdd(d, newfield, newvalue); + hashTypeEntry *entry = hashTypeCreateEntry(field, sdsdup(value)); + hashtableAdd(ht, entry); } hashTypeResetIterator(&hi); - hobj = createObject(OBJ_HASH, d); - hobj->encoding = OBJ_ENCODING_HT; + hobj = createObject(OBJ_HASH, ht); + hobj->encoding = OBJ_ENCODING_HASHTABLE; } else { serverPanic("Unknown hash encoding"); } @@ -547,22 +607,23 @@ void hashReplyFromListpackEntry(client *c, listpackEntry *e) { } /* Return random element from a non empty hash. - * 'key' and 'val' will be set to hold the element. + * 'field' and 'val' will be set to hold the element. * The memory in them is not to be freed or modified by the caller. * 'val' can be NULL in which case it's not extracted. */ -void hashTypeRandomElement(robj *hashobj, unsigned long hashsize, listpackEntry *key, listpackEntry *val) { - if (hashobj->encoding == OBJ_ENCODING_HT) { - dictEntry *de = dictGetFairRandomKey(hashobj->ptr); - sds s = dictGetKey(de); - key->sval = (unsigned char *)s; - key->slen = sdslen(s); +static void hashTypeRandomElement(robj *hashobj, unsigned long hashsize, listpackEntry *field, listpackEntry *val) { + if (hashobj->encoding == OBJ_ENCODING_HASHTABLE) { + void *entry; + hashtableFairRandomEntry(hashobj->ptr, &entry); + sds sds_field = hashTypeEntryGetField(entry); + field->sval = (unsigned char *)sds_field; + field->slen = sdslen(sds_field); if (val) { - sds s = dictGetVal(de); - val->sval = (unsigned char *)s; - val->slen = sdslen(s); + sds sds_val = hashTypeEntryGetValue(entry); + val->sval = (unsigned char *)sds_val; + val->slen = sdslen(sds_val); } } else if (hashobj->encoding == OBJ_ENCODING_LISTPACK) { - lpRandomPair(hashobj->ptr, hashsize, key, val); + lpRandomPair(hashobj->ptr, hashsize, field, val); } else { serverPanic("Unknown hash encoding"); } @@ -799,7 +860,7 @@ static void addHashIteratorCursorToReply(writePreparedClient *wpc, hashTypeItera addWritePreparedReplyBulkCBuffer(wpc, vstr, vlen); else addWritePreparedReplyBulkLongLong(wpc, vll); - } else if (hi->encoding == OBJ_ENCODING_HT) { + } else if (hi->encoding == OBJ_ENCODING_HASHTABLE) { sds value = hashTypeCurrentFromHashTable(hi, what); addWritePreparedReplyBulkCBuffer(wpc, value, sdslen(value)); } else { @@ -812,15 +873,15 @@ void genericHgetallCommand(client *c, int flags) { hashTypeIterator hi; int length, count = 0; - robj *emptyResp = (flags & OBJ_HASH_KEY && flags & OBJ_HASH_VALUE) ? shared.emptymap[c->resp] : shared.emptyarray; + robj *emptyResp = (flags & OBJ_HASH_FIELD && flags & OBJ_HASH_VALUE) ? shared.emptymap[c->resp] : shared.emptyarray; if ((o = lookupKeyReadOrReply(c, c->argv[1], emptyResp)) == NULL || checkType(c, o, OBJ_HASH)) return; writePreparedClient *wpc = prepareClientForFutureWrites(c); if (!wpc) return; - /* We return a map if the user requested keys and values, like in the + /* We return a map if the user requested fields and values, like in the * HGETALL case. Otherwise to use a flat array makes more sense. */ length = hashTypeLength(o); - if (flags & OBJ_HASH_KEY && flags & OBJ_HASH_VALUE) { + if (flags & OBJ_HASH_FIELD && flags & OBJ_HASH_VALUE) { addWritePreparedReplyMapLen(wpc, length); } else { addWritePreparedReplyArrayLen(wpc, length); @@ -828,8 +889,8 @@ void genericHgetallCommand(client *c, int flags) { hashTypeInitIterator(o, &hi); while (hashTypeNext(&hi) != C_ERR) { - if (flags & OBJ_HASH_KEY) { - addHashIteratorCursorToReply(wpc, &hi, OBJ_HASH_KEY); + if (flags & OBJ_HASH_FIELD) { + addHashIteratorCursorToReply(wpc, &hi, OBJ_HASH_FIELD); count++; } if (flags & OBJ_HASH_VALUE) { @@ -841,12 +902,12 @@ void genericHgetallCommand(client *c, int flags) { hashTypeResetIterator(&hi); /* Make sure we returned the right number of elements. */ - if (flags & OBJ_HASH_KEY && flags & OBJ_HASH_VALUE) count /= 2; + if (flags & OBJ_HASH_FIELD && flags & OBJ_HASH_VALUE) count /= 2; serverAssert(count == length); } void hkeysCommand(client *c) { - genericHgetallCommand(c, OBJ_HASH_KEY); + genericHgetallCommand(c, OBJ_HASH_FIELD); } void hvalsCommand(client *c) { @@ -854,7 +915,7 @@ void hvalsCommand(client *c) { } void hgetallCommand(client *c) { - genericHgetallCommand(c, OBJ_HASH_KEY | OBJ_HASH_VALUE); + genericHgetallCommand(c, OBJ_HASH_FIELD | OBJ_HASH_VALUE); } void hexistsCommand(client *c) { @@ -873,14 +934,14 @@ void hscanCommand(client *c) { scanGenericCommand(c, o, cursor); } -static void hrandfieldReplyWithListpack(writePreparedClient *wpc, unsigned int count, listpackEntry *keys, listpackEntry *vals) { +static void hrandfieldReplyWithListpack(writePreparedClient *wpc, unsigned int count, listpackEntry *fields, listpackEntry *vals) { client *c = (client *)wpc; for (unsigned long i = 0; i < count; i++) { if (vals && c->resp > 2) addWritePreparedReplyArrayLen(wpc, 2); - if (keys[i].sval) - addWritePreparedReplyBulkCBuffer(wpc, keys[i].sval, keys[i].slen); + if (fields[i].sval) + addWritePreparedReplyBulkCBuffer(wpc, fields[i].sval, fields[i].slen); else - addWritePreparedReplyBulkLongLong(wpc, keys[i].lval); + addWritePreparedReplyBulkLongLong(wpc, fields[i].lval); if (vals) { if (vals[i].sval) addWritePreparedReplyBulkCBuffer(wpc, vals[i].sval, vals[i].slen); @@ -933,32 +994,32 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { addWritePreparedReplyArrayLen(wpc, count * 2); else addWritePreparedReplyArrayLen(wpc, count); - if (hash->encoding == OBJ_ENCODING_HT) { - sds key, value; + if (hash->encoding == OBJ_ENCODING_HASHTABLE) { while (count--) { - dictEntry *de = dictGetFairRandomKey(hash->ptr); - key = dictGetKey(de); - value = dictGetVal(de); + void *entry; + hashtableFairRandomEntry(hash->ptr, &entry); + sds field = hashTypeEntryGetField(entry); + sds value = hashTypeEntryGetValue(entry); if (withvalues && c->resp > 2) addWritePreparedReplyArrayLen(wpc, 2); - addWritePreparedReplyBulkCBuffer(wpc, key, sdslen(key)); + addWritePreparedReplyBulkCBuffer(wpc, field, sdslen(field)); if (withvalues) addWritePreparedReplyBulkCBuffer(wpc, value, sdslen(value)); if (c->flag.close_asap) break; } } else if (hash->encoding == OBJ_ENCODING_LISTPACK) { - listpackEntry *keys, *vals = NULL; + listpackEntry *fields, *vals = NULL; unsigned long limit, sample_count; limit = count > HRANDFIELD_RANDOM_SAMPLE_LIMIT ? HRANDFIELD_RANDOM_SAMPLE_LIMIT : count; - keys = zmalloc(sizeof(listpackEntry) * limit); + fields = zmalloc(sizeof(listpackEntry) * limit); if (withvalues) vals = zmalloc(sizeof(listpackEntry) * limit); while (count) { sample_count = count > limit ? limit : count; count -= sample_count; - lpRandomPairs(hash->ptr, sample_count, keys, vals); - hrandfieldReplyWithListpack(wpc, sample_count, keys, vals); + lpRandomPairs(hash->ptr, sample_count, fields, vals); + hrandfieldReplyWithListpack(wpc, sample_count, fields, vals); if (c->flag.close_asap) break; } - zfree(keys); + zfree(fields); zfree(vals); } return; @@ -979,7 +1040,7 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { hashTypeInitIterator(hash, &hi); while (hashTypeNext(&hi) != C_ERR) { if (withvalues && c->resp > 2) addWritePreparedReplyArrayLen(wpc, 2); - addHashIteratorCursorToReply(wpc, &hi, OBJ_HASH_KEY); + addHashIteratorCursorToReply(wpc, &hi, OBJ_HASH_FIELD); if (withvalues) addHashIteratorCursorToReply(wpc, &hi, OBJ_HASH_VALUE); } hashTypeResetIterator(&hi); @@ -995,12 +1056,12 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { * And it is inefficient to repeatedly pick one random element from a * listpack in CASE 4. So we use this instead. */ if (hash->encoding == OBJ_ENCODING_LISTPACK) { - listpackEntry *keys, *vals = NULL; - keys = zmalloc(sizeof(listpackEntry) * count); + listpackEntry *fields, *vals = NULL; + fields = zmalloc(sizeof(listpackEntry) * count); if (withvalues) vals = zmalloc(sizeof(listpackEntry) * count); - serverAssert(lpRandomPairsUnique(hash->ptr, count, keys, vals) == count); - hrandfieldReplyWithListpack(wpc, count, keys, vals); - zfree(keys); + serverAssert(lpRandomPairsUnique(hash->ptr, count, fields, vals) == count); + hrandfieldReplyWithListpack(wpc, count, fields, vals); + zfree(fields); zfree(vals); return; } @@ -1024,11 +1085,11 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { /* Add all the elements into the temporary dictionary. */ while ((hashTypeNext(&hi)) != C_ERR) { int ret = DICT_ERR; - sds key, value = NULL; + sds field, value = NULL; - key = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_KEY); + field = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_FIELD); if (withvalues) value = hashTypeCurrentObjectNewSds(&hi, OBJ_HASH_VALUE); - ret = dictAdd(d, key, value); + ret = dictAdd(d, field, value); serverAssert(ret == DICT_OK); } @@ -1051,10 +1112,10 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { dictEntry *de; di = dictGetIterator(d); while ((de = dictNext(di)) != NULL) { - sds key = dictGetKey(de); + sds field = dictGetKey(de); sds value = dictGetVal(de); if (withvalues && c->resp > 2) addWritePreparedReplyArrayLen(wpc, 2); - addWritePreparedReplyBulkSds(wpc, key); + addWritePreparedReplyBulkSds(wpc, field); if (withvalues) addWritePreparedReplyBulkSds(wpc, value); } @@ -1069,25 +1130,25 @@ void hrandfieldWithCountCommand(client *c, long l, int withvalues) { else { /* Hashtable encoding (generic implementation) */ unsigned long added = 0; - listpackEntry key, value; + listpackEntry field, value; dict *d = dictCreate(&hashDictType); dictExpand(d, count); while (added < count) { - hashTypeRandomElement(hash, size, &key, withvalues ? &value : NULL); + hashTypeRandomElement(hash, size, &field, withvalues ? &value : NULL); /* Try to add the object to the dictionary. If it already exists * free it, otherwise increment the number of objects we have * in the result dictionary. */ - sds skey = hashSdsFromListpackEntry(&key); - if (dictAdd(d, skey, NULL) != DICT_OK) { - sdsfree(skey); + sds sfield = hashSdsFromListpackEntry(&field); + if (dictAdd(d, sfield, NULL) != DICT_OK) { + sdsfree(sfield); continue; } added++; /* We can reply right away, so that we don't need to store the value in the dict. */ if (withvalues && c->resp > 2) addWritePreparedReplyArrayLen(wpc, 2); - hashReplyFromListpackEntry(c, &key); + hashReplyFromListpackEntry(c, &field); if (withvalues) hashReplyFromListpackEntry(c, &value); } From 06506cd0ac934ec5ea3d99ab7e1b6f3f870ff7aa Mon Sep 17 00:00:00 2001 From: secwall Date: Tue, 14 Jan 2025 05:05:04 +0100 Subject: [PATCH 21/31] Escape unix socket group in unit tests (#1554) In some cases unix groups could have whitespace and/or `\` in them. One example is my workstation. It's a MacOS in an Active Directory domain. So my user has group `LD\Domain Users`. Running `make test` on `unstable` and `8.0` branches fails with: I'm not sure if we need to fix this in 8.0. But it seems that it should be fixed in unstable. Signed-off-by: secwall Signed-off-by: proost --- tests/unit/other.tcl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unit/other.tcl b/tests/unit/other.tcl index 225287aa3a..15c0d38136 100644 --- a/tests/unit/other.tcl +++ b/tests/unit/other.tcl @@ -568,7 +568,8 @@ if {$::verbose} { set tempFileId [open $tempFileName w] set group [dict get [file attributes $tempFileName] -group] if {$group != ""} { - start_server [list tags {"repl external:skip"} overrides [list unixsocketgroup $group unixsocketperm 744]] { + set escaped_group "\"[string map {"\\" "\\\\"} $group]\"" + start_server [list tags {"repl external:skip"} overrides [list unixsocketgroup $escaped_group unixsocketperm 744]] { test {test unixsocket options are set correctly} { set socketpath [lindex [r config get unixsocket] 1] set attributes [file attributes $socketpath] From c33a7b1b36978ef3c6bac00370f531cc5cc00179 Mon Sep 17 00:00:00 2001 From: Amit Nagler <58042354+naglera@users.noreply.github.com> Date: Tue, 14 Jan 2025 10:49:46 +0200 Subject: [PATCH 22/31] Fix valgrind test (#1555) Introduced at https://github.com/valkey-io/valkey/pull/1165/files Signed-off-by: naglera Signed-off-by: proost --- tests/integration/dual-channel-replication.tcl | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl index 4ca70651a1..ec6783c1c9 100644 --- a/tests/integration/dual-channel-replication.tcl +++ b/tests/integration/dual-channel-replication.tcl @@ -511,8 +511,10 @@ start_server {tags {"dual-channel-replication external:skip"}} { $primary config set repl-diskless-sync-delay 0 if {$::valgrind} { $primary config set repl-timeout 100 + $replica config set repl-timeout 100 } else { - $primary config set repl-timeout 10 + $primary config set repl-timeout 10 + $replica config set repl-timeout 10 } $primary config set rdb-key-save-delay 200 populate 10000 primary 10000 @@ -523,11 +525,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { $replica config set dual-channel-replication-enabled yes $replica config set loglevel debug - if {$::valgrind} { - $primary config set repl-timeout 100 - } else { - $primary config set repl-timeout 10 - } + # Pause replica after primary fork $replica debug pause-after-fork 1 From cf8af095c8d4b4871f8bac6e2bdcc5feb79598dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Tue, 14 Jan 2025 10:38:12 +0100 Subject: [PATCH 23/31] Introduce const_sds for const-content sds (#1553) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `sds` is a typedef of `char *`. `const sds` means `char * const`, i.e. a const-pointer to non-const content. More often, you would want `const char *`, i.e. a pointer to const-content. Until now, it's not possible to express that. This PR adds `const_sds` which is a pointer to const-content sds. To get a const-pointer to const-content sds, you can use `const const_sds`. In this PR, some uses of `const sds` are replaced by `const_sds`. We can use it more later. Fixes #1542 --------- Signed-off-by: Viktor Söderqvist Signed-off-by: proost --- src/sds.c | 16 ++++++++-------- src/sds.h | 29 ++++++++++++++++++----------- src/server.c | 2 +- src/unit/test_sds.c | 2 +- 4 files changed, 28 insertions(+), 21 deletions(-) diff --git a/src/sds.c b/src/sds.c index 97be74ea47..d956f834e5 100644 --- a/src/sds.c +++ b/src/sds.c @@ -187,19 +187,19 @@ sds sdsnew(const char *init) { } /* Duplicate an sds string. */ -sds sdsdup(const sds s) { +sds sdsdup(const_sds s) { return sdsnewlen(s, sdslen(s)); } /* * This method returns the minimum amount of bytes required to store the sds (header + data + NULL terminator). */ -static inline size_t sdsminlen(const sds s) { +static inline size_t sdsminlen(const_sds s) { return sdslen(s) + sdsHdrSize(s[-1]) + 1; } /* This method copies the sds `s` into `buf` which is the target character buffer. */ -size_t sdscopytobuffer(unsigned char *buf, size_t buf_len, const sds s, uint8_t *hdr_size) { +size_t sdscopytobuffer(unsigned char *buf, size_t buf_len, const_sds s, uint8_t *hdr_size) { size_t required_keylen = sdsminlen(s); if (buf == NULL) { return required_keylen; @@ -432,7 +432,7 @@ sds sdsResize(sds s, size_t size, int would_regrow) { * 3) The free buffer at the end if any. * 4) The implicit null term. */ -size_t sdsAllocSize(sds s) { +size_t sdsAllocSize(const_sds s) { char type = s[-1] & SDS_TYPE_MASK; /* SDS_TYPE_5 header doesn't contain the size of the allocation */ if (type == SDS_TYPE_5) { @@ -444,7 +444,7 @@ size_t sdsAllocSize(sds s) { /* Return the pointer of the actual SDS allocation (normally SDS strings * are referenced by the start of the string buffer). */ -void *sdsAllocPtr(sds s) { +void *sdsAllocPtr(const_sds s) { return (void *)(s - sdsHdrSize(s[-1])); } @@ -559,7 +559,7 @@ sds sdscat(sds s, const char *t) { * * After the call, the modified sds string is no longer valid and all the * references must be substituted with the new pointer returned by the call. */ -sds sdscatsds(sds s, const sds t) { +sds sdscatsds(sds s, const_sds t) { return sdscatlen(s, t, sdslen(t)); } @@ -870,7 +870,7 @@ void sdstoupper(sds s) { * If two strings share exactly the same prefix, but one of the two has * additional characters, the longer string is considered to be greater than * the smaller one. */ -int sdscmp(const sds s1, const sds s2) { +int sdscmp(const_sds s1, const_sds s2) { size_t l1, l2, minlen; int cmp; @@ -996,7 +996,7 @@ sds sdscatrepr(sds s, const char *p, size_t len) { * that is compatible with sdssplitargs(). For this reason, also spaces will be * treated as needing an escape. */ -int sdsneedsrepr(const sds s) { +int sdsneedsrepr(const_sds s) { size_t len = sdslen(s); const char *p = s; diff --git a/src/sds.h b/src/sds.h index e1b8531955..d6db8d19c2 100644 --- a/src/sds.h +++ b/src/sds.h @@ -39,7 +39,14 @@ extern const char *SDS_NOINIT; #include #include +/* Constness: + * + * - 'const sds' means 'char * const'. It is a const-pointer to non-const content. + * - 'const_sds' means 'const char *'. It is a non-const pointer to const content. + * - 'const const_sds' means 'const char * const', const pointer and content. */ + typedef char *sds; +typedef const char *const_sds; /* Note: sdshdr5 is never used, we just access the flags byte directly. * However is here to document the layout of type 5 SDS strings. */ @@ -83,7 +90,7 @@ struct __attribute__((__packed__)) sdshdr64 { #define SDS_HDR(T, s) ((struct sdshdr##T *)((s) - (sizeof(struct sdshdr##T)))) #define SDS_TYPE_5_LEN(f) ((f) >> SDS_TYPE_BITS) -static inline size_t sdslen(const sds s) { +static inline size_t sdslen(const_sds s) { unsigned char flags = s[-1]; switch (flags & SDS_TYPE_MASK) { case SDS_TYPE_5: return SDS_TYPE_5_LEN(flags); @@ -95,7 +102,7 @@ static inline size_t sdslen(const sds s) { return 0; } -static inline size_t sdsavail(const sds s) { +static inline size_t sdsavail(const_sds s) { unsigned char flags = s[-1]; switch (flags & SDS_TYPE_MASK) { case SDS_TYPE_5: { @@ -151,7 +158,7 @@ static inline void sdsinclen(sds s, size_t inc) { } /* sdsalloc() = sdsavail() + sdslen() */ -static inline size_t sdsalloc(const sds s) { +static inline size_t sdsalloc(const_sds s) { unsigned char flags = s[-1]; switch (flags & SDS_TYPE_MASK) { case SDS_TYPE_5: return SDS_TYPE_5_LEN(flags); @@ -180,14 +187,14 @@ sds sdsnewlen(const void *init, size_t initlen); sds sdstrynewlen(const void *init, size_t initlen); sds sdsnew(const char *init); sds sdsempty(void); -sds sdsdup(const sds s); -size_t sdscopytobuffer(unsigned char *buf, size_t buf_len, sds s, uint8_t *hdr_size); +sds sdsdup(const_sds s); +size_t sdscopytobuffer(unsigned char *buf, size_t buf_len, const_sds s, uint8_t *hdr_size); void sdsfree(sds s); void sdsfreeVoid(void *s); sds sdsgrowzero(sds s, size_t len); sds sdscatlen(sds s, const void *t, size_t len); sds sdscat(sds s, const char *t); -sds sdscatsds(sds s, const sds t); +sds sdscatsds(sds s, const_sds t); sds sdscpylen(sds s, const char *t, size_t len); sds sdscpy(sds s, const char *t); @@ -204,7 +211,7 @@ void sdssubstr(sds s, size_t start, size_t len); void sdsrange(sds s, ssize_t start, ssize_t end); void sdsupdatelen(sds s); void sdsclear(sds s); -int sdscmp(const sds s1, const sds s2); +int sdscmp(const_sds s1, const_sds s2); sds *sdssplitlen(const char *s, ssize_t len, const char *sep, int seplen, int *count); void sdsfreesplitres(sds *tokens, int count); void sdstolower(sds s); @@ -215,14 +222,14 @@ sds *sdssplitargs(const char *line, int *argc); sds sdsmapchars(sds s, const char *from, const char *to, size_t setlen); sds sdsjoin(char **argv, int argc, char *sep); sds sdsjoinsds(sds *argv, int argc, const char *sep, size_t seplen); -int sdsneedsrepr(const sds s); +int sdsneedsrepr(const_sds s); /* Callback for sdstemplate. The function gets called by sdstemplate * every time a variable needs to be expanded. The variable name is * provided as variable, and the callback is expected to return a * substitution value. Returning a NULL indicates an error. */ -typedef sds (*sdstemplate_callback_t)(const sds variable, void *arg); +typedef sds (*sdstemplate_callback_t)(const_sds variable, void *arg); sds sdstemplate(const char *template, sdstemplate_callback_t cb_func, void *cb_arg); /* Low level functions exposed to the user API */ @@ -231,8 +238,8 @@ sds sdsMakeRoomForNonGreedy(sds s, size_t addlen); void sdsIncrLen(sds s, ssize_t incr); sds sdsRemoveFreeSpace(sds s, int would_regrow); sds sdsResize(sds s, size_t size, int would_regrow); -size_t sdsAllocSize(sds s); -void *sdsAllocPtr(sds s); +size_t sdsAllocSize(const_sds s); +void *sdsAllocPtr(const_sds s); /* Export the allocator used by SDS to the program using SDS. * Sometimes the program SDS is linked to, may use a different set of diff --git a/src/server.c b/src/server.c index 66101ea839..5fb96c60b2 100644 --- a/src/server.c +++ b/src/server.c @@ -6698,7 +6698,7 @@ void serverOutOfMemoryHandler(size_t allocation_size) { /* Callback for sdstemplate on proc-title-template. See valkey.conf for * supported variables. */ -static sds serverProcTitleGetVariable(const sds varname, void *arg) { +static sds serverProcTitleGetVariable(const_sds varname, void *arg) { if (!strcmp(varname, "title")) { return sdsnew(arg); } else if (!strcmp(varname, "listen-addr")) { diff --git a/src/unit/test_sds.c b/src/unit/test_sds.c index 30f25e4f6f..7972963892 100644 --- a/src/unit/test_sds.c +++ b/src/unit/test_sds.c @@ -6,7 +6,7 @@ #include "../sds.h" #include "../sdsalloc.h" -static sds sdsTestTemplateCallback(sds varname, void *arg) { +static sds sdsTestTemplateCallback(const_sds varname, void *arg) { UNUSED(arg); static const char *_var1 = "variable1"; static const char *_var2 = "variable2"; From 32c57b5680615ce7979e6159a44d2d4b7197c06c Mon Sep 17 00:00:00 2001 From: "zhaozhao.zz" Date: Tue, 14 Jan 2025 19:01:00 +0800 Subject: [PATCH 24/31] add paused_actions for INFO Clients (#1519) Add `paused_actions` and `paused_timeout_milliseconds` for INFO Clients to inform users about if clients are paused. --------- Signed-off-by: zhaozhao.zz Signed-off-by: proost --- src/networking.c | 10 ++++++++++ src/server.c | 15 ++++++++++++++- src/server.h | 1 + tests/unit/pause.tcl | 19 +++++++++++++++++++ 4 files changed, 44 insertions(+), 1 deletion(-) diff --git a/src/networking.c b/src/networking.c index 339cd304d4..48e397e6f4 100644 --- a/src/networking.c +++ b/src/networking.c @@ -4520,6 +4520,16 @@ void flushReplicasOutputBuffers(void) { } } +mstime_t getPausedActionTimeout(uint32_t action) { + mstime_t timeout = 0; + for (int i = 0; i < NUM_PAUSE_PURPOSES; i++) { + pause_event *p = &(server.client_pause_per_purpose[i]); + if (p->paused_actions & action && (p->end - server.mstime) > timeout) + timeout = p->end - server.mstime; + } + return timeout; +} + /* Compute current paused actions and its end time, aggregated for * all pause purposes. */ void updatePausedActions(void) { diff --git a/src/server.c b/src/server.c index 5fb96c60b2..20d6b03517 100644 --- a/src/server.c +++ b/src/server.c @@ -5669,6 +5669,17 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { unsigned long blocking_keys, blocking_keys_on_nokey, watched_keys; getExpansiveClientsInfo(&maxin, &maxout); totalNumberOfStatefulKeys(&blocking_keys, &blocking_keys_on_nokey, &watched_keys); + + char *paused_actions = "none"; + long long paused_timeout = 0; + if (server.paused_actions & PAUSE_ACTION_CLIENT_ALL) { + paused_actions = "all"; + paused_timeout = getPausedActionTimeout(PAUSE_ACTION_CLIENT_ALL); + } else if (server.paused_actions & PAUSE_ACTION_CLIENT_WRITE) { + paused_actions = "write"; + paused_timeout = getPausedActionTimeout(PAUSE_ACTION_CLIENT_WRITE); + } + if (sections++) info = sdscat(info, "\r\n"); info = sdscatprintf( info, @@ -5685,7 +5696,9 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { "clients_in_timeout_table:%llu\r\n", (unsigned long long)raxSize(server.clients_timeout_table), "total_watched_keys:%lu\r\n", watched_keys, "total_blocking_keys:%lu\r\n", blocking_keys, - "total_blocking_keys_on_nokey:%lu\r\n", blocking_keys_on_nokey)); + "total_blocking_keys_on_nokey:%lu\r\n", blocking_keys_on_nokey, + "paused_actions:%s\r\n", paused_actions, + "paused_timeout_milliseconds:%lld\r\n", paused_timeout)); } /* Memory */ diff --git a/src/server.h b/src/server.h index 6d87b46aed..20fc802bef 100644 --- a/src/server.h +++ b/src/server.h @@ -2716,6 +2716,7 @@ void pauseActions(pause_purpose purpose, mstime_t end, uint32_t actions); void unpauseActions(pause_purpose purpose); uint32_t isPausedActions(uint32_t action_bitmask); uint32_t isPausedActionsWithUpdate(uint32_t action_bitmask); +mstime_t getPausedActionTimeout(uint32_t action); void updatePausedActions(void); void unblockPostponedClients(void); void processEventsWhileBlocked(void); diff --git a/tests/unit/pause.tcl b/tests/unit/pause.tcl index b18a32d48f..9697c3b44e 100644 --- a/tests/unit/pause.tcl +++ b/tests/unit/pause.tcl @@ -1,4 +1,23 @@ start_server {tags {"pause network"}} { + test "Test check paused_actions in info stats" { + assert_equal [s paused_actions] "none" + assert_equal [s paused_timeout_milliseconds] 0 + + r client PAUSE 10000 WRITE + assert_equal [s paused_actions] "write" + after 1000 + set timeout [s paused_timeout_milliseconds] + assert {$timeout > 0 && $timeout < 9000} + r client unpause + + r multi + r client PAUSE 1000 ALL + r info clients + assert_match "*paused_actions:all*" [r exec] + + r client unpause + } + test "Test read commands are not blocked by client pause" { r client PAUSE 100000 WRITE set rd [valkey_deferring_client] From 040dbe41a7fd866ba5c1782f7c1d805ee1a12f71 Mon Sep 17 00:00:00 2001 From: Sarthak Aggarwal Date: Wed, 15 Jan 2025 11:44:13 -0800 Subject: [PATCH 25/31] Adding Missing filters to CLIENT LIST and Dedup Parsing (#1401) Adds filter options to CLIENT LIST: * USER Return clients authenticated by . * ADDR Return clients connected from the specified address. * LADDR Return clients connected to the specified local address. * SKIPME (YES|NO) Exclude the current client from the list (default: no). * MAXAGE Only list connections older than the specified age. Modifies the ID filter to CLIENT KILL to allow multiple IDs * ID [...] Kill connections by client ids. This makes CLIENT LIST and CLIENT KILL accept the same options. For backward compatibility, the default value for SKIPME is NO for CLIENT LIST and YES for CLIENT KILL. The MAXAGE comes from CLIENT KILL, where it *keeps* clients with the given max age and kills the older ones. This logic becomes weird for CLIENT LIST, but is kept for similary with CLIENT KILL, for the use case of first testing manually using CLIENT LIST, and then running CLIENT KILL with the same filters. The `ID client-id [client-id ...]` no longer needs to be the last filter. The parsing logic determines if an argument is an ID or not based on whether it can be parsed as an integer or not. Partly addresses: #668 --------- Signed-off-by: Sarthak Aggarwal Signed-off-by: proost --- src/commands.def | 55 +- src/commands/client-caching.json | 2 +- src/commands/client-capa.json | 2 +- src/commands/client-getname.json | 2 +- src/commands/client-getredir.json | 2 +- src/commands/client-help.json | 2 +- src/commands/client-id.json | 2 +- src/commands/client-import-source.json | 2 +- src/commands/client-info.json | 2 +- src/commands/client-kill.json | 7 +- src/commands/client-list.json | 55 +- src/commands/client-no-evict.json | 2 +- src/commands/client-no-touch.json | 2 +- src/commands/client-pause.json | 2 +- src/commands/client-reply.json | 2 +- src/commands/client-setname.json | 2 +- src/commands/client-tracking.json | 2 +- src/commands/client-trackinginfo.json | 2 +- src/commands/client-unblock.json | 2 +- src/commands/client-unpause.json | 2 +- src/commands/client.json | 1 + src/networking.c | 1109 +++++++++++++----------- src/server.h | 19 + tests/unit/introspection.tcl | 122 ++- 24 files changed, 867 insertions(+), 535 deletions(-) diff --git a/src/commands.def b/src/commands.def index c5d766e3f8..cd919a80e1 100644 --- a/src/commands.def +++ b/src/commands.def @@ -1289,6 +1289,7 @@ commandHistory CLIENT_KILL_History[] = { {"6.2.0","`LADDR` option."}, {"8.0.0","`MAXAGE` option."}, {"8.0.0","Replaced `master` `TYPE` with `primary`. `master` still supported for backward compatibility."}, +{"8.1.0","`ID` option accepts multiple IDs."}, }; #endif @@ -1320,7 +1321,7 @@ struct COMMAND_ARG CLIENT_KILL_filter_new_format_skipme_Subargs[] = { /* CLIENT KILL filter new_format argument table */ struct COMMAND_ARG CLIENT_KILL_filter_new_format_Subargs[] = { -{MAKE_ARG("client-id",ARG_TYPE_INTEGER,-1,"ID",NULL,"2.8.12",CMD_ARG_OPTIONAL,0,NULL)}, +{MAKE_ARG("client-id",ARG_TYPE_INTEGER,-1,"ID",NULL,"2.8.12",CMD_ARG_OPTIONAL|CMD_ARG_MULTIPLE,0,NULL)}, {MAKE_ARG("client-type",ARG_TYPE_ONEOF,-1,"TYPE",NULL,"2.8.12",CMD_ARG_OPTIONAL,6,NULL),.subargs=CLIENT_KILL_filter_new_format_client_type_Subargs}, {MAKE_ARG("username",ARG_TYPE_STRING,-1,"USER",NULL,NULL,CMD_ARG_OPTIONAL,0,NULL)}, {MAKE_ARG("addr",ARG_TYPE_STRING,-1,"ADDR",NULL,NULL,CMD_ARG_OPTIONAL,0,NULL),.display_text="ip:port"}, @@ -1352,6 +1353,7 @@ commandHistory CLIENT_LIST_History[] = { {"7.0.0","Added `resp`, `multi-mem`, `rbs` and `rbp` fields."}, {"7.0.3","Added `ssub` field."}, {"8.0.0","Replaced `master` `TYPE` with `primary`. `master` still supported for backward compatibility."}, +{"8.1.0","Added filters USER, ADDR, LADDR, SKIPME, and MAXAGE"}, }; #endif @@ -1375,10 +1377,21 @@ struct COMMAND_ARG CLIENT_LIST_client_type_Subargs[] = { {MAKE_ARG("pubsub",ARG_TYPE_PURE_TOKEN,-1,"PUBSUB",NULL,NULL,CMD_ARG_NONE,0,NULL)}, }; +/* CLIENT LIST skipme argument table */ +struct COMMAND_ARG CLIENT_LIST_skipme_Subargs[] = { +{MAKE_ARG("yes",ARG_TYPE_PURE_TOKEN,-1,"YES",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("no",ARG_TYPE_PURE_TOKEN,-1,"NO",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + /* CLIENT LIST argument table */ struct COMMAND_ARG CLIENT_LIST_Args[] = { {MAKE_ARG("client-type",ARG_TYPE_ONEOF,-1,"TYPE",NULL,"5.0.0",CMD_ARG_OPTIONAL,4,NULL),.subargs=CLIENT_LIST_client_type_Subargs}, {MAKE_ARG("client-id",ARG_TYPE_INTEGER,-1,"ID",NULL,"6.2.0",CMD_ARG_OPTIONAL|CMD_ARG_MULTIPLE,0,NULL)}, +{MAKE_ARG("username",ARG_TYPE_STRING,-1,"USER",NULL,"8.1.0",CMD_ARG_OPTIONAL,0,NULL)}, +{MAKE_ARG("addr",ARG_TYPE_STRING,-1,"ADDR",NULL,"8.1.0",CMD_ARG_OPTIONAL,0,NULL),.display_text="ip:port"}, +{MAKE_ARG("laddr",ARG_TYPE_STRING,-1,"LADDR",NULL,"8.1.0",CMD_ARG_OPTIONAL,0,NULL),.display_text="ip:port"}, +{MAKE_ARG("skipme",ARG_TYPE_ONEOF,-1,"SKIPME",NULL,"8.1.0",CMD_ARG_OPTIONAL,2,NULL),.subargs=CLIENT_LIST_skipme_Subargs}, +{MAKE_ARG("maxage",ARG_TYPE_INTEGER,-1,"MAXAGE",NULL,"8.1.0",CMD_ARG_OPTIONAL,0,NULL)}, }; /********** CLIENT NO_EVICT ********************/ @@ -1652,26 +1665,26 @@ struct COMMAND_ARG CLIENT_UNBLOCK_Args[] = { /* CLIENT command table */ struct COMMAND_STRUCT CLIENT_Subcommands[] = { -{MAKE_CMD("caching","Instructs the server whether to track the keys in the next request.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_CACHING_History,0,CLIENT_CACHING_Tips,0,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_CACHING_Keyspecs,0,NULL,1),.args=CLIENT_CACHING_Args}, -{MAKE_CMD("capa","A client claims its capability.","O(1)","8.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_CAPA_History,0,CLIENT_CAPA_Tips,0,clientCommand,-3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_CAPA_Keyspecs,0,NULL,1),.args=CLIENT_CAPA_Args}, -{MAKE_CMD("getname","Returns the name of the connection.","O(1)","2.6.9",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_GETNAME_History,0,CLIENT_GETNAME_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_GETNAME_Keyspecs,0,NULL,0)}, -{MAKE_CMD("getredir","Returns the client ID to which the connection's tracking notifications are redirected.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_GETREDIR_History,0,CLIENT_GETREDIR_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_GETREDIR_Keyspecs,0,NULL,0)}, -{MAKE_CMD("help","Returns helpful text about the different subcommands.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_HELP_History,0,CLIENT_HELP_Tips,0,clientCommand,2,CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_HELP_Keyspecs,0,NULL,0)}, -{MAKE_CMD("id","Returns the unique client ID of the connection.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_ID_History,0,CLIENT_ID_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_ID_Keyspecs,0,NULL,0)}, -{MAKE_CMD("import-source","Mark this client as an import source when server is in import mode.","O(1)","8.1.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_IMPORT_SOURCE_History,0,CLIENT_IMPORT_SOURCE_Tips,0,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_IMPORT_SOURCE_Keyspecs,0,NULL,1),.args=CLIENT_IMPORT_SOURCE_Args}, -{MAKE_CMD("info","Returns information about the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_INFO_History,0,CLIENT_INFO_Tips,1,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_INFO_Keyspecs,0,NULL,0)}, -{MAKE_CMD("kill","Terminates open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_KILL_History,7,CLIENT_KILL_Tips,0,clientCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_KILL_Keyspecs,0,NULL,1),.args=CLIENT_KILL_Args}, -{MAKE_CMD("list","Lists open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_LIST_History,7,CLIENT_LIST_Tips,1,clientCommand,-2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_LIST_Keyspecs,0,NULL,2),.args=CLIENT_LIST_Args}, -{MAKE_CMD("no-evict","Sets the client eviction mode of the connection.","O(1)","7.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_NO_EVICT_History,0,CLIENT_NO_EVICT_Tips,0,clientCommand,3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_NO_EVICT_Keyspecs,0,NULL,1),.args=CLIENT_NO_EVICT_Args}, -{MAKE_CMD("no-touch","Controls whether commands sent by the client affect the LRU/LFU of accessed keys.","O(1)","7.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_NO_TOUCH_History,0,CLIENT_NO_TOUCH_Tips,0,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_NO_TOUCH_Keyspecs,0,NULL,1),.args=CLIENT_NO_TOUCH_Args}, -{MAKE_CMD("pause","Suspends commands processing.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_PAUSE_History,1,CLIENT_PAUSE_Tips,0,clientCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_PAUSE_Keyspecs,0,NULL,2),.args=CLIENT_PAUSE_Args}, -{MAKE_CMD("reply","Instructs the server whether to reply to commands.","O(1)","3.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_REPLY_History,0,CLIENT_REPLY_Tips,0,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_REPLY_Keyspecs,0,NULL,1),.args=CLIENT_REPLY_Args}, +{MAKE_CMD("caching","Instructs the server whether to track the keys in the next request.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_CACHING_History,0,CLIENT_CACHING_Tips,0,clientCachingCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_CACHING_Keyspecs,0,NULL,1),.args=CLIENT_CACHING_Args}, +{MAKE_CMD("capa","A client claims its capability.","O(1)","8.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_CAPA_History,0,CLIENT_CAPA_Tips,0,clientCapaCommand,-3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_CAPA_Keyspecs,0,NULL,1),.args=CLIENT_CAPA_Args}, +{MAKE_CMD("getname","Returns the name of the connection.","O(1)","2.6.9",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_GETNAME_History,0,CLIENT_GETNAME_Tips,0,clientGetNameCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_GETNAME_Keyspecs,0,NULL,0)}, +{MAKE_CMD("getredir","Returns the client ID to which the connection's tracking notifications are redirected.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_GETREDIR_History,0,CLIENT_GETREDIR_Tips,0,clientGetredirCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_GETREDIR_Keyspecs,0,NULL,0)}, +{MAKE_CMD("help","Returns helpful text about the different subcommands.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_HELP_History,0,CLIENT_HELP_Tips,0,clientHelpCommand,2,CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_HELP_Keyspecs,0,NULL,0)}, +{MAKE_CMD("id","Returns the unique client ID of the connection.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_ID_History,0,CLIENT_ID_Tips,0,clientIDCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_ID_Keyspecs,0,NULL,0)}, +{MAKE_CMD("import-source","Mark this client as an import source when server is in import mode.","O(1)","8.1.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_IMPORT_SOURCE_History,0,CLIENT_IMPORT_SOURCE_Tips,0,clientImportSourceCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_IMPORT_SOURCE_Keyspecs,0,NULL,1),.args=CLIENT_IMPORT_SOURCE_Args}, +{MAKE_CMD("info","Returns information about the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_INFO_History,0,CLIENT_INFO_Tips,1,clientInfoCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_INFO_Keyspecs,0,NULL,0)}, +{MAKE_CMD("kill","Terminates open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_KILL_History,8,CLIENT_KILL_Tips,0,clientKillCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_KILL_Keyspecs,0,NULL,1),.args=CLIENT_KILL_Args}, +{MAKE_CMD("list","Lists open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_LIST_History,8,CLIENT_LIST_Tips,1,clientListCommand,-2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_LIST_Keyspecs,0,NULL,7),.args=CLIENT_LIST_Args}, +{MAKE_CMD("no-evict","Sets the client eviction mode of the connection.","O(1)","7.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_NO_EVICT_History,0,CLIENT_NO_EVICT_Tips,0,clientNoEvictCommand,3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_NO_EVICT_Keyspecs,0,NULL,1),.args=CLIENT_NO_EVICT_Args}, +{MAKE_CMD("no-touch","Controls whether commands sent by the client affect the LRU/LFU of accessed keys.","O(1)","7.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_NO_TOUCH_History,0,CLIENT_NO_TOUCH_Tips,0,clientNoTouchCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_NO_TOUCH_Keyspecs,0,NULL,1),.args=CLIENT_NO_TOUCH_Args}, +{MAKE_CMD("pause","Suspends commands processing.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_PAUSE_History,1,CLIENT_PAUSE_Tips,0,clientPauseCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_PAUSE_Keyspecs,0,NULL,2),.args=CLIENT_PAUSE_Args}, +{MAKE_CMD("reply","Instructs the server whether to reply to commands.","O(1)","3.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_REPLY_History,0,CLIENT_REPLY_Tips,0,clientReplyCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_REPLY_Keyspecs,0,NULL,1),.args=CLIENT_REPLY_Args}, {MAKE_CMD("setinfo","Sets information specific to the client or connection.","O(1)","7.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_SETINFO_History,0,CLIENT_SETINFO_Tips,2,clientSetinfoCommand,4,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_SETINFO_Keyspecs,0,NULL,1),.args=CLIENT_SETINFO_Args}, -{MAKE_CMD("setname","Sets the connection name.","O(1)","2.6.9",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_SETNAME_History,0,CLIENT_SETNAME_Tips,2,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_SETNAME_Keyspecs,0,NULL,1),.args=CLIENT_SETNAME_Args}, -{MAKE_CMD("tracking","Controls server-assisted client-side caching for the connection.","O(1). Some options may introduce additional complexity.","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_TRACKING_History,0,CLIENT_TRACKING_Tips,0,clientCommand,-3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_TRACKING_Keyspecs,0,NULL,7),.args=CLIENT_TRACKING_Args}, -{MAKE_CMD("trackinginfo","Returns information about server-assisted client-side caching for the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_TRACKINGINFO_History,0,CLIENT_TRACKINGINFO_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_TRACKINGINFO_Keyspecs,0,NULL,0)}, -{MAKE_CMD("unblock","Unblocks a client blocked by a blocking command from a different connection.","O(log N) where N is the number of client connections","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_UNBLOCK_History,0,CLIENT_UNBLOCK_Tips,0,clientCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_UNBLOCK_Keyspecs,0,NULL,2),.args=CLIENT_UNBLOCK_Args}, -{MAKE_CMD("unpause","Resumes processing commands from paused clients.","O(N) Where N is the number of paused clients","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_UNPAUSE_History,0,CLIENT_UNPAUSE_Tips,0,clientCommand,2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_UNPAUSE_Keyspecs,0,NULL,0)}, +{MAKE_CMD("setname","Sets the connection name.","O(1)","2.6.9",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_SETNAME_History,0,CLIENT_SETNAME_Tips,2,clientSetNameCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_SETNAME_Keyspecs,0,NULL,1),.args=CLIENT_SETNAME_Args}, +{MAKE_CMD("tracking","Controls server-assisted client-side caching for the connection.","O(1). Some options may introduce additional complexity.","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_TRACKING_History,0,CLIENT_TRACKING_Tips,0,clientTrackingCommand,-3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_TRACKING_Keyspecs,0,NULL,7),.args=CLIENT_TRACKING_Args}, +{MAKE_CMD("trackinginfo","Returns information about server-assisted client-side caching for the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_TRACKINGINFO_History,0,CLIENT_TRACKINGINFO_Tips,0,clientTrackingInfoCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_TRACKINGINFO_Keyspecs,0,NULL,0)}, +{MAKE_CMD("unblock","Unblocks a client blocked by a blocking command from a different connection.","O(log N) where N is the number of client connections","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_UNBLOCK_History,0,CLIENT_UNBLOCK_Tips,0,clientUnblockCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_UNBLOCK_Keyspecs,0,NULL,2),.args=CLIENT_UNBLOCK_Args}, +{MAKE_CMD("unpause","Resumes processing commands from paused clients.","O(N) Where N is the number of paused clients","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_UNPAUSE_History,0,CLIENT_UNPAUSE_Tips,0,clientUnpauseCommand,2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_UNPAUSE_Keyspecs,0,NULL,0)}, {0} }; @@ -10910,7 +10923,7 @@ struct COMMAND_STRUCT serverCommandTable[] = { {MAKE_CMD("readwrite","Enables read-write queries for a connection to a Valkey replica node.","O(1)","3.0.0",CMD_DOC_NONE,NULL,NULL,"cluster",COMMAND_GROUP_CLUSTER,READWRITE_History,0,READWRITE_Tips,0,readwriteCommand,1,CMD_FAST|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,READWRITE_Keyspecs,0,NULL,0)}, /* connection */ {MAKE_CMD("auth","Authenticates the connection.","O(N) where N is the number of passwords defined for the user","1.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,AUTH_History,1,AUTH_Tips,0,authCommand,-2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_AUTH|CMD_SENTINEL|CMD_ALLOW_BUSY,ACL_CATEGORY_CONNECTION,AUTH_Keyspecs,0,NULL,2),.args=AUTH_Args}, -{MAKE_CMD("client","A container for client connection commands.","Depends on subcommand.","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_History,0,CLIENT_Tips,0,NULL,-2,CMD_SENTINEL,0,CLIENT_Keyspecs,0,NULL,0),.subcommands=CLIENT_Subcommands}, +{MAKE_CMD("client","A container for client connection commands.","Depends on subcommand.","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_History,0,CLIENT_Tips,0,clientCommand,-2,CMD_SENTINEL,0,CLIENT_Keyspecs,0,NULL,0),.subcommands=CLIENT_Subcommands}, {MAKE_CMD("echo","Returns the given string.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,ECHO_History,0,ECHO_Tips,0,echoCommand,2,CMD_LOADING|CMD_STALE|CMD_FAST,ACL_CATEGORY_CONNECTION,ECHO_Keyspecs,0,NULL,1),.args=ECHO_Args}, {MAKE_CMD("hello","Handshakes with the server.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,HELLO_History,2,HELLO_Tips,0,helloCommand,-1,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_FAST|CMD_NO_AUTH|CMD_SENTINEL|CMD_ALLOW_BUSY,ACL_CATEGORY_CONNECTION,HELLO_Keyspecs,0,NULL,1),.args=HELLO_Args}, {MAKE_CMD("ping","Returns the server's liveliness response.","O(1)","1.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,PING_History,0,PING_Tips,2,pingCommand,-1,CMD_FAST|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,PING_Keyspecs,0,NULL,1),.args=PING_Args}, diff --git a/src/commands/client-caching.json b/src/commands/client-caching.json index 2a4ae891db..d661492f45 100644 --- a/src/commands/client-caching.json +++ b/src/commands/client-caching.json @@ -6,7 +6,7 @@ "since": "6.0.0", "arity": 3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientCachingCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-capa.json b/src/commands/client-capa.json index 3c16cd44f9..0d0f577f94 100644 --- a/src/commands/client-capa.json +++ b/src/commands/client-capa.json @@ -6,7 +6,7 @@ "since": "8.0.0", "arity": -3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientCapaCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-getname.json b/src/commands/client-getname.json index 9e237af849..e13db064b7 100644 --- a/src/commands/client-getname.json +++ b/src/commands/client-getname.json @@ -6,7 +6,7 @@ "since": "2.6.9", "arity": 2, "container": "CLIENT", - "function": "clientCommand", + "function": "clientGetNameCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-getredir.json b/src/commands/client-getredir.json index 6fdb002dc8..3df1df6b6f 100644 --- a/src/commands/client-getredir.json +++ b/src/commands/client-getredir.json @@ -6,7 +6,7 @@ "since": "6.0.0", "arity": 2, "container": "CLIENT", - "function": "clientCommand", + "function": "clientGetredirCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-help.json b/src/commands/client-help.json index b49294c9ee..ae771d52ae 100644 --- a/src/commands/client-help.json +++ b/src/commands/client-help.json @@ -6,7 +6,7 @@ "since": "5.0.0", "arity": 2, "container": "CLIENT", - "function": "clientCommand", + "function": "clientHelpCommand", "command_flags": [ "LOADING", "STALE", diff --git a/src/commands/client-id.json b/src/commands/client-id.json index 7c2bf08200..f6131250dd 100644 --- a/src/commands/client-id.json +++ b/src/commands/client-id.json @@ -6,7 +6,7 @@ "since": "5.0.0", "arity": 2, "container": "CLIENT", - "function": "clientCommand", + "function": "clientIDCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-import-source.json b/src/commands/client-import-source.json index 113c07d70a..dd5ef65e77 100644 --- a/src/commands/client-import-source.json +++ b/src/commands/client-import-source.json @@ -6,7 +6,7 @@ "since": "8.1.0", "arity": 3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientImportSourceCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-info.json b/src/commands/client-info.json index f974da437b..afda2ca967 100644 --- a/src/commands/client-info.json +++ b/src/commands/client-info.json @@ -6,7 +6,7 @@ "since": "6.2.0", "arity": 2, "container": "CLIENT", - "function": "clientCommand", + "function": "clientInfoCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-kill.json b/src/commands/client-kill.json index 97fa932cd8..0ae3579534 100644 --- a/src/commands/client-kill.json +++ b/src/commands/client-kill.json @@ -6,7 +6,7 @@ "since": "2.4.0", "arity": -3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientKillCommand", "history": [ [ "2.8.12", @@ -35,6 +35,10 @@ [ "8.0.0", "Replaced `master` `TYPE` with `primary`. `master` still supported for backward compatibility." + ], + [ + "8.1.0", + "`ID` option accepts multiple IDs." ] ], "command_flags": [ @@ -68,6 +72,7 @@ "name": "client-id", "type": "integer", "optional": true, + "multiple": true, "since": "2.8.12" }, { diff --git a/src/commands/client-list.json b/src/commands/client-list.json index d9c0054e60..05e4de2419 100644 --- a/src/commands/client-list.json +++ b/src/commands/client-list.json @@ -6,7 +6,7 @@ "since": "2.4.0", "arity": -2, "container": "CLIENT", - "function": "clientCommand", + "function": "clientListCommand", "history": [ [ "2.8.12", @@ -35,6 +35,10 @@ [ "8.0.0", "Replaced `master` `TYPE` with `primary`. `master` still supported for backward compatibility." + ], + [ + "8.1.0", + "Added filters USER, ADDR, LADDR, SKIPME, and MAXAGE" ] ], "command_flags": [ @@ -91,6 +95,55 @@ "optional": true, "multiple": true, "since": "6.2.0" + }, + { + "token": "USER", + "name": "username", + "type": "string", + "optional": true, + "since": "8.1.0" + }, + { + "token": "ADDR", + "name": "addr", + "display": "ip:port", + "type": "string", + "optional": true, + "since": "8.1.0" + }, + { + "token": "LADDR", + "name": "laddr", + "display": "ip:port", + "type": "string", + "optional": true, + "since": "8.1.0" + }, + { + "token": "SKIPME", + "name": "skipme", + "type": "oneof", + "optional": true, + "since": "8.1.0", + "arguments": [ + { + "name": "yes", + "type": "pure-token", + "token": "YES" + }, + { + "name": "no", + "type": "pure-token", + "token": "NO" + } + ] + }, + { + "token": "MAXAGE", + "name": "maxage", + "type": "integer", + "optional": true, + "since": "8.1.0" } ] } diff --git a/src/commands/client-no-evict.json b/src/commands/client-no-evict.json index 9ed6718405..710f8a97f9 100644 --- a/src/commands/client-no-evict.json +++ b/src/commands/client-no-evict.json @@ -6,7 +6,7 @@ "since": "7.0.0", "arity": 3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientNoEvictCommand", "command_flags": [ "ADMIN", "NOSCRIPT", diff --git a/src/commands/client-no-touch.json b/src/commands/client-no-touch.json index 4cf7b72416..4196770a2e 100644 --- a/src/commands/client-no-touch.json +++ b/src/commands/client-no-touch.json @@ -6,7 +6,7 @@ "since": "7.2.0", "arity": 3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientNoTouchCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-pause.json b/src/commands/client-pause.json index b1dd7bc478..54faf796c2 100644 --- a/src/commands/client-pause.json +++ b/src/commands/client-pause.json @@ -6,7 +6,7 @@ "since": "3.0.0", "arity": -3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientPauseCommand", "history": [ [ "6.2.0", diff --git a/src/commands/client-reply.json b/src/commands/client-reply.json index 9406de85cf..8d2b713a69 100644 --- a/src/commands/client-reply.json +++ b/src/commands/client-reply.json @@ -6,7 +6,7 @@ "since": "3.2.0", "arity": 3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientReplyCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-setname.json b/src/commands/client-setname.json index b071bd18ff..f544dc6a0f 100644 --- a/src/commands/client-setname.json +++ b/src/commands/client-setname.json @@ -6,7 +6,7 @@ "since": "2.6.9", "arity": 3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientSetNameCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-tracking.json b/src/commands/client-tracking.json index 2c3768c2fb..1acf84fafc 100644 --- a/src/commands/client-tracking.json +++ b/src/commands/client-tracking.json @@ -6,7 +6,7 @@ "since": "6.0.0", "arity": -3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientTrackingCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-trackinginfo.json b/src/commands/client-trackinginfo.json index 270a3d5e6e..78ba8201d7 100644 --- a/src/commands/client-trackinginfo.json +++ b/src/commands/client-trackinginfo.json @@ -6,7 +6,7 @@ "since": "6.2.0", "arity": 2, "container": "CLIENT", - "function": "clientCommand", + "function": "clientTrackingInfoCommand", "command_flags": [ "NOSCRIPT", "LOADING", diff --git a/src/commands/client-unblock.json b/src/commands/client-unblock.json index d391ede9e9..2173173f40 100644 --- a/src/commands/client-unblock.json +++ b/src/commands/client-unblock.json @@ -6,7 +6,7 @@ "since": "5.0.0", "arity": -3, "container": "CLIENT", - "function": "clientCommand", + "function": "clientUnblockCommand", "command_flags": [ "ADMIN", "NOSCRIPT", diff --git a/src/commands/client-unpause.json b/src/commands/client-unpause.json index 6c55210d2a..bb78fb848b 100644 --- a/src/commands/client-unpause.json +++ b/src/commands/client-unpause.json @@ -6,7 +6,7 @@ "since": "6.2.0", "arity": 2, "container": "CLIENT", - "function": "clientCommand", + "function": "clientUnpauseCommand", "command_flags": [ "ADMIN", "NOSCRIPT", diff --git a/src/commands/client.json b/src/commands/client.json index b50996128e..116fb4d4a2 100644 --- a/src/commands/client.json +++ b/src/commands/client.json @@ -4,6 +4,7 @@ "complexity": "Depends on subcommand.", "group": "connection", "since": "2.4.0", + "function": "clientCommand", "arity": -2, "command_flags": [ "SENTINEL" diff --git a/src/networking.c b/src/networking.c index 48e397e6f4..0ff0fed4c0 100644 --- a/src/networking.c +++ b/src/networking.c @@ -31,6 +31,7 @@ #include "cluster.h" #include "cluster_slot_stats.h" #include "script.h" +#include "intset.h" #include "sds.h" #include "fpconv_dtoa.h" #include "fmtargs.h" @@ -43,10 +44,35 @@ #include #include +/* This struct is used to encapsulate filtering criteria for operations on clients + * such as identifying specific clients to kill or retrieve. Each field in the struct + * represents a filter that can be applied based on specific attributes of a client. */ +typedef struct { + /* A set of client IDs to filter. If NULL, no ID filtering is applied. */ + intset *ids; + /* Maximum age (in seconds) of a client connection for filtering. + * Connections younger than this value will not match. + * A value of 0 means no age filtering. */ + long long max_age; + /* Address/port of the client. If NULL, no address filtering is applied. */ + char *addr; + /* Remote address/port of the client. If NULL, no address filtering is applied. */ + char *laddr; + /* Filtering clients by authentication user. If NULL, no user-based filtering is applied. */ + user *user; + /* Client type to filter. If set to -1, no type filtering is applied. */ + int type; + /* Boolean flag to determine if the current client (`me`) should be filtered. 1 means "skip me", 0 means otherwise. */ + int skipme; +} clientFilter; + static void setProtocolError(const char *errstr, client *c); static void pauseClientsByClient(mstime_t end, int isPauseClientAll); int postponeClientRead(client *c); char *getClientSockname(client *c); +static int parseClientFiltersOrReply(client *c, int index, clientFilter *filter); +static int clientMatchesFilter(client *client, clientFilter client_filter); +static sds getAllFilteredClientsInfoString(clientFilter *client_filter, int hide_user_data); int ProcessingEventsWhileBlocked = 0; /* See processEventsWhileBlocked(). */ __thread sds thread_shared_qb = NULL; @@ -2451,6 +2477,7 @@ int handleClientsWithPendingWrites(void) { /* resetClient prepare the client to process the next command */ void resetClient(client *c) { serverCommandProc *prevcmd = c->cmd ? c->cmd->proc : NULL; + serverCommandProc *prevParentCmd = c->cmd && c->cmd->parent ? c->cmd->parent->proc : NULL; freeClientArgv(c); freeClientOriginalArgv(c); @@ -2480,7 +2507,7 @@ void resetClient(client *c) { /* We do the same for the CACHING command as well. It also affects * the next command or transaction executed, in a way very similar * to ASKING. */ - if (!c->flag.multi && prevcmd != clientCommand) c->flag.tracking_caching = 0; + if (!c->flag.multi && prevParentCmd != clientCommand) c->flag.tracking_caching = 0; /* Remove the CLIENT_REPLY_SKIP flag if any so that the reply * to the next command will be sent, but set the flag if the command @@ -3354,6 +3381,22 @@ sds getAllClientsInfoString(int type, int hide_user_data) { return o; } +static sds getAllFilteredClientsInfoString(clientFilter *client_filter, int hide_user_data) { + listNode *ln; + listIter li; + client *client; + sds o = sdsempty(); + sdsclear(o); + listRewind(server.clients, &li); + while ((ln = listNext(&li)) != NULL) { + client = listNodeValue(ln); + if (!clientMatchesFilter(client, *client_filter)) continue; + o = catClientInfoString(o, client, hide_user_data); + o = sdscatlen(o, "\n", 1); + } + return o; +} + /* Check validity of an attribute that's gonna be shown in CLIENT LIST. */ int validateClientAttr(const char *val) { /* Check if the charset is ok. We need to do this otherwise @@ -3473,570 +3516,648 @@ void quitCommand(client *c) { c->flag.close_after_reply = 1; } -void clientCommand(client *c) { - listNode *ln; - listIter li; +static int parseClientFiltersOrReply(client *c, int index, clientFilter *filter) { + while (index < c->argc) { + int moreargs = c->argc > index + 1; - if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr, "help")) { - const char *help[] = { - "CACHING (YES|NO)", - " Enable/disable tracking of the keys for next command in OPTIN/OPTOUT modes.", - "CAPA