Skip to content

Commit

Permalink
Configurable cluster blacklist TTL (#738)
Browse files Browse the repository at this point in the history
Allows cluster admins to configure the blacklist TTL as needed to allow
sufficient time for `CLUSTER FORGET` to be executed on every node in the
cluster.

Config name `cluster-blacklist-ttl`; unit seconds; deault 60.

---------

Signed-off-by: Brennan Cathcart <[email protected]>
  • Loading branch information
BCathcart authored Jul 13, 2024
1 parent b4ac2c4 commit 34649bd
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 7 deletions.
9 changes: 3 additions & 6 deletions src/cluster_legacy.c
Original file line number Diff line number Diff line change
Expand Up @@ -1842,14 +1842,14 @@ void clusterHandleConfigEpochCollision(clusterNode *sender) {
*
* The nodes blacklist is just a way to ensure that a given node with a given
* Node ID is not re-added before some time elapsed (this time is specified
* in seconds in CLUSTER_BLACKLIST_TTL).
* in seconds by the configurable cluster-blacklist-ttl).
*
* This is useful when we want to remove a node from the cluster completely:
* when CLUSTER FORGET is called, it also puts the node into the blacklist so
* that even if we receive gossip messages from other nodes that still remember
* about the node we want to remove, we don't re-add it before some time.
*
* Currently the CLUSTER_BLACKLIST_TTL is set to 1 minute, this means
* The default blacklist ttl is 1 minute which means
* that valkey-cli has 60 seconds to send CLUSTER FORGET messages to nodes
* in the cluster without dealing with the problem of other nodes re-adding
* back the node to nodes we already sent the FORGET command to.
Expand All @@ -1859,9 +1859,6 @@ void clusterHandleConfigEpochCollision(clusterNode *sender) {
* value.
* -------------------------------------------------------------------------- */

#define CLUSTER_BLACKLIST_TTL 60 /* 1 minute. */


/* Before of the addNode() or Exists() operations we always remove expired
* entries from the black list. This is an O(N) operation but it is not a
* problem since add / exists operations are called very infrequently and
Expand Down Expand Up @@ -1893,7 +1890,7 @@ void clusterBlacklistAddNode(clusterNode *node) {
id = sdsdup(id);
}
de = dictFind(server.cluster->nodes_black_list, id);
dictSetUnsignedIntegerVal(de, time(NULL) + CLUSTER_BLACKLIST_TTL);
dictSetUnsignedIntegerVal(de, time(NULL) + server.cluster_blacklist_ttl);
sdsfree(id);
}

Expand Down
1 change: 1 addition & 0 deletions src/config.c
Original file line number Diff line number Diff line change
Expand Up @@ -3211,6 +3211,7 @@ standardConfig static_configs[] = {
createULongConfig("active-defrag-max-scan-fields", NULL, MODIFIABLE_CONFIG, 1, LONG_MAX, server.active_defrag_max_scan_fields, 1000, INTEGER_CONFIG, NULL, NULL), /* Default: keys with more than 1000 fields will be processed separately */
createULongConfig("slowlog-max-len", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.slowlog_max_len, 128, INTEGER_CONFIG, NULL, NULL),
createULongConfig("acllog-max-len", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.acllog_max_len, 128, INTEGER_CONFIG, NULL, NULL),
createULongConfig("cluster-blacklist-ttl", NULL, MODIFIABLE_CONFIG, 0, ULONG_MAX, server.cluster_blacklist_ttl, 60, INTEGER_CONFIG, NULL, NULL),

/* Long Long configs */
createLongLongConfig("busy-reply-threshold", "lua-time-limit", MODIFIABLE_CONFIG, 0, LONG_MAX, server.busy_reply_threshold, 5000, INTEGER_CONFIG, NULL, NULL), /* milliseconds */
Expand Down
2 changes: 2 additions & 0 deletions src/server.h
Original file line number Diff line number Diff line change
Expand Up @@ -2079,6 +2079,8 @@ struct valkeyServer {
unsigned long long cluster_link_msg_queue_limit_bytes; /* Memory usage limit on individual link msg queue */
int cluster_drop_packet_filter; /* Debug config that allows tactically
* dropping packets of a specific type */
unsigned long cluster_blacklist_ttl; /* Duration in seconds that a node is denied re-entry into
* the cluster after it is forgotten with CLUSTER FORGET. */
/* Debug config that goes along with cluster_drop_packet_filter. When set, the link is closed on packet drop. */
uint32_t debug_cluster_close_link_on_packet_drop : 1;
sds cached_cluster_slot_info[CACHE_CONN_TYPE_MAX]; /* Index in array is a bitwise or of CACHE_CONN_TYPE_* */
Expand Down
12 changes: 11 additions & 1 deletion valkey.conf
Original file line number Diff line number Diff line change
Expand Up @@ -1760,6 +1760,16 @@ aof-timestamp-enabled no
#
# cluster-preferred-endpoint-type ip

# The cluster blacklist is used when removing a node from the cluster completely.
# When CLUSTER FORGET is called for a node, that node is put into the blacklist for
# some time so that when gossip messages are received from other nodes that still
# remember it, it is not re-added. This gives time for CLUSTER FORGET to be sent to
# every node in the cluster. The blacklist TTL is 60 seconds by default, which should
# be sufficient for most clusters, but you may considering increasing this if you see
# nodes getting re-added while using CLUSTER FORGET.
#
# cluster-blacklist-ttl 60

# In order to setup your cluster make sure to read the documentation
# available at https://valkey.io web site.

Expand Down Expand Up @@ -2321,4 +2331,4 @@ jemalloc-bg-thread yes
# this is only exposed via the info command for clients to use, but in the future we
# we may also use this when making decisions for replication.
#
# availability-zone "zone-name"
# availability-zone "zone-name"

0 comments on commit 34649bd

Please sign in to comment.