From e60990e5798a4a9a943aa444dba24855d59674c0 Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 10 Jan 2025 10:19:04 +0800 Subject: [PATCH] Fix crash when freeing newly created node when nodeIp2String fail (#1535) In #1441, we found a assert, and decided remove this assert and instead just free the newly created node and close the link, since if we cannot get the IP from the link it probably means the connection was closed. ``` === VALKEY BUG REPORT START: Cut & paste starting from here === 17847:M 19 Dec 2024 00:15:58.021 # === ASSERTION FAILED === 17847:M 19 Dec 2024 00:15:58.021 # ==> cluster_legacy.c:3252 'nodeIp2String(node->ip, link, hdr->myip) == C_OK' is not true ------ STACK TRACE ------ 17847 valkey-server * src/valkey-server 127.0.0.1:27131 [cluster](clusterProcessPacket+0x1304) [0x4e5634] src/valkey-server 127.0.0.1:27131 [cluster](clusterReadHandler+0x11e) [0x4e59de] /__w/valkey/valkey/src/valkey-tls.so(+0x2f1e) [0x7f083983ff1e] src/valkey-server 127.0.0.1:27131 [cluster](aeMain+0x8a) [0x41afea] src/valkey-server 127.0.0.1:27131 [cluster](main+0x4d7) [0x40f547] /lib64/libc.so.6(+0x40c8) [0x7f083985a0c8] /lib64/libc.so.6(__libc_start_main+0x8b) [0x7f083985a18b] src/valkey-server 127.0.0.1:27131 [cluster](_start+0x25) [0x410ef5] ``` But it also introduces another assert. The reason is that this new node is not added to the cluster nodes dict. ``` 17128:M 08 Jan 2025 10:51:44.061 # === ASSERTION FAILED === 17128:M 08 Jan 2025 10:51:44.061 # ==> cluster_legacy.c:1693 'dictDelete(server.cluster->nodes, nodename) == DICT_OK' is not true ------ STACK TRACE ------ 17128 valkey-server * src/valkey-server 127.0.0.1:28627 [cluster][0x4ebdc4] src/valkey-server 127.0.0.1:28627 [cluster][0x4e81d2] src/valkey-server 127.0.0.1:28627 [cluster](clusterReadHandler+0x268)[0x4e8618] /__w/valkey/valkey/src/valkey-tls.so(+0xb278)[0x7f109480b278] src/valkey-server 127.0.0.1:28627 [cluster](aeMain+0x89)[0x592b09] src/valkey-server 127.0.0.1:28627 [cluster](main+0x4b3)[0x453e23] /lib64/libc.so.6(__libc_start_main+0xe5)[0x7f10958bf7e5] src/valkey-server 127.0.0.1:28627 [cluster](_start+0x2e)[0x454a5e] ``` This closes #1527. Signed-off-by: Binbin --- src/cluster_legacy.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 0777d6d8c6..bf5d314908 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3245,6 +3245,17 @@ int clusterProcessPacket(clusterLink *link) { if (type == CLUSTERMSG_TYPE_MEET) { if (!sender) { if (!link->node) { + char ip[NET_IP_STR_LEN] = {0}; + if (nodeIp2String(ip, link, hdr->myip) != C_OK) { + /* Unable to retrieve the node's IP address from the connection. Without a + * valid IP, the node becomes unusable in the cluster. This failure might be + * due to the connection being closed. */ + serverLog(LL_NOTICE, "Closing link even though we received a MEET packet on it, " + "because the connection has an error"); + freeClusterLink(link); + return 0; + } + /* Add this node if it is new for us and the msg type is MEET. * In this stage we don't try to add the node with the right * flags, replicaof pointer, and so forth, as this details will be @@ -3253,14 +3264,7 @@ int clusterProcessPacket(clusterLink *link) { * we want to send extensions right away in the return PONG in order * to reduce the amount of time needed to stabilize the shard ID. */ clusterNode *node = createClusterNode(NULL, CLUSTER_NODE_HANDSHAKE); - if (nodeIp2String(node->ip, link, hdr->myip) != C_OK) { - /* We cannot get the IP info from the link, it probably means the connection is closed. */ - serverLog(LL_NOTICE, "Closing link even though we received a MEET packet on it, " - "because the connection has an error"); - freeClusterLink(link); - freeClusterNode(node); - return 0; - } + memcpy(node->ip, ip, sizeof(ip)); getClientPortFromClusterMsg(hdr, &node->tls_port, &node->tcp_port); node->cport = ntohs(hdr->cport); if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) {