From 2a6bc2cd2edf5b55f24be7dec8d0c398dd8fa9f2 Mon Sep 17 00:00:00 2001 From: secwall Date: Mon, 20 May 2024 18:02:23 +0300 Subject: [PATCH] In cluster mode prefer node with slots in 2 masters case --- internal/app/master.go | 16 ++++ internal/redis/node.go | 20 +++++ .../05_cluster_replication_fix.feature | 79 +++++++++++++++++++ 3 files changed, 115 insertions(+) diff --git a/internal/app/master.go b/internal/app/master.go index 36db14d..43b71b6 100644 --- a/internal/app/master.go +++ b/internal/app/master.go @@ -54,6 +54,22 @@ func (app *App) getMasterHost(shardState map[string]*HostState) (string, error) } } if len(masters) > 1 { + if app.mode == modeCluster { + mastersWithSlots := make([]string, 0) + for _, master := range masters { + node := app.shard.Get(master) + hasSlots, err := node.HasClusterSlots(app.ctx) + if err != nil { + return "", fmt.Errorf("unable to check slots on %s", master) + } + if hasSlots { + mastersWithSlots = append(mastersWithSlots, master) + } + } + if len(mastersWithSlots) == 1 { + return mastersWithSlots[0], nil + } + } return "", fmt.Errorf("got more than 1 master: %s", masters) } if len(masters) == 0 { diff --git a/internal/redis/node.go b/internal/redis/node.go index ba5c71e..3281ff3 100644 --- a/internal/redis/node.go +++ b/internal/redis/node.go @@ -542,3 +542,23 @@ func (n *Node) ClusterMeet(ctx context.Context, addr string, port, clusterBusPor cmd := n.conn.Do(ctx, n.config.Renames.Cluster, n.config.Renames.ClusterMeet, addr, strconv.Itoa(port), strconv.Itoa(clusterBusPort)) return cmd.Err() } + +// HasClusterSlots checks if node has any slot assigned +func (n *Node) HasClusterSlots(ctx context.Context) (bool, error) { + cmd := n.conn.ClusterNodes(ctx) + err := cmd.Err() + if err != nil { + return false, err + } + lines := strings.Split(cmd.Val(), "\n") + for _, line := range lines { + splitted := strings.Split(line, " ") + if len(splitted) < 3 { + continue + } + if strings.Contains(splitted[2], "myself") { + return len(splitted) > 8, nil + } + } + return false, nil +} diff --git a/tests/features/05_cluster_replication_fix.feature b/tests/features/05_cluster_replication_fix.feature index 6c1dadf..0502f6c 100644 --- a/tests/features/05_cluster_replication_fix.feature +++ b/tests/features/05_cluster_replication_fix.feature @@ -171,6 +171,10 @@ Feature: Cluster mode broken replication fix rm -f /etc/redis/cluster.conf """ And I run command on host "redis3" + """ + sed -i -e 's/offline yes/offline no/' /etc/redis/redis.conf + """ + And I run command on host "redis3" """ supervisorctl signal KILL redis """ @@ -184,3 +188,78 @@ Feature: Cluster mode broken replication fix """ ["redis1","redis2","redis3"] """ + + Scenario: Cluster splitbrain is fixed in favor of node with slots + Given clustered shard is up and running + Then redis host "redis1" should be master + And redis host "redis2" should become replica of "redis1" within "15" seconds + And replication on redis host "redis2" should run fine within "15" seconds + And redis host "redis3" should become replica of "redis1" within "15" seconds + And replication on redis host "redis3" should run fine within "15" seconds + And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds + """ + ["redis1","redis2","redis3"] + """ + When I run command on host "redis1" + """ + supervisorctl signal STOP rdsync + """ + And I run command on host "redis2" + """ + supervisorctl signal STOP rdsync + """ + And I run command on host "redis3" + """ + supervisorctl signal STOP rdsync + """ + And I run command on host "redis3" + """ + rm -f /etc/redis/cluster.conf + """ + And I run command on host "redis3" + """ + sed -i -e 's/offline yes/offline no/' /etc/redis/redis.conf + """ + And I run command on host "redis3" + """ + supervisorctl signal KILL redis + """ + And I run command on host "redis3" + """ + supervisorctl start redis + """ + Then redis host "redis3" should become available within "60" seconds + When I run command on redis host "redis1" + """ + SET very-important-key foo + """ + And I set zookeeper node "/test/master" to + """ + "redis3" + """ + And I run command on host "redis1" + """ + supervisorctl signal CONT rdsync + """ + And I run command on host "redis2" + """ + supervisorctl signal CONT rdsync + """ + And I run command on host "redis3" + """ + supervisorctl signal CONT rdsync + """ + Then redis host "redis3" should become replica of "redis1" within "60" seconds + And replication on redis host "redis3" should run fine within "15" seconds + And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds + """ + ["redis1","redis2","redis3"] + """ + When I run command on redis host "redis1" + """ + GET very-important-key + """ + Then redis cmd result should match regexp + """ + .*foo.* + """