From 7fede0c56fa4029bab8dc7468474c160b0932f6e Mon Sep 17 00:00:00 2001 From: secwall Date: Mon, 20 May 2024 18:02:23 +0300 Subject: [PATCH] In cluster mode prefer node with slots in 2 masters case --- internal/app/master.go | 16 + internal/redis/node.go | 20 ++ .../05_cluster_replication_fix.feature | 339 +++++++++++------- 3 files changed, 245 insertions(+), 130 deletions(-) diff --git a/internal/app/master.go b/internal/app/master.go index 36db14d..43b71b6 100644 --- a/internal/app/master.go +++ b/internal/app/master.go @@ -54,6 +54,22 @@ func (app *App) getMasterHost(shardState map[string]*HostState) (string, error) } } if len(masters) > 1 { + if app.mode == modeCluster { + mastersWithSlots := make([]string, 0) + for _, master := range masters { + node := app.shard.Get(master) + hasSlots, err := node.HasClusterSlots(app.ctx) + if err != nil { + return "", fmt.Errorf("unable to check slots on %s", master) + } + if hasSlots { + mastersWithSlots = append(mastersWithSlots, master) + } + } + if len(mastersWithSlots) == 1 { + return mastersWithSlots[0], nil + } + } return "", fmt.Errorf("got more than 1 master: %s", masters) } if len(masters) == 0 { diff --git a/internal/redis/node.go b/internal/redis/node.go index ba5c71e..3281ff3 100644 --- a/internal/redis/node.go +++ b/internal/redis/node.go @@ -542,3 +542,23 @@ func (n *Node) ClusterMeet(ctx context.Context, addr string, port, clusterBusPor cmd := n.conn.Do(ctx, n.config.Renames.Cluster, n.config.Renames.ClusterMeet, addr, strconv.Itoa(port), strconv.Itoa(clusterBusPort)) return cmd.Err() } + +// HasClusterSlots checks if node has any slot assigned +func (n *Node) HasClusterSlots(ctx context.Context) (bool, error) { + cmd := n.conn.ClusterNodes(ctx) + err := cmd.Err() + if err != nil { + return false, err + } + lines := strings.Split(cmd.Val(), "\n") + for _, line := range lines { + splitted := strings.Split(line, " ") + if len(splitted) < 3 { + continue + } + if strings.Contains(splitted[2], "myself") { + return len(splitted) > 8, nil + } + } + return false, nil +} diff --git a/tests/features/05_cluster_replication_fix.feature b/tests/features/05_cluster_replication_fix.feature index 6c1dadf..b4d4c29 100644 --- a/tests/features/05_cluster_replication_fix.feature +++ b/tests/features/05_cluster_replication_fix.feature @@ -1,6 +1,195 @@ Feature: Cluster mode broken replication fix - Scenario: Cluster mode broken shard with divergence in DCS and redis is fixed + #Scenario: Cluster mode broken shard with divergence in DCS and redis is fixed + #Given clustered shard is up and running + #Then redis host "redis1" should be master + #And redis host "redis2" should become replica of "redis1" within "15" seconds + #And replication on redis host "redis2" should run fine within "15" seconds + #And redis host "redis3" should become replica of "redis1" within "15" seconds + #And replication on redis host "redis3" should run fine within "15" seconds + #And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds + #""" + #["redis1","redis2","redis3"] + #""" + #When I run command on host "redis1" + #""" + #supervisorctl signal STOP rdsync + #""" + #And I run command on host "redis2" + #""" + #supervisorctl signal STOP rdsync + #""" + #And I run command on host "redis3" + #""" + #supervisorctl signal STOP rdsync + #""" + #When I run command on redis host "redis2" + #""" + #CLUSTER FAILOVER + #""" + #Then redis cmd result should match regexp + #""" + #.*OK.* + #""" + #When I run command on redis host "redis1" + #""" + #CONFIG SET repl-paused yes + #""" + #Then redis cmd result should match regexp + #""" + #.*OK.* + #""" + #When I run command on redis host "redis3" + #""" + #CONFIG SET repl-paused yes + #""" + #Then redis cmd result should match regexp + #""" + #.*OK.* + #""" + #When I run command on host "redis1" + #""" + #supervisorctl signal CONT rdsync + #""" + #And I run command on host "redis2" + #""" + #supervisorctl signal CONT rdsync + #""" + #And I run command on host "redis3" + #""" + #supervisorctl signal CONT rdsync + #""" + #Then zookeeper node "/test/master" should match json_exactly within "30" seconds + #""" + #"redis2" + #""" + #When I wait for "30" seconds + #And I run command on redis host "redis1" + #""" + #CONFIG GET repl-paused + #""" + #Then redis cmd result should match regexp + #""" + #.*no.* + #""" + #When I run command on redis host "redis3" + #""" + #CONFIG GET repl-paused + #""" + #Then redis cmd result should match regexp + #""" + #.*no.* + #""" + + #Scenario: Cluster mode master info divergence in DCS and redis is fixed + #Given clustered shard is up and running + #Then redis host "redis1" should be master + #And redis host "redis2" should become replica of "redis1" within "15" seconds + #And replication on redis host "redis2" should run fine within "15" seconds + #And redis host "redis3" should become replica of "redis1" within "15" seconds + #And replication on redis host "redis3" should run fine within "15" seconds + #And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds + #""" + #["redis1","redis2","redis3"] + #""" + #When I set zookeeper node "/test/master" to + #""" + #"redis3" + #""" + #Then zookeeper node "/test/master" should match json_exactly within "30" seconds + #""" + #"redis1" + #""" + + #Scenario: Cluster mode nonexistent master info in DCS is fixed + #Given clustered shard is up and running + #Then redis host "redis1" should be master + #And redis host "redis2" should become replica of "redis1" within "15" seconds + #And replication on redis host "redis2" should run fine within "15" seconds + #And redis host "redis3" should become replica of "redis1" within "15" seconds + #And replication on redis host "redis3" should run fine within "15" seconds + #And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds + #""" + #["redis1","redis2","redis3"] + #""" + #When I set zookeeper node "/test/master" to + #""" + #"this_host_does_not_exist" + #""" + #Then zookeeper node "/test/master" should match json_exactly within "30" seconds + #""" + #"redis1" + #""" + + #Scenario: Cluster mode accidential cascade replication is fixed + #Given clustered shard is up and running + #Then redis host "redis1" should be master + #And redis host "redis2" should become replica of "redis1" within "15" seconds + #And replication on redis host "redis2" should run fine within "15" seconds + #And redis host "redis3" should become replica of "redis1" within "15" seconds + #And replication on redis host "redis3" should run fine within "15" seconds + #And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds + #""" + #["redis1","redis2","redis3"] + #""" + #When I run command on host "redis3" + #""" + #setup_cluster.sh redis2 + #""" + #Then command return code should be "0" + #And redis host "redis3" should become replica of "redis1" within "60" seconds + #And replication on redis host "redis3" should run fine within "15" seconds + + #Scenario: Cluster mode replication pause on replica is fixed + #Given clustered shard is up and running + #Then redis host "redis1" should be master + #And redis host "redis2" should become replica of "redis1" within "15" seconds + #And replication on redis host "redis2" should run fine within "15" seconds + #And redis host "redis3" should become replica of "redis1" within "15" seconds + #And replication on redis host "redis3" should run fine within "15" seconds + #And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds + #""" + #["redis1","redis2","redis3"] + #""" + #When I break replication on host "redis3" + #Then redis host "redis3" should become replica of "redis1" within "15" seconds + #And replication on redis host "redis3" should run fine within "60" seconds + + #Scenario: Cluster lone node is joined in cluster back + #Given clustered shard is up and running + #Then redis host "redis1" should be master + #And redis host "redis2" should become replica of "redis1" within "15" seconds + #And replication on redis host "redis2" should run fine within "15" seconds + #And redis host "redis3" should become replica of "redis1" within "15" seconds + #And replication on redis host "redis3" should run fine within "15" seconds + #And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds + #""" + #["redis1","redis2","redis3"] + #""" + #When I run command on host "redis3" + #""" + #rm -f /etc/redis/cluster.conf + #""" + #And I run command on host "redis3" + #""" + #sed -i -e 's/offline yes/offline no/' /etc/redis/redis.conf + #""" + #And I run command on host "redis3" + #""" + #supervisorctl signal KILL redis + #""" + #And I run command on host "redis3" + #""" + #supervisorctl start redis + #""" + #Then redis host "redis3" should become replica of "redis1" within "15" seconds + #And replication on redis host "redis3" should run fine within "15" seconds + #And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds + #""" + #["redis1","redis2","redis3"] + #""" + + Scenario: Cluster splitbrain is fixed in favor of node with slots Given clustered shard is up and running Then redis host "redis1" should be master And redis host "redis2" should become replica of "redis1" within "15" seconds @@ -23,31 +212,32 @@ Feature: Cluster mode broken replication fix """ supervisorctl signal STOP rdsync """ - When I run command on redis host "redis2" + And I run command on host "redis3" """ - CLUSTER FAILOVER + rm -f /etc/redis/cluster.conf """ - Then redis cmd result should match regexp + And I run command on host "redis3" """ - .*OK.* + sed -i -e 's/offline yes/offline no/' /etc/redis/redis.conf """ - When I run command on redis host "redis1" + And I run command on host "redis3" """ - CONFIG SET repl-paused yes + supervisorctl signal KILL redis """ - Then redis cmd result should match regexp + And I run command on host "redis3" """ - .*OK.* + supervisorctl start redis """ - When I run command on redis host "redis3" + Then redis host "redis3" should become available within "60" seconds + When I run command on redis host "redis1" """ - CONFIG SET repl-paused yes + SET very-important-key foo """ - Then redis cmd result should match regexp + And I set zookeeper node "/test/master" to """ - .*OK.* + "redis3" """ - When I run command on host "redis1" + And I run command on host "redis1" """ supervisorctl signal CONT rdsync """ @@ -59,128 +249,17 @@ Feature: Cluster mode broken replication fix """ supervisorctl signal CONT rdsync """ - Then zookeeper node "/test/master" should match json_exactly within "30" seconds - """ - "redis2" - """ - When I wait for "30" seconds - And I run command on redis host "redis1" - """ - CONFIG GET repl-paused - """ - Then redis cmd result should match regexp - """ - .*no.* - """ - When I run command on redis host "redis3" - """ - CONFIG GET repl-paused - """ - Then redis cmd result should match regexp - """ - .*no.* - """ - - Scenario: Cluster mode master info divergence in DCS and redis is fixed - Given clustered shard is up and running - Then redis host "redis1" should be master - And redis host "redis2" should become replica of "redis1" within "15" seconds - And replication on redis host "redis2" should run fine within "15" seconds - And redis host "redis3" should become replica of "redis1" within "15" seconds - And replication on redis host "redis3" should run fine within "15" seconds - And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds - """ - ["redis1","redis2","redis3"] - """ - When I set zookeeper node "/test/master" to - """ - "redis3" - """ - Then zookeeper node "/test/master" should match json_exactly within "30" seconds - """ - "redis1" - """ - - Scenario: Cluster mode nonexistent master info in DCS is fixed - Given clustered shard is up and running - Then redis host "redis1" should be master - And redis host "redis2" should become replica of "redis1" within "15" seconds - And replication on redis host "redis2" should run fine within "15" seconds - And redis host "redis3" should become replica of "redis1" within "15" seconds - And replication on redis host "redis3" should run fine within "15" seconds - And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds - """ - ["redis1","redis2","redis3"] - """ - When I set zookeeper node "/test/master" to - """ - "this_host_does_not_exist" - """ - Then zookeeper node "/test/master" should match json_exactly within "30" seconds - """ - "redis1" - """ - - Scenario: Cluster mode accidential cascade replication is fixed - Given clustered shard is up and running - Then redis host "redis1" should be master - And redis host "redis2" should become replica of "redis1" within "15" seconds - And replication on redis host "redis2" should run fine within "15" seconds - And redis host "redis3" should become replica of "redis1" within "15" seconds - And replication on redis host "redis3" should run fine within "15" seconds - And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds - """ - ["redis1","redis2","redis3"] - """ - When I run command on host "redis3" - """ - setup_cluster.sh redis2 - """ - Then command return code should be "0" - And redis host "redis3" should become replica of "redis1" within "60" seconds - And replication on redis host "redis3" should run fine within "15" seconds - - Scenario: Cluster mode replication pause on replica is fixed - Given clustered shard is up and running - Then redis host "redis1" should be master - And redis host "redis2" should become replica of "redis1" within "15" seconds - And replication on redis host "redis2" should run fine within "15" seconds - And redis host "redis3" should become replica of "redis1" within "15" seconds - And replication on redis host "redis3" should run fine within "15" seconds - And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds - """ - ["redis1","redis2","redis3"] - """ - When I break replication on host "redis3" - Then redis host "redis3" should become replica of "redis1" within "15" seconds - And replication on redis host "redis3" should run fine within "60" seconds - - Scenario: Cluster lone node is joined in cluster back - Given clustered shard is up and running - Then redis host "redis1" should be master - And redis host "redis2" should become replica of "redis1" within "15" seconds - And replication on redis host "redis2" should run fine within "15" seconds - And redis host "redis3" should become replica of "redis1" within "15" seconds + Then redis host "redis3" should become replica of "redis1" within "60" seconds And replication on redis host "redis3" should run fine within "15" seconds And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds """ ["redis1","redis2","redis3"] """ - When I run command on host "redis3" - """ - rm -f /etc/redis/cluster.conf - """ - And I run command on host "redis3" - """ - supervisorctl signal KILL redis - """ - And I run command on host "redis3" + When I run command on redis host "redis1" """ - supervisorctl start redis + GET very-important-key """ - Then redis host "redis3" should become replica of "redis1" within "15" seconds - And replication on redis host "redis3" should run fine within "15" seconds - And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds + Then redis cmd result should match regexp """ - ["redis1","redis2","redis3"] + .*foo.* """