Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

In cluster mode prefer node with slots in 2 masters case #49

Merged
merged 1 commit into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions internal/app/master.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,22 @@ func (app *App) getMasterHost(shardState map[string]*HostState) (string, error)
}
}
if len(masters) > 1 {
if app.mode == modeCluster {
mastersWithSlots := make([]string, 0)
for _, master := range masters {
node := app.shard.Get(master)
hasSlots, err := node.HasClusterSlots(app.ctx)
if err != nil {
return "", fmt.Errorf("unable to check slots on %s", master)
}
if hasSlots {
mastersWithSlots = append(mastersWithSlots, master)
}
}
if len(mastersWithSlots) == 1 {
return mastersWithSlots[0], nil
}
}
return "", fmt.Errorf("got more than 1 master: %s", masters)
}
if len(masters) == 0 {
Expand Down
20 changes: 20 additions & 0 deletions internal/redis/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -542,3 +542,23 @@ func (n *Node) ClusterMeet(ctx context.Context, addr string, port, clusterBusPor
cmd := n.conn.Do(ctx, n.config.Renames.Cluster, n.config.Renames.ClusterMeet, addr, strconv.Itoa(port), strconv.Itoa(clusterBusPort))
return cmd.Err()
}

// HasClusterSlots checks if node has any slot assigned
func (n *Node) HasClusterSlots(ctx context.Context) (bool, error) {
cmd := n.conn.ClusterNodes(ctx)
err := cmd.Err()
if err != nil {
return false, err
}
lines := strings.Split(cmd.Val(), "\n")
for _, line := range lines {
splitted := strings.Split(line, " ")
if len(splitted) < 3 {
continue
}
if strings.Contains(splitted[2], "myself") {
return len(splitted) > 8, nil
}
}
return false, nil
}
79 changes: 79 additions & 0 deletions tests/features/05_cluster_replication_fix.feature
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ Feature: Cluster mode broken replication fix
rm -f /etc/redis/cluster.conf
"""
And I run command on host "redis3"
"""
sed -i -e 's/offline yes/offline no/' /etc/redis/redis.conf
"""
And I run command on host "redis3"
"""
supervisorctl signal KILL redis
"""
Expand All @@ -184,3 +188,78 @@ Feature: Cluster mode broken replication fix
"""
["redis1","redis2","redis3"]
"""

Scenario: Cluster splitbrain is fixed in favor of node with slots
Given clustered shard is up and running
Then redis host "redis1" should be master
And redis host "redis2" should become replica of "redis1" within "15" seconds
And replication on redis host "redis2" should run fine within "15" seconds
And redis host "redis3" should become replica of "redis1" within "15" seconds
And replication on redis host "redis3" should run fine within "15" seconds
And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds
"""
["redis1","redis2","redis3"]
"""
When I run command on host "redis1"
"""
supervisorctl signal STOP rdsync
"""
And I run command on host "redis2"
"""
supervisorctl signal STOP rdsync
"""
And I run command on host "redis3"
"""
supervisorctl signal STOP rdsync
"""
And I run command on host "redis3"
"""
rm -f /etc/redis/cluster.conf
"""
And I run command on host "redis3"
"""
sed -i -e 's/offline yes/offline no/' /etc/redis/redis.conf
"""
And I run command on host "redis3"
"""
supervisorctl signal KILL redis
"""
And I run command on host "redis3"
"""
supervisorctl start redis
"""
Then redis host "redis3" should become available within "60" seconds
When I run command on redis host "redis1"
"""
SET very-important-key foo
"""
And I set zookeeper node "/test/master" to
"""
"redis3"
"""
And I run command on host "redis1"
"""
supervisorctl signal CONT rdsync
"""
And I run command on host "redis2"
"""
supervisorctl signal CONT rdsync
"""
And I run command on host "redis3"
"""
supervisorctl signal CONT rdsync
"""
Then redis host "redis3" should become replica of "redis1" within "60" seconds
And replication on redis host "redis3" should run fine within "15" seconds
And zookeeper node "/test/active_nodes" should match json_exactly within "30" seconds
"""
["redis1","redis2","redis3"]
"""
When I run command on redis host "redis1"
"""
GET very-important-key
"""
Then redis cmd result should match regexp
"""
.*foo.*
"""
Loading