Check if all cluster members are healthy

According to etcd-io/etcd#8070, cluster-health should check this but apparently this broke in some recent releases, so we're checking this explicitly in the script now.
itskoko · May 22, 2018 · 32d1935 · 32d1935
1 parent d5a66b3
commit 32d1935
Showing 1 changed file with 20 additions and 3 deletions.
diff --git a/kubernetes.yaml b/kubernetes.yaml
@@ -1051,17 +1051,34 @@ Resources:
                   - |
                     #!/bin/bash
                     set -euo pipefail
-                    echo "Wait for cluster-health"
-                    while ! /etc/etcdctl-wrapper cluster-health; do sleep 1; done
                     echo "Wait for etcd to join cluster"
-                    while ! /etc/etcdctl-wrapper member list | grep $(hostname); do sleep 1; done
+                    while ! /etc/etcdctl-wrapper member list | grep "$(hostname)"; do sleep 1; done
+                    echo "Wait for cluster-health"
+                    while true; do
+                      sleep 1
+                      /etc/etcdctl-wrapper cluster-health | tee /tmp/cluster-health.txt
+                      if [[ "${!PIPESTATUS[0]}" -ne 0 ]]; then
+                        echo " - no quorum, retrying"
+                        continue
+                      fi
+                      if [[ "$(cat /tmp/cluster-health.txt | wc -l)" -ne "$((${ControllerPoolSize}+1))" ]]; then
+                        echo "- unexpected number of peers"
+                        continue
+                      fi
+                      # continue if all lines contain 'is healthy'
+                      if ! grep -v "is healthy" /tmp/cluster-health.txt; then
+                        break
+                      fi
+                      echo " - unhealthy members found, retrying"
+                    done
                     echo "Signaling success"
                     docker run --rm --net=host rochacon/cfn-bootstrap cfn-signal \
                       --resource ControllerAutoScalingGroup \
                       --stack ${StackName} \
                       --region ${Region} || true # Ignore if signaling failed
                   - StackName: !Ref AWS::StackName
                     Region: !Ref AWS::Region
+                    ControllerPoolSize: !Ref ControllerPoolSize
               # Environment files
               etcdEnv:
                 Fn::Base64: