Skip to content

Commit

Permalink
CORE-3242: update node status checkers to be Cast AI id aware (#74)
Browse files Browse the repository at this point in the history
* update node status to be Cast AI id aware
* update second delete check method

---------

Co-authored-by: Mangirdas <[email protected]>
  • Loading branch information
mjudeikis and mjudeikis authored Jul 13, 2023
1 parent 862c92d commit bc07d34
Show file tree
Hide file tree
Showing 3 changed files with 191 additions and 6 deletions.
20 changes: 20 additions & 0 deletions actions/check_node_deleted.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,29 @@ func (h *checkNodeDeletedHandler) Handle(ctx context.Context, action *castai.Clu
if apierrors.IsNotFound(err) {
return nil
}

if n == nil {
return nil
}

currentNodeID, ok := n.Labels[castai.LabelNodeID]
if !ok {
log.Info("node doesn't have castai node id label")
}
if currentNodeID != "" {
if currentNodeID != req.NodeID {
log.Info("node name was reused. Original node is deleted")
return nil
}
if currentNodeID == req.NodeID {
return backoff.Permanent(errors.New("node is not deleted"))
}
}

if n != nil {
return backoff.Permanent(errors.New("node is not deleted"))
}

return err
}, b)
}
44 changes: 38 additions & 6 deletions actions/check_node_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,17 +47,17 @@ func (h *checkNodeStatusHandler) Handle(ctx context.Context, action *castai.Clus
switch req.NodeStatus {
case castai.ActionCheckNodeStatus_READY:
log.Info("checking node ready")
return h.checkNodeReady(ctx, req)
return h.checkNodeReady(ctx, log, req)
case castai.ActionCheckNodeStatus_DELETED:
log.Info("checking node deleted")
return h.checkNodeDeleted(ctx, req)
return h.checkNodeDeleted(ctx, log, req)

}

return fmt.Errorf("unknown status to check provided node=%s status=%s", req.NodeName, req.NodeStatus)
}

func (h *checkNodeStatusHandler) checkNodeDeleted(ctx context.Context, req *castai.ActionCheckNodeStatus) error {
func (h *checkNodeStatusHandler) checkNodeDeleted(ctx context.Context, log *logrus.Entry, req *castai.ActionCheckNodeStatus) error {
timeout := 10
if req.WaitTimeoutSeconds != nil {
timeout = int(*req.WaitTimeoutSeconds)
Expand All @@ -70,14 +70,39 @@ func (h *checkNodeStatusHandler) checkNodeDeleted(ctx context.Context, req *cast
if apierrors.IsNotFound(err) {
return nil
}

// If node is nil - deleted
// If label is present and doesn't match - node was reused - deleted
// If label is present and matches - node is not deleted
// If label is not present and node is not nil - node is not deleted (potentially corrupted state)

if n == nil {
return nil
}

currentNodeID, ok := n.Labels[castai.LabelNodeID]
if !ok {
log.Info("node doesn't have castai node id label")
}
if currentNodeID != "" {
if currentNodeID != req.NodeID {
log.Info("node name was reused. Original node is deleted")
return nil
}
if currentNodeID == req.NodeID {
return backoff.Permanent(errors.New("node is not deleted"))
}
}

if n != nil {
return backoff.Permanent(errors.New("node is not deleted"))
}

return err
}, b)
}

func (h *checkNodeStatusHandler) checkNodeReady(ctx context.Context, req *castai.ActionCheckNodeStatus) error {
func (h *checkNodeStatusHandler) checkNodeReady(ctx context.Context, log *logrus.Entry, req *castai.ActionCheckNodeStatus) error {
timeout := 9 * time.Minute
watchObject := metav1.SingleObject(metav1.ObjectMeta{Name: req.NodeName})
if req.WaitTimeoutSeconds != nil {
Expand All @@ -94,7 +119,7 @@ func (h *checkNodeStatusHandler) checkNodeReady(ctx context.Context, req *castai
defer watch.Stop()
for r := range watch.ResultChan() {
if node, ok := r.Object.(*corev1.Node); ok {
if isNodeReady(node) {
if isNodeReady(node, req.NodeID) {
return nil
}
}
Expand All @@ -103,7 +128,14 @@ func (h *checkNodeStatusHandler) checkNodeReady(ctx context.Context, req *castai
return fmt.Errorf("timeout waiting for node %s to become ready", req.NodeName)
}

func isNodeReady(node *corev1.Node) bool {
func isNodeReady(node *corev1.Node, castNodeID string) bool {
// if node has castai node id label, check if it matches the one we are waiting for
// if it doesn't match, we can skip this node
if val, ok := node.Labels[castai.LabelNodeID]; ok {
if val != "" && val != castNodeID {
return false
}
}
for _, cond := range node.Status.Conditions {
if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue && !containsUninitializedNodeTaint(node.Spec.Taints) {
return true
Expand Down
133 changes: 133 additions & 0 deletions actions/check_node_status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,37 @@ func TestCheckStatus_Deleted(t *testing.T) {
log.SetLevel(logrus.DebugLevel)

t.Run("return error when node is not deleted", func(t *testing.T) {
r := require.New(t)
nodeName := "node1"
node := &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: nodeName,
Labels: map[string]string{
castai.LabelNodeID: "old-node-id",
},
},
}
clientset := fake.NewSimpleClientset(node)

h := checkNodeStatusHandler{
log: log,
clientset: clientset,
}

action := &castai.ClusterAction{
ID: uuid.New().String(),
ActionCheckNodeStatus: &castai.ActionCheckNodeStatus{
NodeName: "node1",
NodeStatus: castai.ActionCheckNodeStatus_DELETED,
NodeID: "old-node-id",
},
}

err := h.Handle(context.Background(), action)
r.EqualError(err, "node is not deleted")
})

t.Run("return error when node is not deleted with no label (backwards compatibility)", func(t *testing.T) {
r := require.New(t)
nodeName := "node1"
node := &v1.Node{
Expand All @@ -43,6 +74,7 @@ func TestCheckStatus_Deleted(t *testing.T) {
ActionCheckNodeStatus: &castai.ActionCheckNodeStatus{
NodeName: "node1",
NodeStatus: castai.ActionCheckNodeStatus_DELETED,
NodeID: "old-node-id",
},
}

Expand All @@ -64,6 +96,37 @@ func TestCheckStatus_Deleted(t *testing.T) {
ActionCheckNodeStatus: &castai.ActionCheckNodeStatus{
NodeName: "node1",
NodeStatus: castai.ActionCheckNodeStatus_DELETED,
NodeID: "old-node-id",
},
}

err := h.Handle(context.Background(), action)
r.NoError(err)
})

t.Run("handle check successfully when node name was reused but id mismatch", func(t *testing.T) {
r := require.New(t)
node := &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node1",
Labels: map[string]string{
castai.LabelNodeID: "old-node-id",
},
},
}
clientset := fake.NewSimpleClientset(node)

h := checkNodeStatusHandler{
log: log,
clientset: clientset,
}

action := &castai.ClusterAction{
ID: uuid.New().String(),
ActionCheckNodeStatus: &castai.ActionCheckNodeStatus{
NodeName: "node1",
NodeStatus: castai.ActionCheckNodeStatus_DELETED,
NodeID: "im-a-different-node",
},
}

Expand Down Expand Up @@ -251,4 +314,74 @@ func TestCheckStatus_Ready(t *testing.T) {
r.Error(err)
r.EqualError(err, "timeout waiting for node node1 to become ready")
})

t.Run("handle check successfully when reusing node names happens and node is replaced", func(t *testing.T) {
r := require.New(t)
nodeName := "node1"
node := &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: nodeName,
Labels: map[string]string{
castai.LabelNodeID: "old-node-id",
},
},
Status: v1.NodeStatus{
Conditions: []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionFalse,
},
},
},
}
clientset := fake.NewSimpleClientset(node)

h := checkNodeStatusHandler{
log: log,
clientset: clientset,
}

timeout := int32(60)
action := &castai.ClusterAction{
ID: uuid.New().String(),
ActionCheckNodeStatus: &castai.ActionCheckNodeStatus{
NodeName: "node1",
NodeStatus: castai.ActionCheckNodeStatus_READY,
WaitTimeoutSeconds: &timeout,
NodeID: "new-node-id",
},
}

// simulate node replacement
// 1. node is deleted
// 2. new node is created with the same name and different id
// 3. node is ready
// 4. checkNodeStatusHandler.Handle() is called
var wg sync.WaitGroup
wg.Add(2)
var err error
go func() {
err = h.Handle(context.Background(), action)
wg.Done()
}()

go func() {
time.Sleep(1 * time.Second)
_ = clientset.CoreV1().Nodes().Delete(context.Background(), nodeName, metav1.DeleteOptions{})

time.Sleep(1 * time.Second)
newNode := node.DeepCopy()
newNode.Labels[castai.LabelNodeID] = "new-node-id"

_, _ = clientset.CoreV1().Nodes().Create(context.Background(), newNode, metav1.CreateOptions{})

time.Sleep(5 * time.Second)
newNode.Status.Conditions[0].Status = v1.ConditionTrue
_, _ = clientset.CoreV1().Nodes().UpdateStatus(context.Background(), newNode, metav1.UpdateOptions{})
wg.Done()
}()
wg.Wait()

r.NoError(err)
})
}

0 comments on commit bc07d34

Please sign in to comment.