Give enough time to sync clientset's cache on new Node 3.5 sec -> 10 …

…sec (#103) give enough time to sync clientset's cache on new Node Co-authored-by: Furkhat Kasymov Genii Uulu <[email protected]>
castai · Feb 29, 2024 · ffef754 · ffef754
1 parent 4afcd32
commit ffef754
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 7 deletions.
diff --git a/actions/kubernetes_helpers.go b/actions/kubernetes_helpers.go
@@ -62,11 +62,16 @@ func patchNodeStatus(ctx context.Context, log logrus.FieldLogger, clientset kube
 }
 
 func getNodeForPatching(ctx context.Context, log logrus.FieldLogger, clientset kubernetes.Interface, nodeName string) (*v1.Node, error) {
+	// on GKE we noticed that sometimes the node is not found, even though it is in the cluster
+	// as a result was returned from watch. But subsequent get request returns not found.
+	// This is likely due to clientset's caching that's meant to alleviate API's load.
+	// So we give enough time for cache to sync.
 	logRetry := func(err error, _ time.Duration) {
 		log.Warnf("getting node, will retry: %v", err)
 	}
 	var node *v1.Node
-	b := backoff.WithMaxRetries(backoff.NewExponentialBackOff(), 3)
+	b := backoff.NewExponentialBackOff()
+	b.MaxElapsedTime = 10 * time.Second
 	err := backoff.RetryNotify(func() error {
 		var err error
 		node, err = clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})

diff --git a/actions/patch_node_handler.go b/actions/patch_node_handler.go
@@ -56,10 +56,6 @@ func (h *patchNodeHandler) Handle(ctx context.Context, action *castai.ClusterAct
 		"id":        action.ID,
 	})
 
-	// on GKE we noticed that sometimes the node is not found, even though it is in the cluster
-	// as result was returned from watch. But subsequent get request returns not found.
-	// This in theory should not happen as get should be consistent with api server state.
-	// But we have seen this happening, so we retry the get request.
 	node, err := getNodeForPatching(ctx, h.log, h.clientset, req.NodeName)
 	if err != nil {
 		if apierrors.IsNotFound(err) {
@@ -77,7 +73,12 @@ func (h *patchNodeHandler) Handle(ctx context.Context, action *castai.ClusterAct
 	if req.Unschedulable == nil && len(req.Labels) == 0 && len(req.Taints) == 0 && len(req.Annotations) == 0 {
 		log.Info("no patch for node spec or labels")
 	} else {
-		log.Infof("patching node, labels=%v, taints=%v, annotations=%v, unschedulable=%v", req.Labels, req.Taints, req.Annotations, unschedulable)
+		log.WithFields(map[string]interface{}{
+			"labels":      req.Labels,
+			"taints":      req.Taints,
+			"annotations": req.Annotations,
+			"capacity":    req.Capacity,
+		}).Infof("patching node, labels=%v, taints=%v, annotations=%v, unschedulable=%v", req.Labels, req.Taints, req.Annotations, unschedulable)
 
 		err = patchNode(ctx, h.clientset, node, func(n *v1.Node) {
 			n.Labels = patchNodeMapField(n.Labels, req.Labels)
@@ -91,7 +92,7 @@ func (h *patchNodeHandler) Handle(ctx context.Context, action *castai.ClusterAct
 	}
 
 	if len(req.Capacity) > 0 {
-		log.Infof("patching node status, capacity=%v", req.Capacity)
+		log.WithField("capacity", req.Capacity).Infof("patching node status")
 		patch, err := json.Marshal(map[string]interface{}{
 			"status": map[string]interface{}{
 				"capacity": req.Capacity,