Skip to content

Commit

Permalink
Give enough time to sync clientset's cache on new Node 3.5 sec -> 10 …
Browse files Browse the repository at this point in the history
…sec (#103)

give enough time to sync clientset's cache on new Node

Co-authored-by: Furkhat Kasymov Genii Uulu <[email protected]>
  • Loading branch information
furkhat and Furkhat Kasymov Genii Uulu authored Feb 29, 2024
1 parent 4afcd32 commit ffef754
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 7 deletions.
7 changes: 6 additions & 1 deletion actions/kubernetes_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,16 @@ func patchNodeStatus(ctx context.Context, log logrus.FieldLogger, clientset kube
}

func getNodeForPatching(ctx context.Context, log logrus.FieldLogger, clientset kubernetes.Interface, nodeName string) (*v1.Node, error) {
// on GKE we noticed that sometimes the node is not found, even though it is in the cluster
// as a result was returned from watch. But subsequent get request returns not found.
// This is likely due to clientset's caching that's meant to alleviate API's load.
// So we give enough time for cache to sync.
logRetry := func(err error, _ time.Duration) {
log.Warnf("getting node, will retry: %v", err)
}
var node *v1.Node
b := backoff.WithMaxRetries(backoff.NewExponentialBackOff(), 3)
b := backoff.NewExponentialBackOff()
b.MaxElapsedTime = 10 * time.Second
err := backoff.RetryNotify(func() error {
var err error
node, err = clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
Expand Down
13 changes: 7 additions & 6 deletions actions/patch_node_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,6 @@ func (h *patchNodeHandler) Handle(ctx context.Context, action *castai.ClusterAct
"id": action.ID,
})

// on GKE we noticed that sometimes the node is not found, even though it is in the cluster
// as result was returned from watch. But subsequent get request returns not found.
// This in theory should not happen as get should be consistent with api server state.
// But we have seen this happening, so we retry the get request.
node, err := getNodeForPatching(ctx, h.log, h.clientset, req.NodeName)
if err != nil {
if apierrors.IsNotFound(err) {
Expand All @@ -77,7 +73,12 @@ func (h *patchNodeHandler) Handle(ctx context.Context, action *castai.ClusterAct
if req.Unschedulable == nil && len(req.Labels) == 0 && len(req.Taints) == 0 && len(req.Annotations) == 0 {
log.Info("no patch for node spec or labels")
} else {
log.Infof("patching node, labels=%v, taints=%v, annotations=%v, unschedulable=%v", req.Labels, req.Taints, req.Annotations, unschedulable)
log.WithFields(map[string]interface{}{
"labels": req.Labels,
"taints": req.Taints,
"annotations": req.Annotations,
"capacity": req.Capacity,
}).Infof("patching node, labels=%v, taints=%v, annotations=%v, unschedulable=%v", req.Labels, req.Taints, req.Annotations, unschedulable)

err = patchNode(ctx, h.clientset, node, func(n *v1.Node) {
n.Labels = patchNodeMapField(n.Labels, req.Labels)
Expand All @@ -91,7 +92,7 @@ func (h *patchNodeHandler) Handle(ctx context.Context, action *castai.ClusterAct
}

if len(req.Capacity) > 0 {
log.Infof("patching node status, capacity=%v", req.Capacity)
log.WithField("capacity", req.Capacity).Infof("patching node status")
patch, err := json.Marshal(map[string]interface{}{
"status": map[string]interface{}{
"capacity": req.Capacity,
Expand Down

0 comments on commit ffef754

Please sign in to comment.