Skip to content

Commit

Permalink
bug: StatefulSets are not reevaluated after rescheduling (#67)
Browse files Browse the repository at this point in the history
* bug: StatefulSets are not reevaluated after rescheduling

 # Context

Noe caches the Pods it has already processed to avoid reprocessing
them unless there are changes. We do this by storing the
`NamespacedName` of the Pod, which includes the namespace and Pod
name.

For Deployments, rescheduling the Pod actually changes its name as the
termination is generated. For StatefulSets, the name is kept the same
no matter how many reschedules happen.

Our caching approach prevents us form reevaluating the location of
StatefulSet pods if they were previously scheduled, and we've seen
this allocate StatefulSet pods to nodes whose architectures where not
supported by the Pod.

 # Bugfix approach

The intention of the cache is to reduce load over Registries and Noe
itself. In our case so far StatefulSets are much less common than
Deployments, so we choose to avoid the cache for StatefulSets.

This is a stopgap measure to prevent misscheduling of current
workloads until a better approach to prevent reprocessing of Pods can
be devised that supports StatefulSets.

 # What we change

We add a `skipCache(*v1.Pod)` function that checks if the Pod is owned
by a StatefulSet. If so, the caching mechanism is skipped for this
workload.

Change-Id: Ie7bdf2d863f433e7998b1b92cea129c6bda4dd8f
Co-authored-by: João Alves <[email protected]>
  • Loading branch information
miguelbernadi and joaoqalves authored Jan 8, 2024
1 parent 62ea3e9 commit 8957c00
Show file tree
Hide file tree
Showing 2 changed files with 118 additions and 4 deletions.
13 changes: 12 additions & 1 deletion pkg/controllers/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
r.deleteFromCache(req.NamespacedName.String())
return ctrl.Result{}, nil
}
if r.isImageCached(req.NamespacedName.String()) || podIsReady(ctx, pod) {

if !isStatefulSet(pod) && (r.isImageCached(req.NamespacedName.String()) || podIsReady(ctx, pod)) {
log.DefaultLogger.WithContext(ctx).Info("pod was already processed")
return ctrl.Result{}, nil
}
Expand Down Expand Up @@ -313,6 +314,16 @@ func (r *PodReconciler) deletePodAndNotifyUser(ctx context.Context, pod *v1.Pod)
}
}

func isStatefulSet(pod *v1.Pod) bool {
owners := pod.GetOwnerReferences()
for _, owner := range owners {
if owner.Kind == "StatefulSet" {
return true
}
}
return false
}

// podIsReady checks if a pod condition is ready which means the readiness probe of all containers are OK.
//
// from the documentation: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-readiness-gate
Expand Down
109 changes: 106 additions & 3 deletions pkg/controllers/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"github.com/adevinta/noe/pkg/controllers"
"github.com/adevinta/noe/pkg/metric_test_helpers"
"github.com/adevinta/noe/pkg/registry"
"github.com/google/uuid"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
Expand Down Expand Up @@ -388,7 +389,7 @@ func TestReconcileShouldDeletePodsOnMismatchingNodes(t *testing.T) {
Kind: "Deployment",
Name: "deployment-1",
UID: "deployment-1-uid",
Controller: pointer.BoolPtr(true),
Controller: pointer.Bool(true),
},
},
Name: "test-pod-1",
Expand Down Expand Up @@ -502,7 +503,7 @@ func TestReconcileShouldReportMetricsAndEventsWhenPodDeletionFails(t *testing.T)
Kind: "Deployment",
Name: "deployment-1",
UID: "deployment-1-uid",
Controller: pointer.BoolPtr(true),
Controller: pointer.Bool(true),
},
},
Name: "test-pod-1",
Expand Down Expand Up @@ -786,7 +787,7 @@ func TestReconcileShouldRequeuedWhenNodeSpecGetError(t *testing.T) {
Kind: "Deployment",
Name: "deployment-1",
UID: "deployment-1-uid",
Controller: pointer.BoolPtr(true),
Controller: pointer.Bool(true),
},
},
Name: "test-pod-1",
Expand Down Expand Up @@ -827,3 +828,105 @@ func TestReconcileShouldRequeuedWhenNodeSpecGetError(t *testing.T) {
assert.True(t, result.Requeue)

}

func TestReconcileWhenPodBelongsToStatefulSetIsAlwaysChecked(t *testing.T) {
uid := types.UID(uuid.New().String())
k8sClient := fake.NewClientBuilder().WithObjects(
&appsv1.StatefulSet{
ObjectMeta: metav1.ObjectMeta{
Name: "statefulset",
Namespace: "ns",
UID: uid,
},
Spec: appsv1.StatefulSetSpec{
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Name: "statefulset-running-pod",
Namespace: "ns",
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Image: "test-image",
},
},
},
},
},
},
&v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "statefulset-running-pod",
Namespace: "ns",
OwnerReferences: []metav1.OwnerReference{
{
Kind: "StatefulSet",
UID: uid,
Controller: pointer.Bool(true),
},
},
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Image: "test-image",
},
},
},
Status: v1.PodStatus{
Phase: v1.PodRunning,
Conditions: []v1.PodCondition{
{
Type: v1.PodReady,
Status: v1.ConditionTrue,
},
},
},
},
).Build()

metricsRegistry := prometheus.NewRegistry()

reconciler := controllers.NewPodReconciler(
"test",
controllers.WithClient(k8sClient),
controllers.WithMetricsRegistry(metricsRegistry),
controllers.WithRegistry(arch.RegistryFunc(func(ctx context.Context, imagePullSecret, image string) ([]registry.Platform, error) {
if image == "test-image" {
return []registry.Platform{
{
OS: "linux",
Architecture: "amd64",
},
}, nil
} else {
return nil, errors.New("error")
}
})))

_, err := reconciler.Reconcile(context.Background(), reconcile.Request{NamespacedName: types.NamespacedName{Name: "statefulset-running-pod", Namespace: "ns"}})
assert.NoError(t, err)

checkMetricRegistry(
t,
metricsRegistry,
"test_images_count",
func(t *testing.T, family *dto.MetricFamily) {
assert.Len(t, family.Metric, 1)
fmt.Print(family.Metric)
checkMetricValueForLabels(
t,
family.Metric,
prometheus.Labels{
"arch": "amd64",
"os": "linux",
"image": "test-image",
"variant": "",
},
func(t *testing.T, metric *dto.Metric) {
assert.EqualValues(t, 1.0, *metric.Gauge.Value)
},
)
},
)
}

0 comments on commit 8957c00

Please sign in to comment.