Skip to content

Commit

Permalink
feature(eviction): add event when EvictPod failed
Browse files Browse the repository at this point in the history
  • Loading branch information
googs1025 committed Dec 7, 2024
1 parent a962cca commit 171db09
Show file tree
Hide file tree
Showing 5 changed files with 303 additions and 94 deletions.
4 changes: 4 additions & 0 deletions pkg/api/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ type DeschedulerPolicy struct {
// MaxNoOfPodsToTotal restricts maximum of pods to be evicted total.
MaxNoOfPodsToEvictTotal *uint

// EvictionFailureEventNotification should be set to true to enable eviction failure event notification.
// Default is false.
EvictionFailureEventNotification *bool

// MetricsCollector configures collection of metrics about actual resource utilization
MetricsCollector MetricsCollector
}
Expand Down
1 change: 1 addition & 0 deletions pkg/descheduler/descheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ func newDescheduler(ctx context.Context, rs *options.DeschedulerServer, deschedu
WithMaxPodsToEvictPerNode(deschedulerPolicy.MaxNoOfPodsToEvictPerNode).
WithMaxPodsToEvictPerNamespace(deschedulerPolicy.MaxNoOfPodsToEvictPerNamespace).
WithMaxPodsToEvictTotal(deschedulerPolicy.MaxNoOfPodsToEvictTotal).
WithEvictionFailureEventNotification(deschedulerPolicy.EvictionFailureEventNotification).
WithDryRun(rs.DryRun).
WithMetricsEnabled(!rs.DisableMetrics),
)
Expand Down
66 changes: 40 additions & 26 deletions pkg/descheduler/evictions/evictions.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,20 +206,21 @@ type (
)

type PodEvictor struct {
mu sync.RWMutex
client clientset.Interface
policyGroupVersion string
dryRun bool
maxPodsToEvictPerNode *uint
maxPodsToEvictPerNamespace *uint
maxPodsToEvictTotal *uint
nodePodCount nodePodEvictedCount
namespacePodCount namespacePodEvictCount
totalPodCount uint
metricsEnabled bool
eventRecorder events.EventRecorder
erCache *evictionRequestsCache
featureGates featuregate.FeatureGate
mu sync.RWMutex
client clientset.Interface
policyGroupVersion string
dryRun bool
evictionFailureEventNotification bool
maxPodsToEvictPerNode *uint
maxPodsToEvictPerNamespace *uint
maxPodsToEvictTotal *uint
nodePodCount nodePodEvictedCount
namespacePodCount namespacePodEvictCount
totalPodCount uint
metricsEnabled bool
eventRecorder events.EventRecorder
erCache *evictionRequestsCache
featureGates featuregate.FeatureGate

// registeredHandlers contains the registrations of all handlers. It's used to check if all handlers have finished syncing before the scheduling cycles start.
registeredHandlers []cache.ResourceEventHandlerRegistration
Expand All @@ -238,17 +239,18 @@ func NewPodEvictor(
}

podEvictor := &PodEvictor{
client: client,
eventRecorder: eventRecorder,
policyGroupVersion: options.policyGroupVersion,
dryRun: options.dryRun,
maxPodsToEvictPerNode: options.maxPodsToEvictPerNode,
maxPodsToEvictPerNamespace: options.maxPodsToEvictPerNamespace,
maxPodsToEvictTotal: options.maxPodsToEvictTotal,
metricsEnabled: options.metricsEnabled,
nodePodCount: make(nodePodEvictedCount),
namespacePodCount: make(namespacePodEvictCount),
featureGates: featureGates,
client: client,
eventRecorder: eventRecorder,
policyGroupVersion: options.policyGroupVersion,
dryRun: options.dryRun,
evictionFailureEventNotification: options.evictionFailureEventNotification,
maxPodsToEvictPerNode: options.maxPodsToEvictPerNode,
maxPodsToEvictPerNamespace: options.maxPodsToEvictPerNamespace,
maxPodsToEvictTotal: options.maxPodsToEvictTotal,
metricsEnabled: options.metricsEnabled,
nodePodCount: make(nodePodEvictedCount),
namespacePodCount: make(namespacePodEvictCount),
featureGates: featureGates,
}

if featureGates.Enabled(features.EvictionsInBackground) {
Expand Down Expand Up @@ -481,6 +483,9 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
}
span.AddEvent("Eviction Failed", trace.WithAttributes(attribute.String("node", pod.Spec.NodeName), attribute.String("err", err.Error())))
klog.ErrorS(err, "Error evicting pod", "limit", *pe.maxPodsToEvictTotal)
if pe.evictionFailureEventNotification {
pe.eventRecorder.Eventf(pod, nil, v1.EventTypeWarning, "EvictionFailed", "Descheduled", "pod eviction from %v node by sigs.k8s.io/descheduler failed: total eviction limit exceeded (%v)", pod.Spec.NodeName, *pe.maxPodsToEvictTotal)
}
return err
}

Expand All @@ -492,6 +497,9 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
}
span.AddEvent("Eviction Failed", trace.WithAttributes(attribute.String("node", pod.Spec.NodeName), attribute.String("err", err.Error())))
klog.ErrorS(err, "Error evicting pod", "limit", *pe.maxPodsToEvictPerNode, "node", pod.Spec.NodeName)
if pe.evictionFailureEventNotification {
pe.eventRecorder.Eventf(pod, nil, v1.EventTypeWarning, "EvictionFailed", "Descheduled", "pod eviction from %v node by sigs.k8s.io/descheduler failed: node eviction limit exceeded (%v)", pod.Spec.NodeName, *pe.maxPodsToEvictPerNode)
}
return err
}
}
Expand All @@ -503,6 +511,9 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
}
span.AddEvent("Eviction Failed", trace.WithAttributes(attribute.String("node", pod.Spec.NodeName), attribute.String("err", err.Error())))
klog.ErrorS(err, "Error evicting pod", "limit", *pe.maxPodsToEvictPerNamespace, "namespace", pod.Namespace, "pod", klog.KObj(pod))
if pe.evictionFailureEventNotification {
pe.eventRecorder.Eventf(pod, nil, v1.EventTypeWarning, "EvictionFailed", "Descheduled", "pod eviction from %v node by sigs.k8s.io/descheduler failed: namespace eviction limit exceeded (%v)", pod.Spec.NodeName, *pe.maxPodsToEvictPerNamespace)
}
return err
}

Expand All @@ -514,6 +525,9 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
if pe.metricsEnabled {
metrics.PodsEvicted.With(map[string]string{"result": "error", "strategy": opts.StrategyName, "namespace": pod.Namespace, "node": pod.Spec.NodeName, "profile": opts.ProfileName}).Inc()
}
if pe.evictionFailureEventNotification {
pe.eventRecorder.Eventf(pod, nil, v1.EventTypeWarning, "EvictionFailed", "Descheduled", "pod eviction from %v node by sigs.k8s.io/descheduler failed: %v", pod.Spec.NodeName, err.Error())
}
return err
}

Expand Down Expand Up @@ -542,7 +556,7 @@ func (pe *PodEvictor) EvictPod(ctx context.Context, pod *v1.Pod, opts EvictOptio
reason = "NotSet"
}
}
pe.eventRecorder.Eventf(pod, nil, v1.EventTypeNormal, reason, "Descheduled", "pod evicted from %v node by sigs.k8s.io/descheduler", pod.Spec.NodeName)
pe.eventRecorder.Eventf(pod, nil, v1.EventTypeNormal, reason, "Descheduled", "pod eviction from %v node by sigs.k8s.io/descheduler", pod.Spec.NodeName)
}
return nil
}
Expand Down
Loading

0 comments on commit 171db09

Please sign in to comment.