Skip to content

Commit

Permalink
🐛 fix work agent performance issue
Browse files Browse the repository at this point in the history
Signed-off-by: Yang Le <[email protected]>
  • Loading branch information
elgnay committed Jan 6, 2025
1 parent e1b5f88 commit f5a77b3
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@ import (
)

var (
ResyncInterval = 5 * time.Minute
MaxRequeueDuration = 24 * time.Hour
// ResyncInterval defines the maximum interval for resyncing a ManifestWork. It is used to:
// 1) Set the `ResyncEvery` for the `ManifestWorkAgent` controller;
// 2) Requeue a ManifestWork after it has been successfully reconciled.
ResyncInterval = 5 * time.Minute
)

type workReconcile interface {
Expand Down Expand Up @@ -138,7 +140,7 @@ func (m *ManifestWorkController) sync(ctx context.Context, controllerContext fac
}
newAppliedManifestWork := appliedManifestWork.DeepCopy()

var requeueTime = MaxRequeueDuration
var requeueTime = ResyncInterval
var errs []error
for _, reconciler := range m.reconcilers {
manifestWork, newAppliedManifestWork, err = reconciler.reconcile(
Expand Down Expand Up @@ -173,7 +175,7 @@ func (m *ManifestWorkController) sync(ctx context.Context, controllerContext fac
// we do not need to requeue when manifestwork/appliedmanifestwork are updated, since a following
// reconcile will be executed with update event, and the requeue can be set in this following reconcile
// if needed.
if !mwUpdated && !amwUpdated && requeueTime < MaxRequeueDuration {
if !mwUpdated && !amwUpdated {
controllerContext.Queue().AddAfter(manifestWorkName, requeueTime)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func (m *manifestworkReconciler) reconcile(
}

var newManifestConditions []workapiv1.ManifestCondition
var requeueTime = MaxRequeueDuration
var requeueTime = ResyncInterval
for _, result := range resourceResults {
manifestCondition := workapiv1.ManifestCondition{
ResourceMeta: result.resourceMeta,
Expand Down Expand Up @@ -119,7 +119,7 @@ func (m *manifestworkReconciler) reconcile(

if len(errs) > 0 {
err = utilerrors.NewAggregate(errs)
} else if requeueTime != MaxRequeueDuration {
} else if requeueTime != ResyncInterval {
err = commonhelper.NewRequeueError(
fmt.Sprintf("requeu work %s due to authorization err", manifestWork.Name),
requeueTime,
Expand Down
13 changes: 10 additions & 3 deletions pkg/work/spoke/spokeagent.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ const (
appliedManifestWorkFinalizeControllerWorkers = 10
manifestWorkFinalizeControllerWorkers = 10
availableStatusControllerWorkers = 10
manifestWorkAgentWorkers = 10
)

type WorkAgentConfig struct {
Expand Down Expand Up @@ -87,7 +88,11 @@ func (o *WorkAgentConfig) RunWorkloadAgent(ctx context.Context, controllerContex
if err != nil {
return err
}
spokeWorkInformerFactory := workinformers.NewSharedInformerFactory(spokeWorkClient, 5*time.Minute)

// Resyncing at a small interval may cause performance issues when the number of AppliedManifestWorks is large.
// Since the resync interval for the ManifestWork informer is set to 24 hours, use a different interval, such as
// 21 hours, for the AppliedManifestWork informer to prevent concurrent resyncs between the two informers.
spokeWorkInformerFactory := workinformers.NewSharedInformerFactory(spokeWorkClient, 21*time.Hour)

httpClient, err := rest.HTTPClientFor(spokeRestConfig)
if err != nil {
Expand Down Expand Up @@ -180,7 +185,7 @@ func (o *WorkAgentConfig) RunWorkloadAgent(ctx context.Context, controllerContex
go addFinalizerController.Run(ctx, 1)
go appliedManifestWorkFinalizeController.Run(ctx, appliedManifestWorkFinalizeControllerWorkers)
go unmanagedAppliedManifestWorkController.Run(ctx, 1)
go manifestWorkController.Run(ctx, 1)
go manifestWorkController.Run(ctx, manifestWorkAgentWorkers)
go manifestWorkFinalizeController.Run(ctx, manifestWorkFinalizeControllerWorkers)
go availableStatusController.Run(ctx, availableStatusControllerWorkers)

Expand Down Expand Up @@ -252,7 +257,9 @@ func (o *WorkAgentConfig) newWorkClientAndInformer(

factory := workinformers.NewSharedInformerFactoryWithOptions(
workClient,
5*time.Minute,
// resyncing at a small interval may cause performance issues when the number of ManifestWorks
// is large.
24*time.Hour,
workinformers.WithNamespace(o.agentOptions.SpokeClusterName),
)
informer := factory.Work().V1().ManifestWorks()
Expand Down

0 comments on commit f5a77b3

Please sign in to comment.