-
Notifications
You must be signed in to change notification settings - Fork 20
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Introduce multiple service group categories and maximumMaxReplicas for each of them #428
base: main
Are you sure you want to change the base?
Changes from all commits
d759134
a438211
e93ac1b
00134fe
c15336c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ import ( | |
"time" | ||
|
||
"gopkg.in/yaml.v3" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
|
||
"github.com/mercari/tortoise/pkg/features" | ||
) | ||
|
@@ -261,11 +262,38 @@ type Config struct { | |
// IstioSidecarProxyDefaultMemory is the default Memory resource request of the istio sidecar proxy (default: 200Mi) | ||
IstioSidecarProxyDefaultMemory string `yaml:"IstioSidecarProxyDefaultMemory"` | ||
|
||
// ServiceGroups defines a list of service category names. | ||
ServiceGroups []ServiceGroup `yaml:"ServiceGroups"` | ||
// MaximumMaxReplicasPerService is the maximum maxReplicas that tortoise can give to the HPA per service category. | ||
// If the service category is not found in this list, tortoise uses the default value which is the value set in MaximumMaxReplicas. | ||
MaximumMaxReplicasPerService []MaximumMaxReplicasPerGroup `yaml:"MaximumMaxReplicasPerService"` | ||
|
||
// FeatureFlags is the list of feature flags (default: empty = all alpha features are disabled) | ||
// See the list of feature flags in features.go | ||
FeatureFlags []features.FeatureFlag `yaml:"FeatureFlags"` | ||
} | ||
|
||
type MaximumMaxReplicasPerGroup struct { | ||
// ServiceGroupName refers to one ServiceGroup at Config.ServiceGroups | ||
ServiceGroupName string `yaml:"ServiceGroupName"` | ||
|
||
MaximumMaxReplica int32 `yaml:"MaximumMaxReplica"` | ||
} | ||
|
||
// Selector selects a group of pods by matching its namespace and labels. | ||
type Selector struct { | ||
Namespace string `yaml:"Namespace"` // Namespace name | ||
LabelSelectors []*metav1.LabelSelector `yaml:"Labels,omitempty"` // Slice of label selectors within this namespace | ||
} | ||
|
||
// ServiceGroup represents a collection of services grouped together with namespace awareness. | ||
type ServiceGroup struct { | ||
// Name is the group's name (e.g., big-service, fintech-service, etc). | ||
Name string `yaml:"Name"` | ||
// Selectors represent multiple pod groups that belong to this ServiceGroup. | ||
Selectors []Selector `yaml:"Selectors"` // Slice of namespaces and labels within this service group | ||
} | ||
|
||
func defaultConfig() *Config { | ||
return &Config{ | ||
RangeOfMinMaxReplicasRecommendationHours: 1, | ||
|
@@ -294,6 +322,8 @@ func defaultConfig() *Config { | |
IstioSidecarProxyDefaultMemory: "200Mi", | ||
MinimumCPULimit: "0", | ||
ResourceLimitMultiplier: map[string]int64{}, | ||
ServiceGroups: []ServiceGroup{}, | ||
MaximumMaxReplicasPerService: []MaximumMaxReplicasPerGroup{}, | ||
BufferRatioOnVerticalResource: 0.1, | ||
} | ||
} | ||
|
@@ -339,10 +369,54 @@ func validate(config *Config) error { | |
if config.MinimumMinReplicas >= int(config.MaximumMinReplicas) { | ||
return fmt.Errorf("MinimumMinReplicas should be less than MaximumMinReplicas") | ||
} | ||
if config.MaximumMinReplicas > config.MaximumMaxReplicas { | ||
|
||
// Find the minimum value of MaximumMaxReplicas across all service groups | ||
minOfMaximumMaxReplicas := config.MaximumMaxReplicas // Start with the default value of MaximumMaxReplicas | ||
for _, group := range config.MaximumMaxReplicasPerService { | ||
if group.MaximumMaxReplica < minOfMaximumMaxReplicas { | ||
minOfMaximumMaxReplicas = group.MaximumMaxReplica | ||
} | ||
} | ||
|
||
// Check for non-negative values | ||
if minOfMaximumMaxReplicas < 0 { | ||
return fmt.Errorf("MaximumMaxReplicas should contain non-negative values") | ||
} | ||
|
||
// Ensure ServiceGroupNames in MaximumMaxReplicas match defined ServiceGroups | ||
serviceGroupMap := make(map[string]bool) | ||
for _, sg := range config.ServiceGroups { | ||
serviceGroupMap[sg.Name] = true | ||
} | ||
|
||
for _, maxReplicas := range config.MaximumMaxReplicasPerService { | ||
if maxReplicas.ServiceGroupName != "" { | ||
if _, exists := serviceGroupMap[maxReplicas.ServiceGroupName]; !exists { | ||
return fmt.Errorf("ServiceGroupName %s in MaximumMaxReplicas is not defined in ServiceGroups", maxReplicas.ServiceGroupName) | ||
} | ||
} | ||
} | ||
|
||
// Ensure no duplicates in ServiceGroups | ||
seenServiceGroups := make(map[string]bool) | ||
for _, sg := range config.ServiceGroups { | ||
if seenServiceGroups[sg.Name] { | ||
return fmt.Errorf("duplicate ServiceGroupName found: %s", sg.Name) | ||
} | ||
Comment on lines
+403
to
+405
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This duplicate check can be done in the above loop at L388. |
||
seenServiceGroups[sg.Name] = true | ||
} | ||
|
||
// Check all entries in MaximumMaxReplicasPerService have non-nil ServiceGroupName | ||
for _, maxReplicas := range config.MaximumMaxReplicasPerService { | ||
if maxReplicas.ServiceGroupName == "" { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this empty check could be done in the above loop at 393. |
||
return fmt.Errorf("ServiceGroupName should not be nil in MaximumMaxReplicasPerService entries") | ||
} | ||
} | ||
|
||
if config.MaximumMinReplicas > minOfMaximumMaxReplicas { | ||
return fmt.Errorf("MaximumMinReplicas should be less than or equal to MaximumMaxReplicas") | ||
} | ||
if config.PreferredMaxReplicas >= int(config.MaximumMaxReplicas) { | ||
if config.PreferredMaxReplicas >= int(minOfMaximumMaxReplicas) { | ||
Comment on lines
+416
to
+419
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think |
||
return fmt.Errorf("PreferredMaxReplicas should be less than MaximumMaxReplicas") | ||
} | ||
if config.PreferredMaxReplicas <= config.MinimumMinReplicas { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you should add test cases for all validations that you added. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,7 @@ import ( | |
|
||
"github.com/mercari/tortoise/api/v1beta3" | ||
autoscalingv1beta3 "github.com/mercari/tortoise/api/v1beta3" | ||
"github.com/mercari/tortoise/pkg/config" | ||
"github.com/mercari/tortoise/pkg/event" | ||
"github.com/mercari/tortoise/pkg/metrics" | ||
"github.com/mercari/tortoise/pkg/utils" | ||
|
@@ -41,6 +42,8 @@ type Service struct { | |
minimumMinReplicas int32 | ||
maximumMinReplica int32 | ||
maximumMaxReplica int32 | ||
serviceGroups []config.ServiceGroup | ||
maximumMaxReplicasPerService []config.MaximumMaxReplicasPerGroup | ||
externalMetricExclusionRegex *regexp.Regexp | ||
} | ||
|
||
|
@@ -53,6 +56,8 @@ func New( | |
tortoiseHPATargetUtilizationUpdateInterval time.Duration, | ||
maximumMinReplica, maximumMaxReplica int32, | ||
minimumMinReplicas int32, | ||
serviceGroups []config.ServiceGroup, | ||
maximumMaxReplicasPerService []config.MaximumMaxReplicasPerGroup, | ||
externalMetricExclusionRegex string, | ||
) (*Service, error) { | ||
var regex *regexp.Regexp | ||
|
@@ -75,6 +80,8 @@ func New( | |
maximumMinReplica: maximumMinReplica, | ||
minimumMinReplicas: minimumMinReplicas, | ||
maximumMaxReplica: maximumMaxReplica, | ||
serviceGroups: serviceGroups, | ||
maximumMaxReplicasPerService: maximumMaxReplicasPerService, | ||
externalMetricExclusionRegex: regex, | ||
}, nil | ||
} | ||
|
@@ -240,6 +247,22 @@ var globalRecommendedHPABehavior = &v2.HorizontalPodAutoscalerBehavior{ | |
}, | ||
} | ||
|
||
// Determine which service group is applicable for a given Tortoise using its selectors. | ||
func (c *Service) determineServiceGroup(tortoise *autoscalingv1beta3.Tortoise) string { | ||
tortoiseNamespace := tortoise.Namespace | ||
for _, serviceGroup := range c.serviceGroups { | ||
for _, selector := range serviceGroup.Selectors { | ||
if selector.Namespace == tortoiseNamespace { | ||
klog.InfoS("Namespace matched", "serviceGroup", serviceGroup.Name, "namespace", tortoiseNamespace) | ||
return serviceGroup.Name | ||
} | ||
// TODO(avs): Add logic for matching labels in the future. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not within this PR? If you meant not to implement this within this PR, you should remove LabelSelector support from the config for now. (and add the one when you actually implement logic |
||
} | ||
} | ||
// Returning an empty string to denote a default value when no match. | ||
return "" | ||
} | ||
|
||
func (c *Service) CreateHPA(ctx context.Context, tortoise *autoscalingv1beta3.Tortoise, replicaNum int32, now time.Time) (*v2.HorizontalPodAutoscaler, *autoscalingv1beta3.Tortoise, error) { | ||
if !HasHorizontal(tortoise) { | ||
// no need to create HPA | ||
|
@@ -250,6 +273,10 @@ func (c *Service) CreateHPA(ctx context.Context, tortoise *autoscalingv1beta3.To | |
return nil, tortoise, nil | ||
} | ||
|
||
// Logic to determine the applicable service group and max replicas. | ||
groupName := c.determineServiceGroup(tortoise) | ||
maximumMaxReplicas := c.getMaximumMaxReplicasForGroup(groupName) | ||
|
||
hpa := &v2.HorizontalPodAutoscaler{ | ||
ObjectMeta: metav1.ObjectMeta{ | ||
Name: autoscalingv1beta3.TortoiseDefaultHPAName(tortoise.Name), | ||
|
@@ -262,7 +289,7 @@ func (c *Service) CreateHPA(ctx context.Context, tortoise *autoscalingv1beta3.To | |
APIVersion: tortoise.Spec.TargetRefs.ScaleTargetRef.APIVersion, | ||
}, | ||
MinReplicas: ptr.To[int32](c.minimumMinReplicas), | ||
MaxReplicas: c.maximumMaxReplica, | ||
MaxReplicas: maximumMaxReplicas, | ||
Behavior: globalRecommendedHPABehavior, | ||
}, | ||
} | ||
|
@@ -275,6 +302,25 @@ func (c *Service) CreateHPA(ctx context.Context, tortoise *autoscalingv1beta3.To | |
return hpa.DeepCopy(), tortoise, err | ||
} | ||
|
||
// getMaximumMaxReplicasForGroup returns the maximum replicas for a specific service group. | ||
// If the groupName is empty or doesn't match any named group, the function returns the default maximum replicas defined by the MaximumMaxReplicas. | ||
func (c *Service) getMaximumMaxReplicasForGroup(groupName string) int32 { | ||
// Handle the case for the default group first | ||
if groupName == "" { | ||
return c.maximumMaxReplica | ||
} | ||
|
||
// Look for a specific service group match | ||
for _, group := range c.maximumMaxReplicasPerService { | ||
if group.ServiceGroupName != "" && group.ServiceGroupName == groupName { | ||
return group.MaximumMaxReplica | ||
} | ||
} | ||
|
||
// Fallback to the default maximum if no match is found | ||
return c.maximumMaxReplica | ||
} | ||
|
||
func (c *Service) GetHPAOnTortoiseSpec(ctx context.Context, tortoise *autoscalingv1beta3.Tortoise) (*v2.HorizontalPodAutoscaler, error) { | ||
if tortoise.Spec.TargetRefs.HorizontalPodAutoscalerName == nil { | ||
return nil, nil | ||
|
@@ -405,9 +451,13 @@ func (c *Service) ChangeHPAFromTortoiseRecommendation(tortoise *autoscalingv1bet | |
recommendMax = *tortoise.Spec.MaxReplicas | ||
} | ||
|
||
if recommendMax > c.maximumMaxReplica { | ||
c.recorder.Event(tortoise, corev1.EventTypeWarning, event.WarningHittingHardMaxReplicaLimit, fmt.Sprintf("MaxReplica (%v) suggested from Tortoise (%s/%s) hits a cluster-wide maximum replica number (%v). It wouldn't be a problem until the replica number actually grows to %v though, you may want to reach out to your cluster admin.", recommendMax, tortoise.Namespace, tortoise.Name, c.maximumMaxReplica, c.maximumMaxReplica)) | ||
recommendMax = c.maximumMaxReplica | ||
// Determine the service group and the maximum replicas for that group | ||
groupName := c.determineServiceGroup(tortoise) | ||
maximumMaxReplica := c.getMaximumMaxReplicasForGroup(groupName) | ||
|
||
if recommendMax > maximumMaxReplica { | ||
c.recorder.Event(tortoise, corev1.EventTypeWarning, event.WarningHittingHardMaxReplicaLimit, fmt.Sprintf("MaxReplica (%v) suggested from Tortoise (%s/%s) hits a cluster-wide maximum replica number (%v). It wouldn't be a problem until the replica number actually grows to %v though, you may want to reach out to your cluster admin.", recommendMax, tortoise.Namespace, tortoise.Name, maximumMaxReplica, maximumMaxReplica)) | ||
recommendMax = maximumMaxReplica | ||
} | ||
|
||
hpa.Spec.MaxReplicas = recommendMax | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.