Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce multiple service group categories and maximumMaxReplicas for each of them #428

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion api/autoscaling/v2/webhook_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ var _ = BeforeSuite(func() {
eventRecorder := mgr.GetEventRecorderFor("tortoise-controller")
tortoiseService, err := tortoise.New(mgr.GetClient(), eventRecorder, config.RangeOfMinMaxReplicasRecommendationHours, config.TimeZone, config.TortoiseUpdateInterval, config.GatheringDataPeriodType)
Expect(err).NotTo(HaveOccurred())
hpaService, err := hpa.New(mgr.GetClient(), eventRecorder, config.ReplicaReductionFactor, config.MaximumTargetResourceUtilization, 100, time.Hour, 1000, 10000, 3, "")
hpaService, err := hpa.New(mgr.GetClient(), eventRecorder, config.ReplicaReductionFactor, config.MaximumTargetResourceUtilization, 100, time.Hour, 1000, 10000, 3, config.ServiceGroups, config.MaximumMaxReplicasPerService, "")
Expect(err).NotTo(HaveOccurred())

hpaWebhook := New(tortoiseService, hpaService)
Expand Down
2 changes: 1 addition & 1 deletion cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ func main() {
os.Exit(1)
}

hpaService, err := hpa.New(mgr.GetClient(), eventRecorder, config.ReplicaReductionFactor, config.MaximumTargetResourceUtilization, config.HPATargetUtilizationMaxIncrease, config.HPATargetUtilizationUpdateInterval, config.MaximumMinReplicas, config.MaximumMaxReplicas, int32(config.MinimumMinReplicas), config.HPAExternalMetricExclusionRegex)
hpaService, err := hpa.New(mgr.GetClient(), eventRecorder, config.ReplicaReductionFactor, config.MaximumTargetResourceUtilization, config.HPATargetUtilizationMaxIncrease, config.HPATargetUtilizationUpdateInterval, config.MaximumMinReplicas, config.MaximumMaxReplicas, int32(config.MinimumMinReplicas), config.ServiceGroups, config.MaximumMaxReplicasPerService, config.HPAExternalMetricExclusionRegex)
if err != nil {
setupLog.Error(err, "unable to start hpa service")
os.Exit(1)
Expand Down
4 changes: 3 additions & 1 deletion internal/controller/tortoise_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"sigs.k8s.io/yaml"

"github.com/mercari/tortoise/api/v1beta3"
config_file "github.com/mercari/tortoise/pkg/config"
"github.com/mercari/tortoise/pkg/deployment"
"github.com/mercari/tortoise/pkg/features"
"github.com/mercari/tortoise/pkg/hpa"
Expand Down Expand Up @@ -259,7 +260,8 @@ func startController(ctx context.Context) func() {
Expect(err).ShouldNot(HaveOccurred())
cli, err := vpa.New(mgr.GetConfig(), recorder)
Expect(err).ShouldNot(HaveOccurred())
hpaS, err := hpa.New(mgr.GetClient(), recorder, 0.95, 90, 25, time.Hour, 1000, 10000, 3, ".*-exclude-metric")

hpaS, err := hpa.New(mgr.GetClient(), recorder, 0.95, 90, 25, time.Hour, 1000, 10000, 3, []config_file.ServiceGroup{}, []config_file.MaximumMaxReplicasPerGroup{}, ".*-exclude-metric")
Expect(err).ShouldNot(HaveOccurred())
reconciler := &TortoiseReconciler{
Scheme: scheme,
Expand Down
78 changes: 76 additions & 2 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"time"

"gopkg.in/yaml.v3"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/mercari/tortoise/pkg/features"
)
Expand Down Expand Up @@ -261,11 +262,38 @@ type Config struct {
// IstioSidecarProxyDefaultMemory is the default Memory resource request of the istio sidecar proxy (default: 200Mi)
IstioSidecarProxyDefaultMemory string `yaml:"IstioSidecarProxyDefaultMemory"`

// ServiceGroups defines a list of service category names.
ServiceGroups []ServiceGroup `yaml:"ServiceGroups"`
// MaximumMaxReplicasPerService is the maximum maxReplicas that tortoise can give to the HPA per service category.
// If the service category is not found in this list, tortoise uses the default value which is the value set in MaximumMaxReplicas.
MaximumMaxReplicasPerService []MaximumMaxReplicasPerGroup `yaml:"MaximumMaxReplicasPerService"`

// FeatureFlags is the list of feature flags (default: empty = all alpha features are disabled)
// See the list of feature flags in features.go
FeatureFlags []features.FeatureFlag `yaml:"FeatureFlags"`
}

type MaximumMaxReplicasPerGroup struct {
// ServiceGroupName refers to one ServiceGroup at Config.ServiceGroups
ServiceGroupName string `yaml:"ServiceGroupName"`

MaximumMaxReplica int32 `yaml:"MaximumMaxReplica"`
}

// Selector selects a group of pods by matching its namespace and labels.
type Selector struct {
Namespace string `yaml:"Namespace"` // Namespace name
LabelSelectors []*metav1.LabelSelector `yaml:"Labels,omitempty"` // Slice of label selectors within this namespace
}

// ServiceGroup represents a collection of services grouped together with namespace awareness.
type ServiceGroup struct {
// Name is the group's name (e.g., big-service, fintech-service, etc).
Name string `yaml:"Name"`
// Selectors represent multiple pod groups that belong to this ServiceGroup.
Selectors []Selector `yaml:"Selectors"` // Slice of namespaces and labels within this service group
}

func defaultConfig() *Config {
return &Config{
RangeOfMinMaxReplicasRecommendationHours: 1,
Expand Down Expand Up @@ -294,6 +322,8 @@ func defaultConfig() *Config {
IstioSidecarProxyDefaultMemory: "200Mi",
MinimumCPULimit: "0",
ResourceLimitMultiplier: map[string]int64{},
ServiceGroups: []ServiceGroup{},
MaximumMaxReplicasPerService: []MaximumMaxReplicasPerGroup{},
BufferRatioOnVerticalResource: 0.1,
}
}
Expand Down Expand Up @@ -339,10 +369,54 @@ func validate(config *Config) error {
if config.MinimumMinReplicas >= int(config.MaximumMinReplicas) {
return fmt.Errorf("MinimumMinReplicas should be less than MaximumMinReplicas")
}
if config.MaximumMinReplicas > config.MaximumMaxReplicas {

// Find the minimum value of MaximumMaxReplicas across all service groups
minOfMaximumMaxReplicas := config.MaximumMaxReplicas // Start with the default value of MaximumMaxReplicas
for _, group := range config.MaximumMaxReplicasPerService {
if group.MaximumMaxReplica < minOfMaximumMaxReplicas {
minOfMaximumMaxReplicas = group.MaximumMaxReplica
}
}

// Check for non-negative values
if minOfMaximumMaxReplicas < 0 {
return fmt.Errorf("MaximumMaxReplicas should contain non-negative values")
}

// Ensure ServiceGroupNames in MaximumMaxReplicas match defined ServiceGroups
serviceGroupMap := make(map[string]bool)
for _, sg := range config.ServiceGroups {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for _, sg := range config.ServiceGroups {
for _, sg := range config.ServiceGroups {
if sg.Name == "" {
return fmt.Errorf("Name of the service group should not be empty")
}

serviceGroupMap[sg.Name] = true
}

for _, maxReplicas := range config.MaximumMaxReplicasPerService {
if maxReplicas.ServiceGroupName != "" {
if _, exists := serviceGroupMap[maxReplicas.ServiceGroupName]; !exists {
return fmt.Errorf("ServiceGroupName %s in MaximumMaxReplicas is not defined in ServiceGroups", maxReplicas.ServiceGroupName)
}
}
}

// Ensure no duplicates in ServiceGroups
seenServiceGroups := make(map[string]bool)
for _, sg := range config.ServiceGroups {
if seenServiceGroups[sg.Name] {
return fmt.Errorf("duplicate ServiceGroupName found: %s", sg.Name)
}
Comment on lines +403 to +405
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This duplicate check can be done in the above loop at L388.

seenServiceGroups[sg.Name] = true
}

// Check all entries in MaximumMaxReplicasPerService have non-nil ServiceGroupName
for _, maxReplicas := range config.MaximumMaxReplicasPerService {
if maxReplicas.ServiceGroupName == "" {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this empty check could be done in the above loop at 393.

return fmt.Errorf("ServiceGroupName should not be nil in MaximumMaxReplicasPerService entries")
}
}

if config.MaximumMinReplicas > minOfMaximumMaxReplicas {
return fmt.Errorf("MaximumMinReplicas should be less than or equal to MaximumMaxReplicas")
}
if config.PreferredMaxReplicas >= int(config.MaximumMaxReplicas) {
if config.PreferredMaxReplicas >= int(minOfMaximumMaxReplicas) {
Comment on lines +416 to +419
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think config.MaximumMaxReplicas and minOfMaximumMaxReplicas should be validated with PreferredMaxReplicas and MaximumMinReplicas separately for better error messages.

return fmt.Errorf("PreferredMaxReplicas should be less than MaximumMaxReplicas")
}
if config.PreferredMaxReplicas <= config.MinimumMinReplicas {
Expand Down
8 changes: 7 additions & 1 deletion pkg/config/config_test.go
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you should add test cases for all validations that you added.

Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ func TestParseConfig(t *testing.T) {
IstioSidecarProxyDefaultCPU: "100m",
IstioSidecarProxyDefaultMemory: "200Mi",
MaxAllowedScalingDownRatio: 0.5,
ServiceGroups: []ServiceGroup{},
MaximumMaxReplicasPerService: []MaximumMaxReplicasPerGroup{},
MinimumCPURequestPerContainer: map[string]string{
"istio-proxy": "100m",
"hoge-agent": "120m",
Expand Down Expand Up @@ -92,6 +94,8 @@ func TestParseConfig(t *testing.T) {
MinimumCPURequestPerContainer: map[string]string{},
MinimumMemoryRequestPerContainer: map[string]string{},
ResourceLimitMultiplier: map[string]int64{},
ServiceGroups: []ServiceGroup{},
MaximumMaxReplicasPerService: []MaximumMaxReplicasPerGroup{},
BufferRatioOnVerticalResource: 0.1,
},
},
Expand Down Expand Up @@ -134,6 +138,8 @@ func TestParseConfig(t *testing.T) {
MinimumCPURequestPerContainer: map[string]string{},
MinimumMemoryRequestPerContainer: map[string]string{},
ResourceLimitMultiplier: map[string]int64{},
ServiceGroups: []ServiceGroup{},
MaximumMaxReplicasPerService: []MaximumMaxReplicasPerGroup{},
BufferRatioOnVerticalResource: 0.1,
},
},
Expand Down Expand Up @@ -223,7 +229,7 @@ func Test_validate(t *testing.T) {
wantErr: true,
},
{
name: "invalid PreferredMaxReplicas",
name: "invalid PreferredMaxReplicas less than MinimumMinReplicas",
config: &Config{
RangeOfMinMaxReplicasRecommendationHours: 2,
GatheringDataPeriodType: "daily",
Expand Down
58 changes: 54 additions & 4 deletions pkg/hpa/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (

"github.com/mercari/tortoise/api/v1beta3"
autoscalingv1beta3 "github.com/mercari/tortoise/api/v1beta3"
"github.com/mercari/tortoise/pkg/config"
"github.com/mercari/tortoise/pkg/event"
"github.com/mercari/tortoise/pkg/metrics"
"github.com/mercari/tortoise/pkg/utils"
Expand All @@ -41,6 +42,8 @@ type Service struct {
minimumMinReplicas int32
maximumMinReplica int32
maximumMaxReplica int32
serviceGroups []config.ServiceGroup
maximumMaxReplicasPerService []config.MaximumMaxReplicasPerGroup
externalMetricExclusionRegex *regexp.Regexp
}

Expand All @@ -53,6 +56,8 @@ func New(
tortoiseHPATargetUtilizationUpdateInterval time.Duration,
maximumMinReplica, maximumMaxReplica int32,
minimumMinReplicas int32,
serviceGroups []config.ServiceGroup,
maximumMaxReplicasPerService []config.MaximumMaxReplicasPerGroup,
externalMetricExclusionRegex string,
) (*Service, error) {
var regex *regexp.Regexp
Expand All @@ -75,6 +80,8 @@ func New(
maximumMinReplica: maximumMinReplica,
minimumMinReplicas: minimumMinReplicas,
maximumMaxReplica: maximumMaxReplica,
serviceGroups: serviceGroups,
maximumMaxReplicasPerService: maximumMaxReplicasPerService,
externalMetricExclusionRegex: regex,
}, nil
}
Expand Down Expand Up @@ -240,6 +247,22 @@ var globalRecommendedHPABehavior = &v2.HorizontalPodAutoscalerBehavior{
},
}

// Determine which service group is applicable for a given Tortoise using its selectors.
func (c *Service) determineServiceGroup(tortoise *autoscalingv1beta3.Tortoise) string {
tortoiseNamespace := tortoise.Namespace
for _, serviceGroup := range c.serviceGroups {
for _, selector := range serviceGroup.Selectors {
if selector.Namespace == tortoiseNamespace {
klog.InfoS("Namespace matched", "serviceGroup", serviceGroup.Name, "namespace", tortoiseNamespace)
return serviceGroup.Name
}
// TODO(avs): Add logic for matching labels in the future.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not within this PR? If you meant not to implement this within this PR, you should remove LabelSelector support from the config for now. (and add the one when you actually implement logic

}
}
// Returning an empty string to denote a default value when no match.
return ""
}

func (c *Service) CreateHPA(ctx context.Context, tortoise *autoscalingv1beta3.Tortoise, replicaNum int32, now time.Time) (*v2.HorizontalPodAutoscaler, *autoscalingv1beta3.Tortoise, error) {
if !HasHorizontal(tortoise) {
// no need to create HPA
Expand All @@ -250,6 +273,10 @@ func (c *Service) CreateHPA(ctx context.Context, tortoise *autoscalingv1beta3.To
return nil, tortoise, nil
}

// Logic to determine the applicable service group and max replicas.
groupName := c.determineServiceGroup(tortoise)
maximumMaxReplicas := c.getMaximumMaxReplicasForGroup(groupName)

hpa := &v2.HorizontalPodAutoscaler{
ObjectMeta: metav1.ObjectMeta{
Name: autoscalingv1beta3.TortoiseDefaultHPAName(tortoise.Name),
Expand All @@ -262,7 +289,7 @@ func (c *Service) CreateHPA(ctx context.Context, tortoise *autoscalingv1beta3.To
APIVersion: tortoise.Spec.TargetRefs.ScaleTargetRef.APIVersion,
},
MinReplicas: ptr.To[int32](c.minimumMinReplicas),
MaxReplicas: c.maximumMaxReplica,
MaxReplicas: maximumMaxReplicas,
Behavior: globalRecommendedHPABehavior,
},
}
Expand All @@ -275,6 +302,25 @@ func (c *Service) CreateHPA(ctx context.Context, tortoise *autoscalingv1beta3.To
return hpa.DeepCopy(), tortoise, err
}

// getMaximumMaxReplicasForGroup returns the maximum replicas for a specific service group.
// If the groupName is empty or doesn't match any named group, the function returns the default maximum replicas defined by the MaximumMaxReplicas.
func (c *Service) getMaximumMaxReplicasForGroup(groupName string) int32 {
// Handle the case for the default group first
if groupName == "" {
return c.maximumMaxReplica
}

// Look for a specific service group match
for _, group := range c.maximumMaxReplicasPerService {
if group.ServiceGroupName != "" && group.ServiceGroupName == groupName {
return group.MaximumMaxReplica
}
}

// Fallback to the default maximum if no match is found
return c.maximumMaxReplica
}

func (c *Service) GetHPAOnTortoiseSpec(ctx context.Context, tortoise *autoscalingv1beta3.Tortoise) (*v2.HorizontalPodAutoscaler, error) {
if tortoise.Spec.TargetRefs.HorizontalPodAutoscalerName == nil {
return nil, nil
Expand Down Expand Up @@ -405,9 +451,13 @@ func (c *Service) ChangeHPAFromTortoiseRecommendation(tortoise *autoscalingv1bet
recommendMax = *tortoise.Spec.MaxReplicas
}

if recommendMax > c.maximumMaxReplica {
c.recorder.Event(tortoise, corev1.EventTypeWarning, event.WarningHittingHardMaxReplicaLimit, fmt.Sprintf("MaxReplica (%v) suggested from Tortoise (%s/%s) hits a cluster-wide maximum replica number (%v). It wouldn't be a problem until the replica number actually grows to %v though, you may want to reach out to your cluster admin.", recommendMax, tortoise.Namespace, tortoise.Name, c.maximumMaxReplica, c.maximumMaxReplica))
recommendMax = c.maximumMaxReplica
// Determine the service group and the maximum replicas for that group
groupName := c.determineServiceGroup(tortoise)
maximumMaxReplica := c.getMaximumMaxReplicasForGroup(groupName)

if recommendMax > maximumMaxReplica {
c.recorder.Event(tortoise, corev1.EventTypeWarning, event.WarningHittingHardMaxReplicaLimit, fmt.Sprintf("MaxReplica (%v) suggested from Tortoise (%s/%s) hits a cluster-wide maximum replica number (%v). It wouldn't be a problem until the replica number actually grows to %v though, you may want to reach out to your cluster admin.", recommendMax, tortoise.Namespace, tortoise.Name, maximumMaxReplica, maximumMaxReplica))
recommendMax = maximumMaxReplica
}

hpa.Spec.MaxReplicas = recommendMax
Expand Down
Loading
Loading