Skip to content

Commit

Permalink
Introduce multiple service group categories and maximumMaxReplicas fo…
Browse files Browse the repository at this point in the history
…r each of them
  • Loading branch information
AVSBharadwaj committed Jan 9, 2025
1 parent 310df59 commit 01be401
Show file tree
Hide file tree
Showing 7 changed files with 292 additions and 57 deletions.
2 changes: 1 addition & 1 deletion api/autoscaling/v2/webhook_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ var _ = BeforeSuite(func() {
eventRecorder := mgr.GetEventRecorderFor("tortoise-controller")
tortoiseService, err := tortoise.New(mgr.GetClient(), eventRecorder, config.RangeOfMinMaxReplicasRecommendationHours, config.TimeZone, config.TortoiseUpdateInterval, config.GatheringDataPeriodType)
Expect(err).NotTo(HaveOccurred())
hpaService, err := hpa.New(mgr.GetClient(), eventRecorder, config.ReplicaReductionFactor, config.MaximumTargetResourceUtilization, 100, time.Hour, 1000, 10000, 3, "")
hpaService, err := hpa.New(mgr.GetClient(), eventRecorder, config.ReplicaReductionFactor, config.MaximumTargetResourceUtilization, 100, time.Hour, 1000, 3, "", config)
Expect(err).NotTo(HaveOccurred())

hpaWebhook := New(tortoiseService, hpaService)
Expand Down
4 changes: 2 additions & 2 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ func main() {
os.Exit(1)
}

hpaService, err := hpa.New(mgr.GetClient(), eventRecorder, config.ReplicaReductionFactor, config.MaximumTargetResourceUtilization, config.HPATargetUtilizationMaxIncrease, config.HPATargetUtilizationUpdateInterval, config.MaximumMinReplicas, config.MaximumMaxReplicas, int32(config.MinimumMinReplicas), config.HPAExternalMetricExclusionRegex)
hpaService, err := hpa.New(mgr.GetClient(), eventRecorder, config.ReplicaReductionFactor, config.MaximumTargetResourceUtilization, config.HPATargetUtilizationMaxIncrease, config.HPATargetUtilizationUpdateInterval, config.MaximumMinReplicas, int32(config.MinimumMinReplicas), config.HPAExternalMetricExclusionRegex, config)
if err != nil {
setupLog.Error(err, "unable to start hpa service")
os.Exit(1)
Expand All @@ -176,7 +176,7 @@ func main() {
config.MinimumMemoryRequestPerContainer,
config.MaximumCPURequest,
config.MaximumMemoryRequest,
config.MaximumMaxReplicas,
config.GetDefaultMaximumMaxReplica(),
config.MaxAllowedScalingDownRatio,
config.BufferRatioOnVerticalResource,
config.FeatureFlags,
Expand Down
15 changes: 14 additions & 1 deletion internal/controller/tortoise_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"sigs.k8s.io/yaml"

"github.com/mercari/tortoise/api/v1beta3"
configfile "github.com/mercari/tortoise/pkg/config"
"github.com/mercari/tortoise/pkg/deployment"
"github.com/mercari/tortoise/pkg/features"
"github.com/mercari/tortoise/pkg/hpa"
Expand Down Expand Up @@ -249,7 +250,19 @@ func startController(ctx context.Context) func() {
Expect(err).ShouldNot(HaveOccurred())
cli, err := vpa.New(mgr.GetConfig(), recorder)
Expect(err).ShouldNot(HaveOccurred())
hpaS, err := hpa.New(mgr.GetClient(), recorder, 0.95, 90, 25, time.Hour, 1000, 10000, 3, ".*-exclude-metric")

// Define a dummy config with maximumMaxReplica set to 10000 for the default group
defaultGroupName := "default"
dummyConfig := &configfile.Config{
MaximumMaxReplicas: []configfile.MaximumMaxReplicasPerGroup{
{
ServiceGroupName: &defaultGroupName,
MaximumMaxReplica: 10000, // Set the value you need
},
},
// Add other default values if your function logic depends on them
}
hpaS, err := hpa.New(mgr.GetClient(), recorder, 0.95, 90, 25, time.Hour, 1000, 3, ".*-exclude-metric", dummyConfig)
Expect(err).ShouldNot(HaveOccurred())
reconciler := &TortoiseReconciler{
Scheme: scheme,
Expand Down
122 changes: 113 additions & 9 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@ package config

import (
"fmt"
"math"
"os"
"time"

"gopkg.in/yaml.v3"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/mercari/tortoise/pkg/features"
)
Expand Down Expand Up @@ -166,11 +168,6 @@ type Config struct {
// a tortoise will ignore `PreferredMaxReplicas`, and increase the number of replicas.
// This feature is controlled by the feature flag `VerticalScalingBasedOnPreferredMaxReplicas`.
PreferredMaxReplicas int `yaml:"PreferredMaxReplicas"`
// MaximumMaxReplicas is the maximum maxReplica that tortoise can give to the HPA (default: 100)
// Note that this is very dangerous. If you set this value too low, the HPA may not be able to scale up the workload.
// The motivation is to use it has a hard limit to prevent the HPA from scaling up the workload too much in cases of Tortoise's bug, abnormal traffic increase, etc.
// If some Tortoise hits this limit, the tortoise controller emits an error log, which may or may not imply you have to change this value.
MaximumMaxReplicas int32 `yaml:"MaximumMaxReplicas"`
// MaximumCPURequest is the maximum CPU cores that the tortoise can give to the container resource request (default: 10)
MaximumCPURequest string `yaml:"MaximumCPURequest"`
// MaximumMemoryRequest is the maximum memory bytes that the tortoise can give to the container resource request (default: 10Gi)
Expand Down Expand Up @@ -261,12 +258,43 @@ type Config struct {
// IstioSidecarProxyDefaultMemory is the default Memory resource request of the istio sidecar proxy (default: 200Mi)
IstioSidecarProxyDefaultMemory string `yaml:"IstioSidecarProxyDefaultMemory"`

// serviceGroups defines a list of service category names.
ServiceGroups []ServiceGroup `yaml:"ServiceGroups"`
// MaximumMaxReplicas is the maximum maxReplicas that tortoise can give to the HPA per group (default: 100)
// Note that this is very dangerous. If you set this value too low, the HPA may not be able to scale up the workload.
// The motivation is to use it has a hard limit to prevent the HPA from scaling up the workload too much in cases of Tortoise's bug, abnormal traffic increase, etc.
// If some Tortoise hits this limit, the tortoise controller emits an error log, which may or may not imply you have to change this value.
MaximumMaxReplicas []MaximumMaxReplicasPerGroup `yaml:"MaximumMaxReplicas"`

// FeatureFlags is the list of feature flags (default: empty = all alpha features are disabled)
// See the list of feature flags in features.go
FeatureFlags []features.FeatureFlag `yaml:"FeatureFlags"`
}

type MaximumMaxReplicasPerGroup struct {
// ServiceGroupName refers to one ServiceGroup at Config.ServiceGroups
// If nil, this MaximumMaxReplica would apply to all services.
ServiceGroupName *string `yaml:"ServiceGroupName"`

MaximumMaxReplica int32 `yaml:"MaximumMaxReplica"`
}

// Namespace represents a Kubernetes namespace and its associated label selectors.
type Namespace struct {
Name string `yaml:"name"` // Namespace name
LabelSelectors []*metav1.LabelSelector `yaml:"labelSelectors"` // Slice of label selectors within this namespace
}

// ServiceGroup represents a collection of services grouped together with namespace awareness.
type ServiceGroup struct {
// Name is the group's name (e.g., big-service, fintech-service, etc).
Name string `yaml:"name"`
// Namespaces represent multiple namespaces with their label selectors.
Namespaces []Namespace `yaml:"namespaces"` // A slice of Namespace structs
}

func defaultConfig() *Config {
defaultGroupName := "default"
return &Config{
RangeOfMinMaxReplicasRecommendationHours: 1,
GatheringDataPeriodType: "weekly",
Expand All @@ -288,13 +316,22 @@ func defaultConfig() *Config {
HPATargetUtilizationMaxIncrease: 5,
HPATargetUtilizationUpdateInterval: time.Hour * 24,
MaximumMinReplicas: 10,
MaximumMaxReplicas: 100,
MaxAllowedScalingDownRatio: 0.8,
IstioSidecarProxyDefaultCPU: "100m",
IstioSidecarProxyDefaultMemory: "200Mi",
MinimumCPULimit: "0",
ResourceLimitMultiplier: map[string]int64{},
BufferRatioOnVerticalResource: 0.1,
ServiceGroups: []ServiceGroup{
// This is an empty slice, indicating that no service groups are defined by default.
},
MaximumMaxReplicas: []MaximumMaxReplicasPerGroup{
// This is the default maximum maxReplicas limit for all services.
{
ServiceGroupName: &defaultGroupName, // Applies to all services by default.
MaximumMaxReplica: 100, // Default max replica limit.
},
},
BufferRatioOnVerticalResource: 0.1,
}
}

Expand Down Expand Up @@ -322,6 +359,16 @@ func ParseConfig(path string) (*Config, error) {
return config, nil
}

// GetDefaultMaxReplica returns the default maximum max replicas from the configuration.
func (cfg *Config) GetDefaultMaximumMaxReplica() int32 {
for _, maxReplicaGroup := range cfg.MaximumMaxReplicas {
if maxReplicaGroup.ServiceGroupName != nil && *maxReplicaGroup.ServiceGroupName == "default" {
return maxReplicaGroup.MaximumMaxReplica
}
}
return 100 // Choose a last resort default value if "default" is not found
}

func validate(config *Config) error {
if config.RangeOfMinMaxReplicasRecommendationHours > 24 || config.RangeOfMinMaxReplicasRecommendationHours < 1 {
return fmt.Errorf("RangeOfMinMaxReplicasRecommendationHours should be between 1 and 24")
Expand All @@ -339,10 +386,67 @@ func validate(config *Config) error {
if config.MinimumMinReplicas >= int(config.MaximumMinReplicas) {
return fmt.Errorf("MinimumMinReplicas should be less than MaximumMinReplicas")
}
if config.MaximumMinReplicas > config.MaximumMaxReplicas {

// Check that there is at least one MaximumMaxReplicas
if len(config.MaximumMaxReplicas) == 0 {
return fmt.Errorf("MaximumMaxReplicas must have at least one configuration entry")
}

// Find the minimum value of MaximumMaxReplicas across all service groups
minOfMaximumMaxReplicas := int32(math.MaxInt32) // Start with the largest possible int32 value
var defaultFound bool
for _, group := range config.MaximumMaxReplicas {
if group.MaximumMaxReplica < minOfMaximumMaxReplicas {
minOfMaximumMaxReplicas = group.MaximumMaxReplica
}
// Check for "default" entry
if group.ServiceGroupName != nil && *group.ServiceGroupName == "default" {
defaultFound = true
}
}

// Ensure that there is an entry with "default" for MaximumMaxReplicas
if !defaultFound {
return fmt.Errorf("There must be at least one MaximumMaxReplicas entry with ServiceGroupName set to \"default\"")
}

// Check for non-negative values
if minOfMaximumMaxReplicas < 0 {
return fmt.Errorf("MaximumMaxReplicas should contain non-negative values")
}

// Ensure ServiceGroupNames in MaximumMaxReplicas match defined ServiceGroups
serviceGroupMap := make(map[string]bool)
for _, sg := range config.ServiceGroups {
serviceGroupMap[sg.Name] = true
}

for _, maxReplicas := range config.MaximumMaxReplicas {
if maxReplicas.ServiceGroupName != nil {
// Allow "default" to bypass the check of matching ServiceGroups.
if *maxReplicas.ServiceGroupName == "default" {
continue
}
// If not default, ensure it exists in the serviceGroupMap.
if _, exists := serviceGroupMap[*maxReplicas.ServiceGroupName]; !exists {
return fmt.Errorf("ServiceGroupName %s in MaximumMaxReplicas is not defined in ServiceGroups", *maxReplicas.ServiceGroupName)
}
}
}

// Ensure no duplicates in ServiceGroups
seenServiceGroups := make(map[string]bool)
for _, sg := range config.ServiceGroups {
if seenServiceGroups[sg.Name] {
return fmt.Errorf("Duplicate ServiceGroupName found: %s", sg.Name)
}
seenServiceGroups[sg.Name] = true
}

if config.MaximumMinReplicas > minOfMaximumMaxReplicas {
return fmt.Errorf("MaximumMinReplicas should be less than or equal to MaximumMaxReplicas")
}
if config.PreferredMaxReplicas >= int(config.MaximumMaxReplicas) {
if config.PreferredMaxReplicas >= int(minOfMaximumMaxReplicas) {
return fmt.Errorf("PreferredMaxReplicas should be less than MaximumMaxReplicas")
}
if config.PreferredMaxReplicas <= config.MinimumMinReplicas {
Expand Down
Loading

0 comments on commit 01be401

Please sign in to comment.