Skip to content

Commit

Permalink
[chaosplt-161] fix bug on cloud disruptions adding too much hosts (#805)
Browse files Browse the repository at this point in the history
* fix bug on cloud disruptions adding too much hosts

* receiver name change

* move to disruption controller cloud host computing

* remove useless var

* remove wrong tests in chaos pod service + update disruption controller tests

* remove human mistake :oops:

* remove human mistake :oops:

* fix tests

* Update api/v1beta1/network_disruption.go

Co-authored-by: Philip Thompson <[email protected]>

* Update controllers/disruption_controller.go

Co-authored-by: Philip Thompson <[email protected]>

* add another test

* complexify test

* Update api/v1beta1/network_disruption.go

Co-authored-by: Philip Thompson <[email protected]>

---------

Co-authored-by: Philip Thompson <[email protected]>
  • Loading branch information
clairecng and ptnapoleon authored Jan 9, 2024
1 parent defb431 commit b81a033
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 131 deletions.
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ Once you have installed the above requirements, run the `make lima-all` command
- `make lima-start` to create the lima vm with containerd and Kubernetes (backed by k3s)
- `make lima-kubectx` to add the lima Kubernetes cluster config to your local configs and switch to the lima context
- `make lima-install-cert-manager` to install cert-manager
- `make lima-build` to build the chaos-controller images
- `make lima-push-all` to build and push the chaos-controller images
- `make lima-install` to render and apply the chaos-controller helm chart

Once the instance is started, you can log into it using either the `lima` or its longer form `limactl shell <$LIMA_INSTANCE>` commands.
Expand Down
24 changes: 16 additions & 8 deletions api/v1beta1/network_disruption.go
Original file line number Diff line number Diff line change
Expand Up @@ -708,11 +708,17 @@ func (s NetworkDisruptionServiceSpec) ExtractAffectedPortsInServicePorts(k8sServ
return goodPorts, notFoundPorts
}

// TransformCloudSpecToHostsSpec from a cloud spec disruption, get all ip ranges of services provided and transform them into a list of hosts spec
func TransformCloudSpecToHostsSpec(cloudManager cloudservice.CloudServicesProvidersManager, cloudSpec *NetworkDisruptionCloudSpec) ([]NetworkDisruptionHostSpec, error) {
var hosts []NetworkDisruptionHostSpec
// UpdateHostsOnCloudDisruption from a cloud spec disruption, get all ip ranges of services provided and appends them into the s.Hosts slice
func (s *NetworkDisruptionSpec) UpdateHostsOnCloudDisruption(cloudManager cloudservice.CloudServicesProvidersManager) error {
if s == nil || s.Cloud == nil {
return nil
}

if s.Hosts == nil {
s.Hosts = []NetworkDisruptionHostSpec{}
}

clouds := cloudSpec.TransformToCloudMap()
clouds := s.Cloud.TransformToCloudMap()

for cloudName, serviceList := range clouds {
var serviceListNames []string
Expand All @@ -723,20 +729,22 @@ func TransformCloudSpecToHostsSpec(cloudManager cloudservice.CloudServicesProvid

ipRangesPerService, err := cloudManager.GetServicesIPRanges(types.CloudProviderName(cloudName), serviceListNames)
if err != nil {
return nil, err
return err
}

for _, serviceSpec := range serviceList {
for _, ipRange := range ipRangesPerService[serviceSpec.ServiceName] {
hosts = append(hosts, NetworkDisruptionHostSpec{
host := NetworkDisruptionHostSpec{
Host: ipRange,
Protocol: serviceSpec.Protocol,
Flow: serviceSpec.Flow,
ConnState: serviceSpec.ConnState,
})
}

s.Hosts = append(s.Hosts, host)
}
}
}

return hosts, nil
return nil
}
10 changes: 10 additions & 0 deletions controllers/disruption_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"time"

chaosv1beta1 "github.com/DataDog/chaos-controller/api/v1beta1"
"github.com/DataDog/chaos-controller/cloudservice"
"github.com/DataDog/chaos-controller/o11y/metrics"
"github.com/DataDog/chaos-controller/o11y/tracer"
"github.com/DataDog/chaos-controller/safemode"
Expand Down Expand Up @@ -70,6 +71,7 @@ type DisruptionReconciler struct {
CacheContextStore map[string]CtxTuple
DisruptionsWatchersManager watchers.DisruptionsWatchersManager
ChaosPodService services.ChaosPodService
CloudService cloudservice.CloudServicesProvidersManager
DisruptionsDeletionTimeout time.Duration
}

Expand Down Expand Up @@ -484,6 +486,14 @@ func (r *DisruptionReconciler) startInjection(ctx context.Context, instance *cha
r.log.Infow("starting targets injection", "targets", instance.Status.TargetInjections)
}

// on cloud disruption, update hosts
subspec := instance.Spec.DisruptionKindPicker(chaostypes.DisruptionKindNetworkDisruption)
if reflect.ValueOf(subspec).IsValid() && !reflect.ValueOf(subspec).IsNil() {
if err = instance.Spec.Network.UpdateHostsOnCloudDisruption(r.CloudService); err != nil {
return err
}
}

// iterate through target + existing disruption kind -- to ensure all chaos pods exist
for targetName, injections := range instance.Status.TargetInjections {
for _, disKind := range chaostypes.DisruptionKindNames {
Expand Down
101 changes: 85 additions & 16 deletions controllers/disruption_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -512,10 +512,47 @@ var _ = Describe("Disruption Controller", func() {
})

Context("Cloud disruption is a host disruption disguised", func() {
VerifyHosts := func(ctx SpecContext) (error, int) {
// get chaos pods
l, err := listChaosPods(ctx, disruption)
if err != nil {
return err, 0
}

hosts := make([]int, len(l.Items))

// sum up injectors
for i, p := range l.Items {
hosts[i] = 0
args := p.Spec.Containers[0].Args
for _, arg := range args {
if arg == "--hosts" {
hosts[i]++
}
}
}

for i, hostsForItem := range hosts {
if hostsForItem == 0 {
return fmt.Errorf("should have multiple hosts parameters."), 0
}

// verify that all chaos pods have the same list of hosts
if i > 0 {
if hosts[i] != hosts[i-1] {
return fmt.Errorf("should have the same list of hosts for all chaos pods"), hosts[i]
}
}
}

return nil, hosts[0]
}

BeforeEach(func() {
skipSecondPod = false
disruption.Spec = chaosv1beta1.DisruptionSpec{
DryRun: false,
Count: &intstr.IntOrString{Type: intstr.Int, IntVal: 1},
Count: &intstr.IntOrString{Type: intstr.String, StrVal: "100%"},
Unsafemode: &chaosv1beta1.UnsafemodeSpec{
DisableAll: true,
},
Expand All @@ -538,33 +575,65 @@ var _ = Describe("Disruption Controller", func() {
})

It("should create a cloud disruption but apply a host disruption with the list of cloud managed service ip ranges", func(ctx SpecContext) {
totalNbOfHosts := 0
By("Ensuring that the chaos pod have been created")
ExpectChaosPods(ctx, disruption, 1)
ExpectChaosPods(ctx, disruption, 2)

By("Ensuring that the chaos pod has the list of AWS hosts")
By("Ensuring that the chaos pods have the list of AWS hosts")
Eventually(func(ctx SpecContext) error {
hosts := 0
err, nbOfHosts := VerifyHosts(ctx)
if err != nil {
return err
}

if totalNbOfHosts != 0 && totalNbOfHosts != nbOfHosts {
return fmt.Errorf("should have the same number of hosts for all chaos pods in all iterations")
}

// get chaos pods
l, err := listChaosPods(ctx, disruption)
totalNbOfHosts = nbOfHosts

return nil
}).WithContext(ctx).ProbeEvery(disruptionPotentialChangesEvery).Within(calcDisruptionGoneTimeout(disruption)).Should(Succeed())

By("Ensuring adding another chaos pod will result in the same number of hosts")
By("creating extra target one")
extraOneCreated := CreateRunningPod(ctx, *targetPod.DeepCopy())

By("waiting extra targets to be created and running")
extraTargetPod := <-extraOneCreated
ExpectChaosPods(ctx, disruption, 3)

By("Ensuring that the chaos pods have the same nb of AWS hosts")
Eventually(func(ctx SpecContext) error {
err, nbOfHosts := VerifyHosts(ctx)
if err != nil {
return err
}

// sum up injectors
for _, p := range l.Items {
args := p.Spec.Containers[0].Args
for _, arg := range args {
if arg == "--hosts" {
hosts++
}
}
if totalNbOfHosts != 0 && totalNbOfHosts != nbOfHosts {
return fmt.Errorf("should have the same number of hosts for all chaos pods in all iterations")
}

if hosts == 0 {
return fmt.Errorf("should have multiple hosts parameters.")
totalNbOfHosts = nbOfHosts

return nil
}).WithContext(ctx).ProbeEvery(disruptionPotentialChangesEvery).Within(calcDisruptionGoneTimeout(disruption)).Should(Succeed())

By("Deleting extra target and ensuring the number of hosts is the same")
DeleteRunningPod(ctx, extraTargetPod)
ExpectChaosPods(ctx, disruption, 2)
Eventually(func(ctx SpecContext) error {
err, nbOfHosts := VerifyHosts(ctx)
if err != nil {
return err
}

if totalNbOfHosts != 0 && totalNbOfHosts != nbOfHosts {
return fmt.Errorf("should have the same number of hosts for all chaos pods in all iterations")
}

totalNbOfHosts = nbOfHosts

return nil
}).WithContext(ctx).ProbeEvery(disruptionPotentialChangesEvery).Within(calcDisruptionGoneTimeout(disruption)).Should(Succeed())
})
Expand Down
6 changes: 3 additions & 3 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,9 +171,8 @@ func main() {
DNSDisruptionKubeDNS: cfg.Injector.DNSDisruption.KubeDNS,
ImagePullSecrets: cfg.Injector.ImagePullSecrets,
},
ImagePullSecrets: cfg.Injector.ImagePullSecrets,
MetricsSink: metricsSink,
CloudServicesProvidersManager: cloudProviderManager,
ImagePullSecrets: cfg.Injector.ImagePullSecrets,
MetricsSink: metricsSink,
})

if err != nil {
Expand All @@ -192,6 +191,7 @@ func main() {
ExpiredDisruptionGCDelay: gcPtr,
CacheContextStore: make(map[string]controllers.CtxTuple),
ChaosPodService: chaosPodService,
CloudService: cloudProviderManager,
DisruptionsDeletionTimeout: cfg.Controller.DisruptionDeletionTimeout,
}

Expand Down
35 changes: 10 additions & 25 deletions services/chaospod.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import (

chaosapi "github.com/DataDog/chaos-controller/api"
chaosv1beta1 "github.com/DataDog/chaos-controller/api/v1beta1"
"github.com/DataDog/chaos-controller/cloudservice"
"github.com/DataDog/chaos-controller/env"
"github.com/DataDog/chaos-controller/o11y/metrics"
"github.com/DataDog/chaos-controller/targetselector"
Expand Down Expand Up @@ -75,14 +74,13 @@ type ChaosPodServiceInjectorConfig struct {

// ChaosPodServiceConfig contains configuration options for the chaosPodService.
type ChaosPodServiceConfig struct {
Client client.Client // Kubernetes client for interacting with the API server.
Log *zap.SugaredLogger // Logger for logging.
ChaosNamespace string // Namespace where chaos-related resources are located.
TargetSelector targetselector.TargetSelector // Target selector for selecting target pods.
Injector ChaosPodServiceInjectorConfig // Configuration options for the injector.
ImagePullSecrets string // Image pull secrets for the chaosPodService.
MetricsSink metrics.Sink // Sink for exporting metrics.
CloudServicesProvidersManager cloudservice.CloudServicesProvidersManager // Manager for cloud service providers.
Client client.Client // Kubernetes client for interacting with the API server.
Log *zap.SugaredLogger // Logger for logging.
ChaosNamespace string // Namespace where chaos-related resources are located.
TargetSelector targetselector.TargetSelector // Target selector for selecting target pods.
Injector ChaosPodServiceInjectorConfig // Configuration options for the injector.
ImagePullSecrets string // Image pull secrets for the chaosPodService.
MetricsSink metrics.Sink // Sink for exporting metrics.
}

type chaosPodService struct {
Expand Down Expand Up @@ -243,24 +241,11 @@ func (m *chaosPodService) GenerateChaosPodsOfDisruption(instance *chaosv1beta1.D
}

notInjectedBefore := instance.TimeToInject()

allowedHosts := m.config.Injector.NetworkDisruptionAllowedHosts

// get the ip ranges of cloud provider services
if instance.Spec.Network != nil {
if instance.Spec.Network.Cloud != nil {
hosts, err := chaosv1beta1.TransformCloudSpecToHostsSpec(m.config.CloudServicesProvidersManager, instance.Spec.Network.Cloud)
if err != nil {
return nil, err
}

instance.Spec.Network.Hosts = append(instance.Spec.Network.Hosts, hosts...)
}

// remove default allowed hosts if disabled
if instance.Spec.Network.DisableDefaultAllowedHosts {
allowedHosts = make([]string, 0)
}
// remove default allowed hosts if disabled
if instance.Spec.Network != nil && instance.Spec.Network.DisableDefaultAllowedHosts {
allowedHosts = make([]string, 0)
}

xargs := chaosapi.DisruptionArgs{
Expand Down
Loading

0 comments on commit b81a033

Please sign in to comment.