Skip to content

Commit

Permalink
Ensure we do the SecurityGroup swap for instance cleanup on KMS failu…
Browse files Browse the repository at this point in the history
…res (#292)
  • Loading branch information
joshbranham authored Jan 9, 2025
1 parent 8a6139a commit 8aed1d0
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 52 deletions.
71 changes: 59 additions & 12 deletions pkg/verifier/aws/aws_verifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,6 @@ const (
// This corresponds with the quay tag: v0.1.90-f2e86a9
networkValidatorImage = "quay.io/app-sre/osd-network-verifier@sha256:137bf177c2e87732b2692c1af39d3b79b2f84c7f0ee9254df4ea4412dddfab1e"
networkValidatorRepo = "quay.io/app-sre/osd-network-verifier"
userdataEndVerifier = "USERDATA END"
prepulledImageMessage = "Warning: could not pull the specified docker image, will try to use the prepulled one"
invalidKMSCode = "Client.InvalidKMSKey.InvalidState"
)

Expand All @@ -79,7 +77,7 @@ type AwsVerifier struct {
AwsClient *aws.Client
Logger ocmlog.Logger
Output output.Output
// This cache is only to be used inside of describeInstanceType() to minimize nil ptr error risk
// This cache is only to be used inside describeInstanceType() to minimize nil ptr error risk
cachedInstanceTypeInfo *ec2Types.InstanceTypeInfo
}

Expand Down Expand Up @@ -158,7 +156,7 @@ func (a *AwsVerifier) describeInstanceType(ctx context.Context, instanceType str
return a.cachedInstanceTypeInfo, nil
}

// instanceTypeUsesNitro asks the AWS API whether or not the provided instanceType uses the "Nitro"
// instanceTypeUsesNitro asks the AWS API whether the provided instanceType uses the "Nitro"
// hypervisor. Nitro is the only hypervisor supporting serial console output, which we need to
// collect in order to gather probe results
func (a *AwsVerifier) instanceTypeUsesNitro(ctx context.Context, instanceType string) (bool, error) {
Expand All @@ -169,7 +167,7 @@ func (a *AwsVerifier) instanceTypeUsesNitro(ctx context.Context, instanceType st
}

// Return true if instance type uses nitro
return (instanceTypeInfo.Hypervisor == ec2Types.InstanceTypeHypervisorNitro), nil
return instanceTypeInfo.Hypervisor == ec2Types.InstanceTypeHypervisorNitro, nil
}

// instanceTypeArchitecture asks the AWS API about the CPU architecture(s) supported by the provided
Expand Down Expand Up @@ -256,6 +254,7 @@ type createEC2InstanceInput struct {
tags map[string]string
ctx context.Context
keyPair string
vpcID string
}

func (a *AwsVerifier) createEC2Instance(input createEC2InstanceInput) (string, error) {
Expand Down Expand Up @@ -339,16 +338,16 @@ func (a *AwsVerifier) createEC2Instance(input createEC2InstanceInput) (string, e

// Wait up to 5 minutes for the instance to be running
waiter := ec2.NewInstanceRunningWaiter(a.AwsClient)
if err := waiter.Wait(input.ctx, &ec2.DescribeInstancesInput{InstanceIds: []string{instanceID}}, 2*time.Minute); err != nil {
resp, err := a.AwsClient.DescribeInstances(context.TODO(), &ec2.DescribeInstancesInput{
if err := waiter.Wait(input.ctx, &ec2.DescribeInstancesInput{InstanceIds: []string{instanceID}}, 5*time.Minute); err != nil {
resp, err := a.AwsClient.DescribeInstances(input.ctx, &ec2.DescribeInstancesInput{
InstanceIds: []string{instanceID},
})
if err != nil {
fmt.Println("Warning: Waiter Describe instances failure.")
}

var stateCode string
if resp.Reservations[0].Instances[0].StateReason.Code != nil {
if resp != nil && resp.Reservations[0].Instances[0].StateReason.Code != nil {
stateCode = *resp.Reservations[0].Instances[0].StateReason.Code
}

Expand All @@ -357,6 +356,17 @@ func (a *AwsVerifier) createEC2Instance(input createEC2InstanceInput) (string, e
waiterErr = handledErrors.NewKmsError("encountered issue accessing KMS key when launching instance.")
}

// Switch the instance SecurityGroup to the default before terminating to avoid a cleanup race condition. This is
// handled by the normal cleanup process, except in this specific case where we fail early because of KMS issues.
defaultSecurityGroupID := a.fetchVpcDefaultSecurityGroup(input.ctx, input.vpcID)
if defaultSecurityGroupID != "" {
// Replace the SecurityGroup attached to the instance with the default one for the VPC to allow for graceful
// termination of the network-verifier created temporary SecurityGroup. If we hit an error, we ignore it
// and continue with normal termination of the instance.
_ = a.modifyInstanceSecurityGroup(input.ctx, instanceID, defaultSecurityGroupID)
a.Logger.Info(input.ctx, "Modified the instance to use the default security group")
}

if err := a.AwsClient.TerminateEC2Instance(input.ctx, instanceID); err != nil {
return instanceID, handledErrors.NewGenericError(err)
}
Expand Down Expand Up @@ -491,16 +501,16 @@ func (a *AwsVerifier) CreateSecurityGroup(ctx context.Context, tags map[string]s

a.Logger.Info(ctx, "Created security group with ID: %s", *output.GroupId)

input_rules := &ec2.AuthorizeSecurityGroupEgressInput{
inputRules := &ec2.AuthorizeSecurityGroupEgressInput{
GroupId: output.GroupId,
IpPermissions: defaultIpPermissions,
}

if _, err := a.AwsClient.AuthorizeSecurityGroupEgress(ctx, input_rules); err != nil {
if _, err := a.AwsClient.AuthorizeSecurityGroupEgress(ctx, inputRules); err != nil {
return &ec2.CreateSecurityGroupOutput{}, err
}

revoke_default_egress := &ec2.RevokeSecurityGroupEgressInput{
revokeDefaultEgress := &ec2.RevokeSecurityGroupEgressInput{
GroupId: output.GroupId,
IpPermissions: []ec2Types.IpPermission{
{
Expand All @@ -516,7 +526,7 @@ func (a *AwsVerifier) CreateSecurityGroup(ctx context.Context, tags map[string]s
},
}

if _, err := a.AwsClient.RevokeSecurityGroupEgress(ctx, revoke_default_egress); err != nil {
if _, err := a.AwsClient.RevokeSecurityGroupEgress(ctx, revokeDefaultEgress); err != nil {
return &ec2.CreateSecurityGroupOutput{}, err
}

Expand Down Expand Up @@ -694,3 +704,40 @@ func (a *AwsVerifier) GetVpcIdFromSubnetId(ctx context.Context, vpcSubnetID stri
}
return vpcId, nil
}

// fetchVpcDefaultSecurityGroup will return either the 'default' SG ID, or an empty string if not found/an error is hit
func (a *AwsVerifier) fetchVpcDefaultSecurityGroup(ctx context.Context, vpcId string) string {
describeSGOutput, err := a.AwsClient.DescribeSecurityGroups(ctx, &ec2.DescribeSecurityGroupsInput{
Filters: []ec2Types.Filter{
{
Name: awsTools.String("vpc-id"),
Values: []string{vpcId},
},
{
Name: awsTools.String("group-name"),
Values: []string{"default"},
},
},
})

if err != nil {
return ""
}

for _, SG := range describeSGOutput.SecurityGroups {
if *SG.GroupName == "default" {
return *SG.GroupId
}
}

return ""
}

func (a *AwsVerifier) modifyInstanceSecurityGroup(ctx context.Context, instanceID string, securityGroupID string) error {
_, err := a.AwsClient.ModifyInstanceAttribute(ctx, &ec2.ModifyInstanceAttributeInput{
InstanceId: &instanceID,
Groups: []string{securityGroupID},
})

return err
}
47 changes: 7 additions & 40 deletions pkg/verifier/aws/entry_point.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,6 @@ func (a *AwsVerifier) ValidateEgress(vei verifier.ValidateEgressInput) *output.O

// If security group not given, create a temporary one
if len(vei.AWS.SecurityGroupIDs) == 0 || vei.ForceTempSecurityGroup {

createSecurityGroupOutput, err := a.CreateSecurityGroup(vei.Ctx, vei.Tags, "osd-network-verifier", vpcId)
if err != nil {
return a.Output.AddError(err)
Expand Down Expand Up @@ -208,7 +207,6 @@ func (a *AwsVerifier) ValidateEgress(vei verifier.ValidateEgressInput) *output.O
return a.Output.AddError(err)
}
}

}

// Create EC2 instance
Expand All @@ -224,6 +222,7 @@ func (a *AwsVerifier) ValidateEgress(vei verifier.ValidateEgressInput) *output.O
securityGroupIDs: vei.AWS.SecurityGroupIDs,
tempSecurityGroupID: vei.AWS.TempSecurityGroup,
keyPair: vei.ImportKeyPair,
vpcID: vpcId,
})
if err != nil {
return a.Output.AddError(err)
Expand All @@ -246,48 +245,16 @@ func (a *AwsVerifier) ValidateEgress(vei verifier.ValidateEgressInput) *output.O

// Terminate the EC2 instance (unless user requests otherwise)
if !vei.SkipInstanceTermination {
// Replaced the SGs attached to the network-verifier-instance by the default SG in order to allow
// deletion of temporary SGs created

// Getting a list of the SGs for the current VPC of our instance
var defaultSecurityGroupID = ""
describeSGOutput, err := a.AwsClient.DescribeSecurityGroups(vei.Ctx, &ec2.DescribeSecurityGroupsInput{
Filters: []ec2Types.Filter{
{
Name: awsTools.String("vpc-id"),
Values: []string{vpcId},
},
{
Name: awsTools.String("group-name"),
Values: []string{"default"},
},
},
})
if err != nil {
a.Output.AddError(err)
a.Logger.Info(vei.Ctx, "Unable to describe security groups. Falling back to slower cloud resource cleanup method.")

}

if describeSGOutput != nil {

//Fetch default Security Group ID.
for _, SG := range describeSGOutput.SecurityGroups {
if *SG.GroupName == "default" {
defaultSecurityGroupID = *SG.GroupId
}
}

//Replacing the SGs attach to instance by the default one. This is to clean the SGs created in case the instance
//termination times out
_, err = a.AwsClient.ModifyInstanceAttribute(vei.Ctx, &ec2.ModifyInstanceAttributeInput{
InstanceId: &instanceID,
Groups: []string{defaultSecurityGroupID},
})
// Replace the SecurityGroup attached to the instance with the default one for the VPC to allow for graceful
// termination of the network-verifier created temporary SecurityGroup
defaultSecurityGroupID := a.fetchVpcDefaultSecurityGroup(vei.Ctx, vpcId)
if defaultSecurityGroupID != "" {
err = a.modifyInstanceSecurityGroup(vei.Ctx, instanceID, defaultSecurityGroupID)
if err != nil {
a.Logger.Info(vei.Ctx, "Unable to detach instance from security group. Falling back to slower cloud resource cleanup method.")
a.writeDebugLogs(vei.Ctx, fmt.Sprintf("Fell back to slower cloud resource cleanup because faster method (network interface detatchment) blocked by AWS: %s.", err))
}
a.Logger.Info(vei.Ctx, "Modified the instance to use the default security group")
}

a.Logger.Info(vei.Ctx, "Deleting instance with ID: %s", instanceID)
Expand Down

0 comments on commit 8aed1d0

Please sign in to comment.