Skip to content

Latest commit

 

History

History
558 lines (486 loc) · 23.5 KB

README.md

File metadata and controls

558 lines (486 loc) · 23.5 KB

Terraform module for connecting an AWS EKS cluster to CAST AI

Website: https://www.cast.ai

Requirements

Using the module

A module to connect an EKS cluster to CAST AI.

Requires castai/castai and hashicorp/aws providers to be configured.

module "castai-eks-cluster" {
  source = "castai/eks-cluster/castai"

  aws_account_id     = var.aws_account_id
  aws_cluster_region = var.cluster_region
  aws_cluster_name   = var.cluster_id

  aws_assume_role_arn      = module.castai-eks-role-iam.role_arn

  // Default node configuration will be used for all CAST provisioned nodes unless specific configuration is requested.
  default_node_configuration = module.cast-eks-cluster.castai_node_configurations["default"]

  node_configurations = {
    default = {
      subnets                   = module.vpc.private_subnets
      dns_cluster_ip            = "10.100.0.10"
      instance_profile_role_arn = var.instance_profile_arn
      ssh_public_key            = var.ssh_public_key
      security_groups           = [
        module.eks.node_security_group_id,
      ]
      tags = {
        "team" : "core"
      }
      init_script   = base64encode(var.init_script)
      docker_config = jsonencode({
        "insecure-registries"      = ["registry.com:5000"],
        "max-concurrent-downloads" = 10
      })
      kubelet_config = jsonencode({
        "registryBurst" : 20,
        "registryPullQPS" : 10
      })
      container_runtime = "dockerd"
    }
  }

  node_templates = {
    spot_tmpl = {
      configuration_id = module.cast-eks-cluster.castai_node_configurations["default"]

      should_taint = true

      custom_labels = {
        custom-label-key-1 = "custom-label-value-1"
        custom-label-key-2 = "custom-label-value-2"
      }

      custom_taints = [
        {
          key   = "custom-taint-key-1"
          value = "custom-taint-value-1"
        },
        {
          key   = "custom-taint-key-2"
          value = "custom-taint-value-2"
        }
      ]

      constraints = {
        fallback_restore_rate_seconds = 1800
        spot                          = true
        use_spot_fallbacks            = true
        min_cpu                       = 4
        max_cpu                       = 100
        instance_families             = {
          exclude = ["m5"]
        }
        compute_optimized_state = "disabled"
        storage_optimized_state = "disabled"
        is_gpu_only              = false
        architectures            = ["amd64"]
      }
    }
  }

  autoscaler_settings = {
    enabled                                 = true
    node_templates_partial_matching_enabled = false

    unschedulable_pods = {
      enabled = true

      headroom = {
        enabled           = true
        cpu_percentage    = 10
        memory_percentage = 10
      }

      headroom_spot = {
        enabled           = true
        cpu_percentage    = 10
        memory_percentage = 10
      }
    }

    node_downscaler = {
      enabled = true

      empty_nodes = {
        enabled = true
      }

      evictor = {
        aggressive_mode           = false
        cycle_interval            = "5s10s"
        dry_run                   = false
        enabled                   = true
        node_grace_period_minutes = 10
        scoped_mode               = false
      }
    }

    cluster_limits = {
      enabled = true

      cpu = {
        max_cores = 20
        min_cores = 1
      }
    }
  }
}

Migrating from 2.x.x to 3.x.x

Existing configuration:

module "castai-eks-cluster" {
  // ...
  
  subnets                   = module.vpc.private_subnets
  dns_cluster_ip            = "10.100.0.10"
  instance_profile_role_arn = var.instance_profile_arn
  ssh_public_key            = var.ssh_public_key
  override_security_groups  = [
    module.eks.node_security_group_id,
  ]
  tags = {
    "team" : "core"
  }
}

New configuration:

module "castai-eks-cluster" {
  // ...
  
  // Default node configuration will be used for all CAST provisioned nodes unless specific configuration is requested.
  default_node_configuration = module.cast-eks-cluster.castai_node_configurations["default"]

  node_configurations = {
    default = {
      subnets                   = module.vpc.private_subnets
      dns_cluster_ip            = "10.100.0.10"
      instance_profile_role_arn = var.instance_profile_arn
      ssh_public_key            = var.ssh_public_key
      security_groups           = [
        module.eks.node_security_group_id,
      ]
      tags = {
        "team" : "core"
      }
    }
  }
}

Migrating from 5.x.x to 6.x.x

Existing configuration:

module "castai-eks-cluster" {
  // ...

  node_templates = {
    // ...
  }
  autoscaler_policies_json = <<-EOT
    {
        "enabled": true,
        "unschedulablePods": {
            "enabled": true
        },
        "spotInstances": {
            "enabled": true,
            "clouds": ["aws"],
            "spotBackups": {
                "enabled": true
            },
            "spotDiversityEnabled": false,
            "spotDiversityPriceIncreaseLimitPercent": 20,
            "spotInterruptionPredictions": {
              "enabled": true,
              "type": "AWSRebalanceRecommendations"
            }
        },
        "nodeDownscaler": {
            "enabled": true,
            "emptyNodes": {
                "enabled": true
            },
            "evictor": {
                "aggressiveMode": true,
                "cycleInterval": "5m10s",
                "dryRun": false,
                "enabled": true,
                "nodeGracePeriodMinutes": 10,
                "scopedMode": false
            }
        }
    }
  EOT
}

New configuration:

module "castai-eks-cluster" {
  // ...

  node_templates = {
    default_by_castai = {
      name = "default-by-castai"
      configuration_id = module.castai-eks-cluster.castai_node_configurations["default"]
      is_default   = true
      should_taint = false

      constraints = {
        on_demand          = true
        spot               = true
        use_spot_fallbacks = true

        enable_spot_diversity                       = false
        spot_diversity_price_increase_limit_percent = 20

        spot_interruption_predictions_enabled = true
        spot_interruption_predictions_type = "aws-rebalance-recommendations"
      }
    }
  }
  autoscaler_policies_json = <<-EOT
    {
        "enabled": true,
        "unschedulablePods": {
            "enabled": true
        },
        "nodeDownscaler": {
            "enabled": true,
            "emptyNodes": {
                "enabled": true
            },
            "evictor": {
                "aggressiveMode": true,
                "cycleInterval": "5m10s",
                "dryRun": false,
                "enabled": true,
                "nodeGracePeriodMinutes": 10,
                "scopedMode": false
            }
        }
    }
  EOT
}

Migrating from 6.x.x to 7.x.x

Version 7.x.x changes:

  • Removed custom_label attribute in castai_node_template resource. Use custom_labels instead.

Old configuration:

module "castai-eks-cluster" {
  // ...

  node_templates = {
    spot_tmpl = {
      custom_label = {
        key = "custom-label-key-1"
        value = "custom-label-value-1"
      }
    }
  }
}

New configuration:

module "castai-eks-cluster" {
  // ...

  node_templates = {
    spot_tmpl = {
      custom_labels = {
        custom-label-key-1 = "custom-label-value-1"
      }
    }
  }
}

Migrating from 7.x.x to 8.x.x

Version 8.x.x changed:

  • Removed compute_optimized and storage_optimized attributes in castai_node_template resource, constraints object. Use compute_optimized_state and storage_optimized_state instead.

Old configuration:

module "castai-eks-cluster" {
  node_templates = {
    spot_tmpl = {
      constraints = {
        compute_optimized = false
        storage_optimized = true
      }
    }
  }
}

New configuration:

module "castai-eks-cluster" {
  node_templates = {
    spot_tmpl = {
      constraints = {
        compute_optimized_state = "disabled"
        storage_optimized_state = "enabled"
      }
    }
  }
}

Migrating from 9.x.x to 9.3.x

Version 9.3.x changed:

  • Deprecated autoscaler_policies_json attribute. Use autoscaler_settings instead.

Old configuration:

module "castai-eks-cluster" {
  autoscaler_policies_json = <<-EOT
    {
        "enabled": true,
        "unschedulablePods": {
            "enabled": true
        },
        "nodeDownscaler": {
            "enabled": true,
            "emptyNodes": {
                "enabled": true
            },
            "evictor": {
                "aggressiveMode": false,
                "cycleInterval": "5m10s",
                "dryRun": false,
                "enabled": true,
                "nodeGracePeriodMinutes": 10,
                "scopedMode": false
            }
        },
        "nodeTemplatesPartialMatchingEnabled": false,
        "clusterLimits": {
            "cpu": {
                "maxCores": 20,
                "minCores": 1
            },
            "enabled": true
        }
    }
  EOT
}

New configuration:

module "castai-eks-cluster" {
  autoscaler_settings = {
    enabled                                 = true
    node_templates_partial_matching_enabled = false

    unschedulable_pods = {
      enabled = true
    }

    node_downscaler = {
      enabled = true

      empty_nodes = {
        enabled = true
      }

      evictor = {
        aggressive_mode           = false
        cycle_interval            = "5m10s"
        dry_run                   = false
        enabled                   = true
        node_grace_period_minutes = 10
        scoped_mode               = false
      }
    }

    cluster_limits = {
      enabled = true

      cpu = {
        max_cores = 20
        min_cores = 1
      }
    }
  }
}

Examples

Usage examples are located in terraform provider repo

Generate docs

terraform-docs markdown table . --output-file README.md

Requirements

Name Version
terraform >= 0.13
aws >= 2.49
castai ~> 7.14
helm >= 2.0.0

Providers

Name Version
castai ~> 7.14
helm >= 2.0.0
null n/a

Modules

No modules.

Resources

Name Type
castai_autoscaler.castai_autoscaler_policies resource
castai_eks_cluster.my_castai_cluster resource
castai_node_configuration.this resource
castai_node_configuration_default.this resource
castai_node_template.this resource
castai_workload_scaling_policy.this resource
helm_release.castai_agent resource
helm_release.castai_cluster_controller resource
helm_release.castai_cluster_controller_self_managed resource
helm_release.castai_egressd resource
helm_release.castai_egressd_self_managed resource
helm_release.castai_evictor resource
helm_release.castai_evictor_ext resource
helm_release.castai_evictor_self_managed resource
helm_release.castai_kvisor resource
helm_release.castai_kvisor_self_managed resource
helm_release.castai_pod_pinner resource
helm_release.castai_pod_pinner_self_managed resource
helm_release.castai_spot_handler resource
helm_release.castai_workload_autoscaler resource
helm_release.castai_workload_autoscaler_self_managed resource
null_resource.wait_for_cluster resource

Inputs

Name Description Type Default Required
agent_aws_access_key_id AWS access key for CAST AI agent to fetch instance details. string "" no
agent_aws_iam_service_account_role_arn Arn of the role to be used by CAST AI agent to fetch instance details. Only readonly AmazonEC2ReadOnlyAccess is needed. string "" no
agent_aws_secret_access_key AWS access key secret for CAST AI agent to fetch instance details. string "" no
agent_values List of YAML formatted string with agent values list(string) [] no
agent_version Version of castai-agent helm chart. Default latest string null no
api_grpc_addr CAST AI GRPC API address string "api-grpc.cast.ai:443" no
api_url URL of alternative CAST AI API to be used during development or testing string "https://api.cast.ai" no
autoscaler_policies_json Optional json object to override CAST AI cluster autoscaler policies. Deprecated, use autoscaler_settings instead. string null no
autoscaler_settings Optional Autoscaler policy definitions to override current autoscaler settings any null no
aws_account_id ID of AWS account the cluster is located in. string n/a yes
aws_assume_role_arn Arn of the role to be used by CAST AI for IAM access string null no
aws_cluster_name Name of the cluster to be connected to CAST AI. string n/a yes
aws_cluster_region Region of the cluster to be connected to CAST AI. string n/a yes
castai_api_token Optional CAST AI API token created in console.cast.ai API Access keys section. Used only when wait_for_cluster_ready is set to true string "" no
castai_components_labels Optional additional Kubernetes labels for CAST AI pods map(any) {} no
cluster_controller_values List of YAML formatted string with cluster-controller values list(string) [] no
cluster_controller_version Version of castai-cluster-controller helm chart. Default latest string null no
default_node_configuration ID of the default node configuration string "" no
default_node_configuration_name Name of the default node configuration string "" no
delete_nodes_on_disconnect Optionally delete Cast AI created nodes when the cluster is destroyed bool false no
egressd_values List of YAML formatted string with egressd values list(string) [] no
egressd_version Version of castai-egressd helm chart. Default latest string null no
evictor_ext_values List of YAML formatted string with evictor-ext values list(string) [] no
evictor_ext_version Version of castai-evictor-ext chart. Default latest string null no
evictor_values List of YAML formatted string with evictor values list(string) [] no
evictor_version Version of castai-evictor chart. Default latest string null no
grpc_url gRPC endpoint used by pod-pinner string "grpc.cast.ai:443" no
install_egressd Optional flag for installation of Egressd (Network cost monitoring) (https://docs.cast.ai/docs/network-cost) bool false no
install_security_agent Optional flag for installation of security agent (https://docs.cast.ai/product-overview/console/security-insights/) bool false no
install_workload_autoscaler Optional flag for installation of workload autoscaler (https://docs.cast.ai/docs/workload-autoscaling-configuration) bool false no
kvisor_controller_extra_args Extra arguments for the kvisor controller. Optionally enable kvisor to lint Kubernetes YAML manifests, scan workload images and check if workloads pass CIS Kubernetes Benchmarks as well as NSA, WASP and PCI recommendations. map(string)
{
"image-scan-enabled": "true",
"kube-bench-enabled": "true",
"kube-linter-enabled": "true"
}
no
kvisor_values List of YAML formatted string with kvisor values list(string) [] no
kvisor_version Version of kvisor chart. Default latest string null no
kvisor_wait Wait for kvisor chart to finish release bool true no
node_configurations Map of EKS node configurations to create any {} no
node_templates Map of node templates to create any {} no
pod_pinner_values List of YAML formatted string values for agent helm chart list(string) [] no
pod_pinner_version Version of pod-pinner helm chart. Default latest string null no
self_managed Whether CAST AI components' upgrades are managed by a customer; by default upgrades are managed CAST AI central system. bool false no
spot_handler_values List of YAML formatted string with spot-handler values list(string) [] no
spot_handler_version Version of castai-spot-handler helm chart. Default latest string null no
wait_for_cluster_ready Wait for cluster to be ready before finishing the module execution, this option requires castai_api_token to be set bool false no
workload_autoscaler_values List of YAML formatted string with cluster-workload-autoscaler values list(string) [] no
workload_autoscaler_version Version of castai-workload-autoscaler helm chart. Default latest string null no
workload_scaling_policies Map of workload scaling policies to create any {} no

Outputs

Name Description
castai_node_configurations Map of node configurations ids by name
castai_node_templates Map of node template by name
cluster_id CAST AI cluster id, which can be used for accessing cluster data using API