Skip to content

Commit

Permalink
fix: use ray addon when creating GKE clusters (GoogleCloudPlatform#781)
Browse files Browse the repository at this point in the history
* fix: use ray addon when creating GKE clusters

Change-Id: Iac49ae3e2d57701754a50cbfc10b9bba70829f41

* delete kuberay-logging and kuberay-operator module

Change-Id: I2b8eda653bcf4b1d1cd402111e7bd79d18eaf1ac

---------

Co-authored-by: Gen Lu <[email protected]>
  • Loading branch information
genlu2011 and genlu2011 authored Aug 31, 2024
1 parent 203ddbc commit 1d8ccc3
Show file tree
Hide file tree
Showing 29 changed files with 139 additions and 606 deletions.
56 changes: 20 additions & 36 deletions applications/rag/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -64,21 +64,24 @@ module "infra" {
source = "../../infrastructure"
count = var.create_cluster ? 1 : 0

project_id = var.project_id
cluster_name = local.cluster_name
cluster_location = var.cluster_location
region = local.cluster_location_region
autopilot_cluster = var.autopilot_cluster
private_cluster = var.private_cluster
create_network = var.create_network
network_name = local.network_name
subnetwork_name = local.network_name
subnetwork_cidr = var.subnetwork_cidr
subnetwork_region = local.cluster_location_region
cpu_pools = var.cpu_pools
enable_gpu = true
gpu_pools = var.gpu_pools
kubernetes_version = var.kubernetes_version
project_id = var.project_id
cluster_name = local.cluster_name
cluster_location = var.cluster_location
region = local.cluster_location_region
autopilot_cluster = var.autopilot_cluster
private_cluster = var.private_cluster
create_network = var.create_network
network_name = local.network_name
subnetwork_name = local.network_name
subnetwork_cidr = var.subnetwork_cidr
subnetwork_region = local.cluster_location_region
cpu_pools = var.cpu_pools
enable_gpu = true
gpu_pools = var.gpu_pools
ray_addon_enabled = true
# TODO(genlu): remove channel and k8s_version after ray addon is in REGULAR channel
release_channel = "RAPID"
kubernetes_version = "1.30.3-gke.1969000"
depends_on = [module.project-services]
}

Expand Down Expand Up @@ -152,16 +155,6 @@ module "namespace" {
namespace = local.kubernetes_namespace
}

module "kuberay-operator" {
source = "../../modules/kuberay-operator"
providers = { helm = helm.rag, kubernetes = kubernetes.rag }
name = "kuberay-operator"
project_id = var.project_id
create_namespace = true
namespace = local.kubernetes_namespace
autopilot_cluster = local.enable_autopilot
}

module "gcs" {
source = "../../modules/gcs"
count = var.create_gcs_bucket ? 1 : 0
Expand Down Expand Up @@ -216,13 +209,6 @@ module "jupyterhub" {
depends_on = [module.namespace, module.gcs]
}

module "kuberay-logging" {
source = "../../modules/kuberay-logging"
providers = { kubernetes = kubernetes.rag }
namespace = local.kubernetes_namespace
depends_on = [module.namespace]
}

module "kuberay-workload-identity" {
providers = { kubernetes = kubernetes.rag }
source = "terraform-google-modules/kubernetes-engine/google//modules/workload-identity"
Expand All @@ -245,8 +231,7 @@ module "kuberay-monitoring" {
create_namespace = true
enable_grafana_on_ray_dashboard = var.enable_grafana_on_ray_dashboard
k8s_service_account = local.ray_service_account
//TODO(genlu): remove the module.kuberay-operator after migrated using ray addon.
depends_on = [module.namespace, module.kuberay-operator, module.kuberay-workload-identity]
depends_on = [module.namespace, module.kuberay-workload-identity]
}

module "kuberay-cluster" {
Expand Down Expand Up @@ -281,8 +266,7 @@ module "kuberay-cluster" {
k8s_backend_service_port = var.ray_dashboard_k8s_backend_service_port
domain = var.ray_dashboard_domain
members_allowlist = var.ray_dashboard_members_allowlist != "" ? split(",", var.ray_dashboard_members_allowlist) : []
//TODO(genlu): remove the module.kuberay-operator after migrated using ray addon.
depends_on = [module.gcs, module.kuberay-operator, module.kuberay-workload-identity]
depends_on = [module.gcs, module.kuberay-workload-identity]
}

module "inference-server" {
Expand Down
2 changes: 1 addition & 1 deletion applications/rag/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ variable "cluster_location" {

variable "kubernetes_version" {
type = string
default = "1.28"
default = "1.30"
}

variable "kubernetes_namespace" {
Expand Down
30 changes: 7 additions & 23 deletions applications/ray/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,11 @@ module "infra" {
cpu_pools = var.cpu_pools
enable_gpu = var.enable_gpu
gpu_pools = var.gpu_pools
depends_on = [module.project-services]
ray_addon_enabled = true
# TODO(genlu): remove channel and k8s_version after ray addon is in REGULAR channel
release_channel = "RAPID"
kubernetes_version = "1.30.3-gke.1969000"
depends_on = [module.project-services]
}

data "google_container_cluster" "default" {
Expand Down Expand Up @@ -147,24 +151,6 @@ module "kuberay-workload-identity" {
depends_on = [module.namespace]
}

module "kuberay-operator" {
source = "../../modules/kuberay-operator"
providers = { helm = helm.ray, kubernetes = kubernetes.ray }
name = "kuberay-operator"
create_namespace = true
namespace = local.kubernetes_namespace
project_id = var.project_id
autopilot_cluster = local.enable_autopilot
}

module "kuberay-logging" {
source = "../../modules/kuberay-logging"
providers = { kubernetes = kubernetes.ray }
namespace = local.kubernetes_namespace

depends_on = [module.namespace]
}

module "kuberay-monitoring" {
count = var.create_ray_cluster ? 1 : 0
source = "../../modules/kuberay-monitoring"
Expand All @@ -175,8 +161,7 @@ module "kuberay-monitoring" {
create_namespace = true
enable_grafana_on_ray_dashboard = var.enable_grafana_on_ray_dashboard
k8s_service_account = local.workload_identity_service_account
//TODO(genlu): remove the module.kuberay-operator after migrated using ray addon.
depends_on = [module.kuberay-workload-identity, module.kuberay-operator]
depends_on = [module.kuberay-workload-identity]
}

module "gcs" {
Expand Down Expand Up @@ -216,8 +201,7 @@ module "kuberay-cluster" {
k8s_backend_service_port = var.ray_dashboard_k8s_backend_service_port
domain = var.ray_dashboard_domain
members_allowlist = var.ray_dashboard_members_allowlist != "" ? split(",", var.ray_dashboard_members_allowlist) : []
//TODO(genlu): remove the module.kuberay-operator after migrated using ray addon.
depends_on = [module.gcs, module.kuberay-operator, module.kuberay-workload-identity]
depends_on = [module.gcs, module.kuberay-workload-identity]
}


Expand Down
8 changes: 5 additions & 3 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,12 @@ steps:
-auto-approve -no-color
echo "pass" > /workspace/user_result.txt

# Make sure pods are running
chmod +x /workspace/scripts/ci/wait_for_pods.sh
/workspace/scripts/ci/wait_for_pods.sh ml-$SHORT_SHA-$_BUILD_ID-ray 3000

kubectl wait --all pods -n ml-$SHORT_SHA-$_BUILD_ID-ray --for=condition=Ready --timeout=1200s
# Wait for pods to be stable
sleep 5s
# Ray head's readinessProbe is not probing the head service today. Therefore the wait for ready above is not reliable.
sleep 60s
kubectl port-forward -n ml-$SHORT_SHA-$_BUILD_ID-ray service/ray-cluster-kuberay-head-svc 8265:8265 &
# Wait port-forwarding to take its place
sleep 10s
Expand Down
8 changes: 6 additions & 2 deletions infrastructure/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ module "public-gke-standard-cluster" {
all_node_pools_labels = var.all_node_pools_labels
all_node_pools_metadata = var.all_node_pools_metadata
all_node_pools_tags = var.all_node_pools_tags
ray_addon_enabled = var.ray_addon_enabled
depends_on = [module.custom-network]
}

Expand All @@ -141,8 +142,8 @@ module "public-gke-autopilot-cluster" {
ip_range_services = var.ip_range_services
master_authorized_networks = var.master_authorized_networks
deletion_protection = var.deletion_protection
ray_addon_enabled = var.ray_addon_enabled
depends_on = [module.custom-network]

}

## create private GKE standard
Expand Down Expand Up @@ -170,6 +171,7 @@ module "private-gke-standard-cluster" {
deletion_protection = var.deletion_protection
master_authorized_networks = length(var.master_authorized_networks) == 0 ? [{ cidr_block = "${local.subnetwork_cidr}", display_name = "${local.subnetwork_name}" }] : var.master_authorized_networks
master_ipv4_cidr_block = var.master_ipv4_cidr_block
ray_addon_enabled = var.ray_addon_enabled

## pools config variables
cpu_pools = var.cpu_pools
Expand Down Expand Up @@ -207,7 +209,9 @@ module "private-gke-autopilot-cluster" {
master_authorized_networks = length(var.master_authorized_networks) == 0 ? [{ cidr_block = "${local.subnetwork_cidr}", display_name = "${local.subnetwork_name}" }] : var.master_authorized_networks
master_ipv4_cidr_block = var.master_ipv4_cidr_block
deletion_protection = var.deletion_protection
depends_on = [module.custom-network]
ray_addon_enabled = var.ray_addon_enabled

depends_on = [module.custom-network]
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ autopilot_cluster = false # false = standard cluster, true = autopilot cluster
cluster_name = "test-cluster"
cluster_location = "us-east4"
gcs_fuse_csi_driver = true
ray_addon_enabled = true
# TODO(genlu): remove release_channel and kubernetes_version after 1.30.3-gke.1969000 is in REGULAR channel
release_channel = "RAPID"
kubernetes_version = "1.30.3-gke.1969000"

cpu_pools = [{
name = "cpu-pool"
Expand Down
10 changes: 9 additions & 1 deletion infrastructure/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ variable "cluster_labels" {

variable "kubernetes_version" {
type = string
default = "1.28"
default = "1.30"
}

variable "release_channel" {
Expand Down Expand Up @@ -127,6 +127,13 @@ variable "deletion_protection" {
type = bool
default = false
}

variable "ray_addon_enabled" {
type = bool
description = "Set to true to enable ray addon"
default = true
}

variable "master_authorized_networks" {
type = list(object({
cidr_block = string
Expand Down Expand Up @@ -173,6 +180,7 @@ variable "enable_tpu" {
description = "Set to true to create TPU node pool"
default = false
}

variable "enable_gpu" {
type = bool
description = "Set to true to create GPU node pool"
Expand Down
8 changes: 6 additions & 2 deletions modules/gke-autopilot-private-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

module "gke" {
source = "terraform-google-modules/kubernetes-engine/google//modules/beta-autopilot-private-cluster"
version = "29.0.0"
version = "32.0.1"
project_id = var.project_id
regional = var.cluster_regional
name = var.cluster_name
Expand All @@ -35,7 +35,11 @@ module "gke" {
master_authorized_networks = var.master_authorized_networks
master_ipv4_cidr_block = var.master_ipv4_cidr_block
deletion_protection = var.deletion_protection

ray_operator_config = {
enabled = var.ray_addon_enabled
logging_enabled = var.ray_addon_enabled
monitoring_enabled = var.ray_addon_enabled
}
}

# GKE cluster fleet registration
Expand Down
6 changes: 6 additions & 0 deletions modules/gke-autopilot-private-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,9 @@ variable "master_ipv4_cidr_block" {
type = string
default = ""
}

variable "ray_addon_enabled" {
description = "Enable ray addon by default"
type = bool
default = true
}
8 changes: 7 additions & 1 deletion modules/gke-autopilot-public-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

module "gke" {
source = "terraform-google-modules/kubernetes-engine/google//modules/beta-autopilot-public-cluster"
version = "29.0.0"
version = "32.0.1"
project_id = var.project_id
regional = var.cluster_regional
name = var.cluster_name
Expand All @@ -29,4 +29,10 @@ module "gke" {
ip_range_services = var.ip_range_services
master_authorized_networks = var.master_authorized_networks
deletion_protection = var.deletion_protection

ray_operator_config = {
enabled = var.ray_addon_enabled
logging_enabled = var.ray_addon_enabled
monitoring_enabled = var.ray_addon_enabled
}
}
6 changes: 6 additions & 0 deletions modules/gke-autopilot-public-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,9 @@ variable "deletion_protection" {
type = bool
default = false
}

variable "ray_addon_enabled" {
description = "Enable ray addon by default"
type = bool
default = true
}
8 changes: 7 additions & 1 deletion modules/gke-standard-private-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ locals {

module "gke" {
source = "terraform-google-modules/kubernetes-engine/google//modules/private-cluster"
version = "29.0.0"
version = "32.0.1"
project_id = var.project_id
regional = var.cluster_regional
name = var.cluster_name
Expand All @@ -45,6 +45,12 @@ module "gke" {
master_authorized_networks = var.master_authorized_networks
master_ipv4_cidr_block = var.master_ipv4_cidr_block

ray_operator_config = {
enabled = var.ray_addon_enabled
logging_enabled = var.ray_addon_enabled
monitoring_enabled = var.ray_addon_enabled
}

node_pools = local.node_pools

node_pools_oauth_scopes = {
Expand Down
6 changes: 6 additions & 0 deletions modules/gke-standard-private-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,9 @@ variable "datapath_provider" {
type = string
default = "ADVANCED_DATAPATH"
}

variable "ray_addon_enabled" {
description = "Enable ray addon by default"
type = bool
default = true
}
8 changes: 7 additions & 1 deletion modules/gke-standard-public-cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ locals {

module "gke" {
source = "terraform-google-modules/kubernetes-engine/google"
version = "29.0.0"
version = "32.0.1"
project_id = var.project_id
regional = var.cluster_regional
name = var.cluster_name
Expand All @@ -40,6 +40,12 @@ module "gke" {
monitoring_enable_managed_prometheus = var.monitoring_enable_managed_prometheus
master_authorized_networks = var.master_authorized_networks

ray_operator_config = {
enabled = var.ray_addon_enabled
logging_enabled = var.ray_addon_enabled
monitoring_enabled = var.ray_addon_enabled
}

node_pools = local.node_pools

node_pools_oauth_scopes = {
Expand Down
6 changes: 6 additions & 0 deletions modules/gke-standard-public-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,9 @@ variable "datapath_provider" {
type = string
default = "ADVANCED_DATAPATH"
}

variable "ray_addon_enabled" {
description = "Enable ray addon by default"
type = bool
default = true
}
Loading

0 comments on commit 1d8ccc3

Please sign in to comment.