Skip to content

Commit

Permalink
Add GPU resource requests/limits
Browse files Browse the repository at this point in the history
WIP
  • Loading branch information
sl1pm4t committed Sep 5, 2018
1 parent 31d8552 commit a9c291e
Show file tree
Hide file tree
Showing 5 changed files with 190 additions and 4 deletions.
45 changes: 45 additions & 0 deletions kubernetes/resource_kubernetes_pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,29 @@ func TestAccKubernetesPod_gke_with_nodeSelector(t *testing.T) {
})
}

func TestAccKubernetesPod_gpuResource(t *testing.T) {
var conf api.Pod

podName := fmt.Sprintf("tf-acc-test-%s", acctest.RandStringFromCharSet(10, acctest.CharSetAlphaNum))
imageName := "nginx:1.7.9"

resource.Test(t, resource.TestCase{
PreCheck: func() { testAccPreCheck(t); skipIfNoGoogleCloudSettingsFound(t) },
Providers: testAccProviders,
CheckDestroy: testAccCheckKubernetesPodDestroy,
Steps: []resource.TestStep{
{
Config: testAccKubernetesPodConfigGPUResource(podName, imageName),
Check: resource.ComposeAggregateTestCheckFunc(
testAccCheckKubernetesPodExists("kubernetes_pod.test", &conf),
resource.TestCheckResourceAttr("kubernetes_pod.test", "spec.0.container.0.image", imageName),
resource.TestCheckResourceAttr("kubernetes_pod.test", "spec.0.container.0.resources.0.limits.0.nvidia_gpu.#", "1"),
),
},
},
})
}

func testAccCheckKubernetesPodDestroy(s *terraform.State) error {
conn := testAccProvider.Meta().(*kubernetesProvider).conn

Expand Down Expand Up @@ -1027,3 +1050,25 @@ resource "kubernetes_pod" "test" {
}
`, podName, imageName, args)
}

func testAccKubernetesPodConfigGPUResource(podName, imageName string) string {
return fmt.Sprintf(`
resource "kubernetes_pod" "test" {
metadata {
name = "%s"
}
spec {
container {
image = "%s"
name = "containername"
resources {
limits {
nvidia_gpu = 1
}
}
}
}
}
`, podName, imageName)
}
10 changes: 10 additions & 0 deletions kubernetes/schema_container.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,11 @@ func resourcesField() map[string]*schema.Schema {
ValidateFunc: validateResourceQuantity,
DiffSuppressFunc: suppressEquivalentResourceQuantity,
},
"nvidia_gpu": {
Type: schema.TypeInt,
Optional: true,
Computed: true,
},
},
},
},
Expand All @@ -137,6 +142,11 @@ func resourcesField() map[string]*schema.Schema {
ValidateFunc: validateResourceQuantity,
DiffSuppressFunc: suppressEquivalentResourceQuantity,
},
"nvidia_gpu": {
Type: schema.TypeInt,
Optional: true,
Computed: true,
},
},
},
},
Expand Down
10 changes: 9 additions & 1 deletion kubernetes/structures.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,11 @@ func base64EncodeStringMap(m map[string]interface{}) map[string]interface{} {
func flattenResourceList(l api.ResourceList) map[string]string {
m := make(map[string]string)
for k, v := range l {
m[string(k)] = v.String()
resKey := string(k)
if resKey == "nvidia.com/gpu" {
resKey = "nvidia_gpu"
}
m[resKey] = v.String()
}
return m
}
Expand All @@ -246,6 +250,10 @@ func expandMapToResourceList(m map[string]interface{}) (api.ResourceList, error)
return out, fmt.Errorf("Unexpected value type: %#v", origValue)
}

if key == "nvidia_gpu" {
key = "nvidia.com/gpu"
}

out[key] = value
}
return out, nil
Expand Down
3 changes: 3 additions & 0 deletions kubernetes/structures_container.go
Original file line number Diff line number Diff line change
Expand Up @@ -939,6 +939,9 @@ func expandContainerResourceRequirements(l []interface{}) (v1.ResourceRequiremen
if p["memory"] == "" {
delete(p, "memory")
}
if p["nvidia_gpu"] == "" {
delete(p, "nvidia.com/gpu")
}
return expandMapToResourceList(p)
}
return nil, nil
Expand Down
126 changes: 123 additions & 3 deletions kubernetes/test-infra/gke/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,18 @@ variable gcp_subnetwork {
default = "default"
}

variable enable_gpu {
default = false
}

# See https://cloud.google.com/container-engine/supported-versions
variable "kubernetes_version" {
description = <<EOF
The GKE Kubernetes version.
EXAMPLE:
EXAMPLES:
'1.8'
'1.9'
'1.10'
'1.9.6-gke.1'.
See https://cloud.google.com/container-engine/supported-versions
Expand All @@ -44,15 +50,15 @@ EOF
resource "google_container_cluster" "primary" {
name = "tf-acc-test-${random_id.cluster_name.hex}"
zone = "${data.google_compute_zones.available.names[0]}"
initial_node_count = 2
initial_node_count = 1
node_version = "${var.kubernetes_version}"
min_master_version = "${var.kubernetes_version}"

network = "${var.gcp_network}"
subnetwork = "${var.gcp_subnetwork}"

additional_zones = [
"${data.google_compute_zones.available.names[1]}",
"${data.google_compute_zones.available.names[2]}",
]

master_auth {
Expand All @@ -69,6 +75,120 @@ resource "google_container_cluster" "primary" {
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
]

guest_accelerator {
type = "nvidia-tesla-k80"
count = "${var.enable_gpu ? 1 : 0}"
}
}
}

resource kubernetes_daemonset nvidia_driver {
count = "${var.enable_gpu ? 1 : 0 }"

metadata {
name = "nvidia-driver-installer"
namespace = "kube-system"

labels {
"k8s-app" = "nvidia-driver-installer"
}
}

spec {
selector {
name = "nvidia-driver-installer"
"k8s-app" = "nvidia-driver-installer"
}

template {
metadata {
labels {
name = "nvidia-driver-installer"
"k8s-app" = "nvidia-driver-installer"
}
}

spec {
host_network = "true"
host_pid = "true"

volume {
name = "dev"

host_path {
path = "/dev"
}
}

volume {
name = "nvidia-install-dir-host"

host_path {
path = "/home/kubernetes/bin/nvidia"
}
}

volume {
name = "root-mount"

host_path {
path = "/"
}
}

init_container {
image = "cos-nvidia-installer:fixed"
image_pull_policy = "Never"
name = "nvidia-driver-installer"

resources {
requests {
cpu = "0.15"
}
}

security_context {
privileged = "true"
}

env {
name = "NVIDIA_INSTALL_DIR_HOST"
value = "/home/kubernetes/bin/nvidia"
}

env {
name = "NVIDIA_INSTALL_DIR_CONTAINER"
value = "/usr/local/nvidia"
}

env {
name = "ROOT_MOUNT_DIR"
value = "/root"
}

volume_mount {
name = "nvidia-install-dir-host"
mount_path = "/usr/local/nvidia"
}

volume_mount {
name = "dev"
mount_path = "/dev"
}

volume_mount {
name = "root-mount"
mount_path = "/root"
}
}

container {
image = "gcr.io/google-containers/pause:2.0"
name = "pause"
}
}
}
}
}

Expand Down

0 comments on commit a9c291e

Please sign in to comment.