From bca9b96ef2e309d056397fd689eb0ab98fcabd0a Mon Sep 17 00:00:00 2001 From: Daniel Vega-Myhre <105610547+danielvegamyhre@users.noreply.github.com> Date: Wed, 29 May 2024 12:29:47 -0700 Subject: [PATCH] [TPU Provisioner] Create admission controller (#687) * initial commit of admission controller * use jsonpatch library --- .../admission_controller/.gitignore | 6 + .../admission_controller/Dockerfile | 6 + .../admission_controller/README.md | 46 +++++++ .../admission_controller/__init__.py | 0 .../admission_controller.py | 103 +++++++++++++++ .../certificates/README.md | 7 ++ .../manifests/manifest.yaml | 78 ++++++++++++ .../admission_controller/requirements.txt | Bin 0 -> 466 bytes .../admission_controller/test/__init__.py | 0 .../test/admission_controller_test.py | 118 ++++++++++++++++++ .../test-location-hint-no-reservation.yaml | 38 ++++++ .../test-location-hint-with-reservation.yaml | 40 ++++++ 12 files changed, 442 insertions(+) create mode 100644 tpu-provisioner/admission_controller/.gitignore create mode 100644 tpu-provisioner/admission_controller/Dockerfile create mode 100644 tpu-provisioner/admission_controller/README.md create mode 100644 tpu-provisioner/admission_controller/__init__.py create mode 100644 tpu-provisioner/admission_controller/admission_controller.py create mode 100644 tpu-provisioner/admission_controller/certificates/README.md create mode 100644 tpu-provisioner/admission_controller/manifests/manifest.yaml create mode 100644 tpu-provisioner/admission_controller/requirements.txt create mode 100644 tpu-provisioner/admission_controller/test/__init__.py create mode 100644 tpu-provisioner/admission_controller/test/admission_controller_test.py create mode 100644 tpu-provisioner/admission_controller/test/manual_e2e/test-location-hint-no-reservation.yaml create mode 100644 tpu-provisioner/admission_controller/test/manual_e2e/test-location-hint-with-reservation.yaml diff --git a/tpu-provisioner/admission_controller/.gitignore b/tpu-provisioner/admission_controller/.gitignore new file mode 100644 index 000000000..4d132197c --- /dev/null +++ b/tpu-provisioner/admission_controller/.gitignore @@ -0,0 +1,6 @@ +# don't add certificates +certificates/*.crt +certificates/*.key + +__pycache__/ +.pytest_cache/ \ No newline at end of file diff --git a/tpu-provisioner/admission_controller/Dockerfile b/tpu-provisioner/admission_controller/Dockerfile new file mode 100644 index 000000000..b555f58c3 --- /dev/null +++ b/tpu-provisioner/admission_controller/Dockerfile @@ -0,0 +1,6 @@ +FROM python:3.10-slim-buster +WORKDIR /webhook +COPY requirements.txt /webhook +COPY admission_controller.py /webhook +RUN pip install --no-cache-dir --upgrade -r /webhook/requirements.txt +CMD ["uvicorn", "admission_controller:app", "--host", "0.0.0.0", "--port", "5000","--ssl-keyfile=/certs/webhook.key", "--ssl-certfile=/certs/webhook.crt"] diff --git a/tpu-provisioner/admission_controller/README.md b/tpu-provisioner/admission_controller/README.md new file mode 100644 index 000000000..3505fef8a --- /dev/null +++ b/tpu-provisioner/admission_controller/README.md @@ -0,0 +1,46 @@ +# TPU Provisioner Admission Controller + +This is a custom k8s admission controller that can be paired with the TPU provisioner +to dynamically inject node selectors into a Job's pod template based on environment +variables. The TPU provisioner will then provision slices based on the values of +these node selectors. + +**NOTE**: This is not a generalized solution that works out of the box for any user - the values +injected by the admission controller are just examples that the user is responsible +for changing to fit their use case. + +## Project Structure + +``` +|- admission_controller.py (mutating webhook) +|- certificates (add TLS certificates here) +|- manifests (deployment manifest for admission controller) +|- test (unit tests) +| - tests +| |-- admission_controller_test.py (unit tests) +| |-- manual_e2e/ (JobSet manifests for manual e2e tests) +| | ... +``` + +### Prepare container image + +1. Build container image: `docker build admission-controller -f Dockerfile .` +2. Tag container image: `docker tag admission-controller gcr.io/${PROJECT}/admission-controller:v0.1.0` +2. Push container image: `docker push gcr.io/${PROJECT}/admission-controller:v0.1.0` + +Update the Deployment in `manifests/manifest.yaml` with this container image. + +### Run Unit tests + +This project uses [pytest](https://docs.pytest.org) for unit testing. + +To run unit tests, run the command `pytest` from the `admission_controller/` directory. + +### Run E2E tests + +E2E testing is currently done manually via the following steps: + +1. [Install JobSet](https://jobset.sigs.k8s.io/docs/installation/) +2. **Deploy admission controller**: Run `kubectl apply -f manifests/` from the `admission_controller/` directory. +3. **Create a test JobSet**: Run `kubectl apply -f test/test-jobset.yaml` +4. **Check Jobs were mutated correctly**: Run `kubectl describe jobs` and view the nodeSelectors in the pod template. \ No newline at end of file diff --git a/tpu-provisioner/admission_controller/__init__.py b/tpu-provisioner/admission_controller/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tpu-provisioner/admission_controller/admission_controller.py b/tpu-provisioner/admission_controller/admission_controller.py new file mode 100644 index 000000000..e2bb9a0d4 --- /dev/null +++ b/tpu-provisioner/admission_controller/admission_controller.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +import os +import json +import base64 +import logging +import hashlib +from fastapi import FastAPI, Body +from jsonpatch import JsonPatch +from copy import deepcopy + +app = FastAPI() + +webhook_logger = logging.getLogger(__name__) +webhook_logger.setLevel(logging.INFO) +logging.basicConfig(format="[%(asctime)s] %(levelname)s: %(message)s") + +# environment variables +LOCATION_HINT = "RESERVATION_LOCATION_HINT" +ALWAYS_HINT_TIME = "ALWAYS_HINT_TIME" +FORCE_ON_DEMAND = "FORCE_ON_DEMAND" + +# labels +job_key_label = "job-key" +reservation_name_label = "cloud.google.com/reservation-name" +gke_spot_label = "cloud.google.com/gke-spot" +gke_location_hint_label = "cloud.google.com/gke-location-hint" + +# API endpoint +@app.post("/mutate") +def mutate_request(request: dict = Body(...)): + '''API endpoint for the admission controller mutating webhook.''' + uid: str = request["request"]["uid"] + + object_in: dict = request["request"]["object"] + webhook_logger.info(f'Patching {object_in["kind"]} {object_in["metadata"]["namespace"]}/{object_in["metadata"]["name"]}') + + response: dict = admission_review(uid, object_in) + webhook_logger.info(f'Response: {json.dumps(response)}') + return response + + +def admission_review(uid: str, object_in: dict) -> dict: + '''Returns an AdmissionReview JSONPatch for the given AdmissionRequest.''' + return { + "apiVersion": "admission.k8s.io/v1", + "kind": "AdmissionReview", + "response": { + "uid": uid, + "allowed": True, + "patchType": "JSONPatch", + "status": {"message": f"Patched {object_in['kind']}: {object_in['metadata']['namespace']}/{object_in['metadata']['name']}"}, + "patch": patch(object_in), + }, + } + + +def patch(object_in: dict) -> str: + '''Returns a base64 encoded patch for the given k8s object.''' + patches: list[dict] = make_patches(object_in) + return base64.b64encode(str(patches).encode()).decode() + + +def make_patches(object_in: dict) -> JsonPatch: + '''Generates a JsonPatch for Job mutations that are based on environment variables.''' + job_name: str = object_in["metadata"]["name"] + job_namespace: str = object_in["metadata"]["namespace"] + modified_object: dict = deepcopy(object_in) + + if "nodeSelector" not in modified_object["spec"]["template"]["spec"]: + modified_object["spec"]["template"]["spec"]["nodeSelector"] = {} + + # Add job-key node selector unconditionally. + modified_object["spec"]["template"]["spec"]["nodeSelector"][job_key_label] = job_key_value(job_name, job_namespace) + webhook_logger.info(f'Job: {job_name} Added nodeSelector: {job_key_label}: {job_key_value(job_name, job_namespace)}') + + if os.environ.get(FORCE_ON_DEMAND) == "true": + # Remove reservation label if FORCE_ON_DEMAND is set. + if reservation_name_label in modified_object["spec"]["template"]["spec"]["nodeSelector"]: + del modified_object["spec"]["template"]["spec"]["nodeSelector"][reservation_name_label] + webhook_logger.info(f'Job: {job_name} Removed nodeSelector for node label: {reservation_name_label}') + # Remove spot label if FORCE_ON_DEMAND is set. + if gke_spot_label in modified_object["spec"]["template"]["spec"]["nodeSelector"]: + del modified_object["spec"]["template"]["spec"]["nodeSelector"][gke_spot_label] + webhook_logger.info(f'Job: {job_name} Removed nodeSelector for node label: {gke_spot_label}') + + # Set location hint nodeSelector if RESERVATION_LOCATION_HINT is set. + location_hint_value: str = os.environ.get(LOCATION_HINT, "") + if location_hint_value != "": + modified_object["spec"]["template"]["spec"]["nodeSelector"][gke_location_hint_label] = location_hint_value + webhook_logger.info(f'Job: {job_name} Added nodeSelector: {gke_location_hint_label}: {location_hint_value}') + + patch: JsonPatch = JsonPatch.from_diff(object_in, modified_object) + return patch + + +def job_key_value(job_name: str, job_namespace: str) -> str: + '''Returns the SHA1 hash of the namespaced Job name.''' + return sha1(f'{job_namespace}/{job_name}') + + +def sha1(data: str) -> str: + '''Returns the SHA1 hash of the given string.''' + return hashlib.sha1(data.encode()).hexdigest() diff --git a/tpu-provisioner/admission_controller/certificates/README.md b/tpu-provisioner/admission_controller/certificates/README.md new file mode 100644 index 000000000..e5c4bdb8b --- /dev/null +++ b/tpu-provisioner/admission_controller/certificates/README.md @@ -0,0 +1,7 @@ +Two files are required in this directory: + +1. `certificate.crt` +2. `private.key` + + +These are used to configure TLS for network communication to/from the webhook. \ No newline at end of file diff --git a/tpu-provisioner/admission_controller/manifests/manifest.yaml b/tpu-provisioner/admission_controller/manifests/manifest.yaml new file mode 100644 index 000000000..06d8d6935 --- /dev/null +++ b/tpu-provisioner/admission_controller/manifests/manifest.yaml @@ -0,0 +1,78 @@ +apiVersion: v1 +kind: Secret +metadata: + name: admission-tls +type: Opaque +data: + webhook.crt: "" # base64 encoded certificate + webhook.key: "" # base64 encoded private key +--- +apiVersion: v1 +kind: Service +metadata: + name: mutating-webhook +spec: + selector: + app: mutating-webhook + ports: + - port: 5000 +--- +apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: mutating-webhook +webhooks: +- name: mutating-webhook.default.svc + matchPolicy: Equivalent + admissionReviewVersions: ["v1"] + sideEffects: None + rules: + - operations: ["CREATE"] + apiGroups: ["batch"] + apiVersions: ["v1"] + resources: ["jobs"] + scope: "Namespaced" + failurePolicy: Ignore + timeoutSeconds: 20 + clientConfig: + caBundle: # base64 CA bundle here + service: + namespace: default + name: mutating-webhook + path: /mutate + port: 5000 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mutating-webhook +spec: + replicas: 1 + selector: + matchLabels: + app: mutating-webhook + template: + metadata: + labels: + app: mutating-webhook + spec: + containers: + - name: mutating-webhook + image: "" # build container image, push to repository and add it here + imagePullPolicy: Always + ports: + - containerPort: 5000 + env: + # Set environment variables for your deployment. + - name: RESERVATION_LOCATION_HINT + value: "cell" + - name: FORCE_ON_DEMAND + value: "false" + volumeMounts: + - name: certs-volume + readOnly: true + mountPath: "/certs" + volumes: + - name: certs-volume + secret: + secretName: admission-tls \ No newline at end of file diff --git a/tpu-provisioner/admission_controller/requirements.txt b/tpu-provisioner/admission_controller/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..17c6198cc5ff641ef6f895dd9267ebdf1b81fca0 GIT binary patch literal 466 zcmX|-U2eia5QF_(saHX4ND(hRO9dg6^k)-=w4#?Ue4ZuIYBy4BkL|hr@0InZPQ6|^ z8?ChECOzv(jlMt{eFnCR|0_5J>p^>s+UuKZ^%_RaeF;X(d(TL*SRanZy( zz?ii=l(YAl3~519%q`kZN9Gkr=dEMnK#N06ZD~+1eie+8*o-5LjIUeS)rnH%u42js zw9|<*Z^R6{6z`IGq