From 0f6cc5bb05b96292a27c69f2e85bffa0422bbc24 Mon Sep 17 00:00:00 2001 From: mprahl Date: Fri, 3 Jan 2025 14:43:55 -0500 Subject: [PATCH] Add instructions to remote debug the Driver pods This makes the driver command configurable so that Delve can be used to execute the driver binary and adds Make targets to build the Driver image for debugging. Signed-off-by: mprahl --- .gitignore | 3 + backend/Dockerfile.driver | 6 +- backend/Makefile | 17 +++ backend/README.md | 106 ++++++++++++++++++ backend/src/v2/compiler/argocompiler/argo.go | 2 + .../src/v2/compiler/argocompiler/container.go | 12 +- backend/src/v2/compiler/argocompiler/dag.go | 2 +- 7 files changed, 144 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 77b17b4a930..f221dec7bb3 100644 --- a/.gitignore +++ b/.gitignore @@ -85,3 +85,6 @@ __pycache__ # kfp local execution default directory local_outputs/ + +# Ignore debug Driver Dockerfile produced from `make -C backend image_driver_debug` +backend/Dockerfile.driver-debug diff --git a/backend/Dockerfile.driver b/backend/Dockerfile.driver index 99008585712..58f841b7754 100644 --- a/backend/Dockerfile.driver +++ b/backend/Dockerfile.driver @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM golang:1.21.7-alpine3.19 as builder +FROM golang:1.21.7-alpine3.19 AS builder + +ARG GCFLAGS="" WORKDIR /go/src/github.com/kubeflow/pipelines @@ -25,7 +27,7 @@ RUN ./hack/install-go-licenses.sh COPY . . -RUN GO111MODULE=on CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -tags netgo -ldflags '-extldflags "-static"' -o /bin/driver ./backend/src/v2/cmd/driver/*.go +RUN GO111MODULE=on CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -tags netgo -gcflags="${GCFLAGS}" -ldflags '-extldflags "-static"' -o /bin/driver ./backend/src/v2/cmd/driver/*.go # Check licenses and comply with license terms. # First, make sure there's no forbidden license. diff --git a/backend/Makefile b/backend/Makefile index 1c9512b9341..99a182dc6bd 100644 --- a/backend/Makefile +++ b/backend/Makefile @@ -86,6 +86,16 @@ image_visualization: .PHONY: image_driver image_driver: cd $(MOD_ROOT) && ${CONTAINER_ENGINE} build -t ${IMG_TAG_DRIVER} -f backend/Dockerfile.driver . +.PHONY: image_driver_debug +image_driver_debug: + cd $(MOD_ROOT) && sed -e '/RUN .*go mod download/a\ + RUN go install github.com/go-delve/delve/cmd/dlv@latest' \ + -e '/COPY .*\/bin\/driver \/bin\/driver/a\ + COPY . \/go\/src\/github.com\/kubeflow\/pipelines\ + COPY --from=builder /go/bin/dlv /bin/dlv\ + EXPOSE 2345' \ + backend/Dockerfile.driver > backend/Dockerfile.driver-debug + cd $(MOD_ROOT) && ${CONTAINER_ENGINE} build --build-arg GCFLAGS="all=-N -l" -t ${IMG_TAG_DRIVER}:debug -f backend/Dockerfile.driver-debug . .PHONY: image_launcher image_launcher: cd $(MOD_ROOT) && ${CONTAINER_ENGINE} build -t ${IMG_TAG_LAUNCHER} -f backend/Dockerfile.launcher . @@ -100,3 +110,10 @@ dev-kind-cluster: kubectl apply -k $(CURDIR)/../manifests/kustomize/env/dev-kind kubectl -n kubeflow wait --for condition=Available --timeout=10m deployment/mysql kubectl -n kubeflow wait --for condition=Available --timeout=3m deployment/metadata-grpc-deployment + +.PHONY: kind-load-driver-debug +kind-load-driver-debug: + kind --name $(KIND_NAME) load docker-image ${IMG_TAG_DRIVER}:debug + +.PHONY: kind-build-and-load-driver-debug +kind-build-and-load-driver-debug: image_driver_debug kind-load-driver-debug diff --git a/backend/README.md b/backend/README.md index 54b8de8b645..86cd4e896fc 100644 --- a/backend/README.md +++ b/backend/README.md @@ -167,3 +167,109 @@ You can also directly connect to the MariaDB database server with: ```bash mysql -h 127.0.0.1 -u root ``` + +## Remote Debug the Driver + +These instructions assume you are leveraging the Kind cluster in the +[Run Locally With a Kind Cluster](#run-locally-with-a-kind-cluster) section. + +### Build the Driver Image With Debug Prerequisites + +Run the following to create the `backend/Dockerfile.driver-debug` file and build the container image +tagged as `kfp-driver:debug`. This container image is based on `backend/Dockerfile.driver` but installs +[Delve](https://github.com/go-delve/delve), builds the binary without compiler optimizations so the binary matches the +source code (via `GCFLAGS="all=-N -l"`), and copies the source code to the destination container for the debugger. +Any changes to the Driver code will require rebuilding this container image. + +```bash +make -C backend image_driver_debug +``` + +Then load the container image in the Kind cluster. + +```bash +make -C backend kind-load-driver-debug +``` + +Alternatively, you can use this Make target that does both. + +```bash +make -C kind-build-and-load-driver-debug +``` + +### Run the API Server With Debug Configuration + +You may use the following VS Code `launch.json` file to run the API server which overrides the Driver +command to use Delve and the Driver image to use debug image built previously. + +```json +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Launch API server (Kind) (Debug Driver)", + "type": "go", + "request": "launch", + "mode": "debug", + "program": "${workspaceFolder}/backend/src/apiserver", + "env": { + "POD_NAMESPACE": "kubeflow", + "DBCONFIG_MYSQLCONFIG_HOST": "localhost", + "MINIO_SERVICE_SERVICE_HOST": "localhost", + "MINIO_SERVICE_SERVICE_PORT": "9000", + "METADATA_GRPC_SERVICE_SERVICE_HOST": "localhost", + "METADATA_GRPC_SERVICE_SERVICE_PORT": "8080", + "ML_PIPELINE_VISUALIZATIONSERVER_SERVICE_HOST": "localhost", + "ML_PIPELINE_VISUALIZATIONSERVER_SERVICE_PORT": "8888", + "V2_DRIVER_IMAGE": "kfp-driver:debug", + "V2_DRIVER_COMMAND": "dlv exec --listen=:2345 --headless=true --api-version=2 --log /bin/driver --", + } + } + ] +} +``` + +### Starting a Remote Debug Session + +Start by launching a pipeline. This will eventually create a Driver pod that is waiting for a remote debug connection. + +You can see the pods with the following command. + +```bash +kubectl -n kubeflow get pods -w +``` + +Once you see a pod with `-driver` in the name such as `hello-world-clph9-system-dag-driver-10974850`, port forward +the Delve port in the pod to your localhost (replace `` with the actual name). + +```bash +kubectl -n kubeflow port-forward 2345:2345 +``` + +Set a breakpoint on the Driver code in VS Code. Then remotely connect to the Delve debug session with the following VS +Code `launch.json` file: + +```json +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Connect to remote driver", + "type": "go", + "request": "attach", + "mode": "remote", + "remotePath": "/go/src/github.com/kubeflow/pipelines", + "port": 2345, + "host": "127.0.0.1", + } + ] +} +``` + +Once the Driver pod succeeds, the remote debug session will close. Then repeat the process of forwarding the port +of subsequent Driver pods and starting remote debug sessions in VS Code until the pipeline completes. + +For debugging a specific Driver pod, you'll need to continuously port forward and connect to the remote debug session +without a breakpoint so that Delve will continue execution until the Driver pod you are interested in starts up. At that +point, you can set a break point, port forward, and connect to the remote debug session to debug that specific Driver +pod. diff --git a/backend/src/v2/compiler/argocompiler/argo.go b/backend/src/v2/compiler/argocompiler/argo.go index 1f1c19ed3ec..3d11c407380 100644 --- a/backend/src/v2/compiler/argocompiler/argo.go +++ b/backend/src/v2/compiler/argocompiler/argo.go @@ -125,6 +125,7 @@ func Compile(jobArg *pipelinespec.PipelineJob, kubernetesSpecArg *pipelinespec.S // TODO(chensun): release process and update the images. launcherImage: GetLauncherImage(), driverImage: GetDriverImage(), + driverCommand: GetDriverCommand(), job: job, spec: spec, executors: deploy.GetExecutors(), @@ -161,6 +162,7 @@ type workflowCompiler struct { wf *wfapi.Workflow templates map[string]*wfapi.Template driverImage string + driverCommand []string launcherImage string } diff --git a/backend/src/v2/compiler/argocompiler/container.go b/backend/src/v2/compiler/argocompiler/container.go index 989dfffb8c2..03f0e3a119a 100644 --- a/backend/src/v2/compiler/argocompiler/container.go +++ b/backend/src/v2/compiler/argocompiler/container.go @@ -34,6 +34,8 @@ const ( LauncherImageEnvVar = "V2_LAUNCHER_IMAGE" DefaultDriverImage = "gcr.io/ml-pipeline/kfp-driver@sha256:dc8b56a2eb071f30409828a8884d621092e68385af11a6c06aa9e9fbcfbb19de" DriverImageEnvVar = "V2_DRIVER_IMAGE" + DefaultDriverCommand = "driver" + DriverCommandEnvVar = "V2_DRIVER_COMMAND" gcsScratchLocation = "/gcs" gcsScratchName = "gcs-scratch" s3ScratchLocation = "/s3" @@ -91,6 +93,14 @@ func GetDriverImage() string { return driverImage } +func GetDriverCommand() []string { + driverCommand := os.Getenv(DriverCommandEnvVar) + if driverCommand == "" { + driverCommand = DefaultDriverCommand + } + return strings.Split(driverCommand, " ") +} + func (c *workflowCompiler) containerDriverTask(name string, inputs containerDriverInputs) (*wfapi.DAGTask, *containerDriverOutputs) { dagTask := &wfapi.DAGTask{ Name: name, @@ -151,7 +161,7 @@ func (c *workflowCompiler) addContainerDriverTemplate() string { }, Container: &k8score.Container{ Image: GetDriverImage(), - Command: []string{"driver"}, + Command: GetDriverCommand(), Args: []string{ "--type", "CONTAINER", "--pipeline_name", c.spec.GetPipelineInfo().GetName(), diff --git a/backend/src/v2/compiler/argocompiler/dag.go b/backend/src/v2/compiler/argocompiler/dag.go index 7c997ee61d4..854ddd3bdaa 100644 --- a/backend/src/v2/compiler/argocompiler/dag.go +++ b/backend/src/v2/compiler/argocompiler/dag.go @@ -480,7 +480,7 @@ func (c *workflowCompiler) addDAGDriverTemplate() string { }, Container: &k8score.Container{ Image: c.driverImage, - Command: []string{"driver"}, + Command: c.driverCommand, Args: []string{ "--type", inputValue(paramDriverType), "--pipeline_name", c.spec.GetPipelineInfo().GetName(),