diff --git a/.gitignore b/.gitignore index 77b17b4a930..f221dec7bb3 100644 --- a/.gitignore +++ b/.gitignore @@ -85,3 +85,6 @@ __pycache__ # kfp local execution default directory local_outputs/ + +# Ignore debug Driver Dockerfile produced from `make -C backend image_driver_debug` +backend/Dockerfile.driver-debug diff --git a/backend/Dockerfile.driver b/backend/Dockerfile.driver index 99008585712..58f841b7754 100644 --- a/backend/Dockerfile.driver +++ b/backend/Dockerfile.driver @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM golang:1.21.7-alpine3.19 as builder +FROM golang:1.21.7-alpine3.19 AS builder + +ARG GCFLAGS="" WORKDIR /go/src/github.com/kubeflow/pipelines @@ -25,7 +27,7 @@ RUN ./hack/install-go-licenses.sh COPY . . -RUN GO111MODULE=on CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -tags netgo -ldflags '-extldflags "-static"' -o /bin/driver ./backend/src/v2/cmd/driver/*.go +RUN GO111MODULE=on CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -tags netgo -gcflags="${GCFLAGS}" -ldflags '-extldflags "-static"' -o /bin/driver ./backend/src/v2/cmd/driver/*.go # Check licenses and comply with license terms. # First, make sure there's no forbidden license. diff --git a/backend/Makefile b/backend/Makefile index 1c9512b9341..99a182dc6bd 100644 --- a/backend/Makefile +++ b/backend/Makefile @@ -86,6 +86,16 @@ image_visualization: .PHONY: image_driver image_driver: cd $(MOD_ROOT) && ${CONTAINER_ENGINE} build -t ${IMG_TAG_DRIVER} -f backend/Dockerfile.driver . +.PHONY: image_driver_debug +image_driver_debug: + cd $(MOD_ROOT) && sed -e '/RUN .*go mod download/a\ + RUN go install github.com/go-delve/delve/cmd/dlv@latest' \ + -e '/COPY .*\/bin\/driver \/bin\/driver/a\ + COPY . \/go\/src\/github.com\/kubeflow\/pipelines\ + COPY --from=builder /go/bin/dlv /bin/dlv\ + EXPOSE 2345' \ + backend/Dockerfile.driver > backend/Dockerfile.driver-debug + cd $(MOD_ROOT) && ${CONTAINER_ENGINE} build --build-arg GCFLAGS="all=-N -l" -t ${IMG_TAG_DRIVER}:debug -f backend/Dockerfile.driver-debug . .PHONY: image_launcher image_launcher: cd $(MOD_ROOT) && ${CONTAINER_ENGINE} build -t ${IMG_TAG_LAUNCHER} -f backend/Dockerfile.launcher . @@ -100,3 +110,10 @@ dev-kind-cluster: kubectl apply -k $(CURDIR)/../manifests/kustomize/env/dev-kind kubectl -n kubeflow wait --for condition=Available --timeout=10m deployment/mysql kubectl -n kubeflow wait --for condition=Available --timeout=3m deployment/metadata-grpc-deployment + +.PHONY: kind-load-driver-debug +kind-load-driver-debug: + kind --name $(KIND_NAME) load docker-image ${IMG_TAG_DRIVER}:debug + +.PHONY: kind-build-and-load-driver-debug +kind-build-and-load-driver-debug: image_driver_debug kind-load-driver-debug diff --git a/backend/README.md b/backend/README.md index 54b8de8b645..86cd4e896fc 100644 --- a/backend/README.md +++ b/backend/README.md @@ -167,3 +167,109 @@ You can also directly connect to the MariaDB database server with: ```bash mysql -h 127.0.0.1 -u root ``` + +## Remote Debug the Driver + +These instructions assume you are leveraging the Kind cluster in the +[Run Locally With a Kind Cluster](#run-locally-with-a-kind-cluster) section. + +### Build the Driver Image With Debug Prerequisites + +Run the following to create the `backend/Dockerfile.driver-debug` file and build the container image +tagged as `kfp-driver:debug`. This container image is based on `backend/Dockerfile.driver` but installs +[Delve](https://github.com/go-delve/delve), builds the binary without compiler optimizations so the binary matches the +source code (via `GCFLAGS="all=-N -l"`), and copies the source code to the destination container for the debugger. +Any changes to the Driver code will require rebuilding this container image. + +```bash +make -C backend image_driver_debug +``` + +Then load the container image in the Kind cluster. + +```bash +make -C backend kind-load-driver-debug +``` + +Alternatively, you can use this Make target that does both. + +```bash +make -C kind-build-and-load-driver-debug +``` + +### Run the API Server With Debug Configuration + +You may use the following VS Code `launch.json` file to run the API server which overrides the Driver +command to use Delve and the Driver image to use debug image built previously. + +```json +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Launch API server (Kind) (Debug Driver)", + "type": "go", + "request": "launch", + "mode": "debug", + "program": "${workspaceFolder}/backend/src/apiserver", + "env": { + "POD_NAMESPACE": "kubeflow", + "DBCONFIG_MYSQLCONFIG_HOST": "localhost", + "MINIO_SERVICE_SERVICE_HOST": "localhost", + "MINIO_SERVICE_SERVICE_PORT": "9000", + "METADATA_GRPC_SERVICE_SERVICE_HOST": "localhost", + "METADATA_GRPC_SERVICE_SERVICE_PORT": "8080", + "ML_PIPELINE_VISUALIZATIONSERVER_SERVICE_HOST": "localhost", + "ML_PIPELINE_VISUALIZATIONSERVER_SERVICE_PORT": "8888", + "V2_DRIVER_IMAGE": "kfp-driver:debug", + "V2_DRIVER_COMMAND": "dlv exec --listen=:2345 --headless=true --api-version=2 --log /bin/driver --", + } + } + ] +} +``` + +### Starting a Remote Debug Session + +Start by launching a pipeline. This will eventually create a Driver pod that is waiting for a remote debug connection. + +You can see the pods with the following command. + +```bash +kubectl -n kubeflow get pods -w +``` + +Once you see a pod with `-driver` in the name such as `hello-world-clph9-system-dag-driver-10974850`, port forward +the Delve port in the pod to your localhost (replace `` with the actual name). + +```bash +kubectl -n kubeflow port-forward 2345:2345 +``` + +Set a breakpoint on the Driver code in VS Code. Then remotely connect to the Delve debug session with the following VS +Code `launch.json` file: + +```json +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Connect to remote driver", + "type": "go", + "request": "attach", + "mode": "remote", + "remotePath": "/go/src/github.com/kubeflow/pipelines", + "port": 2345, + "host": "127.0.0.1", + } + ] +} +``` + +Once the Driver pod succeeds, the remote debug session will close. Then repeat the process of forwarding the port +of subsequent Driver pods and starting remote debug sessions in VS Code until the pipeline completes. + +For debugging a specific Driver pod, you'll need to continuously port forward and connect to the remote debug session +without a breakpoint so that Delve will continue execution until the Driver pod you are interested in starts up. At that +point, you can set a break point, port forward, and connect to the remote debug session to debug that specific Driver +pod. diff --git a/backend/src/v2/compiler/argocompiler/argo.go b/backend/src/v2/compiler/argocompiler/argo.go index 1f1c19ed3ec..3d11c407380 100644 --- a/backend/src/v2/compiler/argocompiler/argo.go +++ b/backend/src/v2/compiler/argocompiler/argo.go @@ -125,6 +125,7 @@ func Compile(jobArg *pipelinespec.PipelineJob, kubernetesSpecArg *pipelinespec.S // TODO(chensun): release process and update the images. launcherImage: GetLauncherImage(), driverImage: GetDriverImage(), + driverCommand: GetDriverCommand(), job: job, spec: spec, executors: deploy.GetExecutors(), @@ -161,6 +162,7 @@ type workflowCompiler struct { wf *wfapi.Workflow templates map[string]*wfapi.Template driverImage string + driverCommand []string launcherImage string } diff --git a/backend/src/v2/compiler/argocompiler/container.go b/backend/src/v2/compiler/argocompiler/container.go index 989dfffb8c2..03f0e3a119a 100644 --- a/backend/src/v2/compiler/argocompiler/container.go +++ b/backend/src/v2/compiler/argocompiler/container.go @@ -34,6 +34,8 @@ const ( LauncherImageEnvVar = "V2_LAUNCHER_IMAGE" DefaultDriverImage = "gcr.io/ml-pipeline/kfp-driver@sha256:dc8b56a2eb071f30409828a8884d621092e68385af11a6c06aa9e9fbcfbb19de" DriverImageEnvVar = "V2_DRIVER_IMAGE" + DefaultDriverCommand = "driver" + DriverCommandEnvVar = "V2_DRIVER_COMMAND" gcsScratchLocation = "/gcs" gcsScratchName = "gcs-scratch" s3ScratchLocation = "/s3" @@ -91,6 +93,14 @@ func GetDriverImage() string { return driverImage } +func GetDriverCommand() []string { + driverCommand := os.Getenv(DriverCommandEnvVar) + if driverCommand == "" { + driverCommand = DefaultDriverCommand + } + return strings.Split(driverCommand, " ") +} + func (c *workflowCompiler) containerDriverTask(name string, inputs containerDriverInputs) (*wfapi.DAGTask, *containerDriverOutputs) { dagTask := &wfapi.DAGTask{ Name: name, @@ -151,7 +161,7 @@ func (c *workflowCompiler) addContainerDriverTemplate() string { }, Container: &k8score.Container{ Image: GetDriverImage(), - Command: []string{"driver"}, + Command: GetDriverCommand(), Args: []string{ "--type", "CONTAINER", "--pipeline_name", c.spec.GetPipelineInfo().GetName(), diff --git a/backend/src/v2/compiler/argocompiler/dag.go b/backend/src/v2/compiler/argocompiler/dag.go index 7c997ee61d4..854ddd3bdaa 100644 --- a/backend/src/v2/compiler/argocompiler/dag.go +++ b/backend/src/v2/compiler/argocompiler/dag.go @@ -480,7 +480,7 @@ func (c *workflowCompiler) addDAGDriverTemplate() string { }, Container: &k8score.Container{ Image: c.driverImage, - Command: []string{"driver"}, + Command: c.driverCommand, Args: []string{ "--type", inputValue(paramDriverType), "--pipeline_name", c.spec.GetPipelineInfo().GetName(),