Skip to content
This repository has been archived by the owner on Aug 9, 2024. It is now read-only.

Add MPIJob to configurator #203

Closed
wants to merge 11 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions controller/cmd/configurator/app/configurator.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,16 @@ func (c *Configurator) Run(opt *AppOption) error {

// Modify the manifest as specified by the kubebench job
modSpec := configSpec.ManifestModSpec

// If the namespace in the ManifestModSpec is empty valued, replace it with
// configurator's own namespace which is same as the workflow. This is for
// cases when the ManifestModSpec is created before namespace is set.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove this?

if err := replaceEmptyNamespace(modSpec); err != nil {
log.Errorf("Failed to replace empty namespace: %s", err)
return err
}

modifiedManifest, err := manifestmod.NewManifestModifier(modSpec).ModifyManifest(manifest)
if err != nil {
log.Errorf("Failed to modify manifest: %s", err)
Expand Down
9 changes: 5 additions & 4 deletions controller/examples/crd.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
name: kubebenchjobs.kubebench.operator
name: kubebenchjobs.kubeflow.org
namespace: kubebench
spec:
group: kubebench.operator
version: v1
group: kubeflow.org
names:
kind: KubebenchJob
plural: kubebenchjobs
scope: Namespaced
scope: Namespaced
version: v1alpha2
109 changes: 63 additions & 46 deletions controller/examples/job-example.yaml
Original file line number Diff line number Diff line change
@@ -1,53 +1,70 @@
#uncomment lines
apiVersion: kubebench.operator/v1
apiVersion: kubeflow.org/v1alpha2
kind: KubebenchJob
metadata:
name: kubebench-job
namespace: default
namespace: kirill-kubebench
spec:
serviceAccount: default
volumeSpecs:
configVolume:
name: my-config-volume
persistentVolumeClaim:
claimName: kubebench-config-pvc
volumes:
- name: kubebench-config-volume
configMap:
name: kubebench-config-pvc
- name: kubebench-data-volume
persistentVolumeClaim:
claimName: kubebench-data-pvc
managedVolumes:
experimentVolume:
name: my-experiment-volume
name: kubebench-experiment-volume
persistentVolumeClaim:
claimName: kubebench-experiment-pvc
workflowVolume:
name: kubebench-workflow-volume
persistentVolumeClaim:
claimName: kubebench-exp-pvc
# secretSpecs: # optional
# githubTokenSecret: # optional
# secretName: my-github-token-secret
# secretKey: my-github-token-secret-key
# gcpCredentialsSecret: # optional
# secretName: my-gcp-credentials-secret
# secretKey: my-gcp-credentials-secret-key
jobSpecs:
preJob: # optional
container: # optional between "container" and "resource"
name: my-prejob
image: gcr.io/myprejob-image:latest # change it before using
mainJob: # mandatory
resource: # optional between "container" and "resource"
manifestTemplate:
valueFrom:
ksonnet: # optional, more types in the future
name: kubebench-example-tfcnn-with-monitoring
package: kubebench-examples
registry: /kubebench/config/registry/kubebench
manifestParameters:
valueFrom:
path: tf-cnn/tf-cnn-dummy.yaml
createSuccessCondition: createSuccess # optional
createFailureCondition: createFailure # optional
runSuccessCondition: runSuccess # optional
runFailureCondition: runFailure # optional
#other optional fields: "manifest" - string of raw manifest
postJob: # optional
container: # optional between "container" and "resource"
name: my-postjob
image: gcr.io/kubeflow-images-public/kubebench/kubebench-example-tf-cnn-post-processor:3c75b50
reportSpecs: # optional
csv: # optional
- inputPath: result.json
outputPath: report.csv
claimName: kubebench-workflow-pvc
workflowAgent:
container:
volumeMounts:
- name: kubebench-config-volume
mountPath: /kubebench/config
tasks:
- name: download-data
container:
name: download-dataset
image: kirill-mlperf-download-image:latest
volumeMounts:
- name: kubebench-data-volume
mountPath: /data
options:
mountManagedVolumes: true
autoWatch:
timeout: 5m
autoDelete: true
numCopies: 1
- name: preprocess-data
container:
name: preprocess-dataset
image: kirill-mlperf-preprocess-image:latest
volumeMounts:
- name: kubebench-data-volume
mountPath: /data
options:
mountManagedVolumes: true
autoWatch:
timeout: 5m
autoDelete: true
numCopies: 1
- name: run-training
dependencies: [pre-job-1, pre-job-2]
container:
name: run-training
image: kirill-mlperf-training-image:latest
volumeMounts:
- name: kubebench-data-volume
mountPath: /data
options:
mountManagedVolumes: true
autoWatch:
timeout: 5m
autoDelete: true
numCopies: 1

16 changes: 16 additions & 0 deletions controller/examples/mpi-job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# This file shows how to run multi-node training benchmarks using an MPIJob,
# specifying GPUs explicitly per replica.
apiVersion: kubeflow.org/v1alpha1
kind: MPIJob
metadata:
name: training-imagenet
spec:
replicas: 3
template:
spec:
containers:
- image: docker.io/akado2009/kirill-mlperf-training:latest
name: training-imagenet
resources:
limits:
nvidia.com/gpu: 1
7 changes: 7 additions & 0 deletions controller/examples/resource-mod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
namespace: kubebench
volumes:
- name: efs
nfs:
server: fs-ab134502.efs.us-west-2.amazonaws.com
path: /
readOnly: true

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

Loading