diff --git a/.github/workflows/docker-builds.yaml b/.github/workflows/docker-builds.yaml index 77bc97f..c22eee4 100644 --- a/.github/workflows/docker-builds.yaml +++ b/.github/workflows/docker-builds.yaml @@ -10,13 +10,17 @@ jobs: strategy: fail-fast: false matrix: -# Tutorial is over - these builds are disabled -# test: [["2023-RADIUSS-AWS/JupyterNotebook", "docker/Dockerfile.hub", "ghcr.io/flux-framework/flux-jupyter-hub:2023"], -# ["2023-RADIUSS-AWS/JupyterNotebook", "docker/Dockerfile.init", "ghcr.io/flux-framework/flux-jupyter-init:2023"], -# ["2023-RADIUSS-AWS/JupyterNotebook", "docker/Dockerfile.spawn", "ghcr.io/flux-framework/flux-jupyter-spawn:2023"]] - test: [["2024-RIKEN-AWS/JupyterNotebook", "docker/Dockerfile.hub", "ghcr.io/flux-framework/flux-jupyter-hub:riken-2024"], - ["2024-RIKEN-AWS/JupyterNotebook", "docker/Dockerfile.init", "ghcr.io/flux-framework/flux-jupyter-init:riken-2024"], - ["2024-RIKEN-AWS/JupyterNotebook", "docker/Dockerfile.spawn", "ghcr.io/flux-framework/flux-jupyter-spawn:riken-2024"]] + test: [["2024-RADIUSS-AWS/JupyterNotebook", "docker/Dockerfile.hub", "ghcr.io/flux-framework/flux-jupyter-hub:radiuss-2024"], + ["2024-RADIUSS-AWS/JupyterNotebook", "docker/Dockerfile.init", "ghcr.io/flux-framework/flux-jupyter-init:radiuss-2024"], + ["2024-RADIUSS-AWS/JupyterNotebook", "docker/Dockerfile.spawn", "ghcr.io/flux-framework/flux-jupyter-spawn:radiuss-2024"]] + +# Tutorials are over - these builds are disabled +# ["2023-RADIUSS-AWS/JupyterNotebook", "docker/Dockerfile.hub", "ghcr.io/flux-framework/flux-jupyter-hub:2023"], +# ["2023-RADIUSS-AWS/JupyterNotebook", "docker/Dockerfile.init", "ghcr.io/flux-framework/flux-jupyter-init:2023"], +# ["2023-RADIUSS-AWS/JupyterNotebook", "docker/Dockerfile.spawn", "ghcr.io/flux-framework/flux-jupyter-spawn:2023"]] +# ["2024-RIKEN-AWS/JupyterNotebook", "docker/Dockerfile.hub", "ghcr.io/flux-framework/flux-jupyter-hub:riken-2024"], +# ["2024-RIKEN-AWS/JupyterNotebook", "docker/Dockerfile.init", "ghcr.io/flux-framework/flux-jupyter-init:riken-2024"], +# ["2024-RIKEN-AWS/JupyterNotebook", "docker/Dockerfile.spawn", "ghcr.io/flux-framework/flux-jupyter-spawn:riken-2024"]] steps: - name: Clone the code diff --git a/2024-RADIUSS-AWS/JupyterNotebook/README.md b/2024-RADIUSS-AWS/JupyterNotebook/README.md new file mode 100644 index 0000000..af522db --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/README.md @@ -0,0 +1,1024 @@ +# Flux + Jupyter via KubeSpawner + +This set of tutorials provides: + + - [Building Base Images](#build-images) + - [Local Development or Usage](#local-usage) + - [Deploy A Cluster to AWS or Google Cloud Using](#deploy-to-kubernetes) using Google Cloud or AWS + +Pre-requisites: + + - kubectl, (eksctl|gcloud), and (optionally) docker installed locally + - A cloud with a Kubernetes cluster deployed (AWS and Google) + - Excitement to learn about Flux! + +For AWS Tutorial Day users: + +> To run the AWS tutorial, visit https://tutorial.flux-framework.org. You can use any login you want, but choose something relatvely uncommon (like your email address) or you may end up sharing a JupyterLab instance with another user. The tutorial password will be provided to you. + +## Build Images + +Let's build a set of images - one spawner and one hub, and an init. You can customize the tag to your liking. +Remember that if you just want to test locally, you can jump to the [local usage](#local-usage) section. + +```bash +docker build -t ghcr.io/flux-framework/flux-jupyter-hub:radiuss-2024 -f docker/Dockerfile.hub . +docker build -t ghcr.io/flux-framework/flux-jupyter-spawn:radiuss-2024 -f docker/Dockerfile.spawn . +docker build -t ghcr.io/flux-framework/flux-jupyter-init:radiuss-2024 -f docker/Dockerfile.init . +``` + +Note that these are available under the flux-framework organization GitHub packages, so you shouldn't need +to build them unless you are developing or changing them. + +If you do build (and use a different name) be sure to push your images to a public registry (or load them locally to your development cluster). + +## Local Usage + +While the tutorial here is intended for deployment on AWS or Google Cloud, you can also give it a try on your local machine with a single container! You will need to [install Docker](https://docs.docker.com/engine/install/). When you have Docker available, you can build and run the tutorial with: + +```bash +docker build -t flux-tutorial -f docker/Dockerfile.spawn . +docker network create jupyterhub + +# Here is how to run an entirely contained tutorial (the notebook in the container) +docker run --rm -it --entrypoint /start.sh -v /var/run/docker.sock:/var/run/docker.sock --net jupyterhub --name jupyterhub -p 8888:8888 flux-tutorial +``` + +If you want to develop the ipynb files, you can bind the tutorials directory: + +```bash +docker run --rm -it --entrypoint /start.sh -v $PWD/tutorial:/home/jovyan/flux-tutorial-2024 -v /var/run/docker.sock:/var/run/docker.sock --net jupyterhub --name jupyterhub -p 8888:8888 flux-tutorial +``` + +And then editing and saving will save to your host. You can also File -> Download if you forget to do +this bind. Either way, when the container is running you can open the localhost or 127.0.0.1 (home sweet home!) link in your browser on port 8888. You'll want to go to flux-tutorial-2024 -> notebook to see the notebook. +You'll need to select http only (and bypass the no certificate warning). + +## Deploy to Kubernetes + +### 1. Create Cluster + +#### Google Cloud + +Here is how to create the cluster on Google Cloud using [gcloud](https://cloud.google.com/sdk/docs/install) (and assuming you have logged in +with [gcloud auth login](https://cloud.google.com/sdk/gcloud/reference/auth/login): + +```bash +export GOOGLE_PROJECT=myproject +gcloud container clusters create flux-jupyter --project $GOOGLE_PROJECT \ + --zone us-central1-a --machine-type n1-standard-2 \ + --num-nodes=4 --enable-network-policy --enable-intra-node-visibility +``` + +#### AWS + +Here is how to create an equivalent cluster on AWS (EKS). We will be using [eksctl](https://eksctl.io/introduction/), which +you should install. + +```bash +# Create an EKS cluster with autoscaling with default storage +eksctl create cluster --config-file aws/eksctl-config.yaml + +# Create an EKS cluster with io1 node storage but no autoscaling, used for the RADIUSS 2023 tutorial +eksctl create cluster --config-file aws/eksctl-radiuss-tutorial-2023.yaml +``` + +You can find vanilla (manual) instructions [here](https://z2jh.jupyter.org/en/stable/kubernetes/amazon/step-zero-aws-eks.html) if you +are interested in how it works. We emulate the logic there using eksctl. Then generate a secret token - we will add this to [config-aws.yaml](aws/config-aws.yaml) (without SSL) or [config-aws-ssl.yaml](aws/config-aws-ssl.yaml) (with SSL). When your cluster is ready, this will deploy an EBS CSI driver: + +```bash +kubectl apply -k "github.com/kubernetes-sigs/aws-ebs-csi-driver/deploy/kubernetes/overlays/stable/?ref=master" +``` + +And install the cluster-autoscaler: + +```bash +kubectl apply -f aws/cluster-autoscaler-autodiscover.yaml +``` + +If you want to use a different storage class than the default (`gp2`), you also need to create the new storage class (`gp3` here) and set it as the default storage class: + +```bash +kubectl apply -f aws/storageclass.yaml +kubectl patch storageclass gp3 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' +kubectl patch storageclass gp2 -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}' +``` + +Most of the information I needed to read about this was [here](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws/README.md) - the Jupyter documentation wasn't super helpful beyond saying to install it. Also note that I got this (seemingly working) without the `propagateASGTags` set to true, but that is something that I've seen have issue. +You can look at the autoscaler pod logs for information. + +While the spawned containers (e.g., where you run your notebook) don't use these volumes, the hub will. +You can read about [gp2](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ebs-volume-types.html) class. +Note that we will be using [config-aws.yaml](aws/config-aws.yaml) if you don't need SSL, and [config-aws-ssl.yaml](aws/config-aws-ssl.yaml) if you do. For the latter, the jupyter spawner will generate let's encrypt certificates for us, given that we have correctly configured DNS. + +### 2. Deploy JupyterHub + +We will use [helm](https://helm.sh/docs/helm/helm_install/) to install charts and deploy. + +```bash +helm repo add jupyterhub https://hub.jupyter.org/helm-chart/ +helm repo update +``` + +You can see the versions available: + +```bash +helm search repo jupyterhub +``` +```console +NAME CHART VERSION APP VERSION DESCRIPTION +bitnami/jupyterhub 4.2.0 4.0.2 JupyterHub brings the power of notebooks to gro... +jupyterhub/jupyterhub 3.0.2 4.0.2 Multi-user Jupyter installation +jupyterhub/pebble 1.0.1 v2.3.1 This Helm chart bootstraps Pebble: an ACME serv... +``` + +Note that chart versions don't always coincide with software (or "app") versions. At the time of writing this, +we are using the jupyterhub/jupyterhub 3.0.2/4.0.2 versions, and our container bases point to 3.0.2 tags for the +corresponding images. Next, see the values we can set, which likely will come from a config*.yaml that we will choose. + +```bash +helm show values jupyterhub/jupyterhub +``` + +
+ +Example values for the jupyterhub helm chart + +```console +# fullnameOverride and nameOverride distinguishes blank strings, null values, +# and non-blank strings. For more details, see the configuration reference. +fullnameOverride: "" +nameOverride: + +# enabled is ignored by the jupyterhub chart itself, but a chart depending on +# the jupyterhub chart conditionally can make use this config option as the +# condition. +enabled: + +# custom can contain anything you want to pass to the hub pod, as all passed +# Helm template values will be made available there. +custom: {} + +# imagePullSecret is configuration to create a k8s Secret that Helm chart's pods +# can get credentials from to pull their images. +imagePullSecret: + create: false + automaticReferenceInjection: true + registry: + username: + password: + email: +# imagePullSecrets is configuration to reference the k8s Secret resources the +# Helm chart's pods can get credentials from to pull their images. +imagePullSecrets: [] + +# hub relates to the hub pod, responsible for running JupyterHub, its configured +# Authenticator class KubeSpawner, and its configured Proxy class +# ConfigurableHTTPProxy. KubeSpawner creates the user pods, and +# ConfigurableHTTPProxy speaks with the actual ConfigurableHTTPProxy server in +# the proxy pod. +hub: + revisionHistoryLimit: + config: + JupyterHub: + admin_access: true + authenticator_class: dummy + service: + type: ClusterIP + annotations: {} + ports: + nodePort: + extraPorts: [] + loadBalancerIP: + baseUrl: / + cookieSecret: + initContainers: [] + nodeSelector: {} + tolerations: [] + concurrentSpawnLimit: 64 + consecutiveFailureLimit: 5 + activeServerLimit: + deploymentStrategy: + ## type: Recreate + ## - sqlite-pvc backed hubs require the Recreate deployment strategy as a + ## typical PVC storage can only be bound to one pod at the time. + ## - JupyterHub isn't designed to support being run in parallel. More work + ## needs to be done in JupyterHub itself for a fully highly available (HA) + ## deployment of JupyterHub on k8s is to be possible. + type: Recreate + db: + type: sqlite-pvc + upgrade: + pvc: + annotations: {} + selector: {} + accessModes: + - ReadWriteOnce + storage: 1Gi + subPath: + storageClassName: + url: + password: + labels: {} + annotations: {} + command: [] + args: [] + extraConfig: {} + extraFiles: {} + extraEnv: {} + extraContainers: [] + extraVolumes: [] + extraVolumeMounts: [] + image: + name: jupyterhub/k8s-hub + tag: "3.0.2" + pullPolicy: + pullSecrets: [] + resources: {} + podSecurityContext: + fsGroup: 1000 + containerSecurityContext: + runAsUser: 1000 + runAsGroup: 1000 + allowPrivilegeEscalation: false + lifecycle: {} + loadRoles: {} + services: {} + pdb: + enabled: false + maxUnavailable: + minAvailable: 1 + networkPolicy: + enabled: true + ingress: [] + egress: [] + egressAllowRules: + cloudMetadataServer: true + dnsPortsCloudMetadataServer: true + dnsPortsKubeSystemNamespace: true + dnsPortsPrivateIPs: true + nonPrivateIPs: true + privateIPs: true + interNamespaceAccessLabels: ignore + allowedIngressPorts: [] + allowNamedServers: false + namedServerLimitPerUser: + authenticatePrometheus: + redirectToServer: + shutdownOnLogout: + templatePaths: [] + templateVars: {} + livenessProbe: + # The livenessProbe's aim to give JupyterHub sufficient time to startup but + # be able to restart if it becomes unresponsive for ~5 min. + enabled: true + initialDelaySeconds: 300 + periodSeconds: 10 + failureThreshold: 30 + timeoutSeconds: 3 + readinessProbe: + # The readinessProbe's aim is to provide a successful startup indication, + # but following that never become unready before its livenessProbe fail and + # restarts it if needed. To become unready following startup serves no + # purpose as there are no other pod to fallback to in our non-HA deployment. + enabled: true + initialDelaySeconds: 0 + periodSeconds: 2 + failureThreshold: 1000 + timeoutSeconds: 1 + existingSecret: + serviceAccount: + create: true + name: + annotations: {} + extraPodSpec: {} + +rbac: + create: true + +# proxy relates to the proxy pod, the proxy-public service, and the autohttps +# pod and proxy-http service. +proxy: + secretToken: + annotations: {} + deploymentStrategy: + ## type: Recreate + ## - JupyterHub's interaction with the CHP proxy becomes a lot more robust + ## with this configuration. To understand this, consider that JupyterHub + ## during startup will interact a lot with the k8s service to reach a + ## ready proxy pod. If the hub pod during a helm upgrade is restarting + ## directly while the proxy pod is making a rolling upgrade, the hub pod + ## could end up running a sequence of interactions with the old proxy pod + ## and finishing up the sequence of interactions with the new proxy pod. + ## As CHP proxy pods carry individual state this is very error prone. One + ## outcome when not using Recreate as a strategy has been that user pods + ## have been deleted by the hub pod because it considered them unreachable + ## as it only configured the old proxy pod but not the new before trying + ## to reach them. + type: Recreate + ## rollingUpdate: + ## - WARNING: + ## This is required to be set explicitly blank! Without it being + ## explicitly blank, k8s will let eventual old values under rollingUpdate + ## remain and then the Deployment becomes invalid and a helm upgrade would + ## fail with an error like this: + ## + ## UPGRADE FAILED + ## Error: Deployment.apps "proxy" is invalid: spec.strategy.rollingUpdate: Forbidden: may not be specified when strategy `type` is 'Recreate' + ## Error: UPGRADE FAILED: Deployment.apps "proxy" is invalid: spec.strategy.rollingUpdate: Forbidden: may not be specified when strategy `type` is 'Recreate' + rollingUpdate: + # service relates to the proxy-public service + service: + type: LoadBalancer + labels: {} + annotations: {} + nodePorts: + http: + https: + disableHttpPort: false + extraPorts: [] + loadBalancerIP: + loadBalancerSourceRanges: [] + # chp relates to the proxy pod, which is responsible for routing traffic based + # on dynamic configuration sent from JupyterHub to CHP's REST API. + chp: + revisionHistoryLimit: + containerSecurityContext: + runAsUser: 65534 # nobody user + runAsGroup: 65534 # nobody group + allowPrivilegeEscalation: false + image: + name: jupyterhub/configurable-http-proxy + # tag is automatically bumped to new patch versions by the + # watch-dependencies.yaml workflow. + # + tag: "4.5.6" # https://github.com/jupyterhub/configurable-http-proxy/tags + pullPolicy: + pullSecrets: [] + extraCommandLineFlags: [] + livenessProbe: + enabled: true + initialDelaySeconds: 60 + periodSeconds: 10 + failureThreshold: 30 + timeoutSeconds: 3 + readinessProbe: + enabled: true + initialDelaySeconds: 0 + periodSeconds: 2 + failureThreshold: 1000 + timeoutSeconds: 1 + resources: {} + defaultTarget: + errorTarget: + extraEnv: {} + nodeSelector: {} + tolerations: [] + networkPolicy: + enabled: true + ingress: [] + egress: [] + egressAllowRules: + cloudMetadataServer: true + dnsPortsCloudMetadataServer: true + dnsPortsKubeSystemNamespace: true + dnsPortsPrivateIPs: true + nonPrivateIPs: true + privateIPs: true + interNamespaceAccessLabels: ignore + allowedIngressPorts: [http, https] + pdb: + enabled: false + maxUnavailable: + minAvailable: 1 + extraPodSpec: {} + # traefik relates to the autohttps pod, which is responsible for TLS + # termination when proxy.https.type=letsencrypt. + traefik: + revisionHistoryLimit: + containerSecurityContext: + runAsUser: 65534 # nobody user + runAsGroup: 65534 # nobody group + allowPrivilegeEscalation: false + image: + name: traefik + # tag is automatically bumped to new patch versions by the + # watch-dependencies.yaml workflow. + # + tag: "v2.10.4" # ref: https://hub.docker.com/_/traefik?tab=tags + pullPolicy: + pullSecrets: [] + hsts: + includeSubdomains: false + preload: false + maxAge: 15724800 # About 6 months + resources: {} + labels: {} + extraInitContainers: [] + extraEnv: {} + extraVolumes: [] + extraVolumeMounts: [] + extraStaticConfig: {} + extraDynamicConfig: {} + nodeSelector: {} + tolerations: [] + extraPorts: [] + networkPolicy: + enabled: true + ingress: [] + egress: [] + egressAllowRules: + cloudMetadataServer: true + dnsPortsCloudMetadataServer: true + dnsPortsKubeSystemNamespace: true + dnsPortsPrivateIPs: true + nonPrivateIPs: true + privateIPs: true + interNamespaceAccessLabels: ignore + allowedIngressPorts: [http, https] + pdb: + enabled: false + maxUnavailable: + minAvailable: 1 + serviceAccount: + create: true + name: + annotations: {} + extraPodSpec: {} + secretSync: + containerSecurityContext: + runAsUser: 65534 # nobody user + runAsGroup: 65534 # nobody group + allowPrivilegeEscalation: false + image: + name: jupyterhub/k8s-secret-sync + tag: "3.0.2" + pullPolicy: + pullSecrets: [] + resources: {} + labels: {} + https: + enabled: false + type: letsencrypt + #type: letsencrypt, manual, offload, secret + letsencrypt: + contactEmail: + # Specify custom server here (https://acme-staging-v02.api.letsencrypt.org/directory) to hit staging LE + acmeServer: https://acme-v02.api.letsencrypt.org/directory + manual: + key: + cert: + secret: + name: + key: tls.key + crt: tls.crt + hosts: [] + +# singleuser relates to the configuration of KubeSpawner which runs in the hub +# pod, and its spawning of user pods such as jupyter-myusername. +singleuser: + podNameTemplate: + extraTolerations: [] + nodeSelector: {} + extraNodeAffinity: + required: [] + preferred: [] + extraPodAffinity: + required: [] + preferred: [] + extraPodAntiAffinity: + required: [] + preferred: [] + networkTools: + image: + name: jupyterhub/k8s-network-tools + tag: "3.0.2" + pullPolicy: + pullSecrets: [] + resources: {} + cloudMetadata: + # block set to true will append a privileged initContainer using the + # iptables to block the sensitive metadata server at the provided ip. + blockWithIptables: true + ip: 169.254.169.254 + networkPolicy: + enabled: true + ingress: [] + egress: [] + egressAllowRules: + cloudMetadataServer: false + dnsPortsCloudMetadataServer: true + dnsPortsKubeSystemNamespace: true + dnsPortsPrivateIPs: true + nonPrivateIPs: true + privateIPs: false + interNamespaceAccessLabels: ignore + allowedIngressPorts: [] + events: true + extraAnnotations: {} + extraLabels: + hub.jupyter.org/network-access-hub: "true" + extraFiles: {} + extraEnv: {} + lifecycleHooks: {} + initContainers: [] + extraContainers: [] + allowPrivilegeEscalation: false + uid: 1000 + fsGid: 100 + serviceAccountName: + storage: + type: dynamic + extraLabels: {} + extraVolumes: [] + extraVolumeMounts: [] + static: + pvcName: + subPath: "{username}" + capacity: 10Gi + homeMountPath: /home/jovyan + dynamic: + storageClass: + pvcNameTemplate: claim-{username}{servername} + volumeNameTemplate: volume-{username}{servername} + storageAccessModes: [ReadWriteOnce] + image: + name: jupyterhub/k8s-singleuser-sample + tag: "3.0.2" + pullPolicy: + pullSecrets: [] + startTimeout: 300 + cpu: + limit: + guarantee: + memory: + limit: + guarantee: 1G + extraResource: + limits: {} + guarantees: {} + cmd: jupyterhub-singleuser + defaultUrl: + extraPodConfig: {} + profileList: [] + +# scheduling relates to the user-scheduler pods and user-placeholder pods. +scheduling: + userScheduler: + enabled: true + revisionHistoryLimit: + replicas: 2 + logLevel: 4 + # plugins are configured on the user-scheduler to make us score how we + # schedule user pods in a way to help us schedule on the most busy node. By + # doing this, we help scale down more effectively. It isn't obvious how to + # enable/disable scoring plugins, and configure them, to accomplish this. + # + # plugins ref: https://kubernetes.io/docs/reference/scheduling/config/#scheduling-plugins-1 + # migration ref: https://kubernetes.io/docs/reference/scheduling/config/#scheduler-configuration-migrations + # + plugins: + score: + # These scoring plugins are enabled by default according to + # https://kubernetes.io/docs/reference/scheduling/config/#scheduling-plugins + # 2022-02-22. + # + # Enabled with high priority: + # - NodeAffinity + # - InterPodAffinity + # - NodeResourcesFit + # - ImageLocality + # Remains enabled with low default priority: + # - TaintToleration + # - PodTopologySpread + # - VolumeBinding + # Disabled for scoring: + # - NodeResourcesBalancedAllocation + # + disabled: + # We disable these plugins (with regards to scoring) to not interfere + # or complicate our use of NodeResourcesFit. + - name: NodeResourcesBalancedAllocation + # Disable plugins to be allowed to enable them again with a different + # weight and avoid an error. + - name: NodeAffinity + - name: InterPodAffinity + - name: NodeResourcesFit + - name: ImageLocality + enabled: + - name: NodeAffinity + weight: 14631 + - name: InterPodAffinity + weight: 1331 + - name: NodeResourcesFit + weight: 121 + - name: ImageLocality + weight: 11 + pluginConfig: + # Here we declare that we should optimize pods to fit based on a + # MostAllocated strategy instead of the default LeastAllocated. + - name: NodeResourcesFit + args: + scoringStrategy: + resources: + - name: cpu + weight: 1 + - name: memory + weight: 1 + type: MostAllocated + containerSecurityContext: + runAsUser: 65534 # nobody user + runAsGroup: 65534 # nobody group + allowPrivilegeEscalation: false + image: + # IMPORTANT: Bumping the minor version of this binary should go hand in + # hand with an inspection of the user-scheduelrs RBAC resources + # that we have forked in + # templates/scheduling/user-scheduler/rbac.yaml. + # + # Debugging advice: + # + # - Is configuration of kube-scheduler broken in + # templates/scheduling/user-scheduler/configmap.yaml? + # + # - Is the kube-scheduler binary's compatibility to work + # against a k8s api-server that is too new or too old? + # + # - You can update the GitHub workflow that runs tests to + # include "deploy/user-scheduler" in the k8s namespace report + # and reduce the user-scheduler deployments replicas to 1 in + # dev-config.yaml to get relevant logs from the user-scheduler + # pods. Inspect the "Kubernetes namespace report" action! + # + # - Typical failures are that kube-scheduler fails to search for + # resources via its "informers", and won't start trying to + # schedule pods before they succeed which may require + # additional RBAC permissions or that the k8s api-server is + # aware of the resources. + # + # - If "successfully acquired lease" can be seen in the logs, it + # is a good sign kube-scheduler is ready to schedule pods. + # + name: registry.k8s.io/kube-scheduler + # tag is automatically bumped to new patch versions by the + # watch-dependencies.yaml workflow. The minor version is pinned in the + # workflow, and should be updated there if a minor version bump is done + # here. We aim to stay around 1 minor version behind the latest k8s + # version. + # + tag: "v1.26.7" # ref: https://github.com/kubernetes/kubernetes/tree/master/CHANGELOG + pullPolicy: + pullSecrets: [] + nodeSelector: {} + tolerations: [] + labels: {} + annotations: {} + pdb: + enabled: true + maxUnavailable: 1 + minAvailable: + resources: {} + serviceAccount: + create: true + name: + annotations: {} + extraPodSpec: {} + podPriority: + enabled: false + globalDefault: false + defaultPriority: 0 + imagePullerPriority: -5 + userPlaceholderPriority: -10 + userPlaceholder: + enabled: true + image: + name: registry.k8s.io/pause + # tag is automatically bumped to new patch versions by the + # watch-dependencies.yaml workflow. + # + # If you update this, also update prePuller.pause.image.tag + # + tag: "3.9" + pullPolicy: + pullSecrets: [] + revisionHistoryLimit: + replicas: 0 + labels: {} + annotations: {} + containerSecurityContext: + runAsUser: 65534 # nobody user + runAsGroup: 65534 # nobody group + allowPrivilegeEscalation: false + resources: {} + corePods: + tolerations: + - key: hub.jupyter.org/dedicated + operator: Equal + value: core + effect: NoSchedule + - key: hub.jupyter.org_dedicated + operator: Equal + value: core + effect: NoSchedule + nodeAffinity: + matchNodePurpose: prefer + userPods: + tolerations: + - key: hub.jupyter.org/dedicated + operator: Equal + value: user + effect: NoSchedule + - key: hub.jupyter.org_dedicated + operator: Equal + value: user + effect: NoSchedule + nodeAffinity: + matchNodePurpose: prefer + +# prePuller relates to the hook|continuous-image-puller DaemonsSets +prePuller: + revisionHistoryLimit: + labels: {} + annotations: {} + resources: {} + containerSecurityContext: + runAsUser: 65534 # nobody user + runAsGroup: 65534 # nobody group + allowPrivilegeEscalation: false + extraTolerations: [] + # hook relates to the hook-image-awaiter Job and hook-image-puller DaemonSet + hook: + enabled: true + pullOnlyOnChanges: true + # image and the configuration below relates to the hook-image-awaiter Job + image: + name: jupyterhub/k8s-image-awaiter + tag: "3.0.2" + pullPolicy: + pullSecrets: [] + containerSecurityContext: + runAsUser: 65534 # nobody user + runAsGroup: 65534 # nobody group + allowPrivilegeEscalation: false + podSchedulingWaitDuration: 10 + nodeSelector: {} + tolerations: [] + resources: {} + serviceAccount: + create: true + name: + annotations: {} + continuous: + enabled: true + pullProfileListImages: true + extraImages: {} + pause: + containerSecurityContext: + runAsUser: 65534 # nobody user + runAsGroup: 65534 # nobody group + allowPrivilegeEscalation: false + image: + name: registry.k8s.io/pause + # tag is automatically bumped to new patch versions by the + # watch-dependencies.yaml workflow. + # + # If you update this, also update scheduling.userPlaceholder.image.tag + # + tag: "3.9" + pullPolicy: + pullSecrets: [] + +ingress: + enabled: false + annotations: {} + ingressClassName: + hosts: [] + pathSuffix: + pathType: Prefix + tls: [] + +# cull relates to the jupyterhub-idle-culler service, responsible for evicting +# inactive singleuser pods. +# +# The configuration below, except for enabled, corresponds to command-line flags +# for jupyterhub-idle-culler as documented here: +# https://github.com/jupyterhub/jupyterhub-idle-culler#as-a-standalone-script +# +cull: + enabled: true + users: false # --cull-users + adminUsers: true # --cull-admin-users + removeNamedServers: false # --remove-named-servers + timeout: 3600 # --timeout + every: 600 # --cull-every + concurrency: 10 # --concurrency + maxAge: 0 # --max-age + +debug: + enabled: false + +global: + safeToShowValues: false +``` + +
+ +#### Changes You Might Need to Make: + +- Change the config*.yaml image-> name and tag that you deploy to use your images. +- You might want to change the number of user placeholder pods +- Also change the hub->concurrentSpawnLimit +- Change the password, ssl secret, and domain name if applicable +- Change the aws/eksctl-config.yaml autoscaling ranges depending on your needs. +- Remove pullPolicy Always if you don't expect to want to update/re-pull an image every time (ideal for production) + +And here is how to deploy, assuming the default namespace. Please choose your cloud appropriately! + +```bash +# This is for Google Cloud +helm install flux-jupyter jupyterhub/jupyterhub --values gcp/config.yaml + +# This is for Amazon EKS without SSL +helm install flux-jupyter jupyterhub/jupyterhub --values aws/config-aws.yaml + +# This is for Amazon EKS with SSL (assuming DNS is configured) +helm install flux-jupyter jupyterhub/jupyterhub --values aws/config-aws-ssl.yaml +``` + +If you mess something up, you can change the file and run `helm upgrade`: + +```bash +helm upgrade flux-jupyter jupyterhub/jupyterhub --values aws/config-aws-ssl.yaml +``` + +If you REALLY mess something up, you can tear the whole thing down and then install again: + +```bash +helm uninstall flux-jupyter +``` + +Note that in practice of bringing this up and down many times, we have seen the proxy-public +not create a handful of times. If this happens, just tear down everything, wait for all pods +to terminate, and then start freshly. When you run a command, also note that the terminal will hang! +You can see progress in another terminal: + +```bash +$ kubectl get pods +``` + +or try watching: + +```bash +$ kubectl get pods --watch +``` + +When it's done, you should see: + +```bash +$ kubectl get pods +NAME READY STATUS RESTARTS AGE +continuous-image-puller-nvr4g 1/1 Running 0 5m31s +hub-7d59dfb748-mrfdv 1/1 Running 0 5m31s +proxy-d9dfbf77b-v488t 1/1 Running 0 5m31s +user-scheduler-587fcc5479-c4mmk 1/1 Running 0 5m31s +user-scheduler-587fcc5479-x6jmk 1/1 Running 0 5m31s +``` + +(The numbers of each above might vary based on the size of your cluster). And the terminal provides a lot of useful output: + +
+ +Output of Terminal on Completed Install + +```console +NAME: flux-jupyter +LAST DEPLOYED: Sun Aug 27 15:00:15 2023 +NAMESPACE: default +STATUS: deployed +REVISION: 1 +TEST SUITE: None +NOTES: +. __ __ __ __ __ + / / __ __ ____ __ __ / /_ ___ _____ / / / / __ __ / /_ + __ / / / / / / / __ \ / / / / / __/ / _ \ / ___/ / /_/ / / / / / / __ \ +/ /_/ / / /_/ / / /_/ / / /_/ / / /_ / __/ / / / __ / / /_/ / / /_/ / +\____/ \__,_/ / .___/ \__, / \__/ \___/ /_/ /_/ /_/ \__,_/ /_.___/ + /_/ /____/ + + You have successfully installed the official JupyterHub Helm chart! + +### Installation info + + - Kubernetes namespace: default + - Helm release name: flux-jupyter + - Helm chart version: 3.0.2 + - JupyterHub version: 4.0.2 + - Hub pod packages: See https://github.com/jupyterhub/zero-to-jupyterhub-k8s/blob/3.0.2/images/hub/requirements.txt + +### Followup links + + - Documentation: https://z2jh.jupyter.org + - Help forum: https://discourse.jupyter.org + - Social chat: https://gitter.im/jupyterhub/jupyterhub + - Issue tracking: https://github.com/jupyterhub/zero-to-jupyterhub-k8s/issues + +### Post-installation checklist + + - Verify that created Pods enter a Running state: + + kubectl --namespace=default get pod + + If a pod is stuck with a Pending or ContainerCreating status, diagnose with: + + kubectl --namespace=default describe pod + + If a pod keeps restarting, diagnose with: + + kubectl --namespace=default logs --previous + + - Verify an external IP is provided for the k8s Service proxy-public. + + kubectl --namespace=default get service proxy-public + + If the external ip remains , diagnose with: + + kubectl --namespace=default describe service proxy-public + + - Verify web based access: + + You have not configured a k8s Ingress resource so you need to access the k8s + Service proxy-public directly. + + If your computer is outside the k8s cluster, you can port-forward traffic to + the k8s Service proxy-public with kubectl to access it from your + computer. + + kubectl --namespace=default port-forward service/proxy-public 8080:http + + Try insecure HTTP access: http://localhost:8080 +``` + +
+ +### 3. Get Public Proxy + +Then to find the public proxy: + +```bash +kubectl get service proxy-public +``` +```console +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +proxy-public LoadBalancer 10.96.179.168 80:32530/TCP 7m22s +``` +or: + +```bash +kubectl get service proxy-public --output jsonpath='{.status.loadBalancer.ingress[].ip}' +``` + +Note that for Google, it looks like an ip address. For aws you get a string monster! + +```console +a054af2758c1549f780a433e5515a9d4-1012389935.us-east-2.elb.amazonaws.com +``` + +This might take a minute to fully be there - if it doesn't work immediately give it that. +At this point, you should be able to login as any user, open the notebook (nested two levels) +and interact with Flux! Remember that if you don't see the service, try deleting everything and +starting fresh. If that doesn't work, there might be some new error we didn't anticipate, +and you can look at logs. + +### Clean up + +For both: + +```bash +helm uninstall flux-jupyter +``` + +For Google Cloud: + +```bash +gcloud container clusters delete flux-jupyter +``` + +For AWS: + +```bash +# If you don't do this first, it will tell the pods are un-evictable and loop forever +$ kubectl delete pod --all-namespaces --all --force +# Then delete the cluster +$ eksctl delete cluster --config-file aws/eksctl-config.yaml --wait +``` + +In practice, you'll need to start deleting with `eksctl` and then you will see the pod eviction warning +(because they were re-created) and you'll need to run the command again, and then it will clean up. + +### Tutorial "would be nice" additions + +- Flux accounting + - after flux resource list, to see queues available (flux queue list) + - how to specify a bank for a job + - list banks (all) - flux account view-bank --tree + - specify banks - flux account view user $USER diff --git a/2024-RADIUSS-AWS/JupyterNotebook/aws/cluster-autoscaler-autodiscover.yaml b/2024-RADIUSS-AWS/JupyterNotebook/aws/cluster-autoscaler-autodiscover.yaml new file mode 100644 index 0000000..56869d0 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/aws/cluster-autoscaler-autodiscover.yaml @@ -0,0 +1,180 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + k8s-addon: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler + name: cluster-autoscaler + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: cluster-autoscaler + labels: + k8s-addon: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +rules: + - apiGroups: [""] + resources: ["events", "endpoints"] + verbs: ["create", "patch"] + - apiGroups: [""] + resources: ["pods/eviction"] + verbs: ["create"] + - apiGroups: [""] + resources: ["pods/status"] + verbs: ["update"] + - apiGroups: [""] + resources: ["endpoints"] + resourceNames: ["cluster-autoscaler"] + verbs: ["get", "update"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["watch", "list", "get", "update"] + - apiGroups: [""] + resources: + - "namespaces" + - "pods" + - "services" + - "replicationcontrollers" + - "persistentvolumeclaims" + - "persistentvolumes" + verbs: ["watch", "list", "get"] + - apiGroups: ["extensions"] + resources: ["replicasets", "daemonsets"] + verbs: ["watch", "list", "get"] + - apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["watch", "list"] + - apiGroups: ["apps"] + resources: ["statefulsets", "replicasets", "daemonsets"] + verbs: ["watch", "list", "get"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses", "csinodes", "csidrivers", "csistoragecapacities"] + verbs: ["watch", "list", "get"] + - apiGroups: ["batch", "extensions"] + resources: ["jobs"] + verbs: ["get", "list", "watch", "patch"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["create"] + - apiGroups: ["coordination.k8s.io"] + resourceNames: ["cluster-autoscaler"] + resources: ["leases"] + verbs: ["get", "update"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: cluster-autoscaler + namespace: kube-system + labels: + k8s-addon: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +rules: + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["create", "list", "watch"] + - apiGroups: [""] + resources: ["configmaps"] + resourceNames: ["cluster-autoscaler-status", "cluster-autoscaler-priority-expander"] + verbs: ["delete", "get", "update", "watch"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: cluster-autoscaler + labels: + k8s-addon: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cluster-autoscaler +subjects: + - kind: ServiceAccount + name: cluster-autoscaler + namespace: kube-system + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: cluster-autoscaler + namespace: kube-system + labels: + k8s-addon: cluster-autoscaler.addons.k8s.io + k8s-app: cluster-autoscaler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: cluster-autoscaler +subjects: + - kind: ServiceAccount + name: cluster-autoscaler + namespace: kube-system + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: cluster-autoscaler + namespace: kube-system + labels: + app: cluster-autoscaler +spec: + replicas: 1 + selector: + matchLabels: + app: cluster-autoscaler + template: + metadata: + labels: + app: cluster-autoscaler + annotations: + prometheus.io/scrape: 'true' + prometheus.io/port: '8085' + spec: + priorityClassName: system-cluster-critical + securityContext: + runAsNonRoot: true + runAsUser: 65534 + fsGroup: 65534 + seccompProfile: + type: RuntimeDefault + serviceAccountName: cluster-autoscaler + containers: + - image: registry.k8s.io/autoscaling/cluster-autoscaler:v1.26.2 + name: cluster-autoscaler + resources: + limits: + cpu: 100m + memory: 600Mi + requests: + cpu: 100m + memory: 600Mi + command: + - ./cluster-autoscaler + - --v=4 + - --stderrthreshold=info + - --cloud-provider=aws + - --skip-nodes-with-local-storage=false + - --expander=least-waste + - --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/jupyterhub + volumeMounts: + - name: ssl-certs + mountPath: /etc/ssl/certs/ca-certificates.crt # /etc/ssl/certs/ca-bundle.crt for Amazon Linux Worker Nodes + readOnly: true + imagePullPolicy: "Always" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + volumes: + - name: ssl-certs + hostPath: + path: "/etc/ssl/certs/ca-bundle.crt" diff --git a/2024-RADIUSS-AWS/JupyterNotebook/aws/config-aws-ssl.yaml b/2024-RADIUSS-AWS/JupyterNotebook/aws/config-aws-ssl.yaml new file mode 100644 index 0000000..6e49053 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/aws/config-aws-ssl.yaml @@ -0,0 +1,79 @@ +# A few notes! +# The hub -> authentic class defaults to "dummy" +# We shouldn't need any image pull secrets assuming public +# There is a note about the database being a sqlite pvc +# (and a TODO for better solution for Kubernetes) + +# This is the concurrent spawn limit, likely should be increased (deafults to 64) +hub: + concurrentSpawnLimit: 128 + config: + DummyAuthenticator: + password: butter + JupyterHub: + admin_access: true + authenticator_class: dummy + db: + pvc: + # Defaults to 1Gi + storage: 32Gi + # Add the storageclass name, defaults to gp2 + storageClassName: gp3 + + # This is the image I built based off of jupyterhub/k8s-hub, 3.0.2 at time of writing this + image: + name: ghcr.io/flux-framework/flux-jupyter-hub + tag: "radiuss-2024" + pullPolicy: Always + +# # https://z2jh.jupyter.org/en/latest/administrator/optimization.html#scaling-up-in-time-user-placeholders +# scheduling: +# podPriority: +# enabled: true +# userPlaceholder: +# # Specify 3 dummy user pods will be used as placeholders +# replicas: 3 + +proxy: + https: + enabled: true + hosts: + - tutorial.flux-framework.org + letsencrypt: + contactEmail: you@email.com + +# This is the "spawn" image +singleuser: + image: + name: ghcr.io/flux-framework/flux-jupyter-spawn + tag: "radiuss-2024" + pullPolicy: Always + cpu: + limit: 2 + guarantee: 2 + memory: + limit: '4G' + guarantee: '4G' + cmd: /entrypoint.sh + + # This runs as the root user, who clones and changes ownership to uid 1000 + initContainers: + - name: init-myservice + image: ghcr.io/flux-framework/flux-jupyter-init:radiuss-2024 + command: ["/entrypoint.sh"] + volumeMounts: + - name: flux-tutorial + mountPath: /home/jovyan + + # This is how we get the tutorial files added + storage: + type: none + + # gitRepo volume is deprecated so we need another way + # https://kubernetes.io/docs/concepts/storage/volumes/#gitrepo + extraVolumes: + - name: flux-tutorial + emptyDir: {} + extraVolumeMounts: + - name: flux-tutorial + mountPath: /home/jovyan diff --git a/2024-RADIUSS-AWS/JupyterNotebook/aws/config-aws.yaml b/2024-RADIUSS-AWS/JupyterNotebook/aws/config-aws.yaml new file mode 100644 index 0000000..cd84861 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/aws/config-aws.yaml @@ -0,0 +1,62 @@ +# A few notes! +# The hub -> authentic class defaults to "dummy" +# We shouldn't need any image pull secrets assuming public +# There is a note about the database being a sqlite pvc +# (and a TODO for better solution for Kubernetes) + +# This is the concurrent spawn limit, likely should be increased (deafults to 64) +hub: + concurrentSpawnLimit: 10 + config: + DummyAuthenticator: + password: butter + JupyterHub: + admin_access: true + authenticator_class: dummy + + # This is the image I built based off of jupyterhub/k8s-hub, 3.0.2 at time of writing this + image: + name: ghcr.io/flux-framework/flux-jupyter-hub + tag: "radiuss-2024" + pullPolicy: Always + +# https://z2jh.jupyter.org/en/latest/administrator/optimization.html#scaling-up-in-time-user-placeholders +scheduling: + podPriority: + enabled: true + userPlaceholder: + # Specify 3 dummy user pods will be used as placeholders + replicas: 3 + +# This is the "spawn" image +singleuser: + image: + name: ghcr.io/flux-framework/flux-jupyter-spawn + tag: "radiuss-2024" + pullPolicy: Always + cpu: + limit: 1 + memory: + limit: '4G' + cmd: /entrypoint.sh + + # This runs as the root user, who clones and changes ownership to uid 1000 + initContainers: + - name: init-myservice + image: ghcr.io/flux-framework/flux-jupyter-init:radiuss-2024 + command: ["/entrypoint.sh"] + volumeMounts: + - name: flux-tutorial + mountPath: /home/jovyan + + # This is how we get the tutorial files added + storage: + type: none + # gitRepo volume is deprecated so we need another way + # https://kubernetes.io/docs/concepts/storage/volumes/#gitrepo + extraVolumes: + - name: flux-tutorial + emptyDir: {} + extraVolumeMounts: + - name: flux-tutorial + mountPath: /home/jovyan diff --git a/2024-RADIUSS-AWS/JupyterNotebook/aws/eksctl-config.yaml b/2024-RADIUSS-AWS/JupyterNotebook/aws/eksctl-config.yaml new file mode 100644 index 0000000..d88f0b1 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/aws/eksctl-config.yaml @@ -0,0 +1,110 @@ +# https://www.arhea.net/posts/2020-06-18-jupyterhub-amazon-eks +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig +metadata: + name: jupyterhub + region: us-east-2 + +iam: + withOIDC: true + serviceAccounts: + - metadata: + name: cluster-autoscaler + namespace: kube-system + labels: + aws-usage: "cluster-ops" + app.kubernetes.io/name: cluster-autoscaler + + # https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws/README.md + attachPolicy: + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - "autoscaling:DescribeAutoScalingGroups" + - "autoscaling:DescribeAutoScalingInstances" + - "autoscaling:DescribeLaunchConfigurations" + - "autoscaling:DescribeTags" + - "autoscaling:SetDesiredCapacity" + - "autoscaling:TerminateInstanceInAutoScalingGroup" + - "ec2:DescribeLaunchTemplateVersions" + Resource: '*' + + - metadata: + name: ebs-csi-controller-sa + namespace: kube-system + labels: + aws-usage: "cluster-ops" + app.kubernetes.io/name: aws-ebs-csi-driver + attachPolicy: + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - "ec2:AttachVolume" + - "ec2:CreateSnapshot" + - "ec2:CreateTags" + - "ec2:CreateVolume" + - "ec2:DeleteSnapshot" + - "ec2:DeleteTags" + - "ec2:DeleteVolume" + - "ec2:DescribeInstances" + - "ec2:DescribeSnapshots" + - "ec2:DescribeTags" + - "ec2:DescribeVolumes" + - "ec2:DetachVolume" + Resource: '*' + +availabilityZones: ["us-east-2a", "us-east-2b", "us-east-2c"] +managedNodeGroups: + - name: ng-us-east-2a + iam: + withAddonPolicies: + autoScaler: true + instanceType: m5.large + volumeSize: 30 + desiredCapacity: 1 + minSize: 1 + maxSize: 3 + privateNetworking: true + availabilityZones: + - us-east-2a + # I didn't set this, but I know it's been an issue + # propagateASGTags: true + tags: + k8s.io/cluster-autoscaler/enabled: "true" + k8s.io/cluster-autoscaler/jupyterhub: "owned" + + - name: ng-us-east-2b + iam: + withAddonPolicies: + autoScaler: true + instanceType: m5.large + volumeSize: 30 + desiredCapacity: 1 + minSize: 1 + maxSize: 3 + privateNetworking: true + availabilityZones: + - us-east-2b + # propagateASGTags: true + tags: + k8s.io/cluster-autoscaler/enabled: "true" + k8s.io/cluster-autoscaler/jupyterhub: "owned" + + - name: ng-us-east-2c + iam: + withAddonPolicies: + autoScaler: true + instanceType: m5.large + volumeSize: 30 + desiredCapacity: 1 + minSize: 1 + maxSize: 3 + privateNetworking: true + availabilityZones: + - us-east-2c + # propagateASGTags: true + tags: + k8s.io/cluster-autoscaler/enabled: "true" + k8s.io/cluster-autoscaler/jupyterhub: "owned" diff --git a/2024-RADIUSS-AWS/JupyterNotebook/aws/eksctl-radiuss-tutorial-2024.yaml b/2024-RADIUSS-AWS/JupyterNotebook/aws/eksctl-radiuss-tutorial-2024.yaml new file mode 100644 index 0000000..93c04d9 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/aws/eksctl-radiuss-tutorial-2024.yaml @@ -0,0 +1,80 @@ +# https://www.arhea.net/posts/2020-06-18-jupyterhub-amazon-eks +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig +metadata: + name: jupyterhub + region: us-east-1 + +iam: + withOIDC: true + serviceAccounts: + - metadata: + name: ebs-csi-controller-sa + namespace: kube-system + labels: + aws-usage: "cluster-ops" + app.kubernetes.io/name: aws-ebs-csi-driver + attachPolicy: + Version: "2012-10-17" + Statement: + - Effect: Allow + Action: + - "ec2:AttachVolume" + - "ec2:CreateSnapshot" + - "ec2:CreateTags" + - "ec2:CreateVolume" + - "ec2:DeleteSnapshot" + - "ec2:DeleteTags" + - "ec2:DeleteVolume" + - "ec2:DescribeInstances" + - "ec2:DescribeSnapshots" + - "ec2:DescribeTags" + - "ec2:DescribeVolumes" + - "ec2:DetachVolume" + Resource: '*' + +availabilityZones: + - us-east-1a + - us-east-1b + - us-east-1c + +managedNodeGroups: + - name: ng-us-east-1a + instanceType: m6a.8xlarge + volumeSize: 256 + volumeType: gp3 + volumeIOPS: 16000 + volumeThroughput: 512 + desiredCapacity: 1 + minSize: 1 + maxSize: 6 + privateNetworking: true + availabilityZones: + - us-east-1a + + - name: ng-us-east-1b + instanceType: m6a.8xlarge + volumeSize: 256 + volumeType: gp3 + volumeIOPS: 16000 + volumeThroughput: 512 + desiredCapacity: 1 + minSize: 1 + maxSize: 6 + privateNetworking: true + availabilityZones: + - us-east-1b + + - name: ng-us-east-1c + instanceType: m6a.8xlarge + volumeSize: 256 + volumeType: gp3 + volumeIOPS: 16000 + volumeThroughput: 512 + desiredCapacity: 1 + minSize: 1 + maxSize: 6 + privateNetworking: true + availabilityZones: + - us-east-1c + diff --git a/2024-RADIUSS-AWS/JupyterNotebook/aws/storageclass.yaml b/2024-RADIUSS-AWS/JupyterNotebook/aws/storageclass.yaml new file mode 100644 index 0000000..b9bef8f --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/aws/storageclass.yaml @@ -0,0 +1,7 @@ +kind: StorageClass +apiVersion: storage.k8s.io/v1 +metadata: + name: gp3 +provisioner: kubernetes.io/aws-ebs +volumeBindingMode: WaitForFirstConsumer +reclaimPolicy: Delete diff --git a/2024-RADIUSS-AWS/JupyterNotebook/docker/Dockerfile.hub b/2024-RADIUSS-AWS/JupyterNotebook/docker/Dockerfile.hub new file mode 100644 index 0000000..595a53e --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/docker/Dockerfile.hub @@ -0,0 +1,9 @@ +ARG JUPYTERHUB_VERSION=3.0.2 +FROM jupyterhub/k8s-hub:$JUPYTERHUB_VERSION + +# Add template override directory and copy our example +# Replace the default +USER root +RUN mv /usr/local/share/jupyterhub/templates/login.html /usr/local/share/jupyterhub/templates/_login.html +COPY ./docker/login.html /usr/local/share/jupyterhub/templates/login.html +USER jovyan diff --git a/2024-RADIUSS-AWS/JupyterNotebook/docker/Dockerfile.init b/2024-RADIUSS-AWS/JupyterNotebook/docker/Dockerfile.init new file mode 100644 index 0000000..b4ba70c --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/docker/Dockerfile.init @@ -0,0 +1,13 @@ +FROM alpine/git + +ENV NB_USER=jovyan \ + NB_UID=1000 \ + HOME=/home/jovyan + +RUN adduser \ + -D \ + -g "Default user" \ + -u ${NB_UID} \ + -h ${HOME} \ + ${NB_USER} +COPY ./docker/init-entrypoint.sh /entrypoint.sh diff --git a/2024-RADIUSS-AWS/JupyterNotebook/docker/Dockerfile.spawn b/2024-RADIUSS-AWS/JupyterNotebook/docker/Dockerfile.spawn new file mode 100644 index 0000000..ba6be5b --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/docker/Dockerfile.spawn @@ -0,0 +1,138 @@ +FROM fluxrm/flux-sched:jammy + +# Based off of https://github.com/jupyterhub/zero-to-jupyterhub-k8s/tree/main/images/singleuser-sample +# Local usage +# docker run -p 8888:8888 -v $(pwd):/home/jovyan/work test + +USER root + +ENV NB_USER=jovyan \ + NB_UID=1000 \ + HOME=/home/jovyan + +RUN adduser \ + --disabled-password \ + --gecos "Default user" \ + --uid ${NB_UID} \ + --home ${HOME} \ + --force-badname \ + ${NB_USER} + +RUN apt-get update \ + # && apt-get upgrade -y \ + && apt-get install -y --no-install-recommends \ + gcc-10 \ + g++-10 \ + ca-certificates \ + dnsutils \ + iputils-ping \ + python3 \ + python3-dev \ + python3-pip \ + python3-venv \ + openmpi-bin \ + openmpi-common \ + libopenmpi-dev \ + liblz4-dev \ + tini \ + # requirement for nbgitpuller + git \ + && rm -rf /var/lib/apt/lists/* + +COPY ./requirements_venv.txt ./requirements_venv.txt +RUN python3 -m pip install -r requirements_venv.txt + +COPY ./requirements.txt ./requirements.txt +RUN python3 -m pip install -r requirements.txt && \ + python3 -m pip install ipython==7.34.0 && \ + python3 -m IPython kernel install + +# This is code to install DYAD +# This was added to the RADIUSS 2023 tutorials on AWS +RUN git clone https://github.com/openucx/ucx.git \ + && cd ucx \ + && git checkout v1.13.1 \ + && ./autogen.sh \ + && ./configure --disable-optimizations --enable-logging --enable-debug --disable-assertions --enable-mt --disable-params-check \ + --without-go --without-java --disable-cma --without-cuda --without-gdrcopy --without-verbs --without-knem --without-rmdacm \ + --without-rocm --without-xpmem --without-fuse3 --without-ugni --prefix=/usr CC=$(which gcc) CXX=$(which g++) \ + && make -j \ + && sudo make install \ + && cd .. \ + && rm -rf ucx + +RUN git clone https://github.com/flux-framework/dyad.git \ + && cd dyad \ + && git checkout tutorial-riken-2024 \ + && mkdir build \ + && cd build \ + && cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo -DDYAD_ENABLE_UCX_DATA=ON .. \ + && sudo make install -j \ + && cd ../pydyad \ + && python3 -m build --wheel . \ + && pip install $(ls ./dist/*.whl | head -1) \ + && cd ../.. \ + && rm -rf dyad + +# This adds the flux-tree command, which is provided in flux-sched source +# but not installed alongside production flux-core +COPY ./flux-tree/* /usr/libexec/flux/cmd/ +RUN chmod +x /usr/libexec/flux/cmd/flux-tree* + +# Flux accounting +RUN git clone https://github.com/flux-framework/flux-accounting && \ + cd flux-accounting && \ + ./autogen.sh && \ + ./configure --prefix=/usr && \ + make && make install + +RUN apt-get update && apt-get install -y nodejs && apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN wget https://nodejs.org/dist/v20.15.0/node-v20.15.0-linux-x64.tar.xz && \ + apt-get update && apt-get install -y xz-utils && rm -rf /var/lib/apt/lists/* && \ + xz -d -v node-v20.15.0-linux-x64.tar.xz && \ + tar -C /usr/local --strip-components=1 -xvf node-v20.15.0-linux-x64.tar + +# This customizes the launcher UI +# https://jupyter-app-launcher.readthedocs.io/en/latest/usage.html +RUN python3 -m pip install jupyter_app_launcher && \ + python3 -m pip install --upgrade jupyter-server && \ + python3 -m pip install jupyter-launcher-shortcuts && \ + mkdir -p /usr/local/share/jupyter/lab/jupyter_app_launcher + +COPY ./tutorial /home/jovyan/ +COPY ./docker/jupyter-launcher.yaml /usr/local/share/jupyter/lab/jupyter_app_launcher/jp_app_launcher.yaml +ENV JUPYTER_APP_LAUNCHER_PATH=/usr/local/share/jupyter/lab/jupyter_app_launcher/ + +# Give jovyan user permissions to tutorial materials +RUN chmod -R 777 ~/ /home/jovyan + +WORKDIR $HOME +COPY ./docker/flux-icon.png $HOME/flux-icon.png + +# note that previous examples are added via git volume in config.yaml +ENV SHELL=/usr/bin/bash +ENV FLUX_URI_RESOLVE_LOCAL=t + +EXPOSE 8888 +ENTRYPOINT ["tini", "--"] + +# This is for JupyterHub +COPY ./docker/entrypoint.sh /entrypoint.sh + +# This is for a local start +COPY ./docker/start.sh /start.sh + +RUN mkdir -p $HOME/.local/share && \ + chmod 777 $HOME/.local/share + +# Quick setup of flux-accounting (not working due to needing system service) +# RUN flux start /bin/bash -c "nohup flux account create-db && flux account-service & flux account add-bank root 1" && \ +# flux start flux account add-bank --parent-bank=root default 1 && \ +# flux start flux account add-user --username=jovyan --bank=default && \ +# flux start flux jobtap load mf_priority.so && \ +# flux start flux account-update-db + +USER ${NB_USER} + +CMD ["flux", "start", "--test-size=4", "jupyter", "lab"] diff --git a/2024-RADIUSS-AWS/JupyterNotebook/docker/entrypoint.sh b/2024-RADIUSS-AWS/JupyterNotebook/docker/entrypoint.sh new file mode 100755 index 0000000..8b11568 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/docker/entrypoint.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +/usr/bin/flux start --test-size=4 /usr/local/bin/jupyterhub-singleuser \ No newline at end of file diff --git a/2024-RADIUSS-AWS/JupyterNotebook/docker/flux-icon.png b/2024-RADIUSS-AWS/JupyterNotebook/docker/flux-icon.png new file mode 100644 index 0000000..d50aa52 Binary files /dev/null and b/2024-RADIUSS-AWS/JupyterNotebook/docker/flux-icon.png differ diff --git a/2024-RADIUSS-AWS/JupyterNotebook/docker/init-entrypoint.sh b/2024-RADIUSS-AWS/JupyterNotebook/docker/init-entrypoint.sh new file mode 100755 index 0000000..293e799 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/docker/init-entrypoint.sh @@ -0,0 +1,11 @@ +#!/bin/sh + +# Copy the notebook icon +# This would be for the customized launcher, not working yet +# wget https://flux-framework.org/assets/images/Flux-logo-mark-only-full-color.png +# mv Flux-logo-mark-only-full-color.png /home/jovyan/flux-icon.png + +# We need to clone to the user home, and then change permissions to uid 1000 +# That uid is shared by jovyan here and the spawn container +# git clone https://github.com/rse-ops/flux-radiuss-tutorial-2023 /home/jovyan/flux-tutorial +chown -R 1000 /home/jovyan diff --git a/2024-RADIUSS-AWS/JupyterNotebook/docker/jupyter-launcher.yaml b/2024-RADIUSS-AWS/JupyterNotebook/docker/jupyter-launcher.yaml new file mode 100644 index 0000000..eb6feb0 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/docker/jupyter-launcher.yaml @@ -0,0 +1,75 @@ +- title: Flux Tutorial Notebook + description: This is the main Flux Framework Tutorial + type: jupyterlab-commands + icon: ./flux-icon.png + source: + - label: Flux Tutorial + id: 'filebrowser:open-path' + args: + path: 01_flux_tutorial.ipynb + icon: ./flux-icon.png + catalog: Notebook + +- title: Dyad Notebook Tutorial + description: This is a tutorial for using Dyad + type: jupyterlab-commands + source: + - label: Dyad Tutorial + id: 'filebrowser:open-path' + args: + path: supplementary/dyad/dyad_dlio.ipynb + icon: ./flux-icon.png + catalog: Console + +- title: Flux Framework Portal + description: Flux Framework portal for projects, releases, and publication. + source: https://flux-framework.org/ + type: url + catalog: Flux Resources + args: + sandbox: [ 'allow-same-origin', 'allow-scripts', 'allow-downloads', 'allow-modals', 'allow-popups'] +- title: Flux Documentation + source: https://flux-framework.readthedocs.io/en/latest/ + type: url + catalog: Flux Resources + args: + sandbox: [ 'allow-same-origin', 'allow-scripts', 'allow-downloads', 'allow-modals', 'allow-popups'] +- title: Flux Cheat Sheet + source: https://flux-framework.org/cheat-sheet/ + type: url + catalog: Flux Resources + args: + sandbox: [ 'allow-same-origin', 'allow-scripts', 'allow-downloads', 'allow-modals', 'allow-popups'] +- title: Flux Glossary of Terms + source: https://flux-framework.readthedocs.io/en/latest/glossary.html + type: url + catalog: Flux Resources + args: + sandbox: [ 'allow-same-origin', 'allow-scripts', 'allow-downloads', 'allow-modals', 'allow-popups'] +- title: Flux Comics + source: https://flux-framework.readthedocs.io/en/latest/comics/fluxonomicon.html + description: come and meet FluxBird - the pink bird who knows things! + type: url + catalog: Flux Resources + args: + sandbox: [ 'allow-same-origin', 'allow-scripts', 'allow-downloads', 'allow-modals', 'allow-popups'] +- title: Flux Learning Guide + source: https://flux-framework.readthedocs.io/en/latest/guides/learning_guide.html + description: learn about what Flux does, how it works, and real research applications + type: url + catalog: Flux Resources + args: + sandbox: [ 'allow-same-origin', 'allow-scripts', 'allow-downloads', 'allow-modals', 'allow-popups'] +- title: Getting Started with Flux and Go + source: https://converged-computing.github.io/flux-go + type: url + catalog: Flux Resources + args: + sandbox: [ 'allow-same-origin', 'allow-scripts', 'allow-downloads', 'allow-modals', 'allow-popups'] +- title: Getting Started with Flux in C + source: https://converged-computing.github.io/flux-c-examples/ + description: ...looking for contributors! + type: url + catalog: Flux Resources + args: + sandbox: [ 'allow-same-origin', 'allow-scripts', 'allow-downloads', 'allow-modals', 'allow-popups'] diff --git a/2024-RADIUSS-AWS/JupyterNotebook/docker/login.html b/2024-RADIUSS-AWS/JupyterNotebook/docker/login.html new file mode 100644 index 0000000..d997a0f --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/docker/login.html @@ -0,0 +1,168 @@ +{% extends "page.html" %} +{% if announcement_login is string %} + {% set announcement = announcement_login %} +{% endif %} + +{% block login_widget %} +{% endblock %} + +{% block stylesheet %} +{{ super() }} + +{% endblock %} + +{% block main %} + +{% block login %} +
+{% block login_container %} + +
+ + + +
+

Flux Tutorial running on AWS

+ +
+
+
+

Sign in

+
+
+ + + + {% if login_error %} + + {% endif %} + + + + + + + + + {% block login_terms %} + {% if login_term_url %} + + {% endif %} + {% endblock login_terms %} + +
+
+
+{% endblock login_container %} +
+{% endblock login %} + +{% endblock %} + +{% block script %} +{{ super() }} + +{% endblock %} diff --git a/2024-RADIUSS-AWS/JupyterNotebook/docker/start.sh b/2024-RADIUSS-AWS/JupyterNotebook/docker/start.sh new file mode 100755 index 0000000..bad19ab --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/docker/start.sh @@ -0,0 +1,2 @@ +#!/bin/bash +/usr/bin/flux start --test-size=4 /usr/local/bin/jupyter-lab --ip=0.0.0.0 diff --git a/2024-RADIUSS-AWS/JupyterNotebook/flux-tree/flux-tree b/2024-RADIUSS-AWS/JupyterNotebook/flux-tree/flux-tree new file mode 100644 index 0000000..5eb5002 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/flux-tree/flux-tree @@ -0,0 +1,847 @@ +#! /bin/bash + +############################################################## +# Copyright 2020 Lawrence Livermore National Security, LLC +# (c.f. AUTHORS, NOTICE.LLNS, LICENSE) +# +# This file is part of the Flux resource manager framework. +# For details, see https://github.com/flux-framework. +# +# SPDX-License-Identifier: LGPL-3.0 +############################################################## + +set -o errexit +set -o pipefail +#set -o xtrace + +FT_TOPO='1' # store arg to --topology (e.g., 2x2) +FT_QUEUE='default' # store arg to --queue-policy (e.g., fcfs:easy) +FT_PARAMS='default' # store arg to --queue-params +FT_MATCH='default' # store arg to --match-policy (e.g., low:high) +FT_PERF_OUT='%^+_no' # store perf-out filename given by --perf-out +FT_PERF_FORMAT='{treeid:<15s} {elapse:>20f} {begin:>20f} {end:>20f} {match:>15f} '\ +'{njobs:>15d} {my_nodes:>5d} {my_cores:>4d} {my_gpus:>4d}' + # store perf-out with format given by --perf-format +FT_FXLOGS='%^+_no' # dir in which flux logs are produced +FT_FXRDIR='%^+_no' # flux rundir attribute to pass +FT_LEAF='no' # is this a leaf instance (given by --leaf)? +FT_G_PREFIX='tree' # store hierarchical path given by --prefix +FT_DRY_RUN='no' # --dry-run for testing? +FT_INTER='no' # is an option for internal levels given? +FT_MAX_EXIT_CODE=0 # maximum exit code detected +FT_MAX_FLUX_JOBID=0 # maximum flux jobid that reports max exit code +FT_MAX_TREE_ID="" # FLUX_TREE_ID that reports max exit code +FT_MAX_JOBSCRIPT_IX="" # FLUX_TREE_JOBSCRIPT_INDEX reporting max code + # --prefix is an internal-use-only option +FT_JOB_NAME='%^+_no' # job name to use when submitting children +ORIG_FLUXION_QMANAGER_OPTIONS='' # +ORIG_FLUXION_RESOURCE_OPTIONS='' # to apply and unapply FLUXION_RESOURCE options +ORIG_FLUXION_QMANAGER_RC_NOOP='' # module load options. +ORIG_FLUXION_RESOURCE_RC_NOOP='' # + +declare -i FT_NJOBS=1 # store num of jobs to run, given by --njobs +declare -i FT_NNODES=1 # store num of nodes assigned, given by --nnodes +declare -i FT_NCORES=1 # store num of cores per node (--ncores-per-node) +declare -i FT_NGPUS=0 # store num of gpus per node (--ngpus-per-node) +declare -r top_prefix='tree' # prefix name to identify the top Flux instance +declare -r t_delim='x' # topology delimiter +declare -r p_delim=':' # match policy delimiter +declare -r perf_format='%-15s %20s %20s %20s %15s %15s %5s %4s %4s' +declare -a FT_CL=() # save the jobscript command into an array +declare -A mp_policies=( # make sure to update this when match + [low]=1 # policies are updated. + [high]=1 + [locality]=1 + [variation]=1 + [default]=1 +) +declare -A qp_policies=( # make sure to update this when + [fcfs]=1 # queuing policies are updated. + [easy]=1 + [hybrid]=1 + [conservative]=1 + [default]=1 +) +declare -A q_params=( # make sure to update this when + [queue-depth]=1 # queuing parameters are updated. + [reservation-depth]=1 + [default]=1 +) +declare -a jobids # array to store a set of submitted job IDs + +declare -r long_opts='help,leaf,flux-logs:,flux-rundir:,nnodes:'\ +',ncores-per-node:,ngpus-per-node:,topology:,queue-policy:,queue-params:'\ +',match-policy:,njobs:,perf-out:,perf-format:,prefix:,job-name:,dry-run' +declare -r short_opts='hlf:r:N:c:g:T:Q:P:M:J:o:X:d' +declare -r prog=${0##*/} +declare -r usage=" +Usage: ${prog} [OPTIONS] -- Jobscript\n\ +\n\ +Create a Flux instance hierarchy according to the specified\n\ +policies and schedule/run the specified number\n\ +of Jobscripts at the last level of this hierarchy.\n\ +\n\ +If --topology=2x4 and --njobs=32 are given, for instance,\n\ +2 Flux instances will be spawned from within the current instance,\n\ +each of which will in turn spawn 4 child Flux instances, totaling\n\ +8 instances at the last level of this hierarchy.\n\ +Once this is done, 4 jobs (of Jobscripts) will be scheduled\n\ +and executed at each of these 8 last-level Flux instances.\n\ +\n\ +The resources specified by --nnodes (total number of nodes) and\n\ +--ncores-per-node (total number of cores per node)\n\ +are recursively divided such that each sibling Flux instance\n\ +will be assigned to an equal split of the resources of their\n\ +parent instance. In addition, --ngpus-per-node can be given,\n\ +in which case the given GPU count will also be split.\n\ +If not given, it is assumed that there is no GPU on nodes.\n\ +\n\ +Jobscript is expected to submit one or more programs through\n\ +the flux-job submit command or its variants.\n\ +Jobscript is passed with five environment variables:\n\ +FLUX_TREE_ID, FLUX_TREE_JOBSCRIPT_INDEX, FLUX_TREE_NNODES,\n\ +FLUX_TREE_NCORES_PER_NODE and FLUX_TREE_NGPUS_PER_NODE.\n\ +FLUX_TREE_ID is an ID string uniquely identifying the hierarchical\n\ +path of the Flux instance on which Jobscript is being executed.\n\ +FLUX_TREE_JOBSCRIPT_INDEX is the integer ID of each jobscript\n\ +invocation local to the Flux instance. It starts from 1 and\n\ +sequentially increases.\n\ +FLUX_TREE_NNODES is the number nodes assigned to the instance.\n\ +FLUX_TREE_NCORES_PER_NODE is the number of cores per node\n\ +assigned to the instance.\n\ +FLUX_TREE_NGPUS_PER_NODE is the number of GPUs per node\n\ +assigned to the instance.\n\ +\n\ +If --queue-policy (additionally --queue-params) and/or\n\ +--match-policy are given, each level of this hierarchy will\n\ +be set to the specified queuing and matching policies and\n\ +parameters. Otherwise, all levels will be configured\n\ +to be used either the default policies or policies specified\n\ +through the FLUXION_RESOURCE_OPTIONS and/or FLUXION_QMANAGER_OPTIONS\n\ +environment variables.\n\ +\n\ +If any one of Jobscripts returns a non-zero exit code, flux-tree\n\ +detects the script invocation exited with the highest code and print\n\ +both that exit code and the outputs printed from executing the script.\n\ +In this case, FLUX_TREE_ID and FLUX_TREE_JOBSCRIPT_INDEX are also\n\ +reported in the from of \${FLUX_TREE_ID}@index[\${FLUX_TREE_JOBSCRIPT_INDEX}]\n\ +\n\ +Options:\n\ + -h, --help Display this message\n\ + -l, --leaf Leaf instance. Directly submit jobs\n\ + to enclosing Flux instance. Mutually-exclusive\n\ + with internal tree-node options like -T.\n\ + (default=${FT_LEAF})\n\ + -f, --flux-logs=DIR Dump Flux logs for all instances into DIR\n\ + -r, --flux-rundir=DIR Set the rundir attribute of each Flux tree instance\n\ + into a subdirectory within DIR. The content\n\ + stores will be redirected to them as well\n\ + -N, --nnodes=NNODES Total num of nodes to use\n\ + (default=${FT_NNODES})\n\ + -c, --ncores-per-node=NCORES Total num of cores per node to use\n\ + (default=${FT_NCORES})\n\ + -g, --ngpus-per-node=NGPUS Total num of gpus per node to use\n\ + (default=${FT_NGPUS})\n\ + -T, --topology=HPOLICY Topology of Flux instance hierarchy:\n\ + e.g., 2x2 (default=${FT_TOPO})\n\ + -Q, --queue-policy=QPOLICY Queuing policy for each level of\n\ + the hierarchy: e.g., easy:fcfs\n\ + -P, --queue-params=QPARAMS Queuing parameters for each level of\n\ + the hierarchy: e.g.,\n\ + queue-depth=5:reservation-depth=5\n\ + -M, --match-policy=MPOLICY Match policy for each level of\n\ + the hierarchy: e.g., low:high\n\ + -J, --njobs=NJOBS Total num of Jobscripts to run\n\ + (default=${FT_NJOBS})\n\ + -o, --perf-out=FILENAME Dump the performance data into\n\ + the given file (default: don't print)\n\ + --perf-format=FORMAT Dump the performance data with the given\n\ + format. Uses the python format\n\ + specification mini-language.\n\ + Example: \"{treeid:<15s},{elapse:>20f}\"\n\ + --job-name=NAME Name to use when submitting child jobs\n\ + -- Stop parsing options after this\n\ +" + +die() { echo -e "${prog}:" "$@"; exit 1; } +warn() { echo -e "${prog}: warning:" "$@"; } +dr_print() { echo -e "${prog}: dry-run:" "$@"; } + +# +# Roll up the performance records for each Flux instance to the KVS +# guest namespace of the parent Flux instance or print them out if top level. +# +rollup() { + local prefix="${1}" + local blurb="${2}" + local out="${3}" + local num_children="${4}" + local format="${5}" + + if [[ "${prefix}" == "${top_prefix}" && "${out}" != "%^+_no" ]]; then + flux tree-helper --perf-out="${out}" --perf-format="${format}" \ + ${num_children} "tree-perf" "${FT_JOB_NAME}" <<< "${blurb}" + else + flux tree-helper ${num_children} "tree-perf" "${FT_JOB_NAME}" \ + <<< "${blurb}" + fi +} + + +# +# Return a JSON string out of the performance data passed. +# +jsonify() { + local prefix="${1}" + local njobs="${2}" + local nnodes="${3}" + local ncores="${4}" + local ngpus="${5}" + local begin="${6}" + local end="${7}" + local avg=0 + local avail="no" + local el_match=0 + + # Print resource match time only for internal study + # flux-resource isn't a public command + if [[ "x${FT_DRY_RUN}" = "xno" ]] + then + flux ion-resource -h > /dev/null 2>&1 && avail="yes" + fi + + if [[ "${avail}" = "yes" ]] + then + avg=$(flux ion-resource stats | grep "Avg" | awk '{print $4}') + el_match=$(awk "BEGIN {print ${avg}*${njobs}*1000000.0}") + fi + + local elapse=0 + elapse=$(awk "BEGIN {print ${end} - ${begin}}") + echo "{\"treeid\":\"${prefix}\",\"njobs\":${njobs},\"my_nodes\":${nnodes},\ +\"my_cores\":${ncores},\"my_gpus\":${ngpus},\"perf\":{\"begin\":${begin},\ +\"end\":${end},\"elapse\":${elapse},\"match\":${el_match}}}" +} + + +# +# Fetch the next topology parameter that will be passed to +# the next-level Flux instances. E.g., If the current level topology +# is 2x3x4, the topology handled at the next level will be 3x4. +# +next_topo() { + local topo="${1}" + local nx='' + local nfields=0 + nfields=$(echo "${topo}" | awk -F"${t_delim}" '{print NF}') + # Remove the first topo parameter + [[ ${nfields} -gt 1 ]] && nx="${topo#*${t_delim}}" + echo "${nx}" +} + + +# +# Fetch the next policy parameter that will be passed to +# the next-level Flux instances. E.g., If the current policy parameter +# is high:low:locality, the policies handled at the next level +# will be low:locality. +# +next_policy_or_param() { + local policy_or_param="${1}" + local nx="" + local nfields=0 + nfields=$(echo "${policy_or_param}" | awk -F"${p_delim}" '{print NF}') + [[ ${nfields} -gt 1 ]] && nx="${policy_or_param#*${p_delim}}" + echo "${nx}" +} + + +# +# Check if the given queuing policy is valid +# +qpolicy_check() { + local policy=${1%%${p_delim}*} + [[ "x${policy}" = "x" ]] && return 1 + [[ "${qp_policies["${policy}"]:-missing}" = "missing" ]] && return 1 + return 0 +} + + +# +# Check if the given match policy is valid +# +mpolicy_check() { + local policy=${1%%${p_delim}*} + [[ "x${policy}" = "x" ]] && return 1 + [[ "${mp_policies["${policy}"]:-missing}" = "missing" ]] && return 1 + return 0 +} + + +# +# Check if the given queue param is valid +# +qparams_check() { + local param='' + param=$(echo "${1}" | awk -F"${p_delim}" '{print $1}') + param=${1%%${p_delim}*} + local final_param='' + final_param=${param##*,} + + for i in $(seq 1 10) + do + local token1=${param%%,*} + local token2=${token1%=*} + [[ "x${token2}" = "x" ]] && return 1 + [[ "${q_params["${token2}"]:-missing}" = "missing" ]] && return 1 + [[ "x${token1}" = "x${final_param}" ]] && break + param=${param#*,} + done + return 0 +} + + +# +# Calculate the number of jobs to execute based on the number of Flux instances +# being used at a level and the rank of the instance amongst its siblings. +# +get_my_njobs(){ + local njobs="${1}" + local size="${2}" # rank starts from 1 + local rank="${3}" + echo $(( njobs / size + (size + njobs % size)/(size + rank) )) +} + + +# +# Calculate the total number of cores that will be assigned to a child +# Flux instance based on the total number of nodes and cores per node +# assigned to the current Flux instance as well as the size and rank parameter. +# +get_my_cores(){ + local nnodes="${1}" + local ncores="${2}" + local size="${3}" + local rank="${4}" + local t_cores=$(( nnodes * ncores )) + echo $(( t_cores / size + (size + t_cores % size) / (size + rank) )) +} + + +# +# Calculate the total number of GPUs that will be assigned to a child +# Flux instance based on the total number of nodes and GPUs per node +# assigned to the current Flux instance as well as the size and rank parameter. +# +get_my_gpus(){ + local nnodes="${1}" + local ngpus="${2}" + local size="${3}" + local rank="${4}" + local t_gpus=$(( nnodes * ngpus )) + echo $(( t_gpus / size + (size + t_gpus % size) / (size + rank) )) +} + + +# +# Adjust the number of Flux instances to spawn at the next level +# if the amount of resources managed by the parent instance is small. +# +get_effective_size(){ + local ncores="${1}" + local ngpus="${2}" + local size="${3}" + [[ ${ngpus} -ne 0 && ${ngpus} -lt ${size} ]] && size=${ngpus} + [[ ${ncores} -lt ${size} ]] && size=${ncores} + echo "${size}" +} + + +# +# Calculate the total number of nodes that will be assigned to a child +# Flux instance based on the total number of cores per node as well as +# the total number of cores assigned to this child instance. Returns +# minimum num of nodes required. +# +get_my_nodes(){ + local ncores="${1}" + local m_cores="${2}" + echo $(( m_cores / ncores + (ncores + m_cores % ncores) / (ncores + 1 ))) +} + + +# +# Apply all of the policies for the target Flux instance +# by setting environment variables. +# +apply_policies() { + local queue_policy="${1%%${p_delim}*}" + local queue_param="${2%%${p_delim}*}" + local match_policy="${3%%${p_delim}*}" + + ORIG_FLUXION_QMANAGER_OPTIONS=${FLUXION_QMANAGER_OPTIONS:-none} + ORIG_FLUXION_RESOURCE_OPTIONS=${FLUXION_RESOURCE_OPTIONS:-none} + ORIG_FLUXION_QMANAGER_RC_NOOP=${FLUXION_QMANAGER_RC_NOOP:-none} + ORIG_FLUXION_RESOURCE_RC_NOOP=${FLUXION_RESOURCE_RC_NOOP:-none} + unset FLUXION_QMANAGER_RC_NOOP + unset FLUXION_RESOURCE_RC_NOOP + + if [[ "${queue_policy}" != "default" ]] + then + export FLUXION_QMANAGER_OPTIONS="queue-policy=${queue_policy}" + fi + if [[ "${queue_param}" != "default" ]] + then + local qo="${FLUXION_QMANAGER_OPTIONS}" + export FLUXION_QMANAGER_OPTIONS="${qo:+${qo},}queue-params=${queue_param}" + fi + if [[ "${match_policy}" != "default" ]] + then + export FLUXION_RESOURCE_OPTIONS="hwloc-allowlist=node,core,gpu \ +policy=${match_policy}" + fi + if [[ "x${FT_DRY_RUN}" = "xyes" ]] + then + dr_print "FLUXION_QMANAGER_OPTIONS:${FLUXION_QMANAGER_OPTIONS}" + dr_print "FLUXION_RESOURCE_OPTIONS:${FLUXION_RESOURCE_OPTIONS}" + fi +} + + +# +# Undo all of the policies set for the target Flux instance +# by unsetting environment variables. +# +unapply_policies() { + unset FLUXION_QMANAGER_OPTIONS + unset FLUXION_RESOURCE_OPTIONS + + if [ "${ORIG_FLUXION_QMANAGER_OPTIONS}" != "none" ] + then + export FLUXION_QMANAGER_OPTIONS="${ORIG_FLUXION_QMANAGER_OPTIONS}" + fi + if [ "${ORIG_FLUXION_RESOURCE_OPTIONS}" != "none" ] + then + export FLUXION_RESOURCE_OPTIONS="${ORIG_FLUXION_RESOURCE_OPTIONS}" + fi + if [ "${ORIG_FLUXION_QMANAGER_RC_NOOP}" != "none" ] + then + export FLUXION_QMANAGER_RC_NOOP="${ORIG_FLUXION_QMANAGER_RC_NOOP}" + fi + if [ "${ORIG_FLUXION_RESOURCE_RC_NOOP}" != "none" ] + then + export FLUXION_RESOURCE_RC_NOOP="${ORIG_FLUXION_RESOURCE_RC_NOOP}" + fi + if [[ "x${FT_DRY_RUN}" = "xyes" ]] + then + dr_print "FLUXION_QMANAGER_OPTIONS:${FLUXION_QMANAGER_OPTIONS}" + dr_print "FLUXION_RESOURCE_OPTIONS:${FLUXION_RESOURCE_OPTIONS}" + dr_print "FLUXION_QMANAGER_RC_NOOP:${FLUXION_QMANAGER_RC_NOOP}" + dr_print "FLUXION_RESOURCE_RC_NOOP:${FLUXION_RESOURCE_RC_NOOP}" + fi +} + + + +################################################################################ +# # +# Handle Leaf or Internal Flux Instances # +# # +################################################################################ + +# +# Execute the script. Export a predefined set of +# environment variables and execute the given jobscript. +# +execute() { + local prefix="${1}" + local nnodes="${2}" + local ncores="${3}" + local ngpus="${4}" + local njobs="${5}" + local rc=0 + + for job in $(seq 1 "${njobs}"); + do + export FLUX_TREE_ID="${prefix}" + export FLUX_TREE_JOBSCRIPT_INDEX="${job}" + export FLUX_TREE_NNODES="${nnodes}" + export FLUX_TREE_NCORES_PER_NODE="${ncores}" + export FLUX_TREE_NGPUS_PER_NODE="${ngpus}" + + if [[ "x${FT_DRY_RUN}" = "xyes" ]] + then + dr_print "FLUX_TREE_ID=${FLUX_TREE_ID}" + dr_print "FLUX_TREE_JOBSCRIPT_INDEX=${FLUX_TREE_JOBSCRIPT_INDEX}" + dr_print "FLUX_TREE_NCORES_PER_NODE=${FLUX_TREE_NCORES_PER_NODE}" + dr_print "FLUX_TREE_NGPUS_PER_NODE=${FLUX_TREE_NGPUS_PER_NODE}" + dr_print "FLUX_TREE_NNODES=${FLUX_TREE_NNODES}" + dr_print "eval ${FT_CL[@]}" + continue + else + rc=0 + "${FT_CL[@]}" || rc=$? + if [[ ${rc} -gt ${FT_MAX_EXIT_CODE} ]] + then + FT_MAX_EXIT_CODE=${rc} + FT_MAX_TREE_ID="${FLUX_TREE_ID}" + FT_MAX_JOBSCRIPT_IX="${FLUX_TREE_JOBSCRIPT_INDEX}" + fi + fi + done + + [[ "x${FT_DRY_RUN}" = "xno" ]] && flux queue drain + + if [[ "x${FT_MAX_TREE_ID}" != "x" ]] + then + warn "${FT_CL[@]}: exited with exit code (${FT_MAX_EXIT_CODE})" + warn "invocation id: ${FT_MAX_TREE_ID}@index[${FT_MAX_JOBSCRIPT_IX}]" + warn "output displayed above, if any" + fi + + unset FLUX_TREE_ID + unset FLUX_TREE_NNODES + unset FLUX_TREE_NCORES_PER_NODE +} + + +# +# Entry point to execute the job script. When this is invoke, +# the parent Flux instance has already been started. +# Measure the elapse time of the job script execution, and +# dump the performance data. +# +leaf() { + local prefix="${1}" + local nnodes="${2}" + local ncores="${3}" + local ngpus="${4}" + local njobs="${5}" + local perfout="${6}" + local format="${7}" + + # Begin Time Stamp + local B='' + B=$(date +%s.%N) + + execute "$@" + + # End Time Stamp + local E='' + E=$(date +%s.%N) + + local o='' + + o=$(jsonify "${prefix}" "${njobs}" "${nnodes}" "${ncores}" \ +"${ngpus}" "${B}" "${E}") + rollup "${prefix}" "${o}" "${perfout}" "0" "${format}" +} + + +# +# Roll up exit code from child instances +# +rollup_exit_code() { + local rc=0 + for job in "${jobids[@]}" + do + rc=0 + flux job status --exception-exit-code=255 ${job} || rc=$? + if [[ ${rc} -gt ${FT_MAX_EXIT_CODE} ]] + then + FT_MAX_EXIT_CODE=${rc} + FT_MAX_FLUX_JOBID=${job} + fi + done + + if [[ "${FT_MAX_FLUX_JOBID}" != "0" ]] + then + flux job attach ${FT_MAX_FLUX_JOBID} || true + fi +} + +# +# Submit the specified number of Flux instances at the next level of the calling +# instance. Use flux-tree recursively. Instances that have 0 jobs assigned are +# not launched. +# +submit() { + local prefix="${1}" + local nx_topo=$(next_topo "${2}") + local nx_queue=$(next_policy_or_param "${3}") + local nx_q_params=$(next_policy_or_param "${4}") + local nx_match=$(next_policy_or_param "${5}") + local nnodes="${6}" + local ncores="${7}" + local ngpus="${8}" + local size="${9}" + local njobs="${10}" + local log="${11}" + local rdir="${12}" + + # Flux instance rank-agnostic command-line options for the next level + local T="${nx_topo:+--topology=${nx_topo}}" + T="${T:---leaf}" + local Q="${nx_queue:+--queue-policy=${nx_queue}}" + local P="${nx_q_params:+--queue-params=${nx_q_params}}" + local M="${nx_match:+--match-policy=${nx_match}}" + local F='' + [[ "x${log}" != "x%^+_no" ]] && F="--flux-logs=${log}" + local R='' + [[ "x${rdir}" != "x%^+_no" ]] && R="--flux-rundir=${rdir}" + local rank=0 + + # Main Loop to Submit the Next-Level Flux Instances + size=$(get_effective_size "${ncores}" "${ngpus}" "${size}") + apply_policies "${3}" "${4}" "${5}" + for rank in $(seq 1 "${size}"); do + local my_cores=0 + my_cores=$(get_my_cores "${nnodes}" "${ncores}" "${size}" "${rank}") + local my_gpus=0 + my_gpus=$(get_my_gpus "${nnodes}" "${ngpus}" "${size}" "${rank}") + local my_njobs=0 + my_njobs=$(get_my_njobs "${njobs}" "${size}" "${rank}") + + [[ "${my_njobs}" -eq 0 ]] && break + + # Flux instance rank-aware command-line options + local J="--njobs=${my_njobs}" + local o='' + if [[ x"${log}" != "x%^+_no" ]] + then + if [[ "x${FT_DRY_RUN}" != "xyes" ]] + then + mkdir -p "${log}" + fi + o="-o,-Slog-filename=${log}/${prefix}.${rank}.log" + fi + if [[ x"${rdir}" != "x%^+_no" ]] + then + if [[ "x${FT_DRY_RUN}" != "xyes" ]] + then + rm -rf "${rdir}/${prefix}.${rank}.pfs" + mkdir -p "${rdir}/${prefix}.${rank}.pfs" + fi + o="${o:+${o} }-o,-Srundir=${rdir}/${prefix}.${rank}.pfs" + fi + local N=0 + N=$(get_my_nodes "${ncores}" "${my_cores}") + local c=0 + c=$((my_cores/N + (my_cores + my_cores % N)/(my_cores + 1))) + local g=0 + g=$((my_gpus/N + (my_gpus + my_gpus % N)/(my_gpus + 1))) + local G='' + [[ ${g} -gt 0 ]] && G="-g ${g}" + local X="--prefix=${prefix}.${rank}" + + if [[ "x${FT_DRY_RUN}" = "xyes" ]] + then + dr_print "Rank=${rank}: N=${N} c=${c} ${G:+g=${G}} ${o:+o=${o}}" + dr_print "Rank=${rank}: ${T:+T=${T}}" + dr_print "Rank=${rank}: ${Q:+Q=${Q}} ${P:+P=${P}} ${M:+M=${M}}" + dr_print "Rank=${rank}: ${X:+X=${X}} ${J:+J=${J}} ${FT_CL:+S=${FT_CL[@]}}" + dr_print "" + continue + fi + jobid=$(\ +flux submit --job-name=${FT_JOB_NAME} -N${N} -n${N} -c${c} ${G} \ + flux start ${o} \ + flux tree -N${N} -c${c} ${G} ${T} ${Q} ${P} ${M} ${F} ${R} ${X} ${J} \ + -- "${FT_CL[@]}") + jobids["${rank}"]="${jobid}" + done + + [[ "x${FT_DRY_RUN}" = "xno" ]] && flux queue drain && rollup_exit_code + unapply_policies +} + + +# +# Collect the performance record for sibling Flux instances at one level. +# For each child instance, get the performance record from the guest KVS +# namespace, which had all of the records gathered for the subtree rooted +# at this instance, and add that to the current record with its child key. +# +coll_perf() { + local prefix="${1}" + local nnodes="${2}" + local ncores="${3}" + local ngpus="${4}" + local njobs="${5}" + local begin="${6}" + local end="${7}" + local perfout="${8}" + local nchildren="${9}" + local format="${10}" + + # + # Make a JSON string from the performance data + # + local blurb='' + blurb=$(jsonify "${prefix}" "${njobs}" "${nnodes}" "${ncores}" "${ngpus}" "${begin}" "${end}") + rollup "${prefix}" "${blurb}" "${perfout}" "${nchildren}" "${format}" +} + + +# +# Entry point to submit child Flux instances at the next level from the +# calling Flux instance. Measure the elapse time of running all of these +# Flux instances. Collect the performance record for that level at the end. +# +internal() { + local prefix="${1}" + local nnodes="${6}" + local ncores="${7}" + local ngpus="${8}" + local njobs="${10}" + local perfout="${13}" + local format="${14}" + + # Begin Time Stamp + local B='' + B=$(date +%s.%N) + + submit "$@" + + # End Time Stamp + local E='' + E=$(date +%s.%N) + + if [[ "x${FT_DRY_RUN}" = "xyes" ]]; then + nchildren=0 + else + nchildren=${#jobids[@]} + fi + coll_perf "${prefix}" "${nnodes}" "${ncores}" "${ngpus}" \ +"${njobs}" "${B}" "${E}" "${perfout}" "${nchildren}" "${format}" +} + + +################################################################################ +# # +# Main # +# # +################################################################################ + +main() { + local leaf="${1}" # is this a leaf Flux instance? + local prefix="${2}" # id showing hierarchical path of the instance + local topo="${3}" # topology shape at the invoked level + local queue="${4}" # queuing policies at the invoked level and below + local param="${5}" # queue parameters at the invoked level and below + local match="${6}" # match policy shape at the invoked level + local nnodes="${7}" # num of nodes allocated to this instance + local ncores="${8}" # num of cores per node + local ngpus="${9}" # num of gpus per node + local njobs="${10}" # num of jobs assigned to this Flux instance + local flogs="${11}" # flux log output option + local frdir="${12}" # flux rundir attribute + local out="${13}" # perf output filename + local format="${14}" # perf output format + local size=0 + + if [[ ${leaf} = "yes" ]] + then + # + # flux-tree is invoked for a leaf: all of the internal Flux instances + # leading to this leaf have been instantiated and ${script} should + # be executed on the last-level Flux instance. + # + leaf "${prefix}" "${nnodes}" "${ncores}" "${ngpus}" "${njobs}" \ + "${out}" "${format}" + else + # + # flux-tree is invoked to instantiate ${size} internal Flux instances + # at the next level of the calling instance. + # + size=${topo%%${t_delim}*} + internal "${prefix}" "${topo}" "${queue}" "${param}" "${match}" \ + "${nnodes}" "${ncores}" "${ngpus}" "${size}" "${njobs}" \ + "${flogs}" "${frdir}" "${out}" "${format}" + fi + + exit ${FT_MAX_EXIT_CODE} +} + + +################################################################################ +# # +# Commandline Parsing and Validate Options # +# # +################################################################################ + +GETOPTS=$(/usr/bin/getopt -o ${short_opts} -l ${long_opts} -n "${prog}" -- "${@}") +eval set -- "${GETOPTS}" +rcopt=$? + +while true; do + case "${1}" in + -h|--help) echo -ne "${usage}"; exit 0 ;; + -l|--leaf) FT_LEAF="yes"; shift 1 ;; + -d|--dry-run) FT_DRY_RUN="yes"; shift 1 ;; + -f|--flux-logs) FT_FXLOGS="${2}"; shift 2 ;; + -r|--flux-rundir) FT_FXRDIR="${2}"; shift 2 ;; + -N|--nnodes) FT_NNODES=${2}; shift 2 ;; + -c|--ncores-per-node) FT_NCORES=${2}; shift 2 ;; + -g|--ngpus-per-node) FT_NGPUS=${2}; shift 2 ;; + -T|--topology) FT_TOPO="${2}"; FT_INTER="yes"; shift 2 ;; + -Q|--queue-policy) FT_QUEUE="${2}"; FT_INTER="yes"; shift 2 ;; + -P|--queue-params) FT_PARAMS="${2}"; FT_INTER="yes"; shift 2 ;; + -M|--match-policy) FT_MATCH="${2}"; FT_INTER="yes"; shift 2 ;; + -J|--njobs) FT_NJOBS=${2}; shift 2 ;; + -o|--perf-out) FT_PERF_OUT="${2}"; shift 2 ;; + --perf-format) FT_PERF_FORMAT="${2}"; shift 2 ;; + -X|--prefix) FT_G_PREFIX="${2}"; shift 2 ;; + --job-name) FT_JOB_NAME="${2}"; shift 2 ;; + --) shift; break; ;; + *) die "Invalid option '${1}'\n${usage}" ;; + esac +done + +FT_SCRIPT="${1}" +FT_CL=( "${@}" ) + +[[ "$#" -lt 1 || "${rcopt}" -ne 0 ]] && die "${usage}" + +[[ ! -x $(which ${FT_SCRIPT}) ]] && die "cannot execute ${FT_SCRIPT}!" + +[[ "${FT_NNODES}" -le 0 ]] && die "nnodes must be greater than 0!" + +[[ "${FT_NCORES}" -le 0 ]] && die "ncores must be greater than 0!" + +[[ "${FT_NGPUS}" -lt 0 ]] && die "incorrect ngpus!" + +qpolicy_check "${FT_QUEUE}" || die "invalid queue policy!" + +mpolicy_check "${FT_MATCH}" || die "invalid match policy!" + +qparams_check "${FT_PARAMS}" || die "invalid queue params!" + +if [[ "${FT_INTER}" = "yes" && "${FT_LEAF}" = "yes" ]] +then + die "--leaf must not be used together with internal tree-node options!" +fi + +# if the user did not set a name, then use a partially random string to prevent +# conflicts with other flux-tree instances during performance data collection +# via flux-tree-helper +if [[ "$FT_JOB_NAME" == '%^+_no' ]]; then + # code copied from: + # https://unix.stackexchange.com/questions/230673/how-to-generate-a-random-string + FT_JOB_NAME="flux-tree-$(head /dev/urandom | tr -dc A-Za-z0-9 | head -c 32)" +fi + + +################################################################################ +# # +# Invoke the Main Entry Level # +# # +################################################################################ + +main "${FT_LEAF}" "${FT_G_PREFIX}" "${FT_TOPO}" "${FT_QUEUE}" "${FT_PARAMS}" \ + "${FT_MATCH}" "${FT_NNODES}" "${FT_NCORES}" "${FT_NGPUS}" "${FT_NJOBS}" \ + "${FT_FXLOGS}" "${FT_FXRDIR}" "${FT_PERF_OUT}" "${FT_PERF_FORMAT}" + +# +# vi:tabstop=4 shiftwidth=4 expandtab +# diff --git a/2024-RADIUSS-AWS/JupyterNotebook/flux-tree/flux-tree-helper.py b/2024-RADIUSS-AWS/JupyterNotebook/flux-tree/flux-tree-helper.py new file mode 100644 index 0000000..eba17d5 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/flux-tree/flux-tree-helper.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python3 + +############################################################## +# Copyright 2020 Lawrence Livermore National Security, LLC +# (c.f. AUTHORS, NOTICE.LLNS, LICENSE) +# +# This file is part of the Flux resource manager framework. +# For details, see https://github.com/flux-framework. +# +# SPDX-License-Identifier: LGPL-3.0 +############################################################## + +import os +import sys +import time +import json +import argparse +import logging + +import flux +import flux.util +import flux.kvs +import flux.job + +LOGGER = logging.getLogger("flux-tree-helper") + + +def get_child_jobids(flux_handle, num_children, child_name): + """ + Get the jobids of num_children instances. Will repeatedly query the + job-info module until num_children jobids are collected, with sleeps + inbetween queries. + """ + jobids = set() + since = 0.0 + LOGGER.debug("Getting IDs of inactive children with name == %s", child_name) + while True: + for job in flux.job.job_list_inactive( + flux_handle, + max_entries=num_children, + since=since, + attrs=["t_inactive"], + name=child_name, + ).get_jobs(): + jobid = job["id"] + since = max(since, job["t_inactive"]) + jobids.add(jobid) + if len(jobids) >= num_children: + break + LOGGER.debug( + "Only %d out of %d children are inactive, sleeping before trying again", + len(jobids), + num_children, + ) + time.sleep(1) + return jobids + + +def get_this_instance_data(): + data = json.load(sys.stdin) + return data + + +def get_child_data(flux_handle, num_children, child_name, kvs_key): + child_data = [] + jobids = get_child_jobids(flux_handle, num_children, child_name) + for jobid in jobids: + kvs_dir = flux.job.job_kvs_guest(flux_handle, jobid) + child_data.append(kvs_dir[kvs_key]) + return child_data + + +def combine_data(this_instance_data, child_data): + this_instance_data["child"] = child_data + return this_instance_data + + +class PerfOutputFormat(flux.util.OutputFormat): + """ + Store a parsed version of the program's output format, + allowing the fields to iterated without modifiers, building + a new format suitable for headers display, etc... + """ + + # List of legal format fields and their header names + headings = dict( + treeid="TreeID", + elapse="Elapsed(sec)", + begin="Begin(Epoch)", + end="End(Epoch)", + match="Match(usec)", + njobs="NJobs", + my_nodes="NNodes", + my_cores="CPN", + my_gpus="GPN", + ) + + def __init__(self, fmt): + """ + Parse the input format fmt with string.Formatter. + Save off the fields and list of format tokens for later use, + (converting None to "" in the process) + + Throws an exception if any format fields do not match the allowed + list of headings above. + """ + # Support both new and old style OutputFormat constructor: + try: + super().__init__(fmt, headings=self.headings, prepend="") + except TypeError: + super().__init__(PerfOutputFormat.headings, fmt) + + +def write_data_to_file(output_filename, output_format, data): + def json_traverser(data): + fieldnames = PerfOutputFormat.headings.keys() + output = {k: v for k, v in data.items() if k in fieldnames} + output.update(data["perf"]) + yield output + for child in data["child"]: + yield from json_traverser(child) + + formatter = PerfOutputFormat(output_format) + with open(output_filename, "w") as outfile: + header = formatter.header() + "\n" + outfile.write(header) + fmt = formatter.get_format() + "\n" + for data_row in json_traverser(data): + # newline = formatter.format(data_row) + newline = fmt.format(**data_row) + outfile.write(newline) + + +def write_data_to_parent(flux_handle, kvs_key, data): + try: + parent_uri = flux_handle.flux_attr_get("parent-uri") + except FileNotFoundError: + return + parent_handle = flux.Flux(parent_uri) + + try: + parent_kvs_namespace = flux_handle.flux_attr_get("parent-kvs-namespace").decode( + "utf-8" + ) + except FileNotFoundError: + return + env_name = "FLUX_KVS_NAMESPACE" + os.environ[env_name] = parent_kvs_namespace + + flux.kvs.put(parent_handle, kvs_key, data) + flux.kvs.commit(parent_handle) + + +def parse_args(): + parser = argparse.ArgumentParser( + prog="flux-tree-helper", formatter_class=flux.util.help_formatter() + ) + parser.add_argument( + "num_children", + type=int, + help="number of children to collect data from. Should be 0 at leaves.", + ) + parser.add_argument( + "kvs_key", type=str, help="key to use when propagating data up through the tree" + ) + parser.add_argument( + "job_name", + type=str, + help="name of the child jobs to use when filtering the inactive jobs", + ) + parser.add_argument( + "--perf-out", + type=str, + help="Dump the performance data into the given file. " + "Assumed to be given at the root instance.", + ) + parser.add_argument( + "--perf-format", + type=str, + help="Dump the performance data with the given format string.", + ) + return parser.parse_args() + + +@flux.util.CLIMain(LOGGER) +def main(): + args = parse_args() + flux_handle = None + try: + flux_handle = flux.Flux() + except FileNotFoundError: + flux_handle = None + + LOGGER.debug("Getting this instance's data") + this_data = get_this_instance_data() + if flux_handle is not None and args.num_children > 0: + LOGGER.debug("Getting children's data") + child_data = get_child_data( + flux_handle, args.num_children, args.job_name, args.kvs_key + ) + else: + child_data = [] + LOGGER.debug("Combining data") + combined_data = combine_data(this_data, child_data) + if flux_handle is not None: + LOGGER.debug("Writing data to parent's KVS") + write_data_to_parent(flux_handle, args.kvs_key, combined_data) + if args.perf_out: + LOGGER.debug("Writing data to file") + write_data_to_file(args.perf_out, args.perf_format, combined_data) + + +if __name__ == "__main__": + main() diff --git a/2024-RADIUSS-AWS/JupyterNotebook/gcp/config.yaml b/2024-RADIUSS-AWS/JupyterNotebook/gcp/config.yaml new file mode 100644 index 0000000..a91987a --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/gcp/config.yaml @@ -0,0 +1,62 @@ +# A few notes! +# The hub -> authentic class defaults to "dummy" +# We shouldn't need any image pull secrets assuming public +# There is a note about the database being a sqlite pvc +# (and a TODO for better solution for Kubernetes) + +# This is the concurrent spawn limit, likely should be increased (deafults to 64) +hub: + concurrentSpawnLimit: 10 + config: + DummyAuthenticator: + password: butter + JupyterHub: + admin_access: true + authenticator_class: dummy + + # This is the image I built based off of jupyterhub/k8s-hub, 3.0.2 at time of writing this + image: + name: ghcr.io/flux-framework/flux-jupyter-hub + tag: "radiuss-2024" + pullPolicy: Always + +# https://z2jh.jupyter.org/en/latest/administrator/optimization.html#scaling-up-in-time-user-placeholders +scheduling: + podPriority: + enabled: true + userPlaceholder: + # Specify 3 dummy user pods will be used as placeholders + replicas: 3 + +# This is the "spawn" image +singleuser: + image: + name: ghcr.io/flux-framework/flux-jupyter-spawn + tag: "radiuss-2024" + pullPolicy: Always + cpu: + limit: 1 + memory: + limit: '4G' + cmd: /entrypoint.sh + +# initContainers: +# - name: init-myservice +# image: alpine/git +# command: ["git", "clone", "https://github.com/rse-ops/flux-radiuss-tutorial-2023", "/home/jovyan/flux-tutorial"] +# volumeMounts: +# - name: flux-tutorial +# mountPath: /home/jovyan + + # This is how we get the tutorial files added + storage: + type: none + + # gitRepo volume is deprecated so we need another way + # https://kubernetes.io/docs/concepts/storage/volumes/#gitrepo + extraVolumes: + - name: flux-tutorial + emptyDir: {} + extraVolumeMounts: + - name: flux-tutorial + mountPath: /home/jovyan/ diff --git a/2024-RADIUSS-AWS/JupyterNotebook/requirements.txt b/2024-RADIUSS-AWS/JupyterNotebook/requirements.txt new file mode 100644 index 0000000..0d9d99e --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/requirements.txt @@ -0,0 +1,339 @@ +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# Use the "Run workflow" button at https://github.com/jupyterhub/zero-to-jupyterhub-k8s/actions/workflows/watch-dependencies.yaml +# +alembic==1.11.3 + # via jupyterhub +anyio==3.7.1 + # via jupyter-server +argon2-cffi==23.1.0 + # via + # jupyter-server + # nbclassic +argon2-cffi-bindings==21.2.0 + # via argon2-cffi +arrow==1.2.3 + # via isoduration +asttokens==2.2.1 + # via stack-data +async-generator==1.10 + # via jupyterhub +async-lru==2.0.4 + # via jupyterlab +attrs==23.1.0 + # via + # jsonschema + # referencing +babel==2.12.1 + # via jupyterlab-server +backcall==0.2.0 + # via ipython +beautifulsoup4==4.12.2 + # via nbconvert +bleach==6.0.0 + # via nbconvert +certifi==2023.7.22 + # via requests +certipy==0.1.3 + # via jupyterhub +cffi==1.15.1 + # via + # argon2-cffi-bindings + # cryptography +charset-normalizer==3.2.0 + # via requests +comm==0.1.4 + # via ipykernel +cryptography==41.0.3 + # via pyopenssl +debugpy==1.6.7.post1 + # via ipykernel +decorator==5.1.1 + # via ipython +defusedxml==0.7.1 + # via nbconvert +executing==1.2.0 + # via stack-data +fastjsonschema==2.18.0 + # via nbformat +fqdn==1.5.1 + # via jsonschema +greenlet==2.0.2 + # via sqlalchemy +idna==3.4 + # via + # anyio + # jsonschema + # requests +ipykernel==6.25.1 + # via + # jupyterlab + # nbclassic +ipython==8.13.0 + # via ipykernel +ipython-genutils==0.2.0 + # via nbclassic +isoduration==20.11.0 + # via jsonschema +jedi==0.19.0 + # via ipython +jinja2==3.1.2 + # via + # jupyter-server + # jupyterhub + # jupyterlab + # jupyterlab-server + # nbclassic + # nbconvert +json5==0.9.14 + # via jupyterlab-server +jsonpointer==2.4 + # via jsonschema +jsonschema[format-nongpl]==4.19.0 + # via + # jupyter-events + # jupyter-telemetry + # jupyterlab-server + # nbformat +jsonschema-specifications==2023.7.1 + # via jsonschema +jupyter-client==8.3.0 + # via + # ipykernel + # jupyter-server + # nbclassic + # nbclient +jupyter-core==5.3.1 + # via + # ipykernel + # jupyter-client + # jupyter-server + # jupyterlab + # nbclassic + # nbclient + # nbconvert + # nbformat +jupyter-events==0.7.0 + # via jupyter-server +jupyter-lsp==2.2.0 + # via jupyterlab +jupyter-server==2.7.2 + # via + # jupyter-lsp + # jupyterlab + # jupyterlab-server + # nbclassic + # nbgitpuller + # notebook-shim +jupyter-server-terminals==0.4.4 + # via jupyter-server +jupyter-telemetry==0.1.0 + # via jupyterhub +jupyterhub==4.0.2 + # via -r requirements.in +jupyterlab==4.0.5 + # via -r requirements.in +jupyterlab-pygments==0.2.2 + # via nbconvert +jupyterlab-server==2.24.0 + # via jupyterlab +mako==1.2.4 + # via alembic +markupsafe==2.1.3 + # via + # jinja2 + # mako + # nbconvert +matplotlib-inline==0.1.6 + # via + # ipykernel + # ipython +mistune==3.0.1 + # via nbconvert +nbclassic==1.0.0 + # via -r requirements.in +nbclient==0.8.0 + # via nbconvert +nbconvert==7.7.4 + # via + # jupyter-server + # nbclassic +nbformat==5.9.2 + # via + # jupyter-server + # nbclassic + # nbclient + # nbconvert +nbgitpuller==1.2.0 + # via -r requirements.in +nest-asyncio==1.5.7 + # via + # ipykernel + # nbclassic +notebook-shim==0.2.3 + # via + # jupyterlab + # nbclassic +oauthlib==3.2.2 + # via jupyterhub +overrides==7.4.0 + # via jupyter-server +packaging==23.1 + # via + # ipykernel + # jupyter-server + # jupyterhub + # jupyterlab + # jupyterlab-server + # nbconvert +pamela==1.1.0 + # via jupyterhub +pandocfilters==1.5.0 + # via nbconvert +parso==0.8.3 + # via jedi +pexpect==4.8.0 + # via ipython +pickleshare==0.7.5 + # via ipython +platformdirs==3.10.0 + # via jupyter-core +prometheus-client==0.17.1 + # via + # jupyter-server + # jupyterhub + # nbclassic +prompt-toolkit==3.0.39 + # via ipython +psutil==5.9.5 + # via ipykernel +ptyprocess==0.7.0 + # via + # pexpect + # terminado +pure-eval==0.2.2 + # via stack-data +pycparser==2.21 + # via cffi +pygments==2.16.1 + # via + # ipython + # nbconvert +pyopenssl==23.2.0 + # via certipy +python-dateutil==2.8.2 + # via + # arrow + # jupyter-client + # jupyterhub +python-json-logger==2.0.7 + # via + # jupyter-events + # jupyter-telemetry +pyyaml==6.0.1 + # via jupyter-events +pyzmq==25.1.1 + # via + # ipykernel + # jupyter-client + # jupyter-server + # nbclassic +referencing==0.30.2 + # via + # jsonschema + # jsonschema-specifications + # jupyter-events +requests==2.31.0 + # via + # jupyterhub + # jupyterlab-server +rfc3339-validator==0.1.4 + # via + # jsonschema + # jupyter-events +rfc3986-validator==0.1.1 + # via + # jsonschema + # jupyter-events +rpds-py==0.9.2 + # via + # jsonschema + # referencing +ruamel-yaml==0.17.32 + # via jupyter-telemetry +ruamel-yaml-clib==0.2.7 + # via ruamel-yaml +send2trash==1.8.2 + # via + # jupyter-server + # nbclassic +six==1.16.0 + # via + # asttokens + # bleach + # python-dateutil + # rfc3339-validator +sniffio==1.3.0 + # via anyio +soupsieve==2.4.1 + # via beautifulsoup4 +sqlalchemy==2.0.20 + # via + # alembic + # jupyterhub +stack-data==0.6.2 + # via ipython +terminado==0.17.1 + # via + # jupyter-server + # jupyter-server-terminals + # nbclassic +tinycss2==1.2.1 + # via nbconvert +tornado==6.3.3 + # via + # ipykernel + # jupyter-client + # jupyter-server + # jupyterhub + # jupyterlab + # nbclassic + # nbgitpuller + # terminado +traitlets==5.9.0 + # via + # comm + # ipykernel + # ipython + # jupyter-client + # jupyter-core + # jupyter-events + # jupyter-server + # jupyter-telemetry + # jupyterhub + # jupyterlab + # matplotlib-inline + # nbclassic + # nbclient + # nbconvert + # nbformat +typing-extensions==4.7.1 + # via + # alembic + # sqlalchemy +uri-template==1.3.0 + # via jsonschema +urllib3==2.0.4 + # via requests +wcwidth==0.2.6 + # via prompt-toolkit +webcolors==1.13 + # via jsonschema +webencodings==0.5.1 + # via + # bleach + # tinycss2 +websocket-client==1.6.1 + # via jupyter-server diff --git a/2024-RADIUSS-AWS/JupyterNotebook/requirements_venv.txt b/2024-RADIUSS-AWS/JupyterNotebook/requirements_venv.txt new file mode 100644 index 0000000..01cea55 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/requirements_venv.txt @@ -0,0 +1,9 @@ +# Used for the DYAD notebook +Pygments +build +ipykernel +jsonschema +cffi +ply +pyyaml +dlio_benchmark @ git+https://github.com/argonne-lcf/dlio_benchmark.git \ No newline at end of file diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/.gitignore b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/.gitignore new file mode 100644 index 0000000..3ebb2aa --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/.gitignore @@ -0,0 +1,2 @@ +flux*.out +.ipynb_checkpoints diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/01_flux_tutorial.ipynb b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/01_flux_tutorial.ipynb new file mode 100644 index 0000000..90fd9da --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/01_flux_tutorial.ipynb @@ -0,0 +1,3462 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2507d149-dcab-458a-a554-37388e0ee13a", + "metadata": { + "tags": [] + }, + "source": [ + "
\n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "40e867ba-f689-4301-bb60-9a448556bb84", + "metadata": { + "tags": [] + }, + "source": [ + "# Welcome to the Flux Tutorial\n", + "\n", + "> What is Flux Framework? 🤔️\n", + " \n", + "Flux is a flexible framework for resource management, built for your site. The framework consists of a suite of projects, tools, and libraries that may be used to build site-custom resource managers for High Performance Computing centers and cloud environments. Flux is a next-generation resource manager and scheduler with many transformative capabilities like hierarchical scheduling and resource management (you can think of it as \"fractal scheduling\") and directed-graph based resource representations.\n", + "\n", + "## I'm ready! How do I do this tutorial? 😁️\n", + "\n", + "This tutorial is split into 3 chapters, each of which has a notebook:\n", + "* [Chapter 1: Getting started with Flux](./01_flux_tutorial.ipynb) (you're already here, it's this notebook!)\n", + "* [Chapter 2: Flux Plumbing](./02_flux_framework.ipynb)\n", + "* [Chapter 3: Lessons learned, next steps, and discussion](./03_flux_tutorial_conclusions.ipynb)\n", + "\n", + "And if you have some extra time and interest, we have supplementary chapters to teach you about advanced (often experimental, or under development) features:\n", + "\n", + "* [Supplementary Chapter 1: Using DYAD to accelerate distributed Deep Learning (DL) training](./supplementary/dyad/dyad_dlio.ipynb)\n", + "\n", + "Let's get started! To provide some brief, added background on Flux and a bit more motivation for our tutorial, \"Shift+Enter\" the cell below to watch our YouTube video!" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "d71ecd22-8552-4b4d-9bc4-61d86f8d33fe", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%html\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "15e82c38-8465-49ac-ae2b-b0bb56a79ec9", + "metadata": { + "tags": [] + }, + "source": [ + "
\n", + "\n", + "# Getting started with Flux\n", + "\n", + "The code and examples that this tutorial is based on can be found at [flux-framework/Tutorials](https://github.com/flux-framework/Tutorials/tree/master/2024-RADIUSS-AWS). You can also find python examples in the `flux-workflow-examples` directory from the sidebar navigation in this JupyterLab instance. " + ] + }, + { + "cell_type": "markdown", + "id": "ae33fef6-278c-4996-8534-fd15e548b338", + "metadata": { + "tags": [] + }, + "source": [ + "
\n", + "Tip: Did you know you can get help for flux or a flux command? For example, try \"flux help\" and \"flux help jobs\"\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c7d616de-70cd-4090-bd43-ffacb5ade1f6", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Usage: flux [OPTIONS] COMMAND ARGS\n", + " -h, --help Display this message.\n", + " -v, --verbose Be verbose about environment and command search\n", + " -V, --version Display command and component versions\n", + " -p, --parent Set environment of parent instead of current instance\n", + "\n", + "For general Flux documentation, please visit\n", + " https://flux-framework.readthedocs.io\n", + "\n", + "run and submit jobs, allocate resources\n", + " submit submit a job to a Flux instance\n", + " run run a Flux job interactively\n", + " bulksubmit submit jobs in bulk to a Flux instance\n", + " alloc allocate a new Flux instance for interactive use\n", + " batch submit a batch script to Flux\n", + "\n", + "list and interact with jobs\n", + " jobs list jobs submitted to Flux\n", + " top display running Flux jobs\n", + " pstree display job hierarchies\n", + " cancel cancel one or more jobs\n", + " pgrep/pkill search or cancel matching jobs\n", + " job get job status, info, etc (see: flux help job)\n", + " proxy proxy connections to Flux jobs and instances\n", + " watch monitor one or more Flux jobs\n", + " update update active Flux jobs\n", + "\n", + "get resource, queue and other instance information\n", + " resource list/manipulate Flux resource status\n", + " queue list and manipulate flux queues\n", + " overlay Show flux overlay network status\n", + " uptime Tell how long Flux has been up and running\n", + "\n", + "other useful commands\n", + " start bootstrap a local Flux instance\n", + " version Display flux version information\n", + " config Manage/query Flux configuration\n", + " env Print the flux environment or execute a command inside it\n", + " hostlist fetch, combine, and manipulate Flux hostlists\n", + "\n", + "See 'flux help COMMAND' for more information about a specific command.\n" + ] + } + ], + "source": [ + "!flux help" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2e54f640-283a-4523-8dde-9617fd6ef0c5", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FLUX-JOBS(1) flux-core FLUX-JOBS(1)\n", + "\n", + "NAME\n", + " flux-jobs - list jobs submitted to Flux\n", + "\n", + "SYNOPSIS\n", + " flux jobs [OPTIONS] [JOBID ...]\n", + "\n", + "DESCRIPTION\n", + " flux jobs is used to list jobs run under Flux. By default only pending\n", + " and running jobs for the current user are listed. Additional jobs and\n", + " information can be listed using options listed below. Alternately,\n", + " specific job ids can be listed on the command line to only list those\n", + " job IDs.\n", + "\n", + "OPTIONS\n", + " -a List jobs in all states, including inactive jobs. This is\n", + " shorthand for --filter=pending,running,inactive.\n", + "\n", + " -A List jobs of all users. This is shorthand for --user=all.\n", + "\n", + " -n, --no-header\n", + " For default output, do not output column headers.\n", + "\n", + " -u, --user=[USERNAME|UID]\n", + " List jobs for a specific username or userid. Specify all for all\n", + " users.\n", + "\n", + " --name=[JOB NAME]\n", + " List jobs with a specific job name.\n", + "\n", + " --queue=[QUEUE]\n", + " List jobs in a specific queue.\n", + "\n", + " -c, --count=N\n", + " Limit output to N jobs (default 1000)\n", + "\n", + " --since=WHEN\n", + " Limit output to jobs that have been active since a given time‐\n", + " stamp. In other words, jobs that are currently pending, cur‐\n", + " rently running, or became inactive since the given timestamp.\n", + " This option implies -a if no other --filter options are speci‐\n", + " fied. If WHEN begins with - character, then the remainder is\n", + " considered to be a an offset in Flux standard duration (RFC 23).\n", + " Otherwise, any datetime expression accepted by the Python ‐\n", + " parsedatetime module is accepted. Examples: \"-6h\", \"-1d\", \"yes‐\n", + " terday\", \"2021-06-21 6am\", \"last Monday\", etc. It is assumed to\n", + " be an error if a timestamp in the future is supplied.\n", + "\n", + " -f, --filter=STATE|RESULT\n", + " List jobs with specific job state or result. Multiple states or\n", + " results can be listed separated by comma. See JOB STATUS below\n", + " for additional information. Defaults to pending,running.\n", + "\n", + " -o, --format=NAME|FORMAT\n", + " Specify a named output format NAME or a format string using\n", + " Python's format syntax. See OUTPUT FORMAT below for field names.\n", + " Named formats may be listed via --format=help. An alternate de‐\n", + " fault format can be set via the FLUX_JOBS_FORMAT_DEFAULT envi‐\n", + " ronment variable. Additional named formats may be registered\n", + " with flux jobs via configuration. See the CONFIGURATION section\n", + " for more details. A configuration snippet for an existing named\n", + " format may be generated with --format=get-config=NAME.\n", + "\n", + " --json Emit data for selected jobs in JSON format. The data for multi‐\n", + " ple matching jobs is contained in a jobs array in the emitted\n", + " JSON object, unless a single job was selected by jobid on the\n", + " command line, in which case a JSON object representing that job\n", + " is emitted on success. With --recursive, each job which is also\n", + " an instance of Flux will will have any recursively listed jobs\n", + " in a jobs array, and so on for each sub-child.\n", + "\n", + " Only the attributes which are available at the time of the flux\n", + " jobs query will be present in the returned JSON object for a\n", + " job. For instance a pending job will not have runtime, waitsta‐\n", + " tus or result keys, among others. A missing key should be con‐\n", + " sidered unavailable.\n", + "\n", + " The --json option is incompatible with --stats and --stats-only,\n", + " and any --format is ignored.\n", + "\n", + " --color[=WHEN]\n", + " Control output coloring. The optional argument WHEN can be\n", + " auto, never, or always. If WHEN is omitted, it defaults to al‐\n", + " ways. Otherwise the default is auto.\n", + "\n", + " --stats\n", + " Output a summary of job statistics before the header. By de‐\n", + " fault shows global statistics. If --queue is specified, shows\n", + " statistics for the specified queue. May be useful in conjunc‐\n", + " tion with utilities like watch(1), e.g.:\n", + "\n", + " $ watch -n 2 flux jobs --stats -f running -c 25\n", + "\n", + " will display a summary of statistics along with the top 25 run‐\n", + " ning jobs, updated every 2 seconds.\n", + "\n", + " Note that all job failures, including canceled and timeout jobs,\n", + " are collectively counted as \"failed\" in --stats.\n", + "\n", + " --stats-only\n", + " Output a summary of job statistics and exit. By default shows\n", + " global statistics. If --queue is specified, shows statistics\n", + " for the specified queue. flux jobs will exit with non-zero exit\n", + " status with --stats-only if there are no active jobs. This al‐\n", + " lows the following loop to work:\n", + "\n", + " $ while flux jobs --stats-only; do sleep 2; done\n", + "\n", + " All options other than --queue are ignored when --stats-only is\n", + " used.\n", + "\n", + " Note that all job failures, including canceled and timeout jobs,\n", + " are collectively counted as \"failed\" in --stats-only.\n", + "\n", + " -R, --recursive\n", + " List jobs recursively. Each child job which is also an instance\n", + " of Flux is prefixed by its jobid \"path\" followed by the list of\n", + " jobs, recursively up to any defined --level. If the --stats op‐\n", + " tion is used, then each child instance in the hierarchy is\n", + " listed with its stats.\n", + "\n", + " --recurse-all\n", + " By default, jobs not owned by the user running flux jobs are\n", + " skipped with --recursive, because normally Flux instances only\n", + " permit the instance owner to connect. This option forces the\n", + " command to attempt to recurse into the jobs of other users. Im‐\n", + " plies --recursive.\n", + "\n", + " -L, --level=N\n", + " With --recursive, stop recursive job listing at level N. Levels\n", + " are counted starting at 0, so flux jobs -R --level=0 is equiva‐\n", + " lent to flux jobs without -R, and --level=1 would limit recur‐\n", + " sive job listing to child jobs of the current instance.\n", + "\n", + " --threads=N\n", + " When flux jobs recursively queries job lists (with --recursive)\n", + " or fetches info for jobs that are also instances (see instance.*\n", + " fields), a pool of threads is used to parallelize the required\n", + " RPCs. Normally, the default number of ThreadPoolExecutor threads\n", + " is used, but by using the --threads, a specific number of\n", + " threads can be chosen.\n", + "\n", + "JOB STATUS\n", + " Jobs may be observed to pass through five job states in Flux: DEPEND,\n", + " PRIORITY, SCHED, RUN, CLEANUP, and INACTIVE (see Flux RFC 21). Under\n", + " the state_single field name, these are abbreviated as D, S, P, R, C,\n", + " and I respectively. For convenience and clarity, the following virtual\n", + " job states also exist: \"pending\", an alias for DEPEND,PRIORITY,SCHED;\n", + " \"running\", an alias for RUN,CLEANUP; \"active\", an alias for \"pend‐\n", + " ing,running\".\n", + "\n", + " After a job has finished and is in the INACTIVE state, it can be marked\n", + " with one of the possible results: COMPLETED, FAILED, CANCELED, TIMEOUT.\n", + " Under the result_abbrev field name, these are abbreviated as CD, F, CA,\n", + " and TO respectively.\n", + "\n", + " The job status is a user friendly mix of both, a job is always in one\n", + " of the following statuses: DEPEND, PRIORITY, SCHED, RUN, CLEANUP, COM‐\n", + " PLETED, FAILED, CANCELED, or TIMEOUT. Under the status_abbrev field\n", + " name, these are abbreviated as D, P, S, R, C, CD, F, CA, and TO respec‐\n", + " tively.\n", + "\n", + "OUTPUT FORMAT\n", + " The --format option can be used to specify an output format to flux\n", + " jobs using Python's string format syntax. For example, the following is\n", + " the format used for the default format:\n", + "\n", + " {id.f58:>12} ?:{queue:<8.8} {username:<8.8} {name:<10.10+} \\\n", + " {status_abbrev:>2.2} {ntasks:>6} {nnodes:>6h} \\\n", + " {contextual_time!F:>8h} {contextual_info}\n", + "\n", + " If a format field is preceded by the special string ?: this will cause\n", + " the field to be removed entirely from output if the result would be an\n", + " empty string or zero value for all jobs in the listing. E.g.:\n", + "\n", + " {id.f58:>12} ?:{exception.type}\n", + "\n", + " would eliminate the EXCEPTION-TYPE column if no jobs in the list re‐\n", + " ceived an exception. (Thus the job queue is only displayed if at least\n", + " one job has a queue assigned in the default format shown above).\n", + "\n", + " As a reminder to the reader, some shells will interpret braces ({ and\n", + " }) in the format string. They may need to be quoted.\n", + "\n", + " The special presentation type h can be used to convert an empty string,\n", + " \"0s\", \"0.0\", \"0:00:00\", or epoch time to a hyphen. For example, nor‐\n", + " mally \"{nodelist}\" would output an empty string if the job has not yet\n", + " run. By specifying, \"{nodelist:h}\", a hyphen would be presented in‐\n", + " stead.\n", + "\n", + " The special suffix + can be used to indicate if a string was truncated\n", + " by including a + character when truncation occurs. If both h and + are\n", + " being used, then the + must appear after the h.\n", + "\n", + " Additionally, the custom job formatter supports a set of special con‐\n", + " version flags. Conversion flags follow the format field and are used to\n", + " transform the value before formatting takes place. Currently, the fol‐\n", + " lowing conversion flags are supported by flux jobs:\n", + "\n", + " !D convert a timestamp field to ISO8601 date and time (e.g.\n", + " 2020-01-07T13:31:00). Defaults to empty string if timestamp\n", + " field does not exist or the timestamp is 0 (i.e epoch time).\n", + "\n", + " !d convert a timestamp to a Python datetime object. This allows\n", + " datetime specific format to be used, e.g. {t_inac‐\n", + " tive!d:%H:%M:%S}. Additionally, width and alignment can be spec‐\n", + " ified after the time format by using two colons (::), e.g.\n", + " {t_inactive!d:%H:%M:%S::>20}. Returns an empty string (or \"-\" if\n", + " the h suffix is used) for an unset timestamp.\n", + "\n", + " !F convert a time duration in floating point seconds to Flux Stan‐\n", + " dard Duration (FSD) string (e.g. {runtime!F}). Defaults to\n", + " empty string if field does not exist.\n", + "\n", + " !H convert a time duration in floating point seconds to hours:min‐\n", + " utes:seconds form (e.g. {runtime!H}). Defaults to empty string\n", + " if time duration field does not exist.\n", + "\n", + " !P convert a floating point number into a percentage fitting in 5\n", + " characters including the \"%\" character. E.g. 0.5 becomes \"50%\"\n", + " 0.015 becomes 1.5%, and 0.0005 becomes 0.05% etc.\n", + "\n", + " As a reminder to the reader, some shells will interpret the exclamation\n", + " point (!) when using a conversion flag. The exclamation point may need\n", + " to be escaped (\\!).\n", + "\n", + " Annotations can be retrieved via the annotations field name. Specific\n", + " keys and sub-object keys can be retrieved separated by a period (\".\").\n", + " For example, if the scheduler has annotated the job with a reason pend‐\n", + " ing status, it can be retrieved via \"{annotations.sched.reason_pend‐\n", + " ing}\".\n", + "\n", + " As a convenience, the field names sched and user can be used as substi‐\n", + " tutions for annotations.sched and annotations.user. For example, a\n", + " reason pending status can be retrieved via \"{sched.reason_pending}\".\n", + "\n", + " The field names that can be specified are:\n", + "\n", + " id job ID\n", + "\n", + " id.f58 job ID in RFC 19 F58 (base58) encoding\n", + "\n", + " id.f58plain\n", + " job ID in RFC 19 F58 encoding with ascii f\n", + "\n", + " id.dec job ID in decimal representation\n", + "\n", + " id.hex job ID in 0x prefix hexadecimal representation\n", + "\n", + " id.dothex\n", + " job ID in dotted hexadecimal representation (xx.xx.xx.xx)\n", + "\n", + " id.words\n", + " job ID in mnemonic encoding\n", + "\n", + " id.emoji\n", + " job ID in emoji encoding\n", + "\n", + " userid job submitter's userid\n", + "\n", + " username\n", + " job submitter's username\n", + "\n", + " urgency\n", + " job urgency\n", + "\n", + " priority\n", + " job priority\n", + "\n", + " dependencies\n", + " list of any currently outstanding job dependencies\n", + "\n", + " status job status (DEPEND, SCHED, RUN, CLEANUP, COMPLETED, FAILED, CAN‐\n", + " CELED, or TIMEOUT)\n", + "\n", + " status_abbrev\n", + " status but in a max 2 character abbreviation\n", + "\n", + " status_emoji\n", + " status but an appropriate emoji instead of job state / result\n", + "\n", + " name job name\n", + "\n", + " cwd job current working directory\n", + "\n", + " queue job queue\n", + "\n", + " project\n", + " job accounting project\n", + "\n", + " bank job accounting bank\n", + "\n", + " ntasks job task count\n", + "\n", + " ncores job core count\n", + "\n", + " duration\n", + " job duration in seconds\n", + "\n", + " nnodes job node count (if job ran / is running), empty string otherwise\n", + "\n", + " ranks job ranks (if job ran / is running), empty string otherwise\n", + "\n", + " nodelist\n", + " job nodelist (if job ran / is running), empty string otherwise\n", + "\n", + " state job state (DEPEND, SCHED, RUN, CLEANUP, INACTIVE)\n", + "\n", + " state_single\n", + " job state as a single character\n", + "\n", + " state_emoji\n", + " job state but an appropriate emoji instead of DEPEND, SCHED,\n", + " RUN, CLEANUP, or INACTIVE\n", + "\n", + " result job result if job is inactive (COMPLETED, FAILED, CANCELED,\n", + " TIMEOUT), empty string otherwise\n", + "\n", + " result_abbrev\n", + " result but in a max 2 character abbreviation\n", + "\n", + " result_emoji\n", + " result but an appropriate emoji instead of COMPLETED, FAILED,\n", + " CANCELED, or TIMEOUT\n", + "\n", + " success\n", + " True of False if job completed successfully, empty string other‐\n", + " wise\n", + "\n", + " waitstatus\n", + " The raw status of the job as returned by waitpid(2) if the job\n", + " exited, otherwise an empty string. Note: waitstatus is the maxi‐\n", + " mum wait status returned by all job shells in a job, which may\n", + " not necessarily indicate the highest task wait status. (The job\n", + " shell exits with the maximum task exit status, unless a task\n", + " died due to a signal, in which case the shell exits with\n", + " 128+signo)\n", + "\n", + " returncode\n", + " The job return code if the job has exited, or an empty string if\n", + " the job is still active. The return code of a job is the highest\n", + " job shell exit code, or negative signal number if the job shell\n", + " was terminated by a signal. If the job was canceled before it\n", + " started, then the returncode is set to the special value -128.\n", + "\n", + " exception.occurred\n", + " True of False if job had an exception, empty string otherwise\n", + "\n", + " exception.severity\n", + " If exception.occurred True, the highest severity, empty string\n", + " otherwise\n", + "\n", + " exception.type\n", + " If exception.occurred True, the highest severity exception type,\n", + " empty string otherwise\n", + "\n", + " exception.note\n", + " If exception.occurred True, the highest severity exception note,\n", + " empty string otherwise\n", + "\n", + " t_submit\n", + " time job was submitted\n", + "\n", + " t_depend\n", + " time job entered depend state\n", + "\n", + " t_run time job entered run state\n", + "\n", + " t_cleanup\n", + " time job entered cleanup state\n", + "\n", + " t_inactive\n", + " time job entered inactive state\n", + "\n", + " runtime\n", + " job runtime\n", + "\n", + " expiration\n", + " time at which job allocation was marked to expire\n", + "\n", + " t_remaining\n", + " If job is running, amount of time remaining before expiration\n", + "\n", + " annotations\n", + " annotations metadata, use \".\" to get specific keys\n", + "\n", + " sched short hand for annotations.sched\n", + "\n", + " user short hand for annotations.user\n", + "\n", + " Field names which are specific to jobs which are also instances of Flux\n", + " include:\n", + "\n", + " instance.stats\n", + " a short string describing current job statistics for the in‐\n", + " stance of the form PD:{pending} R:{running} CD:{successful}\n", + " F:{failed}\n", + "\n", + " instance.stats.total\n", + " total number of jobs in any state in the instance.\n", + "\n", + " instance.utilization\n", + " number of cores currently allocated divided by the total number\n", + " of cores. Can be formatted as a percentage with !P, e.g. {in‐\n", + " stance.utilization!P:>4}.\n", + "\n", + " instance.gpu_utilization\n", + " same as instance.utilization but for gpu resources\n", + "\n", + " instance.progress\n", + " number of inactive jobs divided by the total number of jobs.\n", + " Can be formatted as a percentage with {instance.progress!P:>4}\n", + "\n", + " instance.resources..{ncores,ngpus}\n", + " number of cores, gpus in state state, where state can be all,\n", + " up, down, allocated, or free, e.g. {instance.re‐\n", + " sources.all.ncores}\n", + "\n", + " The following fields may return different information depending on the\n", + " state of the job or other context:\n", + "\n", + " contextual_info\n", + " Returns selected information based on the job's current state.\n", + " If the job is in PRIORITY state, then the string priority-wait\n", + " is returned, if the job is in DEPEND state, then a list of out‐\n", + " standing dependencies is returned, if the job is in SCHED state\n", + " then an estimated time the job will run is returned (if the\n", + " scheduler supports it). Otherwise, the assigned nodelist is re‐\n", + " turned (if resources were assigned).\n", + "\n", + " contextual_info\n", + " Returns the job runtime for jobs in RUN state or later, other‐\n", + " wise the job duration (if set) is returned.\n", + "\n", + " inactive_reason\n", + " If the job is inactive, returns the reason that the job is no\n", + " longer active. Generally speaking, will output \"Exit\", \"Time‐\n", + " out\", \"Canceled\", or signal. If available, other contextual in‐\n", + " formation will also be provided such as the exit returncode or\n", + " cancellation message.\n", + "\n", + "CONFIGURATION\n", + " The flux jobs command supports registration of named output formats in\n", + " configuration files. The command loads configuration files from\n", + " flux-jobs.EXT from the following paths in order of increasing prece‐\n", + " dence:\n", + "\n", + " • $XDG_CONFIG_DIRS/flux or /etc/xdg/flux if XDG_CONFIG_DIRS is not\n", + " set. Note that XDG_CONFIG_DIRS is traversed in reverse order such\n", + " that entries first in the colon separated path are highest prior‐\n", + " ity.\n", + "\n", + " • $XDG_CONFIG_HOME/flux or $HOME/.config/flux if XDG_CONFIG_HOME is\n", + " not set\n", + "\n", + " where EXT can be one of toml, yaml, or json.\n", + "\n", + " If there are multiple flux-jobs.* files found in a directory, then they\n", + " are loaded in lexical order (i.e. .json first, then .toml, then .yaml)\n", + "\n", + " Named formats are registered in a formats table or dictionary with a\n", + " key per format pointing to a table or dictionary with the keys:\n", + "\n", + " format (required) The format string\n", + "\n", + " description\n", + " (optional) A short description of the named format, displayed\n", + " with flux jobs --format=help\n", + "\n", + " If a format name is specified in more than one config file, then the\n", + " last one loaded is used. Due to the order that flux jobs loads config\n", + " files, this allows user configuration to override system configuration.\n", + " It is an error to override any internally defined formats (such as de‐\n", + " fault).\n", + "\n", + " If a format name or string is not specified on the command line the in‐\n", + " ternally defined format default is used.\n", + "\n", + " Example:\n", + "\n", + " # $HOME/.config/flux/flux-jobs.toml\n", + "\n", + " [formats.myformat]\n", + " description = \"My useful format\"\n", + " format = \"\"\"\\\n", + " {id.f58:>12} {name:>8.8} {t_submit!D:<19} \\\n", + " {t_run!D:<19} {t_remaining!F}\\\n", + " \"\"\"\n", + "\n", + " It may be helpful to start with an existing named format by using the\n", + " --format=get-config=NAME option, e.g.:\n", + "\n", + " $ flux jobs --format=get-config=default >> ~/.config/flux/flux-jobs.toml\n", + "\n", + " Be sure to change the name of the format string from default. It is an\n", + " error to redefine the default format string.\n", + "\n", + "EXAMPLES\n", + " The default output of flux jobs will list the pending and running jobs\n", + " of the current user. It is equivalent to:\n", + "\n", + " $ flux jobs --filter=pending,running\n", + "\n", + " To list all pending, running, and inactive jobs, of the current user,\n", + " you can use --filter option or the -a option:\n", + "\n", + " $ flux jobs -a\n", + "\n", + " OR\n", + "\n", + " $ flux jobs --filter=pending,running,inactive\n", + "\n", + " To alter which user's jobs are listed, specify the user with --user:\n", + "\n", + " $ flux jobs --user=flux\n", + "\n", + " Jobs that have finished may be filtered further by specifying if they\n", + " have completed, failed, or were canceled. For example, the following\n", + " will list the jobs that have failed or were canceled:\n", + "\n", + " $ flux jobs --filter=failed,canceled\n", + "\n", + " The --format option can be used to alter the output format or output\n", + " additional information. For example, the following would output all\n", + " jobids for the user in decimal form, and output any annotations the\n", + " scheduler attached to each job:\n", + "\n", + " $ flux jobs -a --format=\"{id} {annotations.sched}\"\n", + "\n", + " The following would output the job id and exception information, so a\n", + " user can learn why a job failed.\n", + "\n", + " $ flux jobs --filter=failed --format=\"{id} {exception.type} {exception.note}\"\n", + "\n", + "RESOURCES\n", + " Flux: http://flux-framework.org\n", + "\n", + " Flux RFC: https://flux-framework.readthedocs.io/projects/flux-rfc\n", + "\n", + "SEE ALSO\n", + " flux-pstree(1)\n", + "\n", + "AUTHOR\n", + " This page is maintained by the Flux community.\n", + "\n", + "COPYRIGHT\n", + " Copyright 2014 Lawrence Livermore National Security, LLC and Flux de‐\n", + " velopers.\n", + "\n", + " SPDX-License-Identifier: LGPL-3.0\n", + "\n", + " Jun 06, 2024 FLUX-JOBS(1)\n" + ] + } + ], + "source": [ + "!flux help jobs" + ] + }, + { + "cell_type": "markdown", + "id": "17e435d6-0927-4966-a4d7-47a128c94158", + "metadata": { + "tags": [] + }, + "source": [ + "### What does the terminal prompt mean?\n", + "For cases when you need a terminal, we will ! However, you can also select `File -> New -> Terminal` to open one on the fly. Let's next talk about flux instances." + ] + }, + { + "cell_type": "markdown", + "id": "ec052119", + "metadata": {}, + "source": [ + "## Flux Resources\n", + "\n", + "When you are interacting with Flux, you will commonly want to know what resources are available to you. Flux uses [hwloc](https://github.com/open-mpi/hwloc) to detect the resources on each node and then to populate its resource graph.\n", + "\n", + "You can access the topology information that Flux collects with the `flux resource` subcommand. Let's run `flux resource list` to see the resources available to us in this notebook:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "scenic-chassis", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " STATE NNODES NCORES NGPUS NODELIST\n", + " free 4 38 0 8660c254a8e[5,5,5,5]\n", + " allocated 1 2 0 8660c254a8e5\n", + " down 0 0 0 \n" + ] + } + ], + "source": [ + "!flux resource list" + ] + }, + { + "cell_type": "markdown", + "id": "0086e47e", + "metadata": {}, + "source": [ + "Flux can also bootstrap its resource graph based on static input files, like in the case of a multi-user system instance setup by site administrators. [More information on Flux's static resource configuration files](https://flux-framework.readthedocs.io/projects/flux-core/en/latest/guide/admin.html#configuration). Flux provides a more standard interface to listing available resources that works regardless of the resource input source: `flux resource`." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "prime-equilibrium", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " STATE UP NNODES NODELIST\n", + " avail \u001b[01;32m ✔\u001b[0;0m 4 8660c254a8e[5,5,5,5]\n" + ] + } + ], + "source": [ + "# To view status of resources\n", + "!flux resource status" + ] + }, + { + "cell_type": "markdown", + "id": "e6603d7f-dd45-4743-9efb-bf65ba7e2f22", + "metadata": {}, + "source": [ + "It might also be the case that you need to see queues. Here is how to do that:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c7fbe877-c0bf-4296-a20b-21809caa72d7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " DEFAULTTIME TIMELIMIT NNODES NCORES NGPUS\n", + " inf inf 0-inf 0-inf 0-inf\n" + ] + } + ], + "source": [ + "!flux queue list" + ] + }, + { + "cell_type": "markdown", + "id": "dee2d6af-43fa-490e-88e9-10f13e660125", + "metadata": { + "tags": [] + }, + "source": [ + "
\n", + "\n", + "# Flux Commands \n", + "\n", + "Here are how Flux commands map to a scheduler you are likely familiar with, Slurm. A larger table with similar mappings for LSF, Moab, and Slurm can be [viewed here](https://hpc.llnl.gov/banks-jobs/running-jobs/batch-system-cross-reference-guides). For submitting jobs, you can use the `flux` `submit`, `run`, `bulksubmit`, `batch`, and `alloc` commands.\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
OperationSlurmFlux
One-off run of a single job (blocking)srunflux run
One-off run of a single job (interactive)srun --ptyflux run -o pty.interactive
One-off run of a single job (not blocking)NAflux submit
Bulk submission of jobs (not blocking)NAflux bulksubmit
Watching jobsNAflux watch
Querying the status of jobssqueue/scontrol show job job_idflux jobs/flux job info job_id
Canceling running jobsscancelflux cancel
Allocation for an interactive instancesallocflux alloc
Submitting batch jobssbatchflux batch
" + ] + }, + { + "cell_type": "markdown", + "id": "ac798095", + "metadata": {}, + "source": [ + "## flux run\n", + "\n", + "
\n", + "Description: Running a single job (blocking)\n", + "
\n", + "\n", + "The `flux run` command submits a job to Flux (similar to `flux submit`) but then attaches to the job with `flux job attach`, printing the job's stdout/stderr to the terminal and exiting with the same exit code as the job. It's basically doing an interactive submit, because you will be able to watch the output in your terminal, and it will block your terminal until the job completes." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "52d26496-dd1f-44f7-bb10-8a9b4b8c9c80", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "749a39b51885\n" + ] + } + ], + "source": [ + "!flux run hostname" + ] + }, + { + "cell_type": "markdown", + "id": "53357a9d-11d8-4c2d-87d8-c30ae38d01ba", + "metadata": {}, + "source": [ + "The output from the previous command is the hostname (a container ID string in this case). If the job exits with a non-zero exit code this will be reported by `flux job attach` (occurs implicitly with `flux run`). For example, execute the following:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "fa40cb98-a138-4771-a7ef-f1860dddf7db", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "flux-job: task(s) exited with exit code 1\n" + ] + } + ], + "source": [ + "!flux run /bin/false" + ] + }, + { + "cell_type": "markdown", + "id": "6b2b5c3f-e24a-45a8-a10c-e10bfdbb7b87", + "metadata": {}, + "source": [ + "A job submitted with `run` can be canceled with two rapid `Cltr-C`s in succession, or a user can detach from the job with `Ctrl-C Ctrl-Z`. The user can then re-attach to the job by using `flux job attach JOBID`." + ] + }, + { + "cell_type": "markdown", + "id": "81e5213d", + "metadata": {}, + "source": [ + "`flux submit` and `flux run` also support many other useful flags:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "02032748", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3: 8660c254a8e5\n", + "2: 8660c254a8e5\n", + "1: 8660c254a8e5\n", + "0: 8660c254a8e5\n" + ] + } + ], + "source": [ + "!flux run -n4 --label-io --time-limit=5s --env-remove=LD_LIBRARY_PATH hostname" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f52bb357-a7ce-458d-9c3f-4d664eca4fbd", + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment and run this help command if you want to see all the flags for flux run\n", + "# !flux run --help" + ] + }, + { + "cell_type": "markdown", + "id": "7c09708a-74a1-4e61-b678-cb337b7df435", + "metadata": {}, + "source": [ + "## flux submit\n", + "\n", + "
\n", + "Description: Running a single job (not blocking)\n", + "
\n", + "\n", + "\n", + "The `flux submit` command submits a job to Flux and prints out the jobid." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "cc2bddee-f454-4674-80d4-4a39c5f1bee2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "usage: flux submit [OPTIONS...] COMMAND [ARGS...]\n", + "\n", + "enqueue a job\n", + "\n", + "positional arguments:\n", + " command Job command and arguments\n", + "\n", + "options:\n", + " -h, --help show this help message and exit\n", + " -q, --queue=NAME Submit a job to a specific named queue\n", + " -t, --time-limit=MIN|FSD Time limit in minutes when no units provided,\n", + " otherwise in Flux standard duration, e.g. 30s,\n", + " 2d, 1.5h\n", + " --urgency=N Set job urgency (0-31), hold=0, default=16,\n", + " expedite=31\n" + ] + } + ], + "source": [ + "# Let's peek at the help for flux submit!\n", + "!flux submit --help | head -n 15" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8a5e7d41-1d8d-426c-8198-0ad4a57e7d04", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ƒckWM1ZXM\n" + ] + } + ], + "source": [ + "!flux submit hostname" + ] + }, + { + "cell_type": "markdown", + "id": "809292e5-3f24-4528-916f-8733d065de47", + "metadata": {}, + "source": [ + "But how does one get output? To quickly see output (which will block the terminal if the job is still running) after a submit, you can do:\n", + "\n", + "```bash\n", + "flux job attach $(flux job last)\n", + "```\n", + "\n", + "To provide a custom path to an output or error file, you can provide `--out` and `--err`, respectively. Let's try those both now." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "38a4da7f-2b84-4c67-9da1-02435005d392", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ƒckWM1ZXM\n", + "749a39b51885\n" + ] + } + ], + "source": [ + "# What was the last job id again?\n", + "! flux job last\n", + "\n", + "# Attach to the last job id that was submitted (will block if still running and stream output)\n", + "! flux job attach $(flux job last)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "89a851d3-0179-4e5e-9e20-93bc11b5056f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ƒfeTb2bBm\n", + "Did a polar bear with a soft drink write this...?! 🐻‍❄️🥤️😎️ \n" + ] + } + ], + "source": [ + "# Now let's submit another one, and give it the same output and error file\n", + "! flux submit --out /tmp/hola-cola.txt --err /tmp/hola-cola.txt echo \"Did a polar bear with a soft drink write this...?! 🐻‍❄️🥤️😎️ \"\n", + "\n", + "# Take a look!\n", + "! cat /tmp/hola-cola.txt" + ] + }, + { + "cell_type": "markdown", + "id": "a7e4c25e-3ca8-4277-bb70-a0e94bcd223b", + "metadata": {}, + "source": [ + "`submit` supports common options like `--nnodes`, `--ntasks`, and `--cores-per-task`. There are short option equivalents (`-N`, `-n`, and `-c`, respectively) of these options as well. `--cores-per-task=1` is the default." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "571d8c3d-b24a-415e-b9ac-f58b99a7e92c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ƒ3VqVSHr7q\n" + ] + } + ], + "source": [ + "!flux submit -N1 -n2 sleep inf" + ] + }, + { + "cell_type": "markdown", + "id": "91e9ed6c", + "metadata": {}, + "source": [ + "## flux bulksubmit\n", + "\n", + "
\n", + "Description: Submitting jobs in bulk (not blocking)\n", + "
\n", + "\n", + "The `flux bulksubmit` command enqueues jobs based on a set of inputs which are substituted on the command line, similar to `xargs` and the GNU `parallel` utility, except the jobs have access to the resources of an entire Flux instance instead of only the local system." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "f0e82702", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ƒ3VqabmM3V\n", + "ƒ3VqabmM3W\n", + "ƒ3VqadFLKq\n", + "foo\n", + "bar\n", + "baz\n" + ] + } + ], + "source": [ + "!flux bulksubmit --watch --wait echo {} ::: foo bar baz" + ] + }, + { + "cell_type": "markdown", + "id": "60ba88b4-538d-4eb6-baf9-735581b4d717", + "metadata": {}, + "source": [ + "### carbon copy\n", + "\n", + "The `--cc` option (akin to \"carbon copy\") to `submit` makes repeated submission even easier via, `flux submit --cc=IDSET`:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0ea1962b-1831-4bd2-8dab-c61fd710df9c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ƒ3VqhAnAU7\n", + "ƒ3VqhAnAU8\n", + "ƒ3VqhAnAU9\n", + "ƒ3VqhAnAUA\n" + ] + } + ], + "source": [ + "!flux submit --cc=1-4 hostname" + ] + }, + { + "cell_type": "markdown", + "id": "27ca3706-8bb4-4fd6-a37c-e6135fb05604", + "metadata": {}, + "source": [ + "Try it in the with a progress bar and jobs/s rate report: `flux submit --cc=1-100 --watch --progress --jps hostname`\n", + "\n", + "Note that `--wait` is implied by `--watch`, meaning that when you are watching jobs, you are also waiting for them to finish. Here are some other carbon copy commands that are useful to try:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "8e93d8e3-9342-4edd-b262-757355ddfe9d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ƒ3Vqogq1L3\n", + "ƒ3Vqogq1L4\n" + ] + } + ], + "source": [ + "# Use flux carbon copy to submit identical jobs with different inputs\n", + "!flux submit --cc=\"1-2\" echo \"Hello I am job {cc}\"" + ] + }, + { + "cell_type": "markdown", + "id": "4c5a18ff-8d6a-47e9-a164-931ed1275ef4", + "metadata": {}, + "source": [ + "Here are some \"carbon copy\" jobs to try in the :\n", + "\n", + "```bash\n", + "# Use flux carbon copy to submit identical jobs with different inputs\n", + "flux submit --cc=\"1-10\" echo \"Hello I am job {cc}\"\n", + "\n", + "# Submits scripts myscript1.sh through myscript10.sh\n", + "flux submit --cc=0-6 flux-workflow-examples/bulksubmit/{cc}.sh\n", + "\n", + "# Bypass the key value store and write output to file with jobid\n", + "flux submit --cc=1-10 --output=job-{{id}}.out echo \"This is job {cc}\"\n", + "\n", + "# Use carbon copy to submit identical jobs with different inputs\n", + "flux bulksubmit --dry-run --cc={1} echo {0} ::: a b c ::: 0-1 0-3 0-7\n", + "```\n", + "\n", + "Of course, Flux can launch more than just single-node, single-core jobs. We can submit multiple heterogeneous jobs and Flux will co-schedule the jobs while also ensuring no oversubscription of resources (e.g., cores). Let's run the second example here, and add a clever trick to ask for output as we submit the jobs. This is a fun one, I promise!" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "2f089be5-6d32-40db-b9e9-328e5200b754", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Once upon a time... 📗️\n", + "There was a little duck 🦆️\n", + "Her name was pizzaquack 🍕️\n", + "She was very fond of cheese 🧀️\n", + "And running Flux 🌀️\n", + "And so she ran Flux, while she ate her cheese 😋️\n", + "And was so happy! The end. 🌈️\n" + ] + } + ], + "source": [ + "! for jobid in $(flux submit --cc=0-6 /bin/bash flux-workflow-examples/bulksubmit/{cc}.sh); do flux job attach ${jobid}; done" + ] + }, + { + "cell_type": "markdown", + "id": "6d3623b2-ca25-4d42-8e43-0c8e038464b4", + "metadata": {}, + "source": [ + "Note: in this tutorial, we cannot assume that the host you are running on has multiple cores, thus the examples below only vary the number of nodes per job. Varying the `cores-per-task` is also possible on Flux when the underlying hardware supports it (e.g., a multi-core node). Let's run the middle example - it's a fun one, I promise!" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "brazilian-former", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ƒ3VqtrJWFh\n", + "ƒ3VqzNXq8B\n" + ] + } + ], + "source": [ + "!flux submit --nodes=2 --ntasks=2 --cores-per-task=1 --job-name simulation sleep inf\n", + "!flux submit --nodes=1 --ntasks=1 --cores-per-task=1 --job-name analysis sleep inf" + ] + }, + { + "cell_type": "markdown", + "id": "641f446c-b2e8-40d8-b6bd-eb6b9dba3c71", + "metadata": {}, + "source": [ + "## flux watch\n", + "\n", + "
\n", + "Description: 👀️ Watching jobs\n", + "
\n", + "\n", + "Wouldn't it be cool to submit a job and then watch it? Well, yeah! We can do this now with flux watch. Let's run a fun example, and then watch the output. We have sleeps in here interspersed with echos only to show you the live action! 🥞️\n", + "Also note a nice trick - you can always use `flux job last` to get the last JOBID.\n", + "Here is an example (not runnable, as notebooks don't support environment variables) for getting and saving a job id:\n", + "\n", + "```bash\n", + "flux submit hostname\n", + "JOBID=$(flux job last)\n", + "```\n", + "\n", + "And then you could use the variable `$JOBID` in your subsequent script or interactions with Flux! So what makes `flux watch` different from `flux job attach`? Aside from the fact that `flux watch` is read-only, `flux watch` can watch many (or even all (`flux watch --all`) jobs at once!" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "5ad231c2-4cdb-4d18-afc2-7cb3a74759c2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ƒ3Vr6FWywV\n", + "25 chocolate chip pancakes on the table... 25 chocolate chip pancakes! 🥞️\n", + "Eat a stack, for a snack, 15 chocolate chip pancakes on the table! 🥄️\n", + "15 chocolate chip pancakes on the table... 15 chocolate chip pancakes! 🥞️\n", + "Throw a stack... it makes a smack! 15 chocolate chip pancakes on the wall! 🥞️\n", + "You got some cleaning to do 🧽️\n" + ] + } + ], + "source": [ + "!flux submit ./flux-workflow-examples/job-watch/job-watch.sh\n", + "!flux watch $(flux job last)" + ] + }, + { + "cell_type": "markdown", + "id": "3f8c2af2", + "metadata": {}, + "source": [ + "## flux jobs\n", + "\n", + "
\n", + "Description: Querying the status of jobs\n", + "
\n", + "\n", + "We can now list the jobs in the queue with `flux jobs` and we should see both jobs that we just submitted. Jobs that are instances are colored blue in output, red jobs are failed jobs, and green jobs are those that completed successfully. Note that the JupyterLab notebook may not display these colors. You will be able to see them in the terminal." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "institutional-vocabulary", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " JOBID USER NAME ST NTASKS NNODES TIME INFO\n", + " ƒ3VqzNXq8B jovyan analysis R 1 1 10.49s 8660c254a8e5\n", + " ƒ3VqtrJWFh jovyan simulation R 2 2 10.71s 8660c254a8e[5,5]\n", + " ƒ3VqVSHr7q jovyan sleep R 2 1 11.62s 8660c254a8e5\n", + " ƒnyvM4Nb jovyan sleep R 2 1 5.269h 8660c254a8e5\n" + ] + } + ], + "source": [ + "!flux jobs" + ] + }, + { + "cell_type": "markdown", + "id": "f7228e0e-557c-455c-9903-073ef40a56a5", + "metadata": {}, + "source": [ + "You might also want to see \"all\" jobs with `-a`." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "70dd1459-e21f-46b5-84a4-bd165cf97f4b", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " JOBID USER NAME ST NTASKS NNODES TIME INFO\n", + " ƒ3VqzNXq8B jovyan analysis R 1 1 10.71s 8660c254a8e5\n", + " ƒ3VqtrJWFh jovyan simulation R 2 2 10.92s 8660c254a8e[5,5]\n", + " ƒ3VqVSHr7q jovyan sleep R 2 1 11.84s 8660c254a8e5\n", + " ƒnyvM4Nb jovyan sleep R 2 1 5.269h 8660c254a8e5\n", + "\u001b[01;32m ƒ3Vr6FWywV jovyan job-watch+ CD 1 1 10.03s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3Vqogq1L3 jovyan echo CD 1 1 0.015s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3Vqogq1L4 jovyan echo CD 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VqhAnAUA jovyan hostname CD 1 1 0.060s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VqhAnAU9 jovyan hostname CD 1 1 0.050s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VqhAnAU8 jovyan hostname CD 1 1 0.047s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VqhAnAU7 jovyan hostname CD 1 1 0.047s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VqadFLKq jovyan echo CD 1 1 0.025s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VqabmM3W jovyan echo CD 1 1 0.025s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VqabmM3V jovyan echo CD 1 1 0.012s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VqNqo3Qs jovyan hostname CD 1 1 0.016s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VqBbUVR9 jovyan hostname CD 4 1 0.017s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ3Vq5LFXNf jovyan false F 1 1 0.037s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VpyWEM83 jovyan hostname CD 1 1 0.013s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VPB8ZEqV jovyan echo CD 1 1 0.012s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3V7Tprhqh jovyan echo CD 1 1 0.060s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3V35oKmEo jovyan echo CD 1 1 0.015s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2mzETcgvB jovyan echo CD 1 1 0.012s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2mnMLCXPd jovyan echo CD 1 1 0.036s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2mfhe5NCX jovyan echo CD 1 1 0.036s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF545FE jovyan sleep CD 1 1 0.077s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5y2 jovyan sleep CD 1 1 0.108s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5y8 jovyan sleep CD 1 1 0.078s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5xx jovyan sleep CD 1 1 0.118s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5y5 jovyan sleep CD 1 1 0.107s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5y7 jovyan sleep CD 1 1 0.078s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5y1 jovyan sleep CD 1 1 0.107s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5xs jovyan sleep CD 1 1 0.118s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5xt jovyan sleep CD 1 1 0.118s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5y3 jovyan sleep CD 1 1 0.107s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF266gd jovyan sleep CD 1 1 0.118s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5y4 jovyan sleep CD 1 1 0.107s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF545FF jovyan sleep CD 1 1 0.076s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF545FG jovyan sleep CD 1 1 0.076s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5xv jovyan sleep CD 1 1 0.117s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF545FH jovyan sleep CD 1 1 0.073s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5xu jovyan sleep CD 1 1 0.117s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF545FD jovyan sleep CD 1 1 0.076s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5xw jovyan sleep CD 1 1 0.093s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5y6 jovyan sleep CD 1 1 0.083s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5xz jovyan sleep CD 1 1 0.083s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF266gc jovyan sleep CD 1 1 0.090s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF266gb jovyan sleep CD 1 1 0.087s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5xy jovyan sleep CD 1 1 0.086s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF266gX jovyan sleep CD 1 1 0.101s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEzc7QK jovyan sleep CD 1 1 0.109s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEzc7QJ jovyan sleep CD 1 1 0.107s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEzc7QD jovyan sleep CD 1 1 0.111s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEzc7QG jovyan sleep CD 1 1 0.085s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF266gY jovyan sleep CD 1 1 0.076s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF266ga jovyan sleep CD 1 1 0.074s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF266gZ jovyan sleep CD 1 1 0.076s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEzc7QL jovyan sleep CD 1 1 0.074s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEzc7QH jovyan sleep CD 1 1 0.079s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEzc7QF jovyan sleep CD 1 1 0.077s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEzc7QE jovyan sleep CD 1 1 0.053s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEzc7QC jovyan sleep CD 1 1 0.062s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEy887z jovyan sleep CD 1 1 0.079s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEy887y jovyan sleep CD 1 1 0.079s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEzc7QB jovyan sleep CD 1 1 0.078s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEy887w jovyan sleep CD 1 1 0.078s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEy887x jovyan sleep CD 1 1 0.078s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEy887v jovyan sleep CD 1 1 0.088s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZN jovyan sleep CD 1 1 0.091s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEy887s jovyan sleep CD 1 1 0.088s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEy887u jovyan sleep CD 1 1 0.076s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEwe8qV jovyan sleep CD 1 1 0.078s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZL jovyan sleep CD 1 1 0.077s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEy887r jovyan sleep CD 1 1 0.076s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEy887t jovyan sleep CD 1 1 0.073s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZM jovyan sleep CD 1 1 0.075s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEy887q jovyan sleep CD 1 1 0.075s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEwe8qW jovyan sleep CD 1 1 0.075s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZG jovyan sleep CD 1 1 0.082s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZF jovyan sleep CD 1 1 0.085s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9Z9 jovyan sleep CD 1 1 0.094s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZE jovyan sleep CD 1 1 0.085s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZB jovyan sleep CD 1 1 0.094s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZA jovyan sleep CD 1 1 0.094s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZH jovyan sleep CD 1 1 0.079s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZK jovyan sleep CD 1 1 0.078s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZJ jovyan sleep CD 1 1 0.079s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAH6 jovyan sleep CD 1 1 0.081s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZD jovyan sleep CD 1 1 0.069s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZC jovyan sleep CD 1 1 0.075s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAH3 jovyan sleep CD 1 1 0.084s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAH5 jovyan sleep CD 1 1 0.072s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAH2 jovyan sleep CD 1 1 0.064s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAH1 jovyan sleep CD 1 1 0.067s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGw jovyan sleep CD 1 1 0.084s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGx jovyan sleep CD 1 1 0.084s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAH4 jovyan sleep CD 1 1 0.063s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGz jovyan sleep CD 1 1 0.083s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGv jovyan sleep CD 1 1 0.083s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzd jovyan sleep CD 1 1 0.108s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGy jovyan sleep CD 1 1 0.058s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGt jovyan sleep CD 1 1 0.059s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGu jovyan sleep CD 1 1 0.058s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzb jovyan sleep CD 1 1 0.108s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzc jovyan sleep CD 1 1 0.108s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAza jovyan sleep CD 1 1 0.090s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAze jovyan sleep CD 1 1 0.090s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGq jovyan sleep CD 1 1 0.079s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGr jovyan sleep CD 1 1 0.079s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGp jovyan sleep CD 1 1 0.079s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGs jovyan sleep CD 1 1 0.079s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzY jovyan sleep CD 1 1 0.088s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzZ jovyan sleep CD 1 1 0.085s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzf jovyan sleep CD 1 1 0.084s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzi jovyan sleep CD 1 1 0.074s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGo jovyan sleep CD 1 1 0.074s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzg jovyan sleep CD 1 1 0.080s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzj jovyan sleep CD 1 1 0.071s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzW jovyan sleep CD 1 1 0.074s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzX jovyan sleep CD 1 1 0.074s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiM jovyan sleep CD 1 1 0.112s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiJ jovyan sleep CD 1 1 0.111s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzh jovyan sleep CD 1 1 0.062s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiK jovyan sleep CD 1 1 0.095s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiL jovyan sleep CD 1 1 0.095s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiH jovyan sleep CD 1 1 0.095s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzT jovyan sleep CD 1 1 0.093s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzV jovyan sleep CD 1 1 0.092s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzU jovyan sleep CD 1 1 0.092s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiG jovyan sleep CD 1 1 0.087s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiA jovyan sleep CD 1 1 0.103s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiF jovyan sleep CD 1 1 0.103s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiD jovyan sleep CD 1 1 0.103s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiE jovyan sleep CD 1 1 0.070s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRy jovyan sleep CD 1 1 0.069s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBi9 jovyan sleep CD 1 1 0.068s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBi8 jovyan sleep CD 1 1 0.066s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiB jovyan sleep CD 1 1 0.065s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiC jovyan sleep CD 1 1 0.064s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBi7 jovyan sleep CD 1 1 0.064s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRu jovyan sleep CD 1 1 0.053s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRw jovyan sleep CD 1 1 0.053s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRt jovyan sleep CD 1 1 0.053s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRx jovyan sleep CD 1 1 0.053s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRr jovyan sleep CD 1 1 0.053s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRv jovyan sleep CD 1 1 0.048s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRs jovyan sleep CD 1 1 0.048s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRq jovyan sleep CD 1 1 0.059s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRp jovyan sleep CD 1 1 0.114s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9d jovyan sleep CD 1 1 0.123s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9Y jovyan sleep CD 1 1 0.123s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9e jovyan sleep CD 1 1 0.123s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9b jovyan sleep CD 1 1 0.123s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9W jovyan sleep CD 1 1 0.123s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRm jovyan sleep CD 1 1 0.122s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9f jovyan sleep CD 1 1 0.122s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9a jovyan sleep CD 1 1 0.123s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9Z jovyan sleep CD 1 1 0.123s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9X jovyan sleep CD 1 1 0.122s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEmGDsC jovyan sleep CD 1 1 0.100s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9c jovyan sleep CD 1 1 0.099s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9U jovyan sleep CD 1 1 0.095s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9T jovyan sleep CD 1 1 0.095s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9R jovyan sleep CD 1 1 0.095s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRn jovyan sleep CD 1 1 0.093s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9S jovyan sleep CD 1 1 0.093s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9V jovyan sleep CD 1 1 0.091s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRo jovyan sleep CD 1 1 0.090s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvN jovyan sleep CD 1 1 0.105s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEmGDs5 jovyan sleep CD 1 1 0.105s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEmGDs8 jovyan sleep CD 1 1 0.104s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvQ jovyan sleep CD 1 1 0.105s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEmGDs9 jovyan sleep CD 1 1 0.103s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEmGDs7 jovyan sleep CD 1 1 0.104s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEmGDs6 jovyan sleep CD 1 1 0.104s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvP jovyan sleep CD 1 1 0.104s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvR jovyan sleep CD 1 1 0.104s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvS jovyan sleep CD 1 1 0.104s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEmGDsB jovyan sleep CD 1 1 0.078s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvK jovyan sleep CD 1 1 0.079s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvJ jovyan sleep CD 1 1 0.076s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvT jovyan sleep CD 1 1 0.075s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvM jovyan sleep CD 1 1 0.076s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvL jovyan sleep CD 1 1 0.076s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvU jovyan sleep CD 1 1 0.071s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEmGDsA jovyan sleep CD 1 1 0.069s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvH jovyan sleep CD 1 1 0.074s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qe7 jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvG jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qe4 jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qe6 jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvF jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qe5 jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qe8 jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMm jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMg jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMf jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMi jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qdy jovyan sleep CD 1 1 0.192s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qdv jovyan sleep CD 1 1 0.192s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMh jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qe2 jovyan sleep CD 1 1 0.192s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qe3 jovyan sleep CD 1 1 0.192s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qdw jovyan sleep CD 1 1 0.192s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qdz jovyan sleep CD 1 1 0.192s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qe9 jovyan sleep CD 1 1 0.191s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMp jovyan sleep CD 1 1 0.175s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMj jovyan sleep CD 1 1 0.173s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qdx jovyan sleep CD 1 1 0.174s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMq jovyan sleep CD 1 1 0.173s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qe1 jovyan sleep CD 1 1 0.174s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMn jovyan sleep CD 1 1 0.171s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMZ jovyan sleep CD 1 1 0.168s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEL4S5G jovyan sleep CD 1 1 0.168s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEL4S5F jovyan sleep CD 1 1 0.168s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEL4S5E jovyan sleep CD 1 1 0.168s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qdu jovyan sleep CD 1 1 0.166s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMo jovyan sleep CD 1 1 0.167s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMk jovyan sleep CD 1 1 0.166s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMd jovyan sleep CD 1 1 0.165s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMb jovyan sleep CD 1 1 0.163s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMe jovyan sleep CD 1 1 0.162s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMa jovyan sleep CD 1 1 0.162s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMc jovyan sleep CD 1 1 0.162s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEL4S5D jovyan sleep CD 1 1 0.032s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ2YnijmLwy jovyan compute.py F 1 1 0.031s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ2YiqfxNdm jovyan compute.py F 1 1 0.012s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ2YYgVHnyV jovyan compute.py F 1 1 0.062s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ2YYE7Ja9d jovyan compute.py F 1 1 0.048s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2Fr5PCm9h jovyan ./sub_job+ CD 1 1 31.58s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒS4xykqnw jovyan echo CD 1 1 0.023s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3wSjr2ik jovyan echo CD 1 1 0.015s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3wSjr2ij jovyan echo CD 1 1 0.013s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3mYvC1Rj jovyan hostname CD 1 1 0.030s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3mYvC1Ri jovyan hostname CD 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3mYvC1Rh jovyan hostname CD 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3mYvC1Rk jovyan hostname CD 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ3cZKNgsB jovyan Hello I a+ F 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ3cZKNgsA jovyan Hello I a+ F 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ3cZKNgs9 jovyan Hello I a+ F 1 1 0.015s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ3cZKNgsC jovyan Hello I a+ F 1 1 0.012s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VGjwQ2U jovyan echo CD 1 1 0.032s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VGjwQ2T jovyan echo CD 1 1 0.027s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VGjwQ2V jovyan echo CD 1 1 0.024s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VGiTQkA jovyan echo CD 1 1 0.024s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VGiTQkB jovyan echo CD 1 1 0.023s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VGiTQk9 jovyan echo CD 1 1 0.023s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VGiTQk8 jovyan echo CD 1 1 0.023s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VGiTQk7 jovyan echo CD 1 1 0.018s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VGgyRTn jovyan echo CD 1 1 0.018s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VGgyRTm jovyan echo CD 1 1 0.018s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2rpm1UiB jovyan hostname CD 1 1 0.015s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2rpm1UiC jovyan hostname CD 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2rpm1UiD jovyan hostname CD 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2rpm1UiE jovyan hostname CD 1 1 0.012s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒxah9Lhg jovyan hostname CD 1 1 0.016s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒxah9Lhf jovyan hostname CD 1 1 0.016s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒxah9Lhd jovyan hostname CD 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒxah9Lhe jovyan hostname CD 1 1 0.013s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒp5VSGEC jovyan echo CD 1 1 0.051s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒp5TxGwq jovyan echo CD 1 1 0.047s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒp5VSGEB jovyan echo CD 1 1 0.047s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒm4dLyPD jovyan hostname CD 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒioKWajq jovyan hostname CD 4 1 0.015s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒhFVr6U7 jovyan false F 1 1 0.012s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒgmdsbJF jovyan hostname CD 1 1 0.013s 8660c254a8e5\n", + "\u001b[0;0m" + ] + } + ], + "source": [ + "!flux jobs -a" + ] + }, + { + "cell_type": "markdown", + "id": "77ca4277", + "metadata": {}, + "source": [ + "## flux cancel\n", + "\n", + "
\n", + "Description: Canceling running jobs\n", + "
\n", + "\n", + "Since some of the jobs we see in the table above won't ever exit (and we didn't specify a timelimit), let's cancel them all now and free up the resources." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "46dd8ec8-6c64-4d8d-9a00-949f5f58c07b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "flux-cancel: Canceled 4 jobs (0 errors)\n", + " JOBID USER NAME ST NTASKS NNODES TIME INFO\n" + ] + } + ], + "source": [ + "# This was previously flux cancelall -f\n", + "!flux cancel --all\n", + "!flux jobs" + ] + }, + { + "cell_type": "markdown", + "id": "2d3e314e-98eb-487a-ad8e-1442840e37d8", + "metadata": {}, + "source": [ + "## flux alloc\n", + "\n", + "
\n", + "Description: Allocation for an interactive instance\n", + "
\n", + "\n", + "You might want to request an allocation for a set of resources (an allocation) and then attach to them interactively. This is the goal of flux alloc. Since we can't easily do that in a cell, try opening up the and doing: \n", + "\n", + "```bash\n", + "# Look at the resources you have outside of the allocation\n", + "flux resource list\n", + "\n", + "# Request an allocation with 2 \"nodes\" - a subset of what you have in total\n", + "flux alloc -N 2\n", + "\n", + "# See the resources you are given\n", + "flux resource list\n", + "\n", + "# You can exit from the allocation like this!\n", + "exit\n", + "```\n", + "When you want to automate this, submitting work to an allocation, you would use `flux batch`." + ] + }, + { + "cell_type": "markdown", + "id": "544aa0a9", + "metadata": {}, + "source": [ + "## flux batch\n", + "\n", + "
\n", + "Description: Submitting batch jobs\n", + "
\n", + "\n", + "We can use the `flux batch` command to easily created nested flux instances. When `flux batch` is invoked, Flux will automatically create a nested instance that spans the resources allocated to the job, and then Flux runs the batch script passed to `flux batch` on rank 0 of the nested instance. \"Rank\" refers to the rank of the Tree-Based Overlay Network (TBON) used by the [Flux brokers](https://flux-framework.readthedocs.io/projects/flux-core/en/latest/man1/flux-broker.html).\n", + "\n", + "While a batch script is expected to launch parallel jobs using `flux run` or `flux submit` at this level, nothing prevents the script from further batching other sub-batch-jobs using the `flux batch` interface, if desired." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "blank-carpet", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ƒ3Vw1mYfjD\n", + "ƒ3Vw6xW9wD\n" + ] + } + ], + "source": [ + "!flux batch --nslots=2 --cores-per-slot=1 --nodes=2 ./sleep_batch.sh\n", + "!flux batch --nslots=2 --cores-per-slot=1 --nodes=2 ./sleep_batch.sh" + ] + }, + { + "cell_type": "markdown", + "id": "da98bfa1", + "metadata": {}, + "source": [ + "Take a quick look at [sleep_batch.sh](sleep_batch.sh) to see what we are about to run." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "edff8993-3c39-4f46-939d-4c8be5739fbc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ƒ3VwC9Te9D\n", + " JOBID USER NAME ST NTASKS NNODES TIME INFO\n", + "\u001b[01;34m ƒ3Vw6xW9wD jovyan ./sleep_b+ R 2 2 0.368s 8660c254a8e[5,5]\n", + "\u001b[0;0m\u001b[01;34m ƒ3Vw1mYfjD jovyan ./sleep_b+ R 2 2 0.572s 8660c254a8e[5,5]\n", + "\u001b[0;0m JOBID USER NAME ST NTASKS NNODES TIME INFO\n", + "\u001b[01;34m ƒ3Vw6xW9wD jovyan ./sleep_b+ R 2 2 0.536s 8660c254a8e[5,5]\n", + "\u001b[0;0m\u001b[01;34m ƒ3Vw1mYfjD jovyan ./sleep_b+ R 2 2 0.741s 8660c254a8e[5,5]\n", + "\u001b[0;0m\n", + "ƒ3Vw6xW9wD:\n", + "\n", + "ƒ3Vw1mYfjD:\n" + ] + } + ], + "source": [ + "# Here we are submitting a job that generates output, and asking to write it to /tmp/cheese.txt\n", + "!flux submit --out /tmp/cheese.txt echo \"Sweet dreams 🌚️ are made of cheese, who am I to diss a brie? 🧀️\"\n", + "\n", + "# This will show us JOBIDs\n", + "!flux jobs\n", + "\n", + "# We can even see jobs in sub-instances with \"-R\" (for recursive)\n", + "!flux jobs -R" + ] + }, + { + "cell_type": "markdown", + "id": "7f2b135c-ece7-45f7-b25d-dc90ba5f44f7", + "metadata": {}, + "source": [ + "### `flux job`\n", + "\n", + "Let's next inspect the last job we ran with `flux job info` and target the last job identifier with `flux job last`. " + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "429eb39d-d19c-4170-9707-ca8c3b2bfe87", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"version\": 1, \"execution\": {\"R_lite\": [{\"rank\": \"2\", \"children\": {\"core\": \"7\"}}], \"nodelist\": [\"8660c254a8e5\"], \"starttime\": 1721520196, \"expiration\": 4875116178}}\n", + "0: stdout redirected to /tmp/cheese.txt\n", + "0: stderr redirected to /tmp/cheese.txt\n" + ] + }, + { + "data": { + "text/html": [ + "
Sweet dreams 🌚️ are made of cheese, who am I to diss a brie? 🧀️\n",
+       "
\n" + ], + "text/latex": [ + "\\begin{Verbatim}[commandchars=\\\\\\{\\}]\n", + "Sweet dreams 🌚️ are made of cheese, who am I to diss a brie? 🧀️\n", + "\\end{Verbatim}\n" + ], + "text/plain": [ + "Sweet dreams 🌚️ are made of cheese, who am I to diss a brie? 🧀️" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Note here we are using flux job last to see the last job id\n", + "# The \"R\" here asks for the resource spec\n", + "!flux job info $(flux job last) R\n", + "\n", + "# When we attach it will direct us to our output file\n", + "!flux job attach $(flux job last)\n", + "\n", + "# And we can look at the output file to see our expected output!\n", + "from IPython.display import Code\n", + "Code(filename='/tmp/cheese.txt', language='text')" + ] + }, + { + "cell_type": "markdown", + "id": "f4e525e2-6c89-4c14-9fae-d87a0d4fc574", + "metadata": {}, + "source": [ + "We can again see a list all completed jobs with `flux jobs -a`:" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "df8a8b7c-f475-4a51-8bc6-9983dc9d78ab", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " JOBID USER NAME ST NTASKS NNODES TIME INFO\n", + "\u001b[01;34m ƒ3Vw6xW9wD jovyan ./sleep_b+ R 2 2 0.998s 8660c254a8e[5,5]\n", + "\u001b[0;0m\u001b[01;34m ƒ3Vw1mYfjD jovyan ./sleep_b+ R 2 2 1.202s 8660c254a8e[5,5]\n", + "\u001b[0;0m\u001b[01;32m ƒ3VwC9Te9D jovyan echo CD 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[37m ƒnyvM4Nb jovyan sleep CA 2 1 5.269h 8660c254a8e5\n", + "\u001b[0;0m\u001b[37m ƒ3VqVSHr7q jovyan sleep CA 2 1 12.04s 8660c254a8e5\n", + "\u001b[0;0m\u001b[37m ƒ3VqzNXq8B jovyan analysis CA 1 1 10.91s 8660c254a8e5\n", + "\u001b[0;0m\u001b[37m ƒ3VqtrJWFh jovyan simulation CA 2 2 11.12s 8660c254a8e[5,5]\n", + "\u001b[0;0m\u001b[01;32m ƒ3Vr6FWywV jovyan job-watch+ CD 1 1 10.03s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3Vqogq1L3 jovyan echo CD 1 1 0.015s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3Vqogq1L4 jovyan echo CD 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VqhAnAUA jovyan hostname CD 1 1 0.060s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VqhAnAU9 jovyan hostname CD 1 1 0.050s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VqhAnAU8 jovyan hostname CD 1 1 0.047s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VqhAnAU7 jovyan hostname CD 1 1 0.047s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VqadFLKq jovyan echo CD 1 1 0.025s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VqabmM3W jovyan echo CD 1 1 0.025s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VqabmM3V jovyan echo CD 1 1 0.012s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VqNqo3Qs jovyan hostname CD 1 1 0.016s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VqBbUVR9 jovyan hostname CD 4 1 0.017s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ3Vq5LFXNf jovyan false F 1 1 0.037s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VpyWEM83 jovyan hostname CD 1 1 0.013s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VPB8ZEqV jovyan echo CD 1 1 0.012s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3V7Tprhqh jovyan echo CD 1 1 0.060s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3V35oKmEo jovyan echo CD 1 1 0.015s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2mzETcgvB jovyan echo CD 1 1 0.012s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2mnMLCXPd jovyan echo CD 1 1 0.036s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2mfhe5NCX jovyan echo CD 1 1 0.036s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF545FE jovyan sleep CD 1 1 0.077s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5y2 jovyan sleep CD 1 1 0.108s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5y8 jovyan sleep CD 1 1 0.078s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5xx jovyan sleep CD 1 1 0.118s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5y5 jovyan sleep CD 1 1 0.107s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5y7 jovyan sleep CD 1 1 0.078s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5y1 jovyan sleep CD 1 1 0.107s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5xs jovyan sleep CD 1 1 0.118s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5xt jovyan sleep CD 1 1 0.118s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5y3 jovyan sleep CD 1 1 0.107s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF266gd jovyan sleep CD 1 1 0.118s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5y4 jovyan sleep CD 1 1 0.107s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF545FF jovyan sleep CD 1 1 0.076s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF545FG jovyan sleep CD 1 1 0.076s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5xv jovyan sleep CD 1 1 0.117s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF545FH jovyan sleep CD 1 1 0.073s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5xu jovyan sleep CD 1 1 0.117s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF545FD jovyan sleep CD 1 1 0.076s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5xw jovyan sleep CD 1 1 0.093s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5y6 jovyan sleep CD 1 1 0.083s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5xz jovyan sleep CD 1 1 0.083s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF266gc jovyan sleep CD 1 1 0.090s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF266gb jovyan sleep CD 1 1 0.087s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF3a5xy jovyan sleep CD 1 1 0.086s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF266gX jovyan sleep CD 1 1 0.101s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEzc7QK jovyan sleep CD 1 1 0.109s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEzc7QJ jovyan sleep CD 1 1 0.107s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEzc7QD jovyan sleep CD 1 1 0.111s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEzc7QG jovyan sleep CD 1 1 0.085s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF266gY jovyan sleep CD 1 1 0.076s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF266ga jovyan sleep CD 1 1 0.074s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEF266gZ jovyan sleep CD 1 1 0.076s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEzc7QL jovyan sleep CD 1 1 0.074s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEzc7QH jovyan sleep CD 1 1 0.079s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEzc7QF jovyan sleep CD 1 1 0.077s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEzc7QE jovyan sleep CD 1 1 0.053s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEzc7QC jovyan sleep CD 1 1 0.062s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEy887z jovyan sleep CD 1 1 0.079s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEy887y jovyan sleep CD 1 1 0.079s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEzc7QB jovyan sleep CD 1 1 0.078s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEy887w jovyan sleep CD 1 1 0.078s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEy887x jovyan sleep CD 1 1 0.078s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEy887v jovyan sleep CD 1 1 0.088s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZN jovyan sleep CD 1 1 0.091s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEy887s jovyan sleep CD 1 1 0.088s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEy887u jovyan sleep CD 1 1 0.076s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEwe8qV jovyan sleep CD 1 1 0.078s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZL jovyan sleep CD 1 1 0.077s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEy887r jovyan sleep CD 1 1 0.076s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEy887t jovyan sleep CD 1 1 0.073s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZM jovyan sleep CD 1 1 0.075s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEy887q jovyan sleep CD 1 1 0.075s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEwe8qW jovyan sleep CD 1 1 0.075s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZG jovyan sleep CD 1 1 0.082s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZF jovyan sleep CD 1 1 0.085s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9Z9 jovyan sleep CD 1 1 0.094s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZE jovyan sleep CD 1 1 0.085s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZB jovyan sleep CD 1 1 0.094s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZA jovyan sleep CD 1 1 0.094s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZH jovyan sleep CD 1 1 0.079s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZK jovyan sleep CD 1 1 0.078s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZJ jovyan sleep CD 1 1 0.079s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAH6 jovyan sleep CD 1 1 0.081s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZD jovyan sleep CD 1 1 0.069s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEvA9ZC jovyan sleep CD 1 1 0.075s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAH3 jovyan sleep CD 1 1 0.084s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAH5 jovyan sleep CD 1 1 0.072s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAH2 jovyan sleep CD 1 1 0.064s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAH1 jovyan sleep CD 1 1 0.067s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGw jovyan sleep CD 1 1 0.084s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGx jovyan sleep CD 1 1 0.084s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAH4 jovyan sleep CD 1 1 0.063s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGz jovyan sleep CD 1 1 0.083s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGv jovyan sleep CD 1 1 0.083s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzd jovyan sleep CD 1 1 0.108s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGy jovyan sleep CD 1 1 0.058s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGt jovyan sleep CD 1 1 0.059s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGu jovyan sleep CD 1 1 0.058s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzb jovyan sleep CD 1 1 0.108s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzc jovyan sleep CD 1 1 0.108s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAza jovyan sleep CD 1 1 0.090s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAze jovyan sleep CD 1 1 0.090s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGq jovyan sleep CD 1 1 0.079s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGr jovyan sleep CD 1 1 0.079s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGp jovyan sleep CD 1 1 0.079s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGs jovyan sleep CD 1 1 0.079s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzY jovyan sleep CD 1 1 0.088s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzZ jovyan sleep CD 1 1 0.085s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzf jovyan sleep CD 1 1 0.084s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzi jovyan sleep CD 1 1 0.074s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEtgAGo jovyan sleep CD 1 1 0.074s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzg jovyan sleep CD 1 1 0.080s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzj jovyan sleep CD 1 1 0.071s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzW jovyan sleep CD 1 1 0.074s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzX jovyan sleep CD 1 1 0.074s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiM jovyan sleep CD 1 1 0.112s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiJ jovyan sleep CD 1 1 0.111s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzh jovyan sleep CD 1 1 0.062s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiK jovyan sleep CD 1 1 0.095s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiL jovyan sleep CD 1 1 0.095s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiH jovyan sleep CD 1 1 0.095s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzT jovyan sleep CD 1 1 0.093s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzV jovyan sleep CD 1 1 0.092s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEsCAzU jovyan sleep CD 1 1 0.092s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiG jovyan sleep CD 1 1 0.087s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiA jovyan sleep CD 1 1 0.103s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiF jovyan sleep CD 1 1 0.103s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiD jovyan sleep CD 1 1 0.103s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiE jovyan sleep CD 1 1 0.070s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRy jovyan sleep CD 1 1 0.069s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBi9 jovyan sleep CD 1 1 0.068s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBi8 jovyan sleep CD 1 1 0.066s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiB jovyan sleep CD 1 1 0.065s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBiC jovyan sleep CD 1 1 0.064s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEqiBi7 jovyan sleep CD 1 1 0.064s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRu jovyan sleep CD 1 1 0.053s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRw jovyan sleep CD 1 1 0.053s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRt jovyan sleep CD 1 1 0.053s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRx jovyan sleep CD 1 1 0.053s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRr jovyan sleep CD 1 1 0.053s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRv jovyan sleep CD 1 1 0.048s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRs jovyan sleep CD 1 1 0.048s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRq jovyan sleep CD 1 1 0.059s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRp jovyan sleep CD 1 1 0.114s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9d jovyan sleep CD 1 1 0.123s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9Y jovyan sleep CD 1 1 0.123s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9e jovyan sleep CD 1 1 0.123s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9b jovyan sleep CD 1 1 0.123s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9W jovyan sleep CD 1 1 0.123s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRm jovyan sleep CD 1 1 0.122s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9f jovyan sleep CD 1 1 0.122s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9a jovyan sleep CD 1 1 0.123s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9Z jovyan sleep CD 1 1 0.123s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9X jovyan sleep CD 1 1 0.122s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEmGDsC jovyan sleep CD 1 1 0.100s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9c jovyan sleep CD 1 1 0.099s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9U jovyan sleep CD 1 1 0.095s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9T jovyan sleep CD 1 1 0.095s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9R jovyan sleep CD 1 1 0.095s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRn jovyan sleep CD 1 1 0.093s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9S jovyan sleep CD 1 1 0.093s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEnkD9V jovyan sleep CD 1 1 0.091s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEpECRo jovyan sleep CD 1 1 0.090s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvN jovyan sleep CD 1 1 0.105s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEmGDs5 jovyan sleep CD 1 1 0.105s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEmGDs8 jovyan sleep CD 1 1 0.104s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvQ jovyan sleep CD 1 1 0.105s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEmGDs9 jovyan sleep CD 1 1 0.103s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEmGDs7 jovyan sleep CD 1 1 0.104s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEmGDs6 jovyan sleep CD 1 1 0.104s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvP jovyan sleep CD 1 1 0.104s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvR jovyan sleep CD 1 1 0.104s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvS jovyan sleep CD 1 1 0.104s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEmGDsB jovyan sleep CD 1 1 0.078s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvK jovyan sleep CD 1 1 0.079s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvJ jovyan sleep CD 1 1 0.076s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvT jovyan sleep CD 1 1 0.075s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvM jovyan sleep CD 1 1 0.076s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvL jovyan sleep CD 1 1 0.076s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvU jovyan sleep CD 1 1 0.071s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEmGDsA jovyan sleep CD 1 1 0.069s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvH jovyan sleep CD 1 1 0.074s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qe7 jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvG jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qe4 jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qe6 jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEQWPvF jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qe5 jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qe8 jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMm jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMg jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMf jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMi jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qdy jovyan sleep CD 1 1 0.192s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qdv jovyan sleep CD 1 1 0.192s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMh jovyan sleep CD 1 1 0.193s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qe2 jovyan sleep CD 1 1 0.192s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qe3 jovyan sleep CD 1 1 0.192s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qdw jovyan sleep CD 1 1 0.192s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qdz jovyan sleep CD 1 1 0.192s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qe9 jovyan sleep CD 1 1 0.191s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMp jovyan sleep CD 1 1 0.175s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMj jovyan sleep CD 1 1 0.173s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qdx jovyan sleep CD 1 1 0.174s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMq jovyan sleep CD 1 1 0.173s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qe1 jovyan sleep CD 1 1 0.174s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMn jovyan sleep CD 1 1 0.171s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMZ jovyan sleep CD 1 1 0.168s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEL4S5G jovyan sleep CD 1 1 0.168s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEL4S5F jovyan sleep CD 1 1 0.168s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEL4S5E jovyan sleep CD 1 1 0.168s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEP2Qdu jovyan sleep CD 1 1 0.166s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMo jovyan sleep CD 1 1 0.167s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMk jovyan sleep CD 1 1 0.166s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMd jovyan sleep CD 1 1 0.165s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMb jovyan sleep CD 1 1 0.163s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMe jovyan sleep CD 1 1 0.162s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMa jovyan sleep CD 1 1 0.162s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEMYRMc jovyan sleep CD 1 1 0.162s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2eEEL4S5D jovyan sleep CD 1 1 0.032s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ2YnijmLwy jovyan compute.py F 1 1 0.031s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ2YiqfxNdm jovyan compute.py F 1 1 0.012s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ2YYgVHnyV jovyan compute.py F 1 1 0.062s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ2YYE7Ja9d jovyan compute.py F 1 1 0.048s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2Fr5PCm9h jovyan ./sub_job+ CD 1 1 31.58s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒS4xykqnw jovyan echo CD 1 1 0.023s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3wSjr2ik jovyan echo CD 1 1 0.015s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3wSjr2ij jovyan echo CD 1 1 0.013s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3mYvC1Rj jovyan hostname CD 1 1 0.030s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3mYvC1Ri jovyan hostname CD 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3mYvC1Rh jovyan hostname CD 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3mYvC1Rk jovyan hostname CD 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ3cZKNgsB jovyan Hello I a+ F 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ3cZKNgsA jovyan Hello I a+ F 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ3cZKNgs9 jovyan Hello I a+ F 1 1 0.015s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ3cZKNgsC jovyan Hello I a+ F 1 1 0.012s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VGjwQ2U jovyan echo CD 1 1 0.032s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VGjwQ2T jovyan echo CD 1 1 0.027s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VGjwQ2V jovyan echo CD 1 1 0.024s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VGiTQkA jovyan echo CD 1 1 0.024s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VGiTQkB jovyan echo CD 1 1 0.023s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VGiTQk9 jovyan echo CD 1 1 0.023s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VGiTQk8 jovyan echo CD 1 1 0.023s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VGiTQk7 jovyan echo CD 1 1 0.018s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VGgyRTn jovyan echo CD 1 1 0.018s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ3VGgyRTm jovyan echo CD 1 1 0.018s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2rpm1UiB jovyan hostname CD 1 1 0.015s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2rpm1UiC jovyan hostname CD 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2rpm1UiD jovyan hostname CD 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒ2rpm1UiE jovyan hostname CD 1 1 0.012s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒxah9Lhg jovyan hostname CD 1 1 0.016s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒxah9Lhf jovyan hostname CD 1 1 0.016s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒxah9Lhd jovyan hostname CD 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒxah9Lhe jovyan hostname CD 1 1 0.013s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒp5VSGEC jovyan echo CD 1 1 0.051s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒp5TxGwq jovyan echo CD 1 1 0.047s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒp5VSGEB jovyan echo CD 1 1 0.047s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒm4dLyPD jovyan hostname CD 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒioKWajq jovyan hostname CD 4 1 0.015s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒhFVr6U7 jovyan false F 1 1 0.012s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;32m ƒgmdsbJF jovyan hostname CD 1 1 0.013s 8660c254a8e5\n", + "\u001b[0;0m" + ] + } + ], + "source": [ + "!flux jobs -a" + ] + }, + { + "cell_type": "markdown", + "id": "3e415ecc-f451-4909-a2bf-351a639cd7fa", + "metadata": {}, + "source": [ + "To restrict the output to failed (i.e., jobs that exit with nonzero exit code, time out, or are canceled or killed) jobs, run:" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "032597d2-4b02-47ea-a5e5-915313cdd7f9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " JOBID USER NAME ST NTASKS NNODES TIME INFO\n", + "\u001b[01;31m ƒ3Vq5LFXNf jovyan false F 1 1 0.037s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ2YnijmLwy jovyan compute.py F 1 1 0.031s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ2YiqfxNdm jovyan compute.py F 1 1 0.012s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ2YYgVHnyV jovyan compute.py F 1 1 0.062s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ2YYE7Ja9d jovyan compute.py F 1 1 0.048s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ3cZKNgsB jovyan Hello I a+ F 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ3cZKNgsA jovyan Hello I a+ F 1 1 0.014s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ3cZKNgs9 jovyan Hello I a+ F 1 1 0.015s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒ3cZKNgsC jovyan Hello I a+ F 1 1 0.012s 8660c254a8e5\n", + "\u001b[0;0m\u001b[01;31m ƒhFVr6U7 jovyan false F 1 1 0.012s 8660c254a8e5\n", + "\u001b[0;0m" + ] + } + ], + "source": [ + "!flux jobs -f failed" + ] + }, + { + "cell_type": "markdown", + "id": "6bc17bac-2fc4-4418-8939-e930f9929976", + "metadata": {}, + "source": [ + "### flux submit from within a batch\n", + "\n", + "Next open up [hello-batch.sh](hello-batch.sh) to see an example of using `flux batch` to submit jobs within the instance, and then wait for them to finish. This script is going to:\n", + "\n", + "1. Create a flux instance with the top level resources you specify\n", + "2. Submit jobs to the scheduler controlled by the broker of that sub-instance\n", + "3. Run the four jobs, with `--flags=waitable` and `flux job wait --all` to wait for the output file\n", + "4. Within the batch script, you can add `--wait` or `--flags=waitable` to individual jobs, and use `flux queue drain` to wait for the queue to drain, _or_ `flux job wait --all` to wait for the jobs you flagged to finish. \n", + "\n", + "Note that when you submit a batch job, you'll get a job id back for the _batch job_, and usually when you look at the output of that with `flux job attach $jobid` you will see the output file(s) where the internal contents are written. Since we want to print the output file easily to the terminal, we are waiting for the batch job by adding the `--flags=waitable` and then waiting for it. Let's try to run our batch job now." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "72358a03-6f1f-4c5e-91eb-cab71883a232", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ƒ3VwkUsydR\n", + "ƒ3VwkUsydR\n", + "Hello job 1 from 8660c254a8e5 💛️\n", + "Hello job 2 from 8660c254a8e5 💚️\n", + "Hello job 3 from 8660c254a8e5 💙️\n", + "Hello job 4 from 8660c254a8e5 💜️\n" + ] + } + ], + "source": [ + "! flux batch --flags=waitable --out /tmp/flux-batch.out -N2 ./hello-batch.sh\n", + "! flux job wait\n", + "! cat /tmp/hello-batch-1.out\n", + "! cat /tmp/hello-batch-2.out\n", + "! cat /tmp/hello-batch-3.out\n", + "! cat /tmp/hello-batch-4.out" + ] + }, + { + "cell_type": "markdown", + "id": "75c0ae3f-2813-4ae8-83be-00be3df92a4b", + "metadata": {}, + "source": [ + "Each of `flux batch` and `flux alloc` hints at creating a Flux instance. How deep can we go into that rabbit hole, perhaps for jobs and workflows with nested logic or more orchestration complexity?" + ] + }, + { + "cell_type": "markdown", + "id": "04b405b1-219f-489c-abfc-e2983e82124a", + "metadata": {}, + "source": [ + "### The Flux Hierarchy 🍇️\n", + "\n", + "One feature of the Flux Framework scheduler that is unique is its ability to submit jobs within instances, where an instance can be thought of as a level in a graph. Let's start with a basic image - this is what it might look like to submit to a scheduler that is not graph-based (left), where all jobs go to a central job queue or database. Note that our maximum job throughput is one job per second. The throughput is limited by the workload manager's ability to process a single job. We can improve upon this by simply adding another level, perhaps with three instances. For example, let's say we create a flux allocation or batch that has control of some number of child nodes. We might launch three new instances (each with its own scheduler and queue, right image) at that level two, and all of a sudden, we get a throughput of 1x3, or three jobs per second.\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "All of a sudden, the throughout can increase exponentially because we are essentially submitting to different schedulers. The example above is not impressive, but our [learning guide](https://flux-framework.readthedocs.io/en/latest/guides/learning_guide.html#fully-hierarchical-resource-management-techniques) (Figure 10) has a beautiful example of how it can scale, done via an actual experiment. We were able to submit 500 jobs/second using only three levels, vs. close to 1 job/second with one level. For an interesting detail, you can vary the scheduler algorithm or topology within each sub-instance, meaning that you can do some fairly interesting things with scheduling work, and all without stressing the top level system instance. \n", + "\n", + "Now that we understand nested instances, let's look at another batch example that better uses them. Here we have two job scripts:\n", + "\n", + "- [sub_job1.sh](sub_job1.sh): Is going to be run with `flux batch` and submit sub_job2.sh\n", + "- [sub_job2.sh](sub_job2.sh): Is going to be submitted by sub_job1.sh.\n", + "\n", + "Take a look at each script to see how they work, and then submit it!" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "8640a611-38e4-42b1-a913-89e0c76c8014", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ƒ3Vxb9eQBy\n" + ] + } + ], + "source": [ + "!flux batch -N1 ./sub_job1.sh" + ] + }, + { + "cell_type": "markdown", + "id": "b29c3a4a-2b77-4ab9-8e0c-9f5228e61016", + "metadata": {}, + "source": [ + "And now that we've submitted, let's look at the hierarchy for all the jobs we just ran. Here is how to try flux pstree, which normally can show jobs in an instance, but it has limited functionality given we are in a notebook! So instead of just running the single command, let's add \"-a\" to indicate \"show me ALL jobs.\"\n", + "More complex jobs and in a different environment would have deeper nesting. You can [see examples here](https://flux-framework.readthedocs.io/en/latest/jobs/hierarchies.html?h=pstree#flux-pstree-command)." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "2d2b1f0b-e6c2-4583-8068-7c76fa341884", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".\n", + "├── ./sub_job1.sh\n", + "├── ./sleep_batch.sh\n", + "│ └── sleep:R\n", + "├── ./sleep_batch.sh\n", + "│ └── sleep:R\n", + "├── ./hello-batch.sh:CD\n", + "├── 28*[echo:CD]\n", + "├── 2*[sleep:CA]\n", + "├── analysis:CA\n", + "├── simulation:CA\n", + "├── job-watch.sh:CD\n", + "├── 22*[hostname:CD]\n", + "├── 2*[false:F]\n", + "├── 200*[sleep:CD]\n", + "├── 4*[compute.py:F]\n", + "├── ./sub_job1.sh:CD\n", + "├── Hello I am job 3:F\n", + "├── Hello I am job 2:F\n", + "├── Hello I am job 1:F\n", + "└── Hello I am job 4:F\n" + ] + } + ], + "source": [ + "!flux pstree -a" + ] + }, + { + "cell_type": "markdown", + "id": "7724130f-b0db-4ccf-a01e-98907b9a27ca", + "metadata": {}, + "source": [ + "You can also try a more detailed view with `flux pstree -a -X`!" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "72567af7-aa40-46b7-be43-c9e8124c1c7e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "flux-archive: shared-file.txt: write: Attempt to overwrite existing file\n", + "flux-archive: shared-file.txt: write: Attempt to overwrite existing file\n", + "flux-archive: shared-file.txt: write: Attempt to overwrite existing file\n", + "[1-3]: Exit 1\n" + ] + } + ], + "source": [ + "!flux exec -r all -x 0 flux archive extract --name myarchive --directory $(pwd) shared-file.txt" + ] + }, + { + "cell_type": "markdown", + "id": "eda1a33c-9f9e-4ba0-a013-e97601f79e41", + "metadata": {}, + "source": [ + "
\n", + "\n", + "# Process, Monitoring, and Job Utilities ⚙️\n", + "\n", + "## flux exec 👊️\n", + "\n", + "
\n", + "Description: Executing commands across ranks\n", + "
\n", + "\n", + "Have you ever wanted a quick way to execute a command to all of your nodes in a flux instance? It might be to create a directory, or otherwise interact with a file. This can be hugely useful in environments where you don't have a shared filesystem, for example. This is a job for flux exec! Here is a toy example to execute the command to every rank (`-r all`) to print." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "df8e5a5d-76aa-4151-a25f-4d8f3aa4a738", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hello from a flux rank!\n", + "Hello from a flux rank!\n", + "Hello from a flux rank!\n", + "Hello from a flux rank!\n" + ] + } + ], + "source": [ + "!flux exec -r all echo \"Hello from a flux rank!\"" + ] + }, + { + "cell_type": "markdown", + "id": "768c05fe-461e-4f88-bb3d-c74f9d8bc217", + "metadata": {}, + "source": [ + "You can also use `-x` to exclude ranks. For example, we often do custom actions on the main or \"leader\" rank, and just want to issue commands to the workers." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "3a9f7e0d-edf4-459e-93ac-463ce0635e2a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hello from everyone except the lead (0) rank!\n", + "Hello from everyone except the lead (0) rank!\n", + "Hello from everyone except the lead (0) rank!\n" + ] + } + ], + "source": [ + "! flux exec -r all -x 0 echo \"Hello from everyone except the lead (0) rank!\"" + ] + }, + { + "cell_type": "markdown", + "id": "05404084-55df-4067-9512-e4ef16ca272e", + "metadata": {}, + "source": [ + "Here is a similar example, but asking to execute only on rank 2, and to have it print the rank." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "e9507c7b-de5c-4129-9a99-c943614a9ba2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2\n" + ] + } + ], + "source": [ + "!flux exec -r 2 flux getattr rank " + ] + }, + { + "cell_type": "markdown", + "id": "9ccb6f4d-cbff-4f0a-98b1-59d5a99ee58f", + "metadata": {}, + "source": [ + "And of course, we could do the same to print for all ranks! This is a derivative of the first example we showed you." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "6a9de119-abc4-4917-a339-2010ccc7b9b7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n", + "3\n", + "2\n", + "1\n" + ] + } + ], + "source": [ + "!flux exec flux getattr rank" + ] + }, + { + "cell_type": "markdown", + "id": "b2676cbc-e883-4d72-a719-67bc46182270", + "metadata": {}, + "source": [ + "You can imagine that `flux exec` is hugely useful in the context of batch jobs, and specific use cases with files, such as using `flux archive`, discussed next." + ] + }, + { + "cell_type": "markdown", + "id": "be923293-6fa1-4a4e-a3b4-8d462d021919", + "metadata": {}, + "source": [ + "## flux archive 📚️\n", + "\n", + "
\n", + "Description: Creating file and content archives to access later and between ranks\n", + "
\n", + "\n", + "As Flux is used more in cloud environments, we might find ourselves in a situation where we have a cluster without a shared filesystem. The `flux archive` command helps with this situation. At a high level, `flux archive` allows us to save named pieces of data (e.g., files) to the Flux KVS for later retrieval.\n", + "\n", + "When using `flux archive`, we first have to create an named archive. In the code below, we will create a text file and then save it into an archive using `flux archive`. Note that, for larger files, you can speed up the creation and extraction of archives by using the `--mmap` flag." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "3928d581-9815-4f7b-98cb-72d6a804813d", + "metadata": {}, + "outputs": [], + "source": [ + "!echo \"Sweet dreams 🌚️ are made of cheese, who am I to diss a brie? 🧀️\" > shared-file.txt\n", + "!flux archive create --name myarchive --directory $(pwd) shared-file.txt" + ] + }, + { + "cell_type": "markdown", + "id": "1341da82-b8f0-445c-b335-6a10271994d9", + "metadata": {}, + "source": [ + "When we run this code, we are creating an archive in the leader broker. Now that the archive is created, we will want to extract its contents onto the other nodes of our cluster. To do this, we first need to ensure that the directory that we will extract into exists on those nodes. This can be done using `flux exec`. The `flux exec` command will execute a command on the nodes associated with specified brokers. Let's use `flux exec` to run `mkdir` on all the nodes of our cluster except the leader broker's node." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "2bf40c7b-3ca3-4e4f-b21c-4e843c7562a6", + "metadata": {}, + "outputs": [], + "source": [ + "!flux exec -r all -x 0 mkdir -p $(pwd)" + ] + }, + { + "cell_type": "markdown", + "id": "9913e925-aefc-400e-9ff3-0f541f9c3ed2", + "metadata": {}, + "source": [ + "The flags provided to `flux exec` do the following:\n", + "* `-r all`: run across all brokers in the Flux instance\n", + "* `-x 0`: don't runn on broker 0 (i.e., the leader broker)\n", + "\n", + "Now that the directory has been created on all our nodes, we can extract the archive onto those nodes by combining `flux exec` and `flux archive extract`." + ] + }, + { + "cell_type": "markdown", + "id": "8b35f8a6-869b-4f4f-874a-074919dfcc51", + "metadata": {}, + "source": [ + "Finally, when we're done with the archive, we can remove it with `flux archive remove`." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "38472bab-c7b9-409b-9058-734527898eb7", + "metadata": {}, + "outputs": [], + "source": [ + "!flux archive remove --name myarchive" + ] + }, + { + "cell_type": "markdown", + "id": "76fa64ca-ebde-4c5f-a505-7ca2b0173f98", + "metadata": {}, + "source": [ + "Finally, note that `flux archive` was named `flux filemap` in earlier versions of Flux." + ] + }, + { + "cell_type": "markdown", + "id": "32e110ce-db7a-4066-81f7-7191c1968496", + "metadata": {}, + "source": [ + "## flux uptime\n", + "\n", + "
\n", + "Description: Showing how long a flux instance has been running\n", + "
\n", + "\n", + "Did someone say... [uptime](https://youtu.be/SYRlTISvjww?si=zDlvpWbBljUmZw_Q)? ☝️🕑️🕺️\n", + "\n", + "Flux provides an `uptime` utility to display properties of the Flux instance such as state of the current instance, how long it has been running, its size and if scheduling is disabled or stopped. The output shows how long the instance has been up, the instance owner, the instance depth (depth in the Flux hierarchy), and the size of the instance (number of brokers)." + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "095f2ac3-145b-4cda-8350-7c281f2b2b45", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 00:03:20 run 5.3h, owner jovyan, depth 0, size 4\n" + ] + } + ], + "source": [ + "!flux uptime" + ] + }, + { + "cell_type": "markdown", + "id": "03e2ae62-3e3b-4c82-a0c7-4c97ff1376d2", + "metadata": {}, + "source": [ + "## flux top \n", + "\n", + "
\n", + "Description: Showing a table of real-time Flux processes\n", + "
\n", + "\n", + "Flux provides a feature-full version of `top` for nested Flux instances and jobs. In the invoke `flux top` to see the \"sleep\" jobs. If they have already completed you can resubmit them. \n", + "\n", + "We recommend not running `flux top` in the notebook as it is not designed to display output from a command that runs continuously.\n", + "\n", + "## flux pstree \n", + "\n", + "
\n", + "Description: Showing a flux process tree (and seeing nesting in instances)\n", + "
\n", + "\n", + "In analogy to `top`, Flux provides `flux pstree`. Try it out in the or here in the notebook.\n", + "\n", + "## flux proxy\n", + "\n", + "
\n", + "Description: Interacting with a job hierarchy\n", + "
\n", + "\n", + "Flux proxy is used to route messages to and from a Flux instance. We can use `flux proxy` to connect to a running Flux instance and then submit more nested jobs inside it. From the run the commands below!\n", + "\n", + "```bash\n", + "# Outputs the JOBID\n", + "flux batch --nslots=2 --cores-per-slot=1 --nodes=2 ./sleep_batch.sh\n", + "\n", + "# Put the JOBID into an environment variable\n", + "JOBID=$(flux job last)\n", + "\n", + "# See the flux process tree\n", + "flux pstree -a\n", + "\n", + "# Connect to the Flux instance corresponding to JOBID above\n", + "flux proxy ${JOBID}\n", + "\n", + "# Note the depth is now 1 and the size is 2: we're one level deeper in a Flux hierarchy and we have only 2 brokers now.\n", + "flux uptime\n", + "\n", + "# This instance has 2 \"nodes\" and 2 cores allocated to it\n", + "flux resource list\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "523e04b6-7427-4c77-8eb0-b5b8998c6224", + "metadata": {}, + "source": [ + "## flux queue\n", + "\n", + "
\n", + "Description: Interacting with and inspecting Flux queues\n", + "
\n", + "\n", + "Flux has a command for controlling the queue within the `job-manager`: `flux queue`. This includes disabling job submission, re-enabling it, waiting for the queue to become idle or empty, and checking the queue status:" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "800de4eb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Job submission is disabled: maintenance outage\n", + "Job submission is enabled\n", + "usage: flux-queue [-h] {status,list,enable,disable,start,stop,drain,idle} ...\n", + "\n", + "options:\n", + " -h, --help show this help message and exit\n", + "\n", + "subcommands:\n", + "\n", + " {status,list,enable,disable,start,stop,drain,idle}\n" + ] + } + ], + "source": [ + "!flux queue disable \"maintenance outage\"\n", + "!flux queue enable\n", + "!flux queue -h" + ] + }, + { + "cell_type": "markdown", + "id": "e958b3ce-9220-48ad-8f3e-f76c8d6a800c", + "metadata": {}, + "source": [ + "## flux getattr\n", + "\n", + "
\n", + "Description: Getting attributes about your system and environment\n", + "
\n", + "\n", + "Each Flux instance has a set of attributes that are set at startup that affect the operation of Flux, such as `rank`, `size`, and `local-uri` (the Unix socket usable for communicating with Flux). Many of these attributes can be modified at runtime, such as `log-stderr-level` (1 logs only critical messages to stderr while 7 logs everything, including debug messages). Here is an example set that you might be interested in looking at:" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "biblical-generic", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n", + "4\n", + "local:///tmp/flux-iwjuLe/local-0\n", + "broker.boot-method simple\n", + "broker.critical-ranks 0\n", + "broker.mapping [[0,1,4,1]]\n", + "broker.pid 8\n", + "broker.quorum 4\n", + "broker.quorum-timeout 1m\n", + "broker.rc1_path /etc/flux/rc1\n", + "broker.rc3_path /etc/flux/rc3\n", + "broker.starttime 1721501121.61\n", + "conf.shell_initrc /etc/flux/shell/initrc.lua\n", + "conf.shell_pluginpath /usr/lib/flux/shell/plugins\n", + "config.path -\n", + "content.backing-module content-sqlite\n", + "content.hash sha1\n", + "hostlist 8660c254a8e[5,5,5,5]\n", + "instance-level 0\n", + "jobid -\n", + "local-uri local:///tmp/flux-iwjuLe/local-0\n", + "log-critical-level 2\n", + "log-filename -\n", + "log-forward-level 7\n", + "log-level 7\n", + "log-ring-size 1024\n", + "log-stderr-level 3\n", + "log-stderr-mode leader\n", + "parent-kvs-namespace -\n", + "parent-uri -\n", + "rank 0\n", + "rundir /tmp/flux-iwjuLe\n", + "security.owner 1000\n", + "size 4\n", + "statedir -\n", + "tbon.child_rcvhwm 0\n", + "tbon.connect_timeout 30s\n", + "tbon.descendants 3\n", + "tbon.endpoint ipc:///tmp/flux-iwjuLe/tbon-0\n", + "tbon.level 0\n", + "tbon.maxlevel 1\n", + "tbon.parent-endpoint -\n", + "tbon.prefertcp 0\n", + "tbon.tcp_user_timeout 20s\n", + "tbon.topo kary:32\n", + "tbon.torpid_max 30s\n", + "tbon.torpid_min 5s\n", + "tbon.zmq_io_threads 1\n", + "tbon.zmqdebug 0\n", + "version 0.63.0-5-g0ddc3d9e8\n" + ] + } + ], + "source": [ + "!flux getattr rank\n", + "!flux getattr size\n", + "!flux getattr local-uri\n", + "!flux setattr log-stderr-level 3\n", + "!flux lsattr -v" + ] + }, + { + "cell_type": "markdown", + "id": "e78568c0-7b78-4a1c-aa7f-40d7ea43620f", + "metadata": {}, + "source": [ + "## flux module\n", + "\n", + "
\n", + "Description: Managing Flux extension modules\n", + "
\n", + "\n", + "Services within a Flux instance are implemented by modules. To query and manage broker modules, use `flux module`. Modules that we have already directly interacted with in this tutorial include `resource` (via `flux resource`), `job-ingest` (via `flux` and the Python API) `job-list` (via `flux jobs`) and `job-manager` (via `flux queue`). For the most part, services are implemented by modules of the same name. In some circumstances, where multiple implementations for a service exist, a module of a different name implements a given service (e.g., in this instance, `sched-fluxion-qmanager` provides the `sched` service and thus `sched.alloc`, but in another instance `sched-simple` might provide the `sched` service)." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "spatial-maintenance", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Module Idle S Service\n", + "job-exec 2 R \n", + "heartbeat 0 R \n", + "job-list 2 R \n", + "sched-fluxion-qmanager 2 R sched\n", + "content-sqlite 1 R content-backing\n", + "resource 1 R \n", + "job-ingest 2 R \n", + "content 1 R \n", + "job-info 5 R \n", + "kvs-watch 5 R \n", + "sched-fluxion-resource 2 R \n", + "kvs 1 R \n", + "cron idle R \n", + "job-manager 0 R \n", + "barrier idle R \n", + "connector-local 0 R 1000-shell-f3Vw1mYfjD,1000-shell-f3Vw6xW9wD\n" + ] + } + ], + "source": [ + "!flux module list" + ] + }, + { + "cell_type": "markdown", + "id": "a4c7d50b-50d2-4190-a42e-9b13f1f30380", + "metadata": {}, + "source": [ + "See the [Flux Management Notebook](02_flux_framework.ipynb) for a small tutorial of unloading and reloading the Fluxion (flux scheduler) modules." + ] + }, + { + "cell_type": "markdown", + "id": "5fea958f-e12c-4229-b8a6-e40dcfbd0692", + "metadata": {}, + "source": [ + "## flux dmesg\n", + "\n", + "
\n", + "Description: Viewing Flux system messages\n", + "
\n", + "\n", + "\n", + "If you need some additional help debugging your Flux setup, you might be interested in `flux dmesg`, which is akin to the [Linux dmesg](https://man7.org/linux/man-pages/man1/dmesg.1.html) but delivers messages for Flux." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "c34899ba", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2024-07-20T22:56:18.760174Z\u001b[0m \u001b[33mbroker.debug[0]\u001b[0m: \u001b[34mrmmod sched-simple\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:18.760532Z\u001b[0m \u001b[33mbroker.debug[0]\u001b[0m: \u001b[34mmodule sched-simple exited\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:18.760597Z\u001b[0m \u001b[33mresource.debug[0]\u001b[0m: \u001b[34maborted 1 resource.acquire(s)\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:18.760615Z\u001b[0m \u001b[33mjob-manager.debug[0]\u001b[0m: \u001b[34malloc: stop due to disconnect: Success\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:18.879655Z\u001b[0m \u001b[33mbroker.debug[0]\u001b[0m: \u001b[34minsmod sched-fluxion-resource\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:18.879925Z\u001b[0m \u001b[33msched-fluxion-resource.info[0]\u001b[0m: version 0.34.0-38-g0fad5268\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:18.879954Z\u001b[0m \u001b[33msched-fluxion-resource.debug[0]\u001b[0m: \u001b[34mmod_main: resource module starting\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:18.880319Z\u001b[0m \u001b[33msched-fluxion-resource.warning[0]\u001b[0m: \u001b[1mcreate_reader: allowlist unsupported\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:18.880472Z\u001b[0m \u001b[33msched-fluxion-resource.debug[0]\u001b[0m: \u001b[34mresource graph datastore loaded with rv1exec reader\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:18.880477Z\u001b[0m \u001b[33msched-fluxion-resource.info[0]\u001b[0m: populate_resource_db: loaded resources from core's resource.acquire\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:18.880519Z\u001b[0m \u001b[33msched-fluxion-resource.debug[0]\u001b[0m: \u001b[34mresource status changed (rankset=[all] status=DOWN)\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:18.880522Z\u001b[0m \u001b[33msched-fluxion-resource.debug[0]\u001b[0m: \u001b[34mresource status changed (rankset=[0-3] status=UP)\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:18.880523Z\u001b[0m \u001b[33msched-fluxion-resource.debug[0]\u001b[0m: \u001b[34mmod_main: resource graph database loaded\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:18.998112Z\u001b[0m \u001b[33mbroker.debug[0]\u001b[0m: \u001b[34minsmod sched-fluxion-qmanager\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:18.998336Z\u001b[0m \u001b[33msched-fluxion-qmanager.info[0]\u001b[0m: version 0.34.0-38-g0fad5268\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:18.998452Z\u001b[0m \u001b[33msched-fluxion-qmanager.debug[0]\u001b[0m: \u001b[34mservice_register\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:18.998472Z\u001b[0m \u001b[33msched-fluxion-qmanager.debug[0]\u001b[0m: \u001b[34menforced policy (queue=default): fcfs\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:18.998477Z\u001b[0m \u001b[33msched-fluxion-qmanager.debug[0]\u001b[0m: \u001b[34meffective queue params (queue=default): queue-depth=4\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:18.998478Z\u001b[0m \u001b[33msched-fluxion-qmanager.debug[0]\u001b[0m: \u001b[34meffective policy params (queue=default): default\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:18.998726Z\u001b[0m \u001b[33msched-fluxion-qmanager.debug[0]\u001b[0m: \u001b[34mhandshaking with sched-fluxion-resource completed\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:18.998919Z\u001b[0m \u001b[33mjob-manager.debug[0]\u001b[0m: \u001b[34mscheduler: hello\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:19.000882Z\u001b[0m \u001b[33msched-fluxion-qmanager.debug[0]\u001b[0m: \u001b[34mrequeue success (queue=default id=1750450831360)\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:19.001022Z\u001b[0m \u001b[33mjob-manager.debug[0]\u001b[0m: \u001b[34mscheduler: ready unlimited\u001b[0m\n", + "\u001b[32m2024-07-20T22:56:19.001141Z\u001b[0m \u001b[33msched-fluxion-qmanager.debug[0]\u001b[0m: \u001b[34mhandshaking with job-manager completed\u001b[0m\n", + "\u001b[32m2024-07-21T00:03:03.486096Z\u001b[0m \u001b[33mkvs.debug[0]\u001b[0m: \u001b[34maggregated 2 transactions (2 ops)\u001b[0m\n", + "\u001b[32m2024-07-21T00:03:03.487165Z\u001b[0m \u001b[33mkvs.debug[0]\u001b[0m: \u001b[34maggregated 2 transactions (2 ops)\u001b[0m\n", + "\u001b[32m2024-07-21T00:03:03.749285Z\u001b[0m \u001b[33mkvs.debug[0]\u001b[0m: \u001b[34maggregated 3 transactions (3 ops)\u001b[0m\n", + "\u001b[32m2024-07-21T00:03:03.752634Z\u001b[0m \u001b[33mkvs.debug[0]\u001b[0m: \u001b[34maggregated 2 transactions (2 ops)\u001b[0m\n", + "\u001b[32m2024-07-21T00:03:15.324218Z\u001b[0m \u001b[33mjob-exec.debug[0]\u001b[0m: \u001b[34mexec aborted: id=ƒ3VqVSHr7q\u001b[0m\n", + "\u001b[32m2024-07-21T00:03:15.324274Z\u001b[0m \u001b[33mjob-exec.debug[0]\u001b[0m: \u001b[34mexec aborted: id=ƒ3VqtrJWFh\u001b[0m\n", + "\u001b[32m2024-07-21T00:03:15.324296Z\u001b[0m \u001b[33mjob-exec.debug[0]\u001b[0m: \u001b[34mexec aborted: id=ƒ3VqzNXq8B\u001b[0m\n", + "\u001b[32m2024-07-21T00:03:15.324309Z\u001b[0m \u001b[33mjob-exec.debug[0]\u001b[0m: \u001b[34mexec aborted: id=ƒnyvM4Nb\u001b[0m\n" + ] + } + ], + "source": [ + "!flux dmesg" + ] + }, + { + "cell_type": "markdown", + "id": "70e3df1d-32c9-4996-b6f7-2fa85f4c02ad", + "metadata": { + "tags": [] + }, + "source": [ + "### flux start\n", + "\n", + "
\n", + "Description: Interactively starting a set of resources\n", + "
\n", + "\n", + "Sometimes you need to interactively start a set of compute resources. We call this subset a flux instance. You can launch jobs under this instance, akin to how you've done above! In fact, this entire tutorial is started (to give you 4 faux nodes) with a `flux start` command: \n", + "\n", + "```bash\n", + "flux start --test-size=4\n", + "```\n", + "\n", + "A Flux instance may be running as the default resource manager on a cluster, a job in a resource manager such as Slurm, LSF, or Flux itself, or as a test instance launched locally. This is really neat because it means you can launch Flux under other resource managers where it is not installed as the system workload manager. You can also execute \"one off\" commands to it, for example, to see the instance size:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d568de50-f9e0-452f-8364-e52853013d83", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4\n" + ] + } + ], + "source": [ + "!flux start --test-size=4 flux getattr size" + ] + }, + { + "cell_type": "markdown", + "id": "e693f2d9-651f-4f58-bf53-62528caa83d9", + "metadata": {}, + "source": [ + "When you run `flux start` without a command, it will give you an interactive shell to the instance. When you provide a command (as we do above) it will run it and exit. This is what happens for the command above! The output indicates the number of brokers started successfully. As soon as we get and print the size, we exit." + ] + }, + { + "cell_type": "markdown", + "id": "997faffc", + "metadata": {}, + "source": [ + "
\n", + "\n", + "# Python Submission API 🐍️\n", + "Flux also provides first-class python bindings which can be used to submit jobs programmatically. \n", + "\n", + "### `flux.job.JobspecV1` to create job specifications\n", + "\n", + "Flux represents work as a standard called the [Jobspec](https://flux-framework.readthedocs.io/projects/flux-rfc/en/latest/spec_25.html). While you could write YAML or JSON, it's much easier to use provided Python functions that take high level metadata (command, resources, etc) to generate them. We can then replicate our previous example of submitting multiple heterogeneous jobs using these Python helpers, and testing that Flux co-schedules them." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "third-comment", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import flux\n", + "from flux.job import JobspecV1\n", + "from flux.job.JobID import JobID" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "selective-uganda", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ƒ3Vyv4d99Z\n" + ] + } + ], + "source": [ + "# connect to the running Flux instance\n", + "f = flux.Flux()\n", + "\n", + "# Create the Jobspec from a command to run a python script, and specify resources\n", + "compute_jobreq = JobspecV1.from_command(\n", + " command=[\"./compute.py\", \"120\"], num_tasks=1, num_nodes=1, cores_per_task=1\n", + ")\n", + "\n", + "# This is the \"current working directory\" (cwd)\n", + "compute_jobreq.cwd = os.path.expanduser(\"~/flux-tutorial/flux-workflow-examples/job-submit-api/\")\n", + "\n", + "# When we submit, we get back the job identifier (JobID)\n", + "print(JobID(flux.job.submit(f,compute_jobreq)).f58) # submit and print out the jobid (in f58 format)" + ] + }, + { + "cell_type": "markdown", + "id": "0c4b260f-f08a-46ae-ad66-805911a857a7", + "metadata": {}, + "source": [ + "Once we create the job, when we submit it in Python we get back a job identifier or jobid. We can then interact with the Flux handle, a connection to Flux, to get information about that job.\n", + "\n", + "### `flux.job.get_job(handle, jobid)` to get job info" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "ed65cb46-8d8a-41f0-bec1-92b9a89e6db2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "🎉️ Hooray, we just submitted ƒ3VyvraktF!\n", + "\n", + "{\"t_depend\": 1721520202.4164655, \"t_run\": 0.0, \"t_cleanup\": 0.0, \"t_inactive\": 0.0, \"duration\": 0.0, \"expiration\": 0.0, \"name\": \"compute.py\", \"cwd\": \"/home/jovyan/flux-tutorial/flux-workflow-examples/job-submit-api/\", \"queue\": \"\", \"project\": \"\", \"bank\": \"\", \"ntasks\": 1, \"ncores\": 1, \"nnodes\": 1, \"priority\": 16, \"ranks\": \"\", \"nodelist\": \"\", \"success\": \"\", \"result\": \"\", \"waitstatus\": \"\", \"id\": 320116914913280, \"t_submit\": 1721520202.4053006, \"t_remaining\": 0.0, \"state\": \"SCHED\", \"username\": \"jovyan\", \"userid\": 1000, \"urgency\": 16, \"runtime\": 0.0, \"status\": \"SCHED\", \"returncode\": \"\", \"dependencies\": [], \"annotations\": {}, \"exception\": {\"occurred\": \"\", \"severity\": \"\", \"type\": \"\", \"note\": \"\"}}\n" + ] + } + ], + "source": [ + "# Let's submit again to retrieve (and save) the job identifier\n", + "fluxjob = flux.job.submit(f, compute_jobreq)\n", + "fluxjobid = JobID(fluxjob.f58)\n", + "print(f\"🎉️ Hooray, we just submitted {fluxjobid}!\\n\")\n", + "\n", + "# Here is how to get your info. The first argument is the flux handle, then the jobid\n", + "jobinfo = flux.job.get_job(f, fluxjobid)\n", + "print(json.dumps(jobinfo))" + ] + }, + { + "cell_type": "markdown", + "id": "197ee252-dfc9-4256-8d45-df40718c5c3f", + "metadata": {}, + "source": [ + "You can now run `flux jobs` to see the jobs that we submit from Python." + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "5d679897-7054-4f96-b340-7f39245aca89", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ƒ3VyvraktF jovyan compute.py F 1 1 0.014s 8660c254a8e5\n", + " ƒ3Vyv4d99Z jovyan compute.py F 1 1 0.020s 8660c254a8e5\n", + " ƒ2YnijmLwy jovyan compute.py F 1 1 0.031s 8660c254a8e5\n", + " ƒ2YiqfxNdm jovyan compute.py F 1 1 0.012s 8660c254a8e5\n", + " ƒ2YYgVHnyV jovyan compute.py F 1 1 0.062s 8660c254a8e5\n", + " ƒ2YYE7Ja9d jovyan compute.py F 1 1 0.048s 8660c254a8e5\n" + ] + } + ], + "source": [ + "!flux jobs -a | grep compute" + ] + }, + { + "cell_type": "markdown", + "id": "d332f9c9", + "metadata": {}, + "source": [ + "Under the hood, the `Jobspec` class is creating a YAML document that ultimately gets serialized as JSON and sent to Flux for ingestion, validation, queueing, scheduling, and eventually execution. We can dump the raw JSON jobspec that is submitted, where we can see the exact resources requested and the task set to be executed on those resources." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "efa06478", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"resources\": [\n", + " {\n", + " \"type\": \"node\",\n", + " \"count\": 1,\n", + " \"with\": [\n", + " {\n", + " \"type\": \"slot\",\n", + " \"count\": 1,\n", + " \"with\": [\n", + " {\n", + " \"type\": \"core\",\n", + " \"count\": 1\n", + " }\n", + " ],\n", + " \"label\": \"task\"\n", + " }\n", + " ]\n", + " }\n", + " ],\n", + " \"tasks\": [\n", + " {\n", + " \"command\": [\n", + " \"./compute.py\",\n", + " \"120\"\n", + " ],\n", + " \"slot\": \"task\",\n", + " \"count\": {\n", + " \"per_slot\": 1\n", + " }\n", + " }\n", + " ],\n", + " \"attributes\": {\n", + " \"system\": {\n", + " \"duration\": 0,\n", + " \"cwd\": \"/home/jovyan/flux-tutorial/flux-workflow-examples/job-submit-api/\"\n", + " }\n", + " },\n", + " \"version\": 1\n", + "}\n" + ] + } + ], + "source": [ + "print(compute_jobreq.dumps(indent=2))" + ] + }, + { + "cell_type": "markdown", + "id": "a8051640", + "metadata": {}, + "source": [ + "### `FluxExecutor` for bulk submission\n", + "\n", + "We can use the FluxExecutor class to submit large numbers of jobs to Flux. This method uses python's `concurrent.futures` interface. Here is an example snippet from [flux-workflow-examples/async-bulk-job-submit/bulksubmit_executor.py](flux-workflow-examples/async-bulk-job-submit/bulksubmit_executor.py)." + ] + }, + { + "cell_type": "markdown", + "id": "binary-trace", + "metadata": {}, + "source": [ + "``` python \n", + "with FluxExecutor() as executor:\n", + " compute_jobspec = JobspecV1.from_command(args.command)\n", + " futures = [executor.submit(compute_jobspec) for _ in range(args.njobs)]\n", + " # wait for the jobid for each job, as a proxy for the job being submitted\n", + " for fut in futures:\n", + " fut.jobid()\n", + " # all jobs submitted - print timings\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "cleared-lawsuit", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "bulksubmit_executor: submitted 200 jobs in 0.16s. 1246.61job/s\n", + "bulksubmit_executor: First job finished in about 0.172s\n", + "|██████████████████████████████████████████████████████████| 100.0% (298.2 job/s)\n", + "bulksubmit_executor: Ran 200 jobs in 0.8s. 249.6 job/s\n" + ] + } + ], + "source": [ + "# Submit the FluxExecutor based script.\n", + "%run ./flux-workflow-examples/async-bulk-job-submit/bulksubmit_executor.py -n200 /bin/sleep 0" + ] + }, + { + "cell_type": "markdown", + "id": "e5e39506-7f89-4be2-880e-fc21cfe33548", + "metadata": {}, + "source": [ + "### `flux.event_watch` to watch events\n", + "\n", + "If you want to get the output of a job (or more generally, stream events) you can do that as follows. Let's submit a quick job, and then look at the output.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "b24124f8-0faf-4e99-83a5-bd983300fda6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1721520256.00717: header {'version': 1, 'encoding': {'stdout': 'UTF-8', 'stderr': 'UTF-8'}, 'count': {'stdout': 1, 'stderr': 1}, 'options': {}}\n", + "1721520256.01083: data {'stream': 'stderr', 'rank': '0', 'eof': True}\n", + "1721520256.01085: data {'stream': 'stdout', 'rank': '0', 'data': 'Flux Plumbing 💩️🚽️\\n'}\n", + "1721520256.01087: data {'stream': 'stdout', 'rank': '0', 'eof': True}\n" + ] + } + ], + "source": [ + "# Create the Jobspec from a command to run a python script, and specify resources\n", + "jobspec = JobspecV1.from_command(\n", + " command=[\"echo\", \"Flux Plumbing 💩️🚽️\"], num_tasks=1, num_nodes=1, cores_per_task=1)\n", + "jobid = flux.job.submit(f, jobspec)\n", + "\n", + "# Give some time to run and finish\n", + "import time\n", + "time.sleep(5)\n", + "\n", + "for line in flux.job.event_watch(f, jobid, \"guest.output\"):\n", + " print(line)" + ] + }, + { + "cell_type": "markdown", + "id": "432a6b44-4a37-4b75-9035-ade107def5de", + "metadata": {}, + "source": [ + "### `flux.job.job_list` to list jobs\n", + "\n", + "Finally, it can be really helpful to get an entire listing of jobs. You can do that as follows. Note that the `job_list` is creating a remote procedure call (rpc) and we call `get` to retrieve the output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0d109d8-8586-4b91-bbfc-89e523199707", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "flux.job.job_list(f).get()" + ] + }, + { + "cell_type": "markdown", + "id": "c9c3e767-0459-4218-a8cf-0f98bd32d6bf", + "metadata": {}, + "source": [ + "# This concludes Chapter 1! 📗️\n", + "\n", + "In this module, we covered:\n", + "1. Submitting jobs with Flux\n", + "2. The Flux Hierarchy\n", + "3. Flux Process and Job Utilities\n", + "4. Python Submission API\n", + "\n", + "To continue with the tutorial, open [Chapter 2](./02_flux_framework.ipynb)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/02_flux_framework.ipynb b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/02_flux_framework.ipynb new file mode 100644 index 0000000..1ae33bc --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/02_flux_framework.ipynb @@ -0,0 +1,470 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "
\n", + "
\n", + "\n", + "# Chapter 2: Flux Plumbing 💩️🚽️\n", + "\n", + "> How to get to Porcelain? You start with Plumbing, of course - \"the toilet vs. the pipes\"\n", + "\n", + "Now that we have learned about basic flux commands, and hierarchical scheduling and its benefits, let's dive deeper into the structure of the individual Flux instances that comprise a hierarchy and talk about some additional \"plumbing\" that helps Flux to run. In this module, we cover:\n", + "1. The structure of Flux instances\n", + "2. Flux modules\n", + "3. Examples `flux kvs` that powers a lot of higher level commands\n", + "4. Advanced job specification interaction with flux job\n", + "\n", + "
\n", + "\n", + "## The structure of Flux instances\n", + "\n", + "As mentioned in [Chapter 2](./01_flux_tutorial.ipynb), a Flux instance is comprised of one or more Flux brokers. A high-level depiction of the design of a Flux broker is shown in the figure below.\n", + "\n", + "
\n", + "\n", + "
\n", + "Image created by Ian Lumsden for the Flux tutorials
\n", + "
\n", + "\n", + "Each broker is a program built on top of the ∅MQ networking library. The broker contains two main components. First, the broker implements Flux-specific networking abstractions over ∅MQ, such as remote-proceedure call (RPC) and publication-subscription (pub-sub). Second, the broker contains several core services, such as PMI (for MPI support), run control support (for enabling automatic startup of other services), and, most importantly, broker module management. The remainder of a Flux broker's functionality comes from broker modules: specially designed services that the broker can deploy in independent OS threads. Some examples of broker modules provided by Flux include:\n", + "* Job scheduling (both traditional and hierarchical)\n", + "* [Fluxion](https://github.com/flux-framework/flux-sched) (Flux's advanced graph-based scheduler)\n", + "* Banks and accounting (for system-wide deployments of Flux)\n", + "* [PMIx](https://github.com/openpmix/openpmix) (for OpenMPI)\n", + "* An in-memory content store (useful for preloading data into pods on cloud)\n", + "\n", + "When Flux starts, it launches one or more brokers across the resources it manages. By default, Flux will launch one broker per node, but this can be configured (e.g., with the `--test-size` flag to `flux start` shown in [Chapter 1](./01_flux_tutorial.ipynb)). After launching the brokers, Flux will designate one broker as the \"leader\" and the rest as \"followers\". The leader serves as entrypoint into the Flux instance, and it serves as the starting point for most Flux commands. The distribution of brokers and the \"leader-follower\" designations are shown in the following figure:\n", + "\n", + "
\n", + "\n", + "
\n", + "Image created by Vanessa Sochat for Flux Framework Components documentation
\n", + "
\n", + "\n", + "After launching the brokers and designating a leader, Flux uses the brokers' network abstractions to connect the brokers together into what we call the \"tree-based overlay network\" or TBON for short. This network is shown in the figure below. This overlay network connects brokers together in a pre-defined tree-based topology (e.g., *k*-ary and binomial). Whenever brokers or instances of distributed services running on top of the brokers need to communicate, they can send messages up and down this tree-structured network. This tree-structured network is used over alternative designs (e.g., all-to-all networks used by MPI) because it provides better scalability (by minimizing communication), security, and fault tolerance for a service-focused framework. More information about these benefits and Flux's overall design can be found in our [publications](https://flux-framework.org/publications/) (particularly our [2014 paper on Flux](https://ieeexplore.ieee.org/document/7103433) presented at ICPP).\n", + "\n", + "
\n", + "\n", + "
\n", + "Image created by Vanessa Sochat for Flux Framework Components documentation
\n", + "
\n", + "\n", + "Flux functionality can be extended with modules, which you might think of like services. For Flux instances, additional services are typically implemented as broker modules that can be deployed across one or more brokers. Once deployed, these services can leverage the other components of the broker, including message routing over the TBON and services provided by other broker modules. As a result, broker modules allow for the creation of composable, easily deployable services for Flux instances." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\n", + "## Flux Modules\n", + "\n", + "To manage and query modules, Flux provides the `flux module` command. The sub-commands provided by `flux module` can be seen by running the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Usage: flux-module COMMAND [OPTIONS]\n", + " -h, --help Display this message.\n", + "\n", + "flux module subcommands:\n", + " list List loaded modules\n", + " remove Unload module\n", + " load Load module\n", + " reload Reload module\n", + " stats Display stats on module\n", + " debug Get/set module debug flags\n" + ] + } + ], + "source": [ + "!flux module --help" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Some examples of Flux modules include:\n", + "* `job-ingest` (used by Flux submission commands like `flux batch` and `flux run`)\n", + "* `job-list` (used by `flux jobs`)\n", + "* `sched-fluxion-qmanager` (used by `flux tree`)\n", + "* `sched-fluxion-resource` (also used by `flux tree`)\n", + "\n", + "We can see that these services are loaded and available by running the cell below." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Module Idle S Service\n", + "job-exec idle R \n", + "heartbeat 1 R \n", + "job-list idle R \n", + "sched-fluxion-resource idle R \n", + "content-sqlite idle R content-backing\n", + "resource idle R \n", + "job-ingest idle R \n", + "content idle R \n", + "job-info idle R \n", + "sched-fluxion-qmanager idle R sched\n", + "kvs-watch idle R \n", + "kvs idle R \n", + "cron idle R \n", + "job-manager idle R \n", + "barrier idle R \n", + "connector-local 0 R \n" + ] + } + ], + "source": [ + "!flux module list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Users and system administrators can easily load and unload modules using the `flux module load` and `flux module remove` commands. To show this, let's unload Fluxion (Flux's graph-based scheduler) and replace it with the built-in simple scheduler." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Module Idle S Service\n", + "job-exec idle R \n", + "heartbeat 0 R \n", + "job-list idle R \n", + "content-sqlite idle R content-backing\n", + "resource 0 R \n", + "job-ingest idle R \n", + "content 0 R \n", + "job-info idle R \n", + "kvs-watch idle R \n", + "kvs 0 R \n", + "cron idle R \n", + "job-manager 0 R \n", + "sched-simple 0 R sched\n", + "barrier idle R \n", + "connector-local 0 R \n" + ] + } + ], + "source": [ + "!flux module remove sched-fluxion-qmanager\n", + "!flux module remove sched-fluxion-resource\n", + "!flux module load sched-simple\n", + "!flux module list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this code block, we unload the 2 services that comprise Fluxion: `sched-fluxion-qmanager` and `sched-fluxion-resource`. Next, we load the simple scheduler (`sched-simple`), and, finally, we look at the running servicees. We now see that Fluxion is not available, and the simple scheduler is. Next, let's reload Fluxion, but, this time, let's pass some extra arguments to specialize our Flux instance. In particular, we will limit the scheduling depth to 4 and populate Fluxion's resource graph with:\n", + "* Nodes\n", + "* Sockets\n", + "* Cores" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Module Idle S Service\n", + "job-exec idle R \n", + "heartbeat 1 R \n", + "job-list idle R \n", + "sched-fluxion-qmanager 0 R sched\n", + "content-sqlite idle R content-backing\n", + "resource 0 R \n", + "job-ingest idle R \n", + "content 0 R \n", + "job-info idle R \n", + "kvs-watch idle R \n", + "sched-fluxion-resource 0 R \n", + "kvs 0 R \n", + "cron idle R \n", + "job-manager 0 R \n", + "barrier idle R \n", + "connector-local 0 R \n" + ] + } + ], + "source": [ + "# Run flux dmesg to make sure sched-simple has no more work before unloading\n", + "!flux dmesg -C\n", + "!flux module remove sched-simple\n", + "!flux module load sched-fluxion-resource load-allowlist=node,socket,core\n", + "!flux module load sched-fluxion-qmanager queue-params=queue-depth=4\n", + "!flux module list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### flux kvs\n", + "\n", + "One of the core services built into Flux is the key-value store (KVS). It is used in many other services, including most of Flux's resource management services, the `flux archive` service below, and DYAD (which we will explore in [Supplementary Chapter 1](./supplementary/dyad/dyad_dlio.ipynb)). These services use the KVS to persistantly store information and retrieve it later (potentially after a restart of Flux).\n", + "\n", + "The `flux kvs` command provides a utility to list and manipulate values of the KVS. As a example of using `flux kvs`, let's use the command to examine information saved by the `resource` service." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!flux kvs ls\n", + "!flux kvs ls resource\n", + "!flux kvs get resource.R | jq" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The KVS is such an essential component of Flux that we provide C and Python APIs to interact with it. To learn more about interacting with the KVS from these languages, take a look at these documentation pages:\n", + "* C's `flux_kvs_commit` [family of functions](https://flux-framework.readthedocs.io/projects/flux-core/en/latest/man3/flux_kvs_commit.html)\n", + "* C's `flux_kvs_copy` [family of functions](https://flux-framework.readthedocs.io/projects/flux-core/en/latest/man3/flux_kvs_copy.html)\n", + "* C's `flux_kvs_getroot` [family of functions](https://flux-framework.readthedocs.io/projects/flux-core/en/latest/man3/flux_kvs_getroot.html)\n", + "* C's `flux_kvs_lookup` [family of functions](https://flux-framework.readthedocs.io/projects/flux-core/en/latest/man3/flux_kvs_lookup.html)\n", + "* C's `flux_kvs_namespace_create` [family of functions](https://flux-framework.readthedocs.io/projects/flux-core/en/latest/man3/flux_kvs_namespace_create.html)\n", + "* C's `flux_kvs_txn_create` [family of functions](https://flux-framework.readthedocs.io/projects/flux-core/en/latest/man3/flux_kvs_txn_create.html)\n", + "* Python's `flux.kvs` [module](https://flux-framework.readthedocs.io/projects/flux-core/en/latest/python/autogenerated/flux.kvs.html#module-flux.kvs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\n", + "## flux jobspec generation\n", + "\n", + "Underlying much interaction with jobs is the creation of job specifications. When you use the command line or Python SDK and submit from a command or script, under the hood (back to that plumbing reference) we are creating a job specification \"Jobspec\" that is passed further through Flux. The command `flux submit` makes it possible to provide a similar command, but instead of running it, to generate the jobspec. Let's do that now. We will generate and view a Jobspec for a simple \"hello world\" job. We do that by adding `--dry-run`." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;39m{\n", + " \u001b[0m\u001b[34;1m\"resources\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[1;39m[\n", + " \u001b[1;39m{\n", + " \u001b[0m\u001b[34;1m\"type\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"slot\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"count\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;39m1\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"with\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[1;39m[\n", + " \u001b[1;39m{\n", + " \u001b[0m\u001b[34;1m\"type\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"core\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"count\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;39m1\u001b[0m\u001b[1;39m\n", + " \u001b[1;39m}\u001b[0m\u001b[1;39m\n", + " \u001b[1;39m]\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"label\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"task\"\u001b[0m\u001b[1;39m\n", + " \u001b[1;39m}\u001b[0m\u001b[1;39m\n", + " \u001b[1;39m]\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"tasks\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[1;39m[\n", + " \u001b[1;39m{\n", + " \u001b[0m\u001b[34;1m\"command\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[1;39m[\n", + " \u001b[0;32m\"echo\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0;32m\"hello\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0;32m\"potato\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0;32m\"🥔️🍠️\"\u001b[0m\u001b[1;39m\n", + " \u001b[1;39m]\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"slot\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"task\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"count\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[1;39m{\n", + " \u001b[0m\u001b[34;1m\"per_slot\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;39m1\u001b[0m\u001b[1;39m\n", + " \u001b[1;39m}\u001b[0m\u001b[1;39m\n", + " \u001b[1;39m}\u001b[0m\u001b[1;39m\n", + " \u001b[1;39m]\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"attributes\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[1;39m{\n", + " \u001b[0m\u001b[34;1m\"system\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[1;39m{\n", + " \u001b[0m\u001b[34;1m\"duration\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;39m0\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"environment\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[1;39m{\n", + " \u001b[0m\u001b[34;1m\"SHELL\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"/usr/bin/bash\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"FLUX_MODULE_PATH\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"/usr/lib/flux/modules\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"HOSTNAME\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"8660c254a8e5\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"FLUX_START_URI\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"local:///tmp/flux-iwjuLe/start\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"NB_UID\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"1000\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"PWD\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"/home/jovyan\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"JPY_SESSION_NAME\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"/home/jovyan/02_flux_framework.ipynb\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"MANPATH\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"/usr/share/man\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"FLUX_CONNECTOR_PATH\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"/usr/lib/flux/connectors\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"_\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"/usr/bin/flux\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"HOME\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"/home/jovyan\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"LANG\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"C.UTF-8\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"FORCE_COLOR\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"1\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"PYDEVD_USE_FRAME_EVAL\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"NO\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"JUPYTER_APP_LAUNCHER_PATH\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"/usr/local/share/jupyter/lab/jupyter_app_launcher/\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"CLICOLOR\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"1\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"CLICOLOR_FORCE\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"1\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"JPY_PARENT_PID\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"159\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"PYTHONPATH\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"/usr/lib/flux/python3.10\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"TERM\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"xterm-color\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"GIT_PAGER\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"cat\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"SHLVL\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"2\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"PAGER\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"cat\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"FLUX_URI\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"local:///tmp/flux-iwjuLe/local-0\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"MPLBACKEND\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"module://matplotlib_inline.backend_inline\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"NB_USER\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"jovyan\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"LUA_CPATH\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"/usr/lib/lua/5.2/?.so;;;\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"FLUX_EXEC_PATH\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"/usr/libexec/flux/cmd\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"PATH\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"FLUX_URI_RESOLVE_LOCAL\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"t\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"LUA_PATH\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"/usr/share/lua/5.2/?.lua;;;\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"BASE_IMAGE\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"jammy\"\u001b[0m\u001b[1;39m\n", + " \u001b[1;39m}\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"cwd\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;32m\"/home/jovyan\"\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"shell\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[1;39m{\n", + " \u001b[0m\u001b[34;1m\"options\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[1;39m{\n", + " \u001b[0m\u001b[34;1m\"rlimit\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[1;39m{\n", + " \u001b[0m\u001b[34;1m\"cpu\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;39m-1\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"fsize\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;39m-1\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"data\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;39m-1\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"stack\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;39m8388608\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"core\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;39m-1\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"nofile\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;39m1048576\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"as\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;39m-1\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"rss\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;39m-1\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"nproc\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;39m-1\u001b[0m\u001b[1;39m\n", + " \u001b[1;39m}\u001b[0m\u001b[1;39m\n", + " \u001b[1;39m}\u001b[0m\u001b[1;39m\n", + " \u001b[1;39m}\u001b[0m\u001b[1;39m\n", + " \u001b[1;39m}\u001b[0m\u001b[1;39m\n", + " \u001b[1;39m}\u001b[0m\u001b[1;39m,\n", + " \u001b[0m\u001b[34;1m\"version\"\u001b[0m\u001b[1;39m: \u001b[0m\u001b[0;39m1\u001b[0m\u001b[1;39m\n", + "\u001b[1;39m}\u001b[0m\n" + ] + } + ], + "source": [ + "! flux submit --dry-run echo hello potato 🥔️🍠️ > potato-job.txt\n", + "! cat potato-job.txt | jq" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You'll notice there is a lot of content in there! At this point you could write this to file (as we did, saving to `potato-job.txt`, edit it, and provide it directly to `flux job submit` to run. Let's try that now." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ƒ3VPB8ZEqV\n", + "hello potato 🥔️🍠️\n" + ] + } + ], + "source": [ + "! flux job submit ./potato-job.txt\n", + "! flux job attach $(flux job last)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# This concludes Chapter 2.\n", + "\n", + "In this module, we covered:\n", + "1. The structure of Flux instances \n", + "2. How to load and unload modules in Flux\n", + "3. An example flux module `flux kvs`\n", + "4. Interacting with job specifications `Jobspec`s\n", + "\n", + "To finish the tutorial, open [Chapter 3](./03_flux_tutorial_conclusions.ipynb)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/03_flux_tutorial_conclusions.ipynb b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/03_flux_tutorial_conclusions.ipynb new file mode 100644 index 0000000..e89ec3d --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/03_flux_tutorial_conclusions.ipynb @@ -0,0 +1,93 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "
\n", + "
\n", + "\n", + "# Chapter 3: You Finished!\n", + "# This concludes the Flux tutorial! 😄️\n", + "\n", + "In this tutorial, we:\n", + "* Introduced Flux, and showed you how to get started\n", + "* Showed how to perform traditional batch scheduling with Flux\n", + "* Showed how to perform hierarchical scheduling with Flux\n", + "* Described the structure of Flux instances and Flux modules\n", + "\n", + "If you are ready for advanced content, you can do the [DYAD and DLIO tutorial](./supplementary/dyad/dyad_dlio.ipynb) and learn about:\n", + "* Describing the design of DYAD, a Flux service for runtime data movement\n", + "* Introducing distributed Deep Learning (DL) training\n", + "* Introducing Argonne National Laboratory's Deep Learning I/O (DLIO) benchmark\n", + "* Using DLIO to show how DYAD accelerates distributed DL training\n", + "\n", + "And don't worry, you'll have more opportunities for using Flux! We hope you reach out to us on any of our [project repositories](https://flux-framework.org) and ask any questions that you have. We'd love your contribution to code, documentation, or just saying hello! 👋️ If you have feedback on the tutorial, please let us know so we can improve it for next year. \n", + "\n", + "> But what do I do now?\n", + "\n", + "Feel free to experiment more with Flux here, or (for more freedom) in the terminal. You can try more of the examples in the `flux-workflow-examples` directory in the window to the left. If you're using a shared system like the one on the RADIUSS AWS tutorial please be mindful of other users and don't run compute intensive workloads. If you're running the tutorial in a job on an HPC cluster... compute away! ⚾️\n", + "\n", + "> Where can I learn to set this up on my own?\n", + "\n", + "If you're interested in installing Flux on your cluster, take a look at the [system instance instructions](https://flux-framework.readthedocs.io/projects/flux-core/en/latest/guide/admin.html). If you are interested in running Flux on Kubernetes, check out the [Flux Operator](https://github.com/flux-framework/flux-operator). \n", + "\n", + "> How can I run this tutorial on my own?\n", + "\n", + "All materials for this tutorial (including other versions of the tutorial) can be found in our [Tutorials repo](https://github.com/flux-framework/Tutorials). To run this tutorial on your own, you can clone this repo, enter the directory for the version of the tutorial you want to run, and follow the instructions in that directory's README. All versions of this tutorial are designed to either be deployed to cloud (e.g., AWS) or be run locally using Docker.\n", + "\n", + "\n", + "## How can I learn more about Flux?\n", + "\n", + "We've got lots of resources for learning about Flux!\n", + "- [https://flux-framework.org/](https://flux-framework.org/) Flux Framework portal for projects, releases, and publication.\n", + " - [Flux Documentation](https://flux-framework.readthedocs.io/en/latest/).\n", + " - [Flux Framework Cheat Sheet](https://flux-framework.org/cheat-sheet/)\n", + " - [Flux Glossary of Terms](https://flux-framework.readthedocs.io/en/latest/glossary.html)\n", + " - [Flux Comics](https://flux-framework.readthedocs.io/en/latest/comics/fluxonomicon.html) come and meet FluxBird - the pink bird who knows things!\n", + " - [Flux Learning Guide](https://flux-framework.readthedocs.io/en/latest/guides/learning_guide.html) learn about what Flux does, how it works, and real research applications \n", + " - [Getting Started with Flux and Go](https://converged-computing.github.io/flux-go/)\n", + " - [Getting Started with Flux in C](https://converged-computing.github.io/flux-c-examples/) *looking for contributors*\n", + "\n", + "We also have talks and recent publications or work related to Flux in the cloud:\n", + "\n", + " - [Flux Alongside User-Space Kubernetes](https://arxiv.org/abs/2406.06995): A possible future for running Kubernetes in user space on a traditional HPC cluster (with Flux)!\n", + " - [The Flux Operator](https://flux-framework.org/flux-operator/getting_started/user-guide.html): For deploying an entire Flux cluster in seconds in Kubernetes.\n", + " - [Fluence, a scheduler-plugin for Kubernetes](https://github.com/flux-framework/flux-k8s): to schedule pods with Fluxion.\n", + "\n", + "We've also got resources for learning about DYAD!\n", + "* [DYAD's ReadTheDocs page](https://dyad.readthedocs.io/en/latest/)\n", + "* [DYAD's GitHub repository](https://github.com/flux-framework/dyad)\n", + "* [eScience 2022 Short Paper](https://dyad.readthedocs.io/en/latest/_downloads/27090817b034a89b76e5538e148fea9e/ShortPaper_2022_eScience_LLNL.pdf)\n", + "\n", + "And, of course, you can always reach out to us on any of our [project repositories](https://flux-framework.org) and ask any questions that you have. We'd love your contribution to code, documentation, or just saying hello!\n", + "\n", + "![https://flux-framework.org/flux-operator/_static/images/flux-operator.png](https://flux-framework.org/flux-operator/_static/images/flux-operator.png)\n", + "\n", + ">> See you next year! 👋️😎️" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/Flux-logo.svg b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/Flux-logo.svg new file mode 100644 index 0000000..f2d126b --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/Flux-logo.svg @@ -0,0 +1 @@ +Flux-logo-3 \ No newline at end of file diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-icon.png b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-icon.png new file mode 100644 index 0000000..d50aa52 Binary files /dev/null and b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-icon.png differ diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/.github/workflows/main.yml b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/.github/workflows/main.yml new file mode 100644 index 0000000..5d301e8 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/.github/workflows/main.yml @@ -0,0 +1,33 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +on: [pull_request] +jobs: + check-pr: + name: check formatting + runs-on: ubuntu-latest + + strategy: + matrix: + python-version: [3.6, 3.7, 3.8] + + steps: + - uses: actions/checkout@v2 + with: + ref: ${{ github.event.pull_request.head.sha }} + fetch-depth: 0 + - run: git fetch origin master + - uses: flux-framework/pr-validator@master + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Lint with flake8 + run: | + pip install flake8 + pip install black + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + black . diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/.mergify.yml b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/.mergify.yml new file mode 100644 index 0000000..65c6341 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/.mergify.yml @@ -0,0 +1,18 @@ +pull_request_rules: + - name: rebase and merge when passing all checks + conditions: + - base=master + - status-success="check formatting (3.6)" + - status-success="check formatting (3.7)" + - status-success="check formatting (3.8)" + - label="merge-when-passing" + - label!="work-in-progress" + - "approved-reviews-by=@flux-framework/core" + - "#approved-reviews-by>0" + - "#changes-requested-reviews-by=0" + - -title~=^\[*[Ww][Ii][Pp] + actions: + merge: + method: merge + strict: smart + strict_method: rebase diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/Makefile b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/Makefile new file mode 100644 index 0000000..f219905 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/Makefile @@ -0,0 +1,25 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile check spelling $(SCHEMA_DIRS) + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +check: spelling + +spelling: + @$(SPHINXBUILD) -W -b spelling "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/README.md b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/README.md new file mode 100644 index 0000000..892bcb0 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/README.md @@ -0,0 +1,72 @@ +# Flux Workflow Examples + +This contents used to be hosted at [flux-framework/flux-workflow-examples](https://github.com/flux-framework/flux-workflow-examples) and has been moved here for annual updates paired with the Flux Tutorials. + +The examples contained here demonstrate and explain some simple use-cases with Flux, +and make use of Flux's command-line interface (CLI), Flux's C library, +and the Python and Lua bindings to the C library. + +## Requirements + +The examples assume that you have installed: + +1. A recent version of Flux +2. Python 3.6+ +3. Lua 5.1+ + +You can also use an interactive container locally, binding this directory to the container: + +```bash +docker run -it -v $(pwd):/home/fluxuser/flux-workflow-examples fluxrm/flux-sched:jammy +cd /home/fluxuser/flux-workflow-examples/ +``` + +**_1. [CLI: Job Submission](job-submit-cli)_** + +Launch a flux instance and schedule/launch compute and io-forwarding jobs on +separate nodes using the CLI + +**_2. [Python: Job Submission](job-submit-api)_** + +Schedule/launch compute and io-forwarding jobs on separate nodes using the Python bindings + +**_3. [Python: Job Submit/Wait](job-submit-wait)_** + +Submit jobs and wait for them to complete using the Flux Python bindings + +**_4. [Python: Asynchronous Bulk Job Submission](async-bulk-job-submit)_** + +Asynchronously submit jobspec files from a directory and wait for them to complete in any order + +**_5. [Python: Tracking Job Status and Events](job-status-control)_** + +Submit job bundles, get event updates, and wait until all jobs complete + +**_6. [Python: Job Cancellation](job-cancel)_** + +Cancel a running job + +**_7. [Lua: Use Events](synchronize-events)_** + +Use events to synchronize compute and io-forwarding jobs running on separate +nodes + +**_8. [Python: Simple KVS Example](kvs-python-bindings)_** + +Use KVS Python interfaces to store user data into KVS + +**_9. [CLI/Lua: Job Ensemble Submitted with a New Flux Instance](job-ensemble)_** + +Submit job bundles, print live job events, and exit when all jobs are complete + +**_10. [CLI: Hierarchical Launching](hierarchical-launching)_** + +Launch a large number of sleep 0 jobs + +**_11. [C/Lua: Use a Flux Comms Module](comms-module)_** + +Use a Flux Comms Module to communicate with job elements + +**_12. [C/Python: A Data Conduit Strategy](data-conduit)_** + +Attach to a job that receives OS time data from compute jobs diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/async-bulk-job-submit/README.md b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/async-bulk-job-submit/README.md new file mode 100644 index 0000000..c612e51 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/async-bulk-job-submit/README.md @@ -0,0 +1,113 @@ +# Python Asynchronous Bulk Job Submission + +Parts (a) and (b) demonstrate different implementations of the same basic use-case---submitting large numbers of jobs to Flux. For simplicity, in these examples all of the jobs are identical. + +In part (a), we use the `flux.job.submit_async` and `flux.job.wait` functions to submit jobs and wait for them. +In part (b), we use the `FluxExecutor` class, which offers a higher-level interface. It is important to note that +these two different implementations deal with very different kinds of futures. +The executor's futures fulfill in the background and callbacks added to the futures may +be invoked by different threads; the `submit_async` futures do not fulfill in the background, callbacks are always +invoked by the same thread that added them, and sharing the futures among threads is not supported. + +## Setup - Downloading the Files + +If you haven't already, download the files and change your working directory: + +```bash +$ cd flux-workflow-examples/async-bulk-job-submit +``` + +## Part (a) - Using `submit_async` + +### Description: Asynchronously submit jobspec files from a directory and wait for them to complete in any order + +1. Allocate three nodes from a resource manager: + +```bash +salloc -N3 -ppdebug +``` + +2. Make a **jobs** directory: + +```bash +mkdir /tmp/jobs +``` + +3. If you are running Slurm, launch a Flux instance on the current allocation by running `flux start` once per node, redirecting log messages to the file `out` in the current directory: + +```bash +srun --pty --mpi=none -N3 flux start -o,-S,log-filename=out +``` + +4. Store the jobspec of a `sleep 0` job in the **jobs** directory: + +```bash +flux run --dry-run -n1 sleep 0 > /tmp/jobs/0.json +``` + +5. Copy the jobspec of **job0** 1024 times to create a directory of 1025 `sleep 0` jobs: + +```bash +for i in `seq 1 1024`; do cp /tmp/jobs/0.json /tmp/jobs/${i}.json; done +``` + +6. Run the **bulksubmit.py** script and pass all jobspec in the **jobs** directory as an argument with a shell glob `jobs/*.json`: + +```bash +./bulksubmit.py /tmp/jobs/*.json +``` +```console +bulksubmit: Starting... +bulksubmit: submitted 1025 jobs in 0.43s. 2392.93job/s +bulksubmit: First job finished in about 0.521s +|██████████████████████████████████████████████████████████| 100.0% (274.3 job/s) +bulksubmit: Ran 1025 jobs in 3.7s. 274.3 job/s +``` + +### Notes to Part (a) + +- `h = flux.Flux()` creates a new Flux handle which can be used to connect to and interact with a Flux instance. + +- `job_submit_async(h, jobspec.read(), waitable=True).then(submit_cb)` submits a jobspec, returning a future which will be fulfilled when the submission of this job is complete. + +`.then(submit_cb)`, called on the returned future, will cause our callback `submit_cb()` to be invoked when the submission of this job is complete and a jobid is available. To process job submission RPC responses and invoke callbacks, the flux reactor for handle `h` must be run: + +```python +if h.reactor_run() < 0: + h.fatal_error("reactor start failed") +``` + +The reactor will return automatically when there are no more outstanding RPC responses, i.e., all jobs have been submitted. + +- `job.wait(h)` waits for any job submitted with the `FLUX_JOB_WAITABLE` flag to transition to the **INACTIVE** state. + + +### Part (b) - Using FluxExecutor + +#### Description: Asynchronously submit a single command repeatedly + +If continuing from part (a), skip to step 3. + +1. Allocate three nodes from a resource manager: + +```bash +salloc -N3 -ppdebug +``` + +2. Launch a Flux instance on the current allocation by running `flux start` once per node, redirecting log messages to the file `out` in the current directory: + +```bash +srun --pty --mpi=none -N3 flux start -o,-S,log-filename=out +``` + +3. Run the **bulksubmit_executor.py** script and pass the command (`/bin/sleep 0` in this example) and the number of times to run it (default is 100): + +```bash +./bulksubmit_executor.py -n200 /bin/sleep 0 +``` +```console +bulksubmit_executor: submitted 200 jobs in 0.18s. 1087.27job/s +bulksubmit_executor: First job finished in about 0.248s +|██████████████████████████████████████████████████████████| 100.0% (229.8 job/s) +bulksubmit_executor: Ran 200 jobs in 1.0s. 199.6 job/s +``` diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/async-bulk-job-submit/bulksubmit.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/async-bulk-job-submit/bulksubmit.py new file mode 100755 index 0000000..c1a2e9a --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/async-bulk-job-submit/bulksubmit.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 + +import time +import sys +import flux + +from flux import job +from flux import constants + +t0 = time.time() +jobs = [] +label = "bulksubmit" + +# open connection to broker +h = flux.Flux() + + +def log(s): + print(label + ": " + s) + + +def progress(fraction, length=72, suffix=""): + fill = int(round(length * fraction)) + bar = "\u2588" * fill + "-" * (length - fill) + s = "\r|{0}| {1:.1f}% {2}".format(bar, 100 * fraction, suffix) + sys.stdout.write(s) + if fraction == 1.0: + sys.stdout.write("\n") + + +def submit_cb(f): + jobs.append(job.submit_get_id(f)) + + +# asynchronously submit jobspec files from a directory +log("Starting...") +for file in sys.argv[1:]: + with open(file) as jobspec: + job.submit_async(h, jobspec.read(), waitable=True).then(submit_cb) + +if h.reactor_run() < 0: + h.fatal_error("reactor start failed") + +total = len(jobs) +dt = time.time() - t0 +jps = len(jobs) / dt +log("submitted {0} jobs in {1:.2f}s. {2:.2f}job/s".format(total, dt, jps)) + +count = 0 +while count < total: + # wait for jobs to complete in any order + job.wait(h) + count = count + 1 + if count == 1: + log("First job finished in about {0:.3f}s".format(time.time() - t0)) + suffix = "({0:.1f} job/s)".format(count / (time.time() - t0)) + progress(count / total, length=58, suffix=suffix) + +dt = time.time() - t0 +log("Ran {0} jobs in {1:.1f}s. {2:.1f} job/s".format(total, dt, total / dt)) + +# vi: ts=4 sw=4 expandtab diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/async-bulk-job-submit/bulksubmit_executor.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/async-bulk-job-submit/bulksubmit_executor.py new file mode 100755 index 0000000..5280863 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/async-bulk-job-submit/bulksubmit_executor.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +import time +import sys +import argparse +import concurrent.futures as cf + +from flux.job import FluxExecutor, JobspecV1 + + +def log(label, s): + print(label + ": " + s) + + +def progress(fraction, length=72, suffix=""): + fill = int(round(length * fraction)) + bar = "\u2588" * fill + "-" * (length - fill) + s = f"\r|{bar}| {100 * fraction:.1f}% {suffix}" + sys.stdout.write(s) + if fraction == 1.0: + sys.stdout.write("\n") + + +def main(): + parser = argparse.ArgumentParser( + description="Submit a command repeatedly using FluxExecutor" + ) + parser.add_argument( + "-n", + "--njobs", + type=int, + metavar="N", + help="Set the total number of jobs to run", + default=100, + ) + parser.add_argument("command", nargs=argparse.REMAINDER) + args = parser.parse_args() + if not args.command: + args.command = ["true"] + t0 = time.perf_counter() + label = "bulksubmit_executor" + with FluxExecutor() as executor: + compute_jobspec = JobspecV1.from_command(args.command) + futures = [executor.submit(compute_jobspec) for _ in range(args.njobs)] + # wait for the jobid for each job, as a proxy for the job being submitted + for fut in futures: + fut.jobid() + # all jobs submitted - print timings + dt = time.perf_counter() - t0 + jps = args.njobs / dt + log(label, f"submitted {args.njobs} jobs in {dt:.2f}s. {jps:.2f}job/s") + # wait for jobs to complete + for i, _ in enumerate(cf.as_completed(futures)): + if i == 0: + log( + label, + f"First job finished in about {time.perf_counter() - t0:.3f}s", + ) + jps = (i + 1) / (time.perf_counter() - t0) + progress((i + 1) / args.njobs, length=58, suffix=f"({jps:.1f} job/s)") + # print time summary + dt = time.perf_counter() - t0 + log(label, f"Ran {args.njobs} jobs in {dt:.1f}s. {args.njobs / dt:.1f} job/s") + + +if __name__ == "__main__": + main() diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/bulksubmit/0.sh b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/bulksubmit/0.sh new file mode 100755 index 0000000..a11e231 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/bulksubmit/0.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +echo "Once upon a time... 📗️" diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/bulksubmit/1.sh b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/bulksubmit/1.sh new file mode 100755 index 0000000..f958b58 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/bulksubmit/1.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +echo "There was a little duck 🦆️" diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/bulksubmit/2.sh b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/bulksubmit/2.sh new file mode 100755 index 0000000..fe6930b --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/bulksubmit/2.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +echo "Her name was pizzaquack 🍕️" diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/bulksubmit/3.sh b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/bulksubmit/3.sh new file mode 100755 index 0000000..7ba6b82 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/bulksubmit/3.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +echo "She was very fond of cheese 🧀️" diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/bulksubmit/4.sh b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/bulksubmit/4.sh new file mode 100755 index 0000000..089c949 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/bulksubmit/4.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +echo "And running Flux 🌀️" diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/bulksubmit/5.sh b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/bulksubmit/5.sh new file mode 100755 index 0000000..3c00920 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/bulksubmit/5.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +echo "And so she ran Flux, while she ate her cheese 😋️" diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/bulksubmit/6.sh b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/bulksubmit/6.sh new file mode 100755 index 0000000..9233634 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/bulksubmit/6.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +echo "And was so happy! The end. 🌈️" diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/comms-module/Makefile b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/comms-module/Makefile new file mode 100644 index 0000000..ccc018d --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/comms-module/Makefile @@ -0,0 +1,19 @@ +all: capp.so ioapp.so + +FLUX_CORE_LIBS = $(shell pkg-config --libs flux-core) +FLUX_CORE_INCLUDES = $(shell pkg-config --cflags flux-core) + +ioapp.so: ioapp.o + gcc -Wl,--no-undefined --disable-static -shared -export-dynamic $^ -o $@ $(FLUX_CORE_LIBS) + +ioapp.o: app.c + gcc $(FLUX_CORE_INCLUDES) $^ -DIO_SERVICE=1 -fPIC -c -o $@ + +capp.so: capp.o + gcc -Wl,--no-undefined --disable-static -shared -export-dynamic $^ -o $@ $(FLUX_CORE_LIBS) + +capp.o: app.c + gcc $(FLUX_CORE_INCLUDES) $^ -DCOMP_SERVICE=1 -fPIC -c -o $@ + +clean: + rm *.o *.so diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/comms-module/README.md b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/comms-module/README.md new file mode 100644 index 0000000..6f1456c --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/comms-module/README.md @@ -0,0 +1,93 @@ +# Using a Flux Comms Module + +## Description: Use a Flux comms module to communicate with job elements + +### Setup + +If you haven't already, download the files and change your working directory: + +```bash +$ cd flux-workflow-examples/comms-module +``` + +### Execution + +If you need to get an allocation on Slurm: + +```bash +salloc -N3 -ppdebug +``` + +Point to `flux-core`'s `pkgconfig` directory: + +| Shell | Command | +| ----- | ---------- | +| tcsh | `setenv PKG_CONFIG_PATH /lib/pkgconfig` | +| bash/zsh | `export PKG_CONFIG_PATH='/lib/pkgconfig'` | + +This might look like this in the container: + +```bash +export PKG_CONFIG_PATH=/usr/lib/pkgconfig +``` + +Then build the module (if you don't have permission, copy to /tmp) + +```bash +cp -R ./comms-module /tmp/comms-module +cd /tmp/comms-module +make +``` + +Add the directory of the modules to `FLUX_MODULE_PATH`; if the module was +built in the current dir: + +```bash +flux module load ioapp.so +flux module load capp.so +export FLUX_MODULE_PATH=${FLUX_MODULE_PATH}:$(pwd) +``` + +Now let's try it! If you need to run flux start under Slurm: + +```bash +srun --pty --mpi=none -N3 flux start -o,-S,log-filename=out +``` + +Try running flux with the module on the path. + +```bash +flux run -N 1 -n 2 ./compute.lua 120 +flux run -N 1 -n 2 ./io-forwarding.lua 120 +``` +Notice that the module is loaded (at the bottom): + +```console +Try `flux-module load --help' for more information. +Module Idle S Sendq Recvq Service +heartbeat 1 R 0 0 +resource 0 R 0 0 +job-ingest 0 R 0 0 +kvs-watch 0 R 0 0 +sched-fluxion-resource 0 R 0 0 +cron idle R 0 0 +barrier idle R 0 0 +job-exec 0 R 0 0 +job-list idle R 0 0 +kvs 0 R 0 0 +content-sqlite 0 R 0 0 content-backing +job-info 0 R 0 0 +job-manager 0 R 0 0 +sched-fluxion-qmanager 0 R 0 0 sched +content 0 R 0 0 +connector-local 0 R 0 0 1002-shell-f3Lv2Zd3tj,1002-shell-f3N2WmZB5H +ioapp 83 R 0 0 +Block until we hear go message from the an io forwarder +``` + +If you run them together, they work together: + +```bash +flux submit -N 1 -n 2 ./compute.lua 120 +flux run -N 1 -n 2 ./io-forwarding.lua 120 +``` diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/comms-module/app.c b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/comms-module/app.c new file mode 100644 index 0000000..336ecfb --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/comms-module/app.c @@ -0,0 +1,129 @@ +#include +#include +#include + +#if !defined (IO_SERVICE) && !defined (COMP_SERVICE) +# error "Either IO_SERVICE or COMP_SERVICE macro is needed" +#endif + +struct app_ctx { + flux_t *h; + int count; + flux_msg_handler_t **handlers; +}; + +static void freectx (void *arg) +{ + struct app_ctx *ctx = (struct app_ctx *)arg; + flux_msg_handler_delvec (ctx->handlers); + free (ctx); +} + +static struct app_ctx *getctx (flux_t *h) +{ +#if IO_SERVICE + struct app_ctx *ctx = flux_aux_get (h, "ioapp"); +#elif COMP_SERVICE + struct app_ctx *ctx = flux_aux_get (h, "capp"); +#endif + if (!ctx) { + ctx = malloc (sizeof (*ctx)); + ctx->count = 0; + ctx->handlers = NULL; +#if IO_SERVICE + flux_aux_set (h, "ioapp", ctx, freectx); +#elif COMP_SERVICE + flux_aux_set (h, "capp", ctx, freectx); +#endif + } + return ctx; +} + +#if IO_SERVICE +static void io_request_cb (flux_t *h, flux_msg_handler_t *w, + const flux_msg_t *msg, void *arg) +{ + const char *topic = NULL; + struct app_ctx *ctx = getctx (h); + int data = 0; + + if (flux_request_unpack (msg, &topic, "{s:i}", "data", &data)) + goto error; + ctx->count++; + if (flux_respond_pack (h, msg, "{s:i}", "count", ctx->count) < 0) + flux_log_error (h, "%s", __FUNCTION__); + flux_log (h, LOG_DEBUG, "count: %d", ctx->count); + return; + +error: + flux_log_error (h, "%s", __FUNCTION__); + if (flux_respond (h, msg, NULL) < 0) + flux_log_error (h, "%s: flux_respond", __FUNCTION__); +} +#endif + +#if COMP_SERVICE +static void comp_request_cb (flux_t *h, flux_msg_handler_t *w, + const flux_msg_t *msg, void *arg) +{ + const char *topic = NULL; + struct app_ctx *ctx = getctx (h); + int data = 0; + + flux_log (h, LOG_INFO, "comp_request_cb:"); + if (flux_request_unpack (msg, &topic, "{s:i}", "data", &data)) + goto error; + + ctx->count++; + if (flux_respond_pack (h, msg, "{s:i}", "count", ctx->count) < 0) + flux_log_error (h, "%s", __FUNCTION__); + return; + +error: + flux_log_error (h, "%s", __FUNCTION__); + if (flux_respond (h, msg, NULL) < 0) + flux_log_error (h, "%s: flux_respond", __FUNCTION__); +} +#endif + +static struct flux_msg_handler_spec htab[] = { +#if IO_SERVICE + { FLUX_MSGTYPE_REQUEST, "ioapp.io", io_request_cb, 0 }, +#endif + +#if COMP_SERVICE + { FLUX_MSGTYPE_REQUEST, "capp.comp", comp_request_cb, 0 }, +#endif + + FLUX_MSGHANDLER_TABLE_END +}; + + +int mod_main (flux_t *h, int argc, char **argv) +{ + + struct app_ctx *ctx = getctx (h); + if (flux_msg_handler_addvec (h, htab, (void *)h, + &ctx->handlers) < 0) { + flux_log (ctx->h, LOG_ERR, "flux_msg_handler_addvec: %s", strerror (errno)); + goto done; + } + + if (flux_reactor_run (flux_get_reactor (h), 0) < 0) { + flux_log (h, LOG_ERR, "flux_reactor_run: %s", strerror (errno)); + goto done; + } + +done: + return 0; +} + +#if IO_SERVICE +MOD_NAME ("ioapp"); +#elif COMP_SERVICE +MOD_NAME ("capp"); +#endif + +/* + * vi:tabstop=4 shiftwidth=4 expandtab + */ diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/comms-module/compute.lua b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/comms-module/compute.lua new file mode 100755 index 0000000..f505f54 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/comms-module/compute.lua @@ -0,0 +1,62 @@ +#!/usr/bin/env lua + +local f, err = require 'flux' .new () +local amount = tonumber (arg[1]) or 120 +local rank = tonumber (os.getenv('FLUX_TASK_RANK')) or 0 +local frank = tonumber (os.getenv('FLUX_LOCAL_RANKS')) or 0 +io.stdout:setvbuf ("no") + +local function sleep (n) + os.execute ("sleep " .. n) +end + +if #arg ~= 1 then + print ("Usage: compute.lua seconds") + print (" Compute for seconds") + os.exit (1) +end + +-- subscribe app.io.go event +local rc, err = f:subscribe ("app.io.go") +if not rc then + print ("Failed to subscribe an event, %s", err) + os.exit (1) +end + +-- the leader rank of compute job installs app module +if rank == 0 then + os.execute ("flux module load -r " .. 0 .. " capp") + os.execute ("flux module list") +end + +-- wait for an event sent from the leader of io-forwarding job to sync +-- between io job's installing the app module and sending a request later +print ("Block until we hear go message from the an io forwarder") +local rc, err = f:recv_event () +if not rc then + print ("Failed to receive an event, %s", err) + os.exit (1) +end + +if rank == 0 then + local rc, err = f:sendevent ({ data = "please proceed" }, "app.comp.go") + if not rc then error (err) end + print ("Sent a go event") +end + +local resp, err = f:rpc ("ioapp.io", { data = rank }) +if not resp then + if err == "Function not implemented" then + print ("ioapp.io request handler isn't loaded") + else + print (err) + end +else + print ("Count so far: " .. resp.count) +end + +print ("Will compute for " .. amount .. " seconds") +sleep (amount) +f:unsubscribe ("app.io.go") + +-- vi: ts=4 sw=4 expandtab diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/comms-module/io-forwarding.lua b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/comms-module/io-forwarding.lua new file mode 100755 index 0000000..0f9f78f --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/comms-module/io-forwarding.lua @@ -0,0 +1,57 @@ +#!/usr/bin/env lua + +local flux = require 'flux' +local f = flux.new () +local amount = tonumber (arg[1]) or 120 +local rank = tonumber (os.getenv('FLUX_TASK_RANK')) or 0 +local frank = tonumber (os.getenv('FLUX_LOCAL_RANKS')) or 0 +io.stdout:setvbuf ("no") + +local function sleep (n) + os.execute ("sleep " .. n) +end + +if #arg ~= 1 then + print ("Usage: io-forward.lua seconds") + print (" Forward I/O requests for seconds") + os.exit (1) +end + +-- subscribe app.comp.go event +local rc, err = f:subscribe ("app.comp.go") +if not rc then + print ("Failed to subscribe an event, %s", err) + os.exit (1) +end + +if rank == 0 then + os.execute ("flux module load -r " .. 0 .. " ioapp") + os.execute ("flux module list") + local rc, err = f:sendevent ({ data = "please proceed" }, "app.io.go") + if not rc then error (err) end + print ("Sent a go event") +end + +-- Wait for an event sent from the leader of compute job to sync +-- between compute job's installing the app module and sending a request later +print ("Block until we hear go message from the a leader compute process") +local rc, err = f:recv_event () +if not rc then + print ("Failed to receive an, %s", err) + os.exit (1) +end + +local resp, err = f:rpc ("capp.comp", { data = rank }) +if not resp then + if err == "Function not implemented" then + print ("capp.comp request handler isn't loaded") + else + print (err) + end +end + +print ("Will forward IO requests for " .. amount .. " seconds") +sleep (amount) +f:unsubscribe ("app.comp.go") + +-- vi: ts=4 sw=4 expandtab diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/conf.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/conf.py new file mode 100644 index 0000000..75f3e5c --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/conf.py @@ -0,0 +1,83 @@ +############################################################### +# Copyright 2020 Lawrence Livermore National Security, LLC +# (c.f. AUTHORS, NOTICE.LLNS, COPYING) +# +# This file is part of the Flux resource manager framework. +# For details, see https://github.com/flux-framework. +# +# SPDX-License-Identifier: LGPL-3.0 +############################################################### + +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = 'Flux' +copyright = '''Copyright 2020 Lawrence Livermore National Security, LLC and Flux developers. + +SPDX-License-Identifier: LGPL-3.0''' +author = 'This page is maintained by the Flux community.' + +# The full version, including alpha/beta/rc tags +release = '0.1.0' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.intersphinx', + 'sphinxcontrib.spelling', + 'recommonmark', +] + +# sphinxcontrib.spelling settings +spelling_word_list_filename = [ + 'spell.en.pws' +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'README.md'] + +master_doc = 'index' +source_suffix = ['.rst', '.md'] + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = [ +] + +# -- Options for man output ------------------------------------------------- + +man_pages = [ +] diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/data-conduit/Makefile b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/data-conduit/Makefile new file mode 100644 index 0000000..56abc33 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/data-conduit/Makefile @@ -0,0 +1,13 @@ +all: conduit.so + +FLUX_CORE_LIBS = $(shell pkg-config --libs flux-core) +FLUX_CORE_INCLUDES = $(shell pkg-config --cflags flux-core) + +conduit.so: conduit.o + gcc -Wl,--no-undefined --disable-static -shared -export-dynamic $^ -o $@ $(FLUX_CORE_LIBS) + +conduit.o: conduit.c + gcc $(FLUX_CORE_INCLUDES) $^ -fPIC -c -o $@ + +clean: + rm *.o *.so diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/data-conduit/README.md b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/data-conduit/README.md new file mode 100644 index 0000000..3a9a927 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/data-conduit/README.md @@ -0,0 +1,102 @@ +# A Data Conduit Strategy + +**Note that this module script does not compile and needs an update** + + +## Description: Use a data stream to send packets through + +### Setup + +If you haven't already, download the files and change your working directory: + +```bash +$ cd flux-workflow-examples/data-conduit +``` + +### Execution + +If you are using Slurm, allocate three nodes from a resource manager: + +```bash +salloc -N3 -ppdebug +``` + +Point to `flux-core`'s `pkgconfig` directory: + +| Shell | Command | +| ----- | ---------- | +| tcsh | `setenv PKG_CONFIG_PATH /lib/pkgconfig` | +| bash/zsh | `export PKG_CONFIG_PATH='/lib/pkgconfig'` | + +This might look like this in the container: + +```bash +export PKG_CONFIG_PATH=/usr/lib/pkgconfig +``` + +Then build the module (if you don't have permission, copy to /tmp) + +```bash +cp -R ./data-conduit /tmp/data-conduit +cd /tmp/data-conduit +make +``` + +3. `make` + +4. Add the directory of the modules to `FLUX_MODULE_PATH`, if the module was built in the current directory: + +`export FLUX_MODULE_PATH=${FLUX_MODULE_PATH}:$(pwd)` + +5. Launch a Flux instance on the current allocation by running `flux start` once per node, redirecting log messages to the file `out` in the current directory: + +`srun --pty --mpi=none -N3 flux start -o,-S,log-filename=out` + +6. Submit the **datastore** script: + +`flux submit -N 1 -n 1 ./datastore.py` + +7. Submit and resubmit five **compute** scripts to send time data to **datastore**: + +`flux submit -N 1 -n 1 ./compute.py 1` + +`flux submit -N 1 -n 1 ./compute.py 1` + +`flux submit -N 1 -n 1 ./compute.py 1` + +`flux submit -N 1 -n 1 ./compute.py 1` + +`flux submit -N 1 -n 1 ./compute.py 1` + +8. Attach to the **datastore** job to see the data sent by the **compute.py** scripts: + +`flux job attach 1900070043648` + +``` +Starting.... +Module was loaded successfully... +finished initialize... +starting run() +Waiting for a packet +{u'test': 101} +Waiting for a packet +{u'test': 101, u'1578431137': u'os.time'} +Waiting for a packet +{u'test': 101, u'1578431137': u'os.time', u'1578431139': u'os.time'} +Waiting for a packet +{u'test': 101, u'1578431140': u'os.time', u'1578431137': u'os.time', u'1578431139': u'os.time'} +Waiting for a packet +{u'test': 101, u'1578431140': u'os.time', u'1578431137': u'os.time', u'1578431139': u'os.time', u'1578431141': u'os.time'} +Bye bye! +run finished... +``` + +--- + +### Notes + +- `f = flux.Flux()` creates a new Flux handle which can be used to connect to and interact with a Flux instance. + +- `kvs.put()` places the value of _udata_ under the key **"conduit"**. Once the key-value pair is put, the change must be committed with `kvs.commit()`. The value can then be retrieved with `kvs.get()`. + +- `f.rpc()` creates a new RPC object consisting of a specified topic and payload (along with additional flags) that are exchanged with a Flux service. diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/data-conduit/compute.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/data-conduit/compute.py new file mode 100644 index 0000000..d03f871 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/data-conduit/compute.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 + +import argparse +import time +import os +import re +import flux +import json +from flux import kvs +from flux.message import Message + +parser = argparse.ArgumentParser(description="compute for seconds") +parser.add_argument( + "integer", + metavar="S", + type=int, + help="an integer for the number of seconds to compute", +) + +args = parser.parse_args() + +f = flux.Flux() +udata = "conduit" +kvs.put(f, "conduit", udata) +kvs.commit(f) + +cr = kvs.get(f, "conduit") +print(cr) + +os_time = int(time.time()) +payload = {str(os_time): "os.time"} +new_payload = {"data": json.dumps(payload)} +print("Sending ", json.dumps(new_payload)) + +# this data is ultimately flowed into the data store +f.rpc("conduit.put", new_payload, 0) + + +time.sleep(args.integer) diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/data-conduit/conduit.c b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/data-conduit/conduit.c new file mode 100644 index 0000000..9e6f446 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/data-conduit/conduit.c @@ -0,0 +1,182 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct conduit_ctx { + flux_t *h; + struct sockaddr_un server_sockaddr; + struct sockaddr_un client_sockaddr; + int client_sock; + bool connected; + char *sockname; + char *csockname; + flux_msg_handler_t **handlers; +}; + +static void freectx (void *arg) +{ + struct conduit_ctx *ctx = (struct conduit_ctx *)arg; + flux_msg_handler_delvec (ctx->handlers); + free (ctx->sockname); + free (ctx->csockname); + if (ctx->connected) + close (ctx->client_sock); + free (ctx); +} + +static struct conduit_ctx *getctx (flux_t *h) +{ + struct conduit_ctx *ctx = flux_aux_get (h, "conduit"); + if (!ctx) { + char *user = getenv ("USER"); + ctx = malloc (sizeof (*ctx)); + ctx->connected = false; + ctx->handlers = NULL; + asprintf (&(ctx->sockname), "/tmp/%s/mysock", user? user : ""); + asprintf (&(ctx->csockname),"/tmp/%s/mycsock", user? user : ""); + flux_aux_set (h, "conduit", ctx, freectx); + } + return ctx; +} + +/* Forward the received JSON string to the datastore.py */ +static int conduit_send (flux_t *h, const char *json_str) +{ + int rc = -1; + int n = 0; + struct conduit_ctx *ctx = getctx (h); + + n = (int) strlen (json_str); + if ((rc = send (ctx->client_sock, (void *)&n, sizeof (n), 0)) == -1) { + flux_log_error (h, "send error %s", __FUNCTION__); + return rc; + } + if ((rc = send (ctx->client_sock, (void *)json_str, n, 0)) == -1) { + flux_log_error (h, "send error %s", __FUNCTION__); + return rc; + } + flux_log (h, LOG_INFO, "conduit_send succeed"); + return 0; +} + +/* request callback called when conduit.put request is invoked */ +static void conduit_put_request_cb (flux_t *h, flux_msg_handler_t *w, + const flux_msg_t *msg, void *arg) +{ + int rc = -1; + const char *topic = NULL; + struct conduit_ctx *ctx = getctx (h); + const char *data = NULL; + + flux_log (h, LOG_INFO, "conduit_put_request_cb:"); + if (ctx->connected == false) { + flux_log (h, LOG_INFO, "conduit not connected"); + errno = ENOTCONN; + goto done; + } + if (flux_request_unpack (msg, &topic, "{s:s}", "data", &data)) { + flux_log_error (h, "%s", __FUNCTION__); + goto done; + } + if (conduit_send (h, data) < 0) + errno = EPROTO; +done: + if (flux_respond (h, msg, errno, NULL) < 0) + flux_log_error (h, "%s: flux_respond", __FUNCTION__); +} + +/* open the Unix domain socket to talk to datastore.py */ +static int conduit_open (flux_t *h) +{ + struct conduit_ctx *ctx = getctx (h); + int rc = -1; + int len = 0; + char buf[256]; + memset(&(ctx->server_sockaddr), 0, sizeof(struct sockaddr_un)); + memset(&(ctx->client_sockaddr), 0, sizeof(struct sockaddr_un)); + + if ((ctx->client_sock = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { + flux_log (h, LOG_ERR, "SOCKET ERROR = %d\n", errno); + goto done; + } + + ctx->client_sockaddr.sun_family = AF_UNIX; + strcpy(ctx->client_sockaddr.sun_path, ctx->csockname); + len = sizeof(ctx->client_sockaddr); + unlink (ctx->csockname); + if ((rc = bind(ctx->client_sock, + (struct sockaddr *)&ctx->client_sockaddr, len)) == -1) { + flux_log (h, LOG_ERR, "BIND ERROR: %d\n", errno); + close(ctx->client_sock); + goto done; + } + flux_log (h, LOG_INFO, "Conduit client socket bound\n"); + + ctx->server_sockaddr.sun_family = AF_UNIX; + strcpy(ctx->server_sockaddr.sun_path, ctx->sockname); + if ((rc = connect(ctx->client_sock, + (struct sockaddr *)&ctx->server_sockaddr, len)) == -1) { + flux_log (h, LOG_ERR, "CONNECT ERROR = %d\n", errno); + close(ctx->client_sock); + goto done; + } + + ctx->connected = true; + flux_log (h, LOG_INFO, "Conduit socket connected\n"); + conduit_send (h, "{\"test\":101}"); + rc = 0; +done: + return rc; +} + + +static struct flux_msg_handler_spec htab[] = { + { FLUX_MSGTYPE_REQUEST, "conduit.put", conduit_put_request_cb, 0 }, + FLUX_MSGHANDLER_TABLE_END +}; + +int mod_main (flux_t *h, int argc, char **argv) +{ + uint32_t rank = 0; + struct conduit_ctx *ctx = getctx (h); + + if (conduit_open (h) < 0) { + flux_log (ctx->h, LOG_ERR, "conduit_open failed"); + goto done; + } + if (flux_get_rank (h, &rank) < 0) { + flux_log (ctx->h, LOG_ERR, "flux_get_rank failed"); + goto done; + } + + /* Put the rank where this module is loaded into conduit key + */ + flux_kvs_txn_t *txn = flux_kvs_txn_create (); + flux_kvs_txn_pack (txn, 0, "conduit", "i", rank); + flux_kvs_commit (h, 0, txn); + flux_kvs_txn_destroy (txn); + if (flux_msg_handler_addvec (h, htab, (void *)h, + &ctx->handlers) < 0) { + flux_log (ctx->h, LOG_ERR, "flux_msg_handler_addvec: %s", strerror (errno)); + goto done; + } + if (flux_reactor_run (flux_get_reactor (h), 0) < 0) { + flux_log (h, LOG_ERR, "flux_reactor_run: %s", strerror (errno)); + goto done; + } + +done: + return 0; +} + +MOD_NAME ("conduit"); + +/* + * vi:tabstop=4 shiftwidth=4 expandtab + */ diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/data-conduit/datastore.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/data-conduit/datastore.py new file mode 100755 index 0000000..d5fcc48 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/data-conduit/datastore.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 + +import socket +import struct +import json +import sys +import os + +sockdir = os.path.join("/tmp", os.environ["USER"]) +sockname = os.path.join(sockdir, "mysock") + +store = {} +sock = "" + + +def initialize(): + global sock + if not os.path.exists(sockdir): + os.mkdir(sockdir) + if os.path.exists(sockname): + os.remove(sockname) + sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + sock.bind(sockname) + sock.listen(1) + cmd = "flux module load ./conduit.so" + os.system(cmd) + + +def run(): + global sock + global store + connection, client_address = sock.accept() + for x in range(5): + print("Waiting for a packet") + mybytes = bytearray(4) + nbytes, address = connection.recvfrom_into(mybytes, 4) + if nbytes == 0: + break + size = ( + mybytes[0] * 1 + + mybytes[1] * 256 + + mybytes[2] * 65536 + + mybytes[3] * 16777216 + ) + data = bytearray(size) + nbytes, address = connection.recvfrom_into(data, size) + dict_blob = json.loads(data.decode("ascii")) + + if dict_blob is not None: + store.update(dict_blob) + print(store) + else: + print("Mallformed data, discarding") + + connection.close() + cmd = "flux module remove conduit" + os.system(cmd) + print("Bye bye!") + + +def main(): + print("Starting....") + initialize() + run() + + +if __name__ == "__main__": + main() diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/hierarchical-launching/README.md b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/hierarchical-launching/README.md new file mode 100644 index 0000000..ff33747 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/hierarchical-launching/README.md @@ -0,0 +1,39 @@ +# Hierarchical Launching + +## Description: Launch an ensemble of sleep 0 tasks + +### Setup + +If you haven't already, download the files and change your working directory: + +```bash +$ cd flux-workflow-examples/hierarchical-launching +``` + +### Execution + +If you need to start flux on a Slurm cluster: + +```bash +salloc -N3 -ppdebug +srun --pty --mpi=none -N3 flux start -o,-S,log-filename=out +``` + +Start the parent instance + +```bash +./parent.sh +``` +```console +Mon Nov 18 15:31:08 PST 2019 +13363018989568 +13365166473216 +13367095853056 +First Level Done +Mon Nov 18 15:34:13 PST 2019 +``` + +### Notes + +- You can increase the number of jobs by increasing `NCORES` in `parent.sh` and +`NJOBS` in `ensemble.sh`. diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/hierarchical-launching/ensemble.sh b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/hierarchical-launching/ensemble.sh new file mode 100755 index 0000000..efd987b --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/hierarchical-launching/ensemble.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env sh + +NJOBS=750 +MAXTIME=$(expr ${NJOBS} + 2) + +for i in `seq 1 ${NJOBS}`; do + flux submit --nodes=1 --ntasks=1 --cores-per-task=1 sleep 0 +done + +flux jobs +flux queue drain +echo "Final Level Done" diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/hierarchical-launching/parent.sh b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/hierarchical-launching/parent.sh new file mode 100755 index 0000000..19d74e3 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/hierarchical-launching/parent.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env sh + +NCORES=3 + +date + +for i in `seq 1 ${NCORES}`; do + flux submit -N 1 -n 1 flux start ./ensemble.sh +done + +flux queue drain +echo "First Level Done" + +date diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/index.rst b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/index.rst new file mode 100644 index 0000000..aa335e9 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/index.rst @@ -0,0 +1,95 @@ +Flux Workflow Examples +---------------------- + +The examples contained here demonstrate and explain some simple use-cases with Flux, +and make use of Flux's command-line interface (CLI), Flux's C library, and the Python and Lua bindings to the C library. +The entire set of examples can be downloaded by cloning the `Github repo `_. + +The examples assume that you have installed: + +#. A recent version of Flux + +#. Python 3.6+ + +#. Lua 5.1+ + +:doc:`CLI: Job Submission ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Launch a flux instance and schedule/launch compute and io-forwarding +jobs on separate nodes using the CLI + +:doc:`Python: Job Submission ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Schedule/launch compute and io-forwarding jobs on separate nodes using +the Python bindings + +:doc:`Python: Job Submit/Wait ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Submit jobs and wait for them to complete using the Flux Python bindings + +:doc:`Python: Asynchronous Bulk Job Submission ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Asynchronously submit jobspec files from a directory and wait for them +to complete in any order + +:doc:`Python: Tracking Job Status and Events ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Submit job bundles and wait until all jobs complete + +:doc:`Python: Job Cancellation ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Cancel a running job + +:doc:`Lua: Use Events ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use events to synchronize compute and io-forwarding jobs running on +separate nodes + +:doc:`Python: Simple KVS Example ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use KVS Python interfaces to store user data into KVS + +:doc:`CLI/Lua: Job Ensemble Submitted with a New Flux Instance ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Submit job bundles, print live job events, and exit when all jobs are +complete + +:doc:`CLI: Hierarchical Launching ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Launch a large number of sleep 0 jobs + +:doc:`C/Lua: Use a Flux Comms Module ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use a Flux Comms Module to communicate with job elements + +:doc:`C/Python: A Data Conduit Strategy ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Attach to a job that receives OS time data from compute jobs + +.. toctree:: + :hidden: + + job-submit-cli/README + job-submit-api/README + job-submit-wait/README + async-bulk-job-submit/README + job-status-control/README + job-cancel/README + synchronize-events/README + kvs-python-bindings/README + job-ensemble/README + hierarchical-launching/README + comms-module/README + data-conduit/README diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-cancel/README.md b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-cancel/README.md new file mode 100644 index 0000000..2c9c0cf --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-cancel/README.md @@ -0,0 +1,42 @@ +# Job Cancellation + +## Description: Cancel a running job + +### Setup + +If you haven't already, download the files and change your working directory: + +```bash +$ cd flux-workflow-examples/job-cancel +``` + +### Execution + +Launch the submitter script: + +```bash +python3 ./submitter.py $(flux resource list -no {ncores} --state=up) +``` + +```console +Submitted 1st job: 2241905819648 +Submitted 2nd job: 2258951471104 + +First submitted job status (2241905819648) - RUNNING +Second submitted job status (2258951471104) - PENDING + +Canceled first job: 2241905819648 + +First submitted job status (2241905819648) - CANCELED +Second submitted job status (2258951471104) - RUNNING +``` + +### Notes + +- `f = flux.Flux()` creates a new Flux handle which can be used to connect to and interact with a Flux instance. + +- `flux.job.submit(f, sleep_jobspec, waitable=True)` submits a jobspec, returning a job ID that can be used to interact with the submitted job. + +- `flux.job.cancel(f, jobid)` cancels the job. + +- `flux.job.wait_async(f, jobid)` will wait for the job to complete (or in this case, be canceled). It returns a Flux future, which can be used to process the result later. Only jobs submitted with `waitable=True` can be waited for. diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-cancel/submitter.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-cancel/submitter.py new file mode 100644 index 0000000..b95584f --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-cancel/submitter.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 + +import time +import argparse + +import flux +from flux.job import JobspecV1 + +f = flux.Flux() + +parser = argparse.ArgumentParser( + description=""" + Description: Submit two 'sleep 60' jobs that take up + all resources on a node. + """ +) +parser.add_argument(dest="cores", help="number of cores on the node") +args = parser.parse_args() + +# submit a sleep job that takes up all resources +sleep_jobspec = JobspecV1.from_command( + ["sleep", "60"], num_tasks=1, cores_per_task=int(args.cores) +) +first_jobid = flux.job.submit(f, sleep_jobspec, waitable=True) +print("Submitted 1st job: %d" % (int(first_jobid))) +time.sleep(1) + +# submit a second sleep job - will be scheduled, but not run +sleep_jobspec = JobspecV1.from_command( + ["sleep", "60"], num_tasks=1, cores_per_task=int(args.cores) +) +second_jobid = flux.job.submit(f, sleep_jobspec, waitable=True) +print("Submitted 2nd job: %d\n" % (int(second_jobid))) +time.sleep(1) + +# get list of JobInfo objects - fetch their ID's and current status +jobs = flux.job.JobList(f, max_entries=2).jobs() +print("First submitted job status (%d) - %s" % (int(jobs[1].id.dec), jobs[1].status)) +print("Second submitted job status (%d) - %s\n" % (int(jobs[0].id.dec), jobs[0].status)) + +# cancel the first job +flux.job.cancel(f, first_jobid) +future = flux.job.wait_async(f, first_jobid).wait_for(5.0) +return_id, success, errmsg = future.get_status() +print("Canceled first job: %d\n" % (int(return_id))) +time.sleep(1) + +# the second job should now run since the first was canceled +jobs = flux.job.JobList(f, max_entries=2).jobs() +print("First submitted job status (%d) - %s" % (int(jobs[1].id.dec), jobs[1].status)) +print("Second submitted job status (%d) - %s" % (int(jobs[0].id.dec), jobs[0].status)) diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-ensemble/README.md b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-ensemble/README.md new file mode 100644 index 0000000..99361f5 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-ensemble/README.md @@ -0,0 +1,94 @@ +# Job Ensemble Submitted with a New Flux Instance + +## Description: Launch a flux instance and submit one instance of an io-forwarding job and 50 compute jobs, each spanning the entire set of nodes. + +### Setup + +If you haven't already, download the files and change your working directory: + +``` +$ git clone https://github.com/flux-framework/flux-workflow-examples.git +$ cd flux-workflow-examples/job-ensemble +``` + +### Execution + +If you need a Slurm allocation: + +```bash +salloc -N3 -ppdebug + +# Take a look at the script first +cat ensemble.sh +``` +Here is how to run under Slurm: + +```bash +srun --pty --mpi=none -N3 flux start -o,-S,log-filename=out ./ensemble.sh +``` + +Or without: + +```bash +flux start -o,-S,log-filename=out ./ensemble.sh +``` + +``` +JOBID USER NAME STATE NTASKS NNODES RUNTIME +1721426247680 fluxuser compute.lu RUN 4 2 0.122s +1718322462720 fluxuser compute.lu RUN 4 2 0.293s +1715201900544 fluxuser compute.lu RUN 4 2 0.481s +1712299442176 fluxuser compute.lu RUN 4 2 0.626s +1709296320512 fluxuser compute.lu RUN 4 2 0.885s +1706293198848 fluxuser compute.lu RUN 4 2 1.064s +1691378253824 fluxuser io-forward RUN 1 1 1.951s +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Jobid: 1691378253824 +{ + "version": 1, + "execution": { + "R_lite": [ + { + "rank": "0", + "children": { + "core": "0-1" + } + } + ] + } +} +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Jobid: 1694414929920 +{ + "version": 1, + "execution": { + "R_lite": [ + { + "rank": "1-2", + "children": { + "core": "0-3" + } + } + ] + } +} +. +. +. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Jobid: 1721426247680 +{ + "version": 1, + "execution": { + "R_lite": [ + { + "rank": "1-2", + "children": { + "core": "8-11" + } + } + ] + } +} + +``` diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-ensemble/compute.lua b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-ensemble/compute.lua new file mode 100755 index 0000000..e5159fd --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-ensemble/compute.lua @@ -0,0 +1,18 @@ +#!/usr/bin/env lua + +local amount = tonumber (arg[1]) or 120 + +local function sleep (n) + os.execute ("sleep " .. n) +end + +if #arg ~= 1 then + print ("Usage: compute.lua seconds") + print (" Compute for seconds") + os.exit (1) +end + +print ("Will compute for " .. amount .. " seconds") +sleep (amount) + +-- vi: ts=4 sw=4 expandtab diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-ensemble/ensemble.sh b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-ensemble/ensemble.sh new file mode 100755 index 0000000..9468ec6 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-ensemble/ensemble.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env sh + +NJOBS=10 +MAXTIME=$(expr ${NJOBS} + 2) +JOBIDS="" + +JOBIDS=$(flux submit --nodes=1 --ntasks=1 --cores-per-task=2 ./io-forwarding.lua ${MAXTIME}) +for i in `seq 1 ${NJOBS}`; do + JOBIDS="${JOBIDS} $(flux submit --nodes=1 --ntasks=1 --cores-per-task=2 ./compute.lua 1)" +done + +flux jobs +flux queue drain + +# print mock-up prevenance data +for i in ${JOBIDS}; do + echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + echo "Jobid: ${i}" + KVSJOBID=$(flux job id --to=kvs ${i}) + flux kvs get ${KVSJOBID}.R | jq +done diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-ensemble/io-forwarding.lua b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-ensemble/io-forwarding.lua new file mode 100755 index 0000000..3427b1e --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-ensemble/io-forwarding.lua @@ -0,0 +1,18 @@ +#!/usr/bin/env lua + +local amount = tonumber (arg[1]) or 120 + +local function sleep (n) + os.execute ("sleep " .. n) +end + +if #arg ~= 1 then + print ("Usage: io-forward.lua seconds") + print (" Forward I/O requests for seconds") + os.exit (1) +end + +print ("Will forward IO requests for " .. amount .. " seconds") +sleep (amount) + +-- vi: ts=4 sw=4 expandtab diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-ensemble/kvs-watch-until.lua b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-ensemble/kvs-watch-until.lua new file mode 100755 index 0000000..16e63ae --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-ensemble/kvs-watch-until.lua @@ -0,0 +1,82 @@ +#!/usr/bin/env lua +-- +-- Exit only if/when all ranks have exited 'unknown' state +-- +local usage = [[ +Usage: kvs-wait-until [OPTIONS] KEY CODE +Watch kvs KEY until Lua code CODE returns true. +(CODE is supplied key value in variable 'v') +If -t, --timeout is provided, and the timeout expires, then +exit with non-zero exit status. + -h, --help Display this message + -v, --verbose Print value on each watch callback + -t, --timeout=T Wait at most T seconds (before exiting +]] + +local getopt = require 'flux.alt_getopt' .get_opts +local timer = require 'flux.timer'.new() +local f = require 'flux' .new() + +local function printf (...) + io.stdout:write (string.format (...)) +end +local function log_err (...) + io.stdout:write (string.format (...)) +end + +local opts, optind = getopt (arg, "hvt:", + { verbose = 'v', + timeout = 't', + help = 'h' + } + ) +if opts.h then print (usage); os.exit (0) end + +local key = arg [optind] +local callback = arg [optind+1] + +if not key or not callback then + log_err ("KVS key and callback code required\n") + print (usage) + os.exit (1) +end + +callback = "return function (v) return "..callback.." end" +local fn, err = loadstring (callback, "callback") +if not fn then + log_err ("code compile error: %s", err) + os.exit (1) +end +local cb = fn () + +local kw, err = f:kvswatcher { + key = key, + handler = function (kw, result) + if opts.v then + printf ("%4.03fs: %s = %s\n", + timer:get0(), + key, tostring (result)) + end + -- Do not pass nil result to callback: + if result == nil then return end + local ok, rv = pcall (cb, result) + if not ok then error (rv) end + if ok and rv then + os.exit (0) + end + end +} + +if opts.t then + local tw, err = f:timer { + timeout = opts.t * 1000, + handler = function (f, to) + log_err ("%4.03fs: Timeout expired!\n", timer:get0()) + os.exit (1) + end + } +end + +timer:set () +f:reactor () +-- vi: ts=4 sw=4 expandtab diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-status-control/README.md b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-status-control/README.md new file mode 100644 index 0000000..02ad14c --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-status-control/README.md @@ -0,0 +1,71 @@ +# Using Flux Job Status and Control API + +## Description: Submit job bundles, get event updates, and wait until all jobs complete + +### Setup + +If you haven't already, download the files and change your working directory: + +```bash +$ cd flux-workflow-examples/job-status-control +``` + +### Execution + +1. Allocate three nodes from a resource manager: + +```bash +salloc -N3 -p pdebug +``` + +2. If needed, launch a Flux instance on the current allocation by running `flux start` once per node, redirecting log messages to the file `out` in the current directory: + +```bash +srun --pty --mpi=none -N3 flux start -o,-S,log-filename=out +``` + +3. Run the bookkeeper executable along with the number of jobs to be submitted (if no size is specified, 6 jobs are submitted: 3 instances of **compute.py**, and 3 instances of **io-forwarding,py**): + +```bash +python3 ./bookkeeper.py 2 +``` +```console +bookkeeper: all jobs submitted +bookkeeper: waiting until all jobs complete +job 39040581632 triggered event 'submit' +job 39040581633 triggered event 'submit' +job 39040581632 triggered event 'depend' +job 39040581632 triggered event 'priority' +job 39040581632 triggered event 'alloc' +job 39040581633 triggered event 'depend' +job 39040581633 triggered event 'priority' +job 39040581633 triggered event 'alloc' +job 39040581632 triggered event 'start' +job 39040581633 triggered event 'start' +job 39040581632 triggered event 'finish' +job 39040581633 triggered event 'finish' +job 39040581633 triggered event 'release' +job 39040581633 triggered event 'free' +job 39040581633 triggered event 'clean' +job 39040581632 triggered event 'release' +job 39040581632 triggered event 'free' +job 39040581632 triggered event 'clean' +bookkeeper: all jobs completed +``` + +### Notes + +- The following constructs a job request using the **JobspecV1** class with customizable parameters for how you want to utilize the resources allocated for your job: +```python +compute_jobreq = JobspecV1.from_command( + command=["./compute.py", "10"], num_tasks=4, num_nodes=2, cores_per_task=2 +) +compute_jobreq.cwd = os.getcwd() +compute_jobreq.environment = dict(os.environ) +``` + +- `with FluxExecutor() as executor:` creates a new `FluxExecutor` which can be used to submit jobs, wait for them to complete, and get event updates. Using the executor as a context manager (`with ... as ...:`) ensures it is shut down properly. + +- `executor.submit(compute_jobreq)` returns a `concurrent.futures.Future` subclass which completes when the underlying job is done. The jobid of the underlying job can be fetched with the `.jobid([timeout])` method (which waits until the jobid is ready). + +- Throughout the course of a job, various events will occur to it. `future.add_event_callback(event, event_callback)` adds a callback which will be invoked when the given event occurs. diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-status-control/bookkeeper.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-status-control/bookkeeper.py new file mode 100755 index 0000000..a7cef19 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-status-control/bookkeeper.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 + +import os +import argparse + +from flux.job import JobspecV1, FluxExecutor + + +def event_callback(future, event): + print(f"job {future.jobid()} triggered event {event.name!r}") + + +# main +def main(): + # set up command-line parser + parser = argparse.ArgumentParser( + description="submit and wait for the completion of " + "N bundles, each consisting of compute " + "and io-forwarding jobs" + ) + parser.add_argument( + "njobs", metavar="N", type=int, help="the number of bundles to submit and wait", + ) + args = parser.parse_args() + # set up jobspecs + compute_jobreq = JobspecV1.from_command( + command=["./compute.py", "10"], num_tasks=6, num_nodes=3, cores_per_task=2 + ) + compute_jobreq.cwd = os.getcwd() + compute_jobreq.environment = dict(os.environ) + io_jobreq = JobspecV1.from_command( + command=["./io-forwarding.py", "10"], num_tasks=3, num_nodes=3, cores_per_task=1 + ) + io_jobreq.cwd = os.getcwd() + io_jobreq.environment = dict(os.environ) + # submit jobs and register event callbacks for all events + with FluxExecutor() as executor: + futures = [executor.submit(compute_jobreq) for _ in range(args.njobs // 2)] + futures.extend( + executor.submit(io_jobreq) for _ in range(args.njobs // 2, args.njobs) + ) + print("bookkeeper: all jobs submitted") + for fut in futures: + # each event can have a different callback + for event in executor.EVENTS: + fut.add_event_callback(event, event_callback) + print("bookkeeper: waiting until all jobs complete") + # exiting the context manager waits for the executor to complete all futures + print("bookkeeper: all jobs completed") + + +main() + +# vi: ts=4 sw=4 expandtab diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-status-control/compute.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-status-control/compute.py new file mode 100755 index 0000000..1f860f2 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-status-control/compute.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +import argparse +import time + +parser = argparse.ArgumentParser(description="compute for seconds") +parser.add_argument( + "integer", + metavar="S", + type=int, + help="an integer for the number of seconds to compute", +) + +args = parser.parse_args() + +print("Will compute for " + str(args.integer) + " seconds.") +time.sleep(args.integer) diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-status-control/io-forwarding.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-status-control/io-forwarding.py new file mode 100755 index 0000000..217ed0e --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-status-control/io-forwarding.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +import argparse +import time + +parser = argparse.ArgumentParser(description="forward I/O requests for seconds") +parser.add_argument( + "integer", + metavar="S", + type=int, + help="an integer for the number of seconds to compute", +) + +args = parser.parse_args() + +print("Will forward I/O requests for " + str(args.integer) + " seconds.") +time.sleep(args.integer) diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-api/README.md b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-api/README.md new file mode 100644 index 0000000..cfcd17e --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-api/README.md @@ -0,0 +1,115 @@ +# Job Submit API + +To run the following examples, download the files and change your working directory: + +```bash +$ cd flux-workflow-examples/job-submit-api +``` + +## Part(a) - Using a direct job.submit RPC + +### Description: Schedule and launch compute and io-forwarding jobs on separate nodes + +1. Allocate three nodes from a resource manager: + +```bash +salloc -N3 -p pdebug +``` + +2. Launch a Flux instance on the current allocation by running `flux start` once per node, redirecting log messages to the file `out` in the current directory: + +```bash +srun --pty --mpi=none -N3 flux start -o,-S,log-filename=out +``` + +3. Run the submitter executable: + +```bash +python3 ./submitter.py +``` + +4. List currently running jobs: + +```bash +flux jobs +``` +```console +JOBID USER NAME ST NTASKS NNODES RUNTIME +ƒ5W8gVwm fluxuser io-forward R 1 1 19.15s +ƒ5Vd2kJs fluxuser compute.py R 4 2 19.18s +``` + +5. Information about jobs, such as the submitted job specification, an eventlog, and the resource description format **R** are stored in the KVS. The data can be queried via the `job-info` module via the `flux job info` command. For example, to fetch **R** for a job which has been allocated resources: + +```bash +flux job info ƒ5W8gVwm R +``` +```console +{"version":1,"execution":{"R_lite":[{"rank":"2","children":{"core":"0"}}]}} +``` +```bash +flux job info ƒ5Vd2kJs R +``` +```console +{"version":1,"execution":{"R_lite":[{"rank":"0-1","children":{"core":"0-3"}}]}} +``` + +## Part(b) - Using a direct job.submit RPC + +### Schedule and launch both compute and io-forwarding jobs across all nodes + +1. Allocate three nodes from a resource manager: + +```bash +salloc -N3 -p pdebug +``` + +2. Launch another Flux instance on the current allocation: + +```bash +srun --pty --mpi=none -N3 flux start -o,-S,log-filename=out +``` + +3. Run the second submitter executable: + +```bash +python3 ./submitter2.py +``` + +4. List currently running jobs: + +```bash +flux jobs +``` +```console +JOBID USER NAME ST NTASKS NNODES RUNTIME +ƒctYadhh fluxuser io-forward R 3 3 3.058s +ƒct1StnT fluxuser compute.py R 6 3 3.086s +``` + +5. Fetch **R** for the jobs that have been allocated resources: + +```bash +flux job info $(flux job last) jobspec +``` +```console +{"version":1,"execution":{"R_lite":[{"rank":"0-2","children":{"core":"0-3"}}]}} +``` +```console +{"resources": [{"type": "node", "count": 3, "with": [{"type": "slot", "count": 1, "with": [{"type": "core", "count": 1}], "label": "task"}]}], "tasks": [{"command": ["./io-forwarding.py", "120"], "slot": "task", "count": {"per_slot": 1}}], "attributes": {"system": {"duration": 0, "cwd": "/home/fluxuser/flux-workflow-examples/job-submit-api"}}, "version": 1} +``` + +### Notes + +- `f = flux.Flux()` creates a new Flux handle which can be used to connect to and interact with a Flux instance. + +- The following constructs a job request using the **JobspecV1** class with customizable parameters for how you want to utilize the resources allocated for your job: +```python +compute_jobreq = JobspecV1.from_command( + command=["./compute.py", "120"], num_tasks=4, num_nodes=2, cores_per_task=2 +) +compute_jobreq.cwd = os.getcwd() +compute_jobreq.environment = dict(os.environ) +``` + +- `flux.job.submit(f, compute_jobreq)` submits the job to be run, and returns a job ID once it begins running. diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-api/compute.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-api/compute.py new file mode 100755 index 0000000..1f860f2 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-api/compute.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +import argparse +import time + +parser = argparse.ArgumentParser(description="compute for seconds") +parser.add_argument( + "integer", + metavar="S", + type=int, + help="an integer for the number of seconds to compute", +) + +args = parser.parse_args() + +print("Will compute for " + str(args.integer) + " seconds.") +time.sleep(args.integer) diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-api/io-forwarding.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-api/io-forwarding.py new file mode 100755 index 0000000..217ed0e --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-api/io-forwarding.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +import argparse +import time + +parser = argparse.ArgumentParser(description="forward I/O requests for seconds") +parser.add_argument( + "integer", + metavar="S", + type=int, + help="an integer for the number of seconds to compute", +) + +args = parser.parse_args() + +print("Will forward I/O requests for " + str(args.integer) + " seconds.") +time.sleep(args.integer) diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-api/submitter.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-api/submitter.py new file mode 100755 index 0000000..51f2408 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-api/submitter.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 + +import json +import os +import re +import flux +from flux.job import JobspecV1 + +f = flux.Flux() + +compute_jobreq = JobspecV1.from_command( + command=["./compute.py", "120"], num_tasks=4, num_nodes=2, cores_per_task=2 +) +compute_jobreq.cwd = os.getcwd() +compute_jobreq.environment = dict(os.environ) +print(flux.job.submit(f, compute_jobreq)) + +io_jobreq = JobspecV1.from_command( + command=["./io-forwarding.py", "120"], num_tasks=1, num_nodes=1, cores_per_task=1 +) +io_jobreq.cwd = os.getcwd() +io_jobreq.environment = dict(os.environ) +print(flux.job.submit(f, io_jobreq)) diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-api/submitter2.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-api/submitter2.py new file mode 100755 index 0000000..670acff --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-api/submitter2.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 + +import json +import os +import re +import flux +from flux.job import JobspecV1 + +f = flux.Flux() + +compute_jobreq = JobspecV1.from_command( + command=["./compute.py", "120"], num_tasks=6, num_nodes=3, cores_per_task=2 +) +compute_jobreq.cwd = os.getcwd() +compute_jobreq.environment = dict(os.environ) +print(flux.job.submit(f, compute_jobreq)) + +io_jobreq = JobspecV1.from_command( + command=["./io-forwarding.py", "120"], num_tasks=3, num_nodes=3, cores_per_task=1 +) +io_jobreq.cwd = os.getcwd() +io_jobreq.environment = dict(os.environ) +print(flux.job.submit(f, io_jobreq)) diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-cli/README.md b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-cli/README.md new file mode 100644 index 0000000..0a34979 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-cli/README.md @@ -0,0 +1,61 @@ +# Job Submit CLI + +To run the following examples, download the files and change your working directory: + +```console +$ cd flux-workflow-examples/job-submit-cli +``` + +## Example + +### Launch a flux instance and submit compute and io-forwarding jobs + +If you need an allocation: + +```bash +salloc -N3 -ppdebug +srun --pty --mpi=none -N3 flux start -o,-S,log-filename=out +``` + +To submit + +```bash +# if you have more than one node... +flux submit --nodes=2 --ntasks=4 --cores-per-task=2 ./compute.lua 120 + +# and if not! +flux submit --nodes=1 --ntasks=1 --cores-per-task=2 ./io-forwarding.lua 120 +``` + +Attach to watch output: + +```bash +# Control +C then Control+Z to detach +flux job attach $(flux job last) +``` + +List running jobs: + +```bash +flux jobs +``` +```console +JOBID USER NAME ST NTASKS NNODES RUNTIME +ƒ3ETxsR9H fluxuser io-forward R 1 1 2.858s +ƒ38rBqEWT fluxuser compute.lu R 4 2 15.6s +``` + +Get information about job: + +```bash +flux job info $(flux job last) R +flux job info $(flux job last) jobspec +flux job info $(flux job last) eventlog +flux job info $(flux job last) guest.output + +# Example with flux job id +flux job info ƒ3ETxsR9H R +``` +```console +{"version": 1, "execution": {"R_lite": [{"rank": "0", "children": {"core": "5-7"}}], "nodelist": ["674f16a501e5"], "starttime": 1723225494, "expiration": 4876808372}} +``` diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-cli/compute.lua b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-cli/compute.lua new file mode 100755 index 0000000..4fbccc8 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-cli/compute.lua @@ -0,0 +1,17 @@ +#!/usr/bin/env lua + +local amount = tonumber (arg[1]) or 120 + +local function sleep (n) + os.execute ("sleep " .. n) +end + +if #arg ~= 1 then + print ("Usage: compute.lua seconds") + print (" Compute for seconds") + os.exit (1) +end + +print ("Will compute for " .. amount .. " seconds") +sleep (amount) + diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-cli/compute.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-cli/compute.py new file mode 100755 index 0000000..1f860f2 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-cli/compute.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +import argparse +import time + +parser = argparse.ArgumentParser(description="compute for seconds") +parser.add_argument( + "integer", + metavar="S", + type=int, + help="an integer for the number of seconds to compute", +) + +args = parser.parse_args() + +print("Will compute for " + str(args.integer) + " seconds.") +time.sleep(args.integer) diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-cli/io-forwarding.lua b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-cli/io-forwarding.lua new file mode 100755 index 0000000..46ccda0 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-cli/io-forwarding.lua @@ -0,0 +1,17 @@ +#!/usr/bin/env lua + +local amount = tonumber (arg[1]) or 120 + +local function sleep (n) + os.execute ("sleep " .. n) +end + +if #arg ~= 1 then + print ("Usage: io-forward.lua seconds") + print (" Forward I/O requests for seconds") + os.exit (1) +end + +print ("Will forward IO requests for " .. amount .. " seconds") +sleep (amount) + diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-cli/io-forwarding.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-cli/io-forwarding.py new file mode 100755 index 0000000..217ed0e --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-cli/io-forwarding.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +import argparse +import time + +parser = argparse.ArgumentParser(description="forward I/O requests for seconds") +parser.add_argument( + "integer", + metavar="S", + type=int, + help="an integer for the number of seconds to compute", +) + +args = parser.parse_args() + +print("Will forward I/O requests for " + str(args.integer) + " seconds.") +time.sleep(args.integer) diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-wait/README.md b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-wait/README.md new file mode 100644 index 0000000..c4fbd5d --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-wait/README.md @@ -0,0 +1,159 @@ +# Python Job Submit/Wait + +To run the following examples, download the files and change your working directory: + +```bash +$ cd flux-workflow-examples/job-submit-wait +``` + +## Part(a) - Python Job Submit/Wait + +### Description: Submit jobs asynchronously and wait for them to complete in any order + +1. If needed, allocate three nodes from a resource manager: + +```bash +salloc -N3 -ppdebug +``` + +2. Launch a Flux instance on the current allocation by running `flux start` once per node, redirecting log messages to the file `out` in the current directory: + +```bash +srun --pty --mpi=none -N3 flux start -o,-S,log-filename=out +``` + +3. Submit the **submitter_wait_any.py** script, along with the number of jobs you want to run (if no argument is passed, 10 jobs are submitted): + +```bash +python3 ./submitter_wait_any.py 10 +``` +```console +submit: 46912591450240 compute_jobspec +submit: 46912591450912 compute_jobspec +submit: 46912591451080 compute_jobspec +submit: 46912591363152 compute_jobspec +submit: 46912591362984 compute_jobspec +submit: 46912591451360 bad_jobspec +submit: 46912591451528 bad_jobspec +submit: 46912591451696 bad_jobspec +submit: 46912591451864 bad_jobspec +submit: 46912591452032 bad_jobspec +wait: 46912591451528 Error: job returned exit code 1 +wait: 46912591451864 Error: job returned exit code 1 +wait: 46912591451360 Error: job returned exit code 1 +wait: 46912591451696 Error: job returned exit code 1 +wait: 46912591452032 Error: job returned exit code 1 +wait: 46912591450240 Success +wait: 46912591363152 Success +wait: 46912591450912 Success +wait: 46912591451080 Success +wait: 46912591362984 Success +``` + +## Part(b) - Python Job Submit/Wait (Sliding Window) + +### Description: Asynchronously submit jobs and keep at most a number of those jobs active + +1. Allocate three nodes from a resource manager: + +```bash +salloc -N3 -ppdebug +``` + +2. Launch a Flux instance on the current allocation by running `flux start` once per node, redirecting log messages to the file `out` in the current directory: + +```bash +srun --pty --mpi=none -N3 flux start -o,-S,log-filename=out +``` + +3. Submit the **submitter_sliding_window.py** script, along with the number of jobs you want to run and the size of the window (if no argument is passed, 10 jobs are submitted and the window size is 2 jobs): + +```bash +python3 ./submitter_sliding_window.py 10 3 +``` +```console +submit: 5624175788032 +submit: 5624611995648 +submit: 5625014648832 +wait: 5624175788032 Success +submit: 5804329533440 +wait: 5624611995648 Success +submit: 5804648300544 +wait: 5625014648832 Success +submit: 5805084508160 +wait: 5804329533440 Success +submit: 5986144223232 +wait: 5804648300544 Success +submit: 5986462990336 +wait: 5805084508160 Success +submit: 5986882420736 +wait: 5986144223232 Success +submit: 6164435697664 +wait: 5986462990336 Success +wait: 5986882420736 Success +wait: 6164435697664 Success +``` + + +## Part(c) - Python Job Submit/Wait (Specific Job ID) + +### Description: Asynchronously submit jobs, block/wait for specific jobs to complete + +1. Allocate three nodes from a resource manager: + +```bash +salloc -N3 -ppdebug +``` + +2. Launch a Flux instance on the current allocation by running `flux start` once per node, redirecting log messages to the file `out` in the current directory: + +```bash +srun --pty --mpi=none -N3 flux start -o,-S,log-filename=out +``` + +3. Submit the **submitter_wait_in_order.py** script, along with the number of jobs you want to run (if no argument is passed, 10 jobs are submitted): + +```bash +python3 ./submitter_wait_in_order.py 10 +``` +```console +submit: 46912593818008 compute_jobspec +submit: 46912593818176 compute_jobspec +submit: 46912593818344 compute_jobspec +submit: 46912593818512 compute_jobspec +submit: 46912593738048 compute_jobspec +submit: 46912519873816 bad_jobspec +submit: 46912593818792 bad_jobspec +submit: 46912593818960 bad_jobspec +submit: 46912593819128 bad_jobspec +submit: 46912593819296 bad_jobspec +wait: 46912593818008 Success +wait: 46912593818176 Success +wait: 46912593818344 Success +wait: 46912593818512 Success +wait: 46912593738048 Success +wait: 46912519873816 Error: job returned exit code 1 +wait: 46912593818792 Error: job returned exit code 1 +wait: 46912593818960 Error: job returned exit code 1 +wait: 46912593819128 Error: job returned exit code 1 +wait: 46912593819296 Error: job returned exit code 1 +``` + +### Notes + +- The following constructs a job request using the **JobspecV1** class with customizable parameters for how you want to utilize the resources allocated for your job: + +```python +# create jobspec for compute.py +compute_jobspec = JobspecV1.from_command(command=["./compute.py", "15"], num_tasks=4, num_nodes=2, cores_per_task=2) +compute_jobspec.cwd = os.getcwd() +compute_jobspec.environment = dict(os.environ) +``` + +- Using the executor as a context manager (`with FluxExecutor() as executor`) ensures it shuts down properly. + +- `executor.submit(jobspec)` returns a future which completes when the job is done. + +- `future.exception()` blocks until the future is complete and returns (not raises) an exception if the job was canceled or was otherwise prevented from execution. Otherwise the method returns ``None``. + +- `future.result()` blocks until the future is complete and returns the return code of the job. If the job succeeded, the return code will be 0. diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-wait/compute.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-wait/compute.py new file mode 100755 index 0000000..1f860f2 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-wait/compute.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +import argparse +import time + +parser = argparse.ArgumentParser(description="compute for seconds") +parser.add_argument( + "integer", + metavar="S", + type=int, + help="an integer for the number of seconds to compute", +) + +args = parser.parse_args() + +print("Will compute for " + str(args.integer) + " seconds.") +time.sleep(args.integer) diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-wait/submitter_sliding_window.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-wait/submitter_sliding_window.py new file mode 100755 index 0000000..cfec311 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-wait/submitter_sliding_window.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 + +import os +import argparse +import collections +import concurrent.futures as cf + +from flux.job import JobspecV1, FluxExecutor + + +def main(): + # parse command line + parser = argparse.ArgumentParser() + parser.add_argument("njobs", nargs="?", type=int, default=10) + parser.add_argument("window_size", nargs="?", type=int, default=2) + args = parser.parse_args() + print(args) + # create jobspec for compute.py + compute_jobspec = JobspecV1.from_command( + command=["./compute.py", "5"], num_tasks=4, num_nodes=2, cores_per_task=2 + ) + compute_jobspec.cwd = os.getcwd() + compute_jobspec.environment = dict(os.environ) + # create a queue of the jobspecs to submit + jobspec_queue = collections.deque(compute_jobspec for _ in range(args.njobs)) + futures = [] # holds incomplete futures + with FluxExecutor() as executor: + while jobspec_queue or futures: + if len(futures) < args.window_size and jobspec_queue: + fut = executor.submit(jobspec_queue.popleft()) + print(f"submit: {id(fut)}") + futures.append(fut) + else: + done, not_done = cf.wait(futures, return_when=cf.FIRST_COMPLETED) + futures = list(not_done) + for fut in done: + if fut.exception() is not None: + print( + f"wait: {id(fut)} Error: job raised error " + f"{fut.exception()}" + ) + elif fut.result() == 0: + print(f"wait: {id(fut)} Success") + else: + print( + f"wait: {id(fut)} Error: job returned " + f"exit code {fut.result()}" + ) + + +if __name__ == "__main__": + main() + +# vim: tabstop=4 shiftwidth=4 expandtab diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-wait/submitter_wait_any.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-wait/submitter_wait_any.py new file mode 100755 index 0000000..890a1f0 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-wait/submitter_wait_any.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 + +import os +import argparse +import concurrent.futures + +from flux.job import JobspecV1, FluxExecutor + + +def main(): + # parse command line + parser = argparse.ArgumentParser() + parser.add_argument("njobs", nargs="?", type=int, default=10) + args = parser.parse_args() + # create jobspec for compute.py + compute_jobspec = JobspecV1.from_command( + command=["./compute.py", "10"], num_tasks=4, num_nodes=2, cores_per_task=2 + ) + compute_jobspec.cwd = os.getcwd() + compute_jobspec.environment = dict(os.environ) + # create bad jobspec that will fail + bad_jobspec = JobspecV1.from_command(["/bin/false"]) + # create an executor to submit jobs + with FluxExecutor() as executor: + futures = [] + # submit half successful jobs and half failures + for _ in range(args.njobs // 2): + futures.append(executor.submit(compute_jobspec)) + print(f"submit: {id(futures[-1])} compute_jobspec") + for _ in range(args.njobs // 2, args.njobs): + futures.append(executor.submit(bad_jobspec)) + print(f"submit: {id(futures[-1])} bad_jobspec") + for fut in concurrent.futures.as_completed(futures): + if fut.exception() is not None: + print(f"wait: {id(fut)} Error: job raised error {fut.exception()}") + elif fut.result() == 0: + print(f"wait: {id(fut)} Success") + else: + print(f"wait: {id(fut)} Error: job returned exit code {fut.result()}") + + +if __name__ == "__main__": + main() + +# vim: tabstop=4 shiftwidth=4 expandtab diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-wait/submitter_wait_in_order.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-wait/submitter_wait_in_order.py new file mode 100755 index 0000000..cad6491 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-submit-wait/submitter_wait_in_order.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 + +import argparse +import os + +from flux.job import JobspecV1, FluxExecutor + + +def main(): + # parse command line + parser = argparse.ArgumentParser() + parser.add_argument("njobs", nargs="?", type=int, default=10) + args = parser.parse_args() + # create jobspec for compute.py + compute_jobspec = JobspecV1.from_command( + command=["./compute.py", "10"], num_tasks=4, num_nodes=2, cores_per_task=2 + ) + compute_jobspec.cwd = os.getcwd() + compute_jobspec.environment = dict(os.environ) + bad_jobspec = JobspecV1.from_command(["/bin/false"]) + # create an executor to submit jobs + with FluxExecutor() as executor: + futures = [] + # submit half successful jobs and half failures + for _ in range(args.njobs // 2): + futures.append(executor.submit(compute_jobspec)) + print(f"submit: {id(futures[-1])} compute_jobspec") + for _ in range(args.njobs // 2, args.njobs): + futures.append(executor.submit(bad_jobspec)) + print(f"submit: {id(futures[-1])} bad_jobspec") + # wait for each future in turn + for fut in futures: + if fut.exception() is not None: + print(f"wait: {id(fut)} Error: job raised error {fut.exception()}") + elif fut.result() == 0: + print(f"wait: {id(fut)} Success") + else: + print(f"wait: {id(fut)} Error: job returned exit code {fut.result()}") + + +if __name__ == "__main__": + main() + +# vim: tabstop=4 shiftwidth=4 expandtab diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-watch/job-watch.sh b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-watch/job-watch.sh new file mode 100755 index 0000000..df379bc --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/job-watch/job-watch.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +echo "25 chocolate chip pancakes on the table... 25 chocolate chip pancakes! 🥞️" +sleep 3 +echo "Eat a stack, for a snack, 15 chocolate chip pancakes on the table! 🥄️" +sleep 3 +echo "15 chocolate chip pancakes on the table... 15 chocolate chip pancakes! 🥞️" +sleep 2 +echo "Throw a stack... it makes a smack! 15 chocolate chip pancakes on the wall! 🥞️" +sleep 2 +echo "You got some cleaning to do 🧽️" diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/kvs-python-bindings/README.md b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/kvs-python-bindings/README.md new file mode 100644 index 0000000..5c3aa22 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/kvs-python-bindings/README.md @@ -0,0 +1,69 @@ +# KVS Python Binding Example + +## Description: Use the KVS Python interface to store user data into KVS + +If you haven't already, download the files and change your working directory: + +```console +$ cd flux-workflow-examples/kvs-python-bindings +``` + +1. Launch a Flux instance by running `flux start`, redirecting log messages to the file `out` in the current directory: + +```bash +flux start -s 1 -o,-S,log-filename=out +``` + +2. Submit the Python script: + +```bash +flux submit -N 1 -n 1 ./kvsput-usrdata.py +``` +```console +6705031151616 +``` + +3. Attach to the job and view output: + +```bash +flux job attach $(flux job last) +``` +```console +hello world +hello world again +``` + +4. Each job is run within a KVS namespace. `FLUX_KVS_NAMESPACE` is set, which is automatically read and used by the KVS operations in the handle. To take a look at the job's KVS, convert its job ID to KVS: + +```bash +flux job id --to=kvs $(flux job last) +``` +```console +job.0000.0619.2300.0000 +``` + +5. The keys for this job will be put at the root of the namespace, which is mounted under "guest". To get the value stored under the first key "usrdata": + +```bash +flux kvs get job.0000.0619.2300.0000.guest.usrdata +``` +```bash +"hello world" +``` + +6. Get the value stored under the second key "usrdata2": + +```bash +flux kvs get job.0000.0619.2300.0000.guest.usrdata2 +``` +```console +"hello world again" +``` + +### Notes + +- `f = flux.Flux()` creates a new Flux handle which can be used to connect to and interact with a Flux instance. + +- `kvs.put()` places the value of _udata_ under the key **"usrdata"**. Once the key-value pair is put, the change must be committed with `kvs.commit()`. The value can then be retrieved with `kvs.get()` + +- `kvs.get()` on a directory will return a KVSDir object which supports the `with` compound statement. `with` guarantees a commit is called on the directory. diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/kvs-python-bindings/kvsput-usrdata.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/kvs-python-bindings/kvsput-usrdata.py new file mode 100755 index 0000000..0a5cb77 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/kvs-python-bindings/kvsput-usrdata.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 + +import sys +import flux +import os +from flux import kvs + +f = flux.Flux() +udata = "hello world" +# using function interface +kvs.put(f, "usrdata", udata) +# commit is required to effect the above put op to the server +kvs.commit(f) +print(kvs.get(f, "usrdata")) + +# get() on a directory will return a KVSDir object which supports +# the "with" compound statement. "with" guarantees a commit is called +# on the directory. +with kvs.get(f, ".") as kd: + kd["usrdata2"] = "hello world again" + +print(kvs.get(f, "usrdata2")) + +# vi: ts=4 sw=4 expandtab diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/requirements.txt b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/requirements.txt new file mode 100644 index 0000000..1463f8f --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/requirements.txt @@ -0,0 +1,3 @@ +sphinx-rtd-theme +sphinxcontrib-spelling +recommonmark diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/synchronize-events/README.md b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/synchronize-events/README.md new file mode 100644 index 0000000..641be09 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/synchronize-events/README.md @@ -0,0 +1,54 @@ +# Using Events with Separate Nodes + +## Description: Using events to synchronize compute and io-forwarding jobs running on separate nodes + +If you haven't already, download the files and change your working directory: + +```console +$ cd flux-workflow-examples/synchronize-events +``` + +Ask for a Slurm allocation, if relevant: + +```bash +salloc -N3 -ppdebug +srun --pty --mpi=none -N3 flux start -o,-S,log-filename=out +flux submit --nodes=1 --ntasks=4 --cores-per-task=2 ./compute.lua 120 +``` + +And: + +```bash +flux submit --nodes=1 --ntasks=1 --cores-per-task=2 ./io-forwarding.lua 120 +``` + +5. List running jobs: + +```bash +flux jobs +``` +``` +JOBID USER NAME ST NTASKS NNODES RUNTIME RANKS +ƒA4TgT7d fluxuser io-forward R 1 1 4.376s 2 +ƒ6vEcj7M fluxuser compute.lu R 4 2 11.51s [0-1] +``` + +6. Attach to running or completed job output: + +```bash +flux job attach ƒ6vEcj7M +``` +```console +Block until we hear go message from the an io forwarder +Block until we hear go message from the an io forwarder +Recv an event: please proceed +Recv an event: please proceed +Will compute for 120 seconds +Will compute for 120 seconds +Block until we hear go message from the an io forwarder +Block until we hear go message from the an io forwarder +Recv an event: please proceed +Recv an event: please proceed +Will compute for 120 seconds +Will compute for 120 seconds +``` diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/synchronize-events/compute.lua b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/synchronize-events/compute.lua new file mode 100755 index 0000000..925be4c --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/synchronize-events/compute.lua @@ -0,0 +1,23 @@ +#!/usr/bin/env lua + +local f, err = require 'flux' .new () + +local amount = tonumber (arg[1]) or 120 + +local function sleep (n) + os.execute ("sleep " .. n) +end + +if #arg ~= 1 then + print ("Usage: compute.lua seconds") + print (" Compute for seconds") + os.exit (1) +end + +print ("Block until we hear go message from the an io forwarder") +f:subscribe ("app.iof.go") +local t, tag = f:recv_event () +print ("Recv an event: " .. t.data ) +print ("Will compute for " .. amount .. " seconds") +sleep (amount) + diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/synchronize-events/io-forwarding.lua b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/synchronize-events/io-forwarding.lua new file mode 100755 index 0000000..bad77f8 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/flux-workflow-examples/synchronize-events/io-forwarding.lua @@ -0,0 +1,23 @@ +#!/usr/bin/env lua + +local flux = require 'flux' +local f = flux.new () +local amount = tonumber (arg[1]) or 120 + +local function sleep (n) + os.execute ("sleep " .. n) +end + +if #arg ~= 1 then + print ("Usage: io-forward.lua seconds") + print (" Forward I/O requests for seconds") + os.exit (1) +end + +local rc, err = f:sendevent ({ data = "please proceed" }, "app.iof.go") +if not rc then error (err) end +print ("Sent a go event") + +print ("Will forward IO requests for " .. amount .. " seconds") +sleep (amount) + diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/hello-batch.sh b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/hello-batch.sh new file mode 100755 index 0000000..3c39dc0 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/hello-batch.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +flux submit --flags=waitable -N1 --output=/tmp/hello-batch-1.out echo "Hello job 1 from $(hostname) 💛️" +flux submit --flags=waitable -N1 --output=/tmp/hello-batch-2.out echo "Hello job 2 from $(hostname) 💚️" +flux submit --flags=waitable -N1 --output=/tmp/hello-batch-3.out echo "Hello job 3 from $(hostname) 💙️" +flux submit --flags=waitable -N1 --output=/tmp/hello-batch-4.out echo "Hello job 4 from $(hostname) 💜️" +# Wait for the jobs to finish +flux job wait --all \ No newline at end of file diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/dl-training-io.png b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/dl-training-io.png new file mode 100644 index 0000000..6129d0e Binary files /dev/null and b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/dl-training-io.png differ diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/flux-batch.jpg b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/flux-batch.jpg new file mode 100644 index 0000000..f7282bb Binary files /dev/null and b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/flux-batch.jpg differ diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/flux-broker-design.png b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/flux-broker-design.png new file mode 100644 index 0000000..267f1a6 Binary files /dev/null and b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/flux-broker-design.png differ diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/flux-instance-pre-tbon.png b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/flux-instance-pre-tbon.png new file mode 100644 index 0000000..bc40a7e Binary files /dev/null and b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/flux-instance-pre-tbon.png differ diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/flux-instance-w-tbon.png b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/flux-instance-w-tbon.png new file mode 100644 index 0000000..93a276e Binary files /dev/null and b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/flux-instance-w-tbon.png differ diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/flux-tree.png b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/flux-tree.png new file mode 100644 index 0000000..a0dba82 Binary files /dev/null and b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/flux-tree.png differ diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/instance-submit.png b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/instance-submit.png new file mode 100644 index 0000000..84ce558 Binary files /dev/null and b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/instance-submit.png differ diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/scaled-submit.png b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/scaled-submit.png new file mode 100644 index 0000000..a5dc346 Binary files /dev/null and b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/scaled-submit.png differ diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/single-submit.png b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/single-submit.png new file mode 100644 index 0000000..0592def Binary files /dev/null and b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/img/single-submit.png differ diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/sleep_batch.sh b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/sleep_batch.sh new file mode 100644 index 0000000..58496da --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/sleep_batch.sh @@ -0,0 +1,15 @@ +#!/bin/bash +#FLUX: --nodes=2 +#FLUX: --nslots=2 +#FLUX: --cores-per-slot=1 + +echo "Starting my batch job" +echo "Print the resources allocated to this batch job" +flux resource list + +echo "Use sleep to emulate a parallel program" +echo "Run the program at a total of 2 processes each requiring" +echo "1 core. These processes are equally spread across 2 nodes." +flux run -N 2 -n 2 sleep 30 +flux run -N 2 -n 2 sleep 30 + diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/sub_job1.sh b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/sub_job1.sh new file mode 100755 index 0000000..6ec9cf8 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/sub_job1.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +flux batch -N1 ./sub_job2.sh +flux queue drain + diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/sub_job2.sh b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/sub_job2.sh new file mode 100755 index 0000000..d947f19 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/sub_job2.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +flux run -N1 sleep 30 + diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dlio_extensions/configs/workload/dyad_unet3d_demo.yaml b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dlio_extensions/configs/workload/dyad_unet3d_demo.yaml new file mode 100644 index 0000000..27641a3 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dlio_extensions/configs/workload/dyad_unet3d_demo.yaml @@ -0,0 +1,35 @@ +model: unet3d + +framework: pytorch + +workflow: + generate_data: False + train: True + checkpoint: False + +dataset: + data_folder: data/unet3d/ + format: npz + num_files_train: 16 + num_samples_per_file: 1 + record_length: 4096 + +reader: + data_loader: pytorch + batch_size: 1 + read_threads: 1 + file_shuffle: seed + sample_shuffle: seed + multiprocessing_context: spawn + data_loader_classname: dyad_torch_data_loader.DyadTorchDataLoader + data_loader_sampler: index + +train: + epochs: 1 + computation_time: 1 + +checkpoint: + checkpoint_folder: checkpoints/unet3d + checkpoint_after_epoch: 5 + epochs_between_checkpoints: 2 + model_size: 499153191 \ No newline at end of file diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dlio_extensions/dyad_torch_data_loader.py b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dlio_extensions/dyad_torch_data_loader.py new file mode 100644 index 0000000..21f652a --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dlio_extensions/dyad_torch_data_loader.py @@ -0,0 +1,184 @@ +""" + Copyright (c) 2022, UChicago Argonne, LLC + All Rights Reserved + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +""" +from time import time +import math +import pickle +import torch +from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler + +from dlio_benchmark.common.constants import MODULE_DATA_LOADER +from dlio_benchmark.common.enumerations import Shuffle, DatasetType, DataLoaderType +from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader +from dlio_benchmark.reader.reader_factory import ReaderFactory +from dlio_benchmark.utils.utility import utcnow, DLIOMPI +from dlio_benchmark.utils.config import ConfigArguments + +from pydyad import Dyad, dyad_open +from pydyad.bindings import DTLMode, DTLCommMode +import numpy as np +import flux +import os + + + +class DYADTorchDataset(Dataset): + """ + Currently, we only support loading one sample per file + TODO: support multiple samples per file + """ + def __init__(self, format_type, dataset_type, epoch, num_samples, num_workers, batch_size): + self.format_type = format_type + self.dataset_type = dataset_type + self.epoch_number = epoch + self.num_samples = num_samples + self.reader = None + self.num_images_read = 0 + self.batch_size = batch_size + args = ConfigArguments.get_instance() + self.serial_args = pickle.dumps(args) + if num_workers == 0: + self.worker_init(-1) + self.broker_per_node = 1 + + def worker_init(self, worker_id): + # Configure PyTorch components + pickle.loads(self.serial_args) + self._args = ConfigArguments.get_instance() + self._args.configure_dlio_logging(is_child=True) + self.reader = ReaderFactory.get_reader(type=self.format_type, + dataset_type=self.dataset_type, + thread_index=worker_id, + epoch_number=self.epoch_number) + # Start initializing DYAD + # Create Dyad object to interact with DYAD's C internals + self.dyad_io = Dyad() + # Create a handle to Flux and get the rank of the broker that this process is running on + self.f = flux.Flux() + self.broker_rank = self.f.get_rank() + # Obtain DYAD's managed directory from the DYAD_PATH environment variable + self.dyad_managed_directory = os.getenv("DYAD_PATH", "") + # Get the DTL mode (UCX or FLUX_RPC) from the DYAD_DTL_MODE environment variable + dtl_str = os.getenv("DYAD_DTL_MODE", "FLUX_RPC") + mode = DTLMode.DYAD_DTL_FLUX_RPC + if dtl_str == "UCX": + mode = DTLMode.DYAD_DTL_UCX + # Initialize DYAD + self.dyad_io.init(debug=self._args.debug, check=False, shared_storage=False, reinit=False, + async_publish=True, fsync_write=False, key_depth=3, + service_mux=self.broker_per_node, + key_bins=1024, kvs_namespace=os.getenv("DYAD_KVS_NAMESPACE"), + prod_managed_path=self.dyad_managed_directory, cons_managed_path=self.dyad_managed_directory, + dtl_mode=mode, dtl_comm_mode=DTLCommMode.DYAD_COMM_RECV) + + def __len__(self): + return self.num_samples + + def __getitem__(self, image_idx): + # For the requested sample (indicated by image_idx), determine the file + # containing the sample and the index of the sample within that file + self.num_images_read += 1 + step = int(math.ceil(self.num_images_read / self.batch_size)) + filename, sample_index = self._args.global_index_map[image_idx] + is_present = False + file_obj = None + base_fname = filename + # Use DYAD's `get_metadata` function to check if the file has already been cached + # into DYAD + if self.dyad_managed_directory != "": + base_fname = os.path.join(self.dyad_managed_directory, os.path.basename(filename)) + file_obj = self.dyad_io.get_metadata(fname=base_fname, should_wait=False, raw=True) + is_present = True + # If the file has already been cached in DYAD, use `dyad_open` to open the file. + # Then, pass the Python File object returned from `dyad_open` to NumPy to read data. + if file_obj: + access_mode = "remote" + file_node_index = int(file_obj.contents.owner_rank*1.0 / self.broker_per_node) + with dyad_open(base_fname, "rb", dyad_ctx=self.dyad_io, metadata_wrapper=file_obj) as f: + try: + data = np.load(f, allow_pickle=True)["x"] + except: + data = self._args.resized_image + self.dyad_io.free_metadata(file_obj) + # If the file has not been cached in DYAD, first read the file using a DLIO reader. + # Then, write the data into the DYAD managed directory using `dyad_open`. + else: + data = self.reader.read_index(image_idx, step) + if is_present: + with dyad_open(base_fname, "wb", dyad_ctx=self.dyad_io) as f: + np.savez(f, x=data) + + return data + +class DyadTorchDataLoader(BaseDataLoader): + def __init__(self, format_type, dataset_type, epoch_number): + super().__init__(format_type, dataset_type, epoch_number, DataLoaderType.PYTORCH) + + def read(self): + do_shuffle = True if self._args.sample_shuffle != Shuffle.OFF else False + dataset = DYADTorchDataset(self.format_type, self.dataset_type, self.epoch_number, self.num_samples, self._args.read_threads, self.batch_size) + if do_shuffle: + sampler = RandomSampler(dataset) + else: + sampler = SequentialSampler(dataset) + if self._args.read_threads >= 1: + prefetch_factor = math.ceil(self._args.prefetch_size / self._args.read_threads) + else: + prefetch_factor = self._args.prefetch_size + if prefetch_factor > 0: + if self._args.my_rank == 0: + else: + prefetch_factor = 2 + if self._args.my_rank == 0: + logging.debug(f"{utcnow()} Setup dataloader with {self._args.read_threads} workers {torch.__version__}") + if self._args.read_threads==0: + kwargs={} + else: + kwargs={'multiprocessing_context':self._args.multiprocessing_context, + 'prefetch_factor': prefetch_factor} + if torch.__version__ != '1.3.1': + kwargs['persistent_workers'] = True + if torch.__version__ == '1.3.1': + if 'prefetch_factor' in kwargs: + del kwargs['prefetch_factor'] + self._dataset = DataLoader(dataset, + batch_size=self.batch_size, + sampler=sampler, + num_workers=self._args.read_threads, + pin_memory=True, + drop_last=True, + worker_init_fn=dataset.worker_init, + **kwargs) + else: + self._dataset = DataLoader(dataset, + batch_size=self.batch_size, + sampler=sampler, + num_workers=self._args.read_threads, + pin_memory=True, + drop_last=True, + worker_init_fn=dataset.worker_init, + **kwargs) # 2 is the default value + + # self._dataset.sampler.set_epoch(epoch_number) + + def next(self): + super().next() + total = self._args.training_steps if self.dataset_type is DatasetType.TRAIN else self._args.eval_steps + for batch in self._dataset: + yield batch + + def finalize(self): + pass diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dyad_dlio.ipynb b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dyad_dlio.ipynb new file mode 100644 index 0000000..8d5f583 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dyad_dlio.ipynb @@ -0,0 +1,518 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "dd3e912b-3428-4bc7-88bd-97686406b75a", + "metadata": { + "tags": [] + }, + "source": [ + "# Using DYAD to accelerate distributed Deep Learning (DL) training\n", + "\n", + "Now that we have seen how Flux enables the management and deployment of services, let's look at an example of using DYAD, an advanced Flux service for runtime data movement, in a real world application. Specifically, we will show how DYAD speeds up distributed Deep Learning (DL) training. In this module, we cover these topics:\n", + "1. Design of DYAD\n", + "2. Distributed DL training\n", + "3. Deep Learning I/O (DLIO) benchmark\n", + "4. Accelerating distributed DL training\n", + "\n", + "## Design of DYAD\n", + "\n", + "DYAD provides transparent, locality-aware, write-once, read-many file caching that runs on top of local NVMe and other burst buffer-style technologies (e.g., El Capitan Rabbit nodes). Figure X shows the components of DYAD, including the DYAD service (implemented as a Flux broker module), the DYAD client, and DYAD's data transport layer. DYAD uses the Flux KVS to store metadata about tracked files, and it uses Flux's remote proceedure call capabilities to communicate between client and service. DYAD also uses UCX to perform RDMA-based data transfer to move files.\n", + "\n", + "
\n", + "\n", + "
\n", + "Image created by Ian Lumsden for a poster at SC'23
\n", + "
\n", + "\n", + "DYAD is designed to accelerate large, distributed workloads, such as distributed Deep Learning (DL) training and scientific computing workflows, on HPC systems. It is also designed be transparent, which allows users to leverage DYAD with little to no code refactoring. Unlike similar tools (e.g., DataSpaces and UnifyFS), which tend to optimize for write performance, DYAD aims to provide good write **and read** performance. To optimize read performance, DYAD uses a locality-aware \"Hierarchical Data Locator,\" which prioritizes node-local metadata and data retrieval to minimize the amount of network communications. When moving data from another node, DYAD also uses a streaming RPC over RDMA protocol, which uses preallocated buffers and connection caching to maximize network bandwidth. This process is shown in the figure below:\n", + "\n", + "
\n", + "\n", + "
\n", + "Image created by Hari Devarajan for a paper submitted to SC'24
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "d32e7976", + "metadata": {}, + "source": [ + "## Distributed DL Training\n", + "\n", + "Distributed DL training is an approach to speed up the training of large Deep Learning models by performing multiple epochs of training in parallel across multiple GPUs and, oftentimes, multiple nodes. This approach is supported by most major DL libraries, such as PyTorch and Tensorflow. In this module, we focus on PyTorch. When running training across multiple nodes and GPUs, PyTorch starts by spawning one process per GPU, called the worker. Each worker performs three major tasks:\n", + "1. Determining which samples from the dataset will comprise the batch for the next epoch of training (i.e., epoch *N+1*)\n", + "2. Reading these samples from the filesystem\n", + "3. Building a batch from these samples and moving the batch to the GPU\n", + "\n", + "To assist with reading the samples from the filesystem, each worker also spawns additional I/O processes. Each of these processes reads data and, optionally, transforms the data based on the configuration of the training pipeline. Figure X shows this process for a single GPU, a single worker, and a single spawned I/O process. In this figure, \"I/O\" indicates data being read from the filesystem, and \"Map\" indicates the optional transformation of data. \"Batch\" indicates the building of a batch from the read samples.\n", + "\n", + "
\n", + "\n", + "
\n", + "Image created by Ian Lumsden based on an image from this article
\n", + "
\n", + "\n", + "One key difference between distributed DL training and many conventional HPC applications (e.g., MPI-based simulations) is the asynchronous loading of data by workers during training. In many conventional HPC applications, data loading and computation are performed one after the one. On the other hand, as shown in Figure X, the loading of data in distributed DL training is asynchronous. In other words, while the GPU is training the DL model for epoch *N*, the worker reading and creating the batch for epoch *N+1*. This asynchronous loading of data can lead to imbalance between data loading and training. For example, Figure X shows a scenario where the data loading takes longer than training, resulting in idle time on the GPU, wasted resources, and, overall, an I/O bound application.\n", + "\n", + "At the end of each epoch of training, all workers and GPUs are synchronized so that the DL models from each GPU can be merged together. This synchronization and merging usually consists of an allreduce-style operation. This synchronization makes the effects of any imbalance between data loading and training more pronounced because, if even one worker and GPU become imbalanced, the performance of the entire distributed training will suffer." + ] + }, + { + "cell_type": "markdown", + "id": "bf6493c2", + "metadata": {}, + "source": [ + "## Deep Learning I/O Benchmark\n", + "\n", + "Due to limited resources and due to this module being about a data movement service (DYAD), we do not need to actually train a DL model in this tutorial. Instead, we accurately show DYAD's benefit to DL training without performing the training itself by using the Argonne National Laboratory's [Deep Learning I/O benchmark](https://github.com/argonne-lcf/dlio_benchmark), or DLIO for short.\n", + "\n", + "DLIO is a benchmark that aims to emulate the I/O behavior of Deep Learning applications. It has an extensible and modular design that allows it to use or mimic aspects (e.g., data formats, worker configuration, data loading-training balanced) of real-world applications. DLIO also has several useful support features, such as the ability to generate data with certain characteristics for users.\n", + "\n", + "To learn more about DLIO, check out the following links:\n", + "* [DLIO Paper](https://ieeexplore.ieee.org/document/9499416)\n", + "* [DLIO Repo](https://github.com/argonne-lcf/dlio_benchmark)" + ] + }, + { + "cell_type": "markdown", + "id": "be8da082", + "metadata": {}, + "source": [ + "## Accelerating DL training\n", + "\n", + "As mentioned in the [Design of DYAD](#design-of-dyad) section, DYAD provides write-once, read-many file caching. This feature is extremely useful in read-heavy workloads, like distributed DL training.\n", + "\n", + "In this section, we show DYAD's benefits to DL training using DLIO. More specifically, we first show an integration of DYAD into PyTorch through custom `Dataset` and `DataLoader` classes. Then, we run DYAD through a configuration of DLIO that mimics the training of a 3D U-Net model. Due to resource restrictions, we only run a small version of the 3D U-Net training pipeline. Finally, we show the I/O performance of DYAD compared against Lustre and [UnifyFS](https://ieeexplore.ieee.org/document/10177390) in training a full version of the 3D U-Net model at various scales on LLNL's [Corona](https://hpc.llnl.gov/hardware/compute-platforms/corona) supercomputer.\n", + "\n", + "### Integrating DYAD into PyTorch\n", + "\n", + "When using custom datasets or custom techniques/tools to read a dataset from storage, PyTorch requires the creation of `Dataset` and `DataLoader` classes. To use DYAD in PyTorch-based distributed DL training, we have implemented several of the `DYADTorchDataset` and `DyadTorchDataLoader` classes, which can both be found [here](../dlio_extensions/dyad_torch_data_loader.py). The `DYADTorchDataset` class is used to read samples from \"remote\" storage (if not previously read) or DYAD (if previously read), and it contains all the DYAD-specific code. The `DyadTorchDataLoader` class is a basic `DataLoader` which configures the \"I/O\" and \"Map\" steps of the data loading pipeline.\n", + "\n", + "In the following code cells, we show the DYAD-specific code in `DYADTorchDataset`. As you will see, this code is very similar to standard Python file I/O. As a result, this code serves as an example of DYAD's transparency.\n", + "\n", + "
\n", + "Note: due to several aspects of PyTorch's design (described below), DYAD cannot be used as transparently as normal. Normally, in Python, users would just have to replace the built-in `open` function for DYAD's `dyad_open`. As a result, this use case should be considered the *worst case* for DYAD's transparency.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c92da400", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import inspect\n", + "from pygments import highlight\n", + "from pygments.lexers import PythonLexer\n", + "from pygments.formatters import HtmlFormatter\n", + "from IPython.display import display, HTML\n", + "\n", + "sys.path.insert(0, os.path.abspath(\"../dlio_extensions/\"))\n", + "\n", + "from dyad_torch_data_loader import DYADTorchDataset" + ] + }, + { + "cell_type": "markdown", + "id": "8007ad75", + "metadata": {}, + "source": [ + "This first block of code shows the `DYADTorchDataset.worker_init` function. This function is called to initialize the I/O processes used to read samples. As a result, this function contains two parts: (1) the initialization of PyTorch internals and utilities and (2) the initialization of DYAD.\n", + "\n", + "Normally, DYAD is configured using environment variables, and, as a result, DYAD's initialization can be hidden from users. However, due to PyTorch's complexity and challenges in correctly propagating environment variables through PyTorch's dynamic process spawning, DYAD's transparent, environment variable-based initialization cannot be used in `DYADTorchDataset`. Instead, we manually initialize and configure DYAD using `Dyad.init()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27e463c0", + "metadata": {}, + "outputs": [], + "source": [ + "display(HTML(highlight(inspect.getsource(DYADTorchDataset.worker_init), PythonLexer(), HtmlFormatter(full=True))))" + ] + }, + { + "cell_type": "markdown", + "id": "32a146e6", + "metadata": {}, + "source": [ + "This second block of code shows the `DYADTorchDataset.__getitem__` function. This function is called by `DyadTorchDataLoader` to read individual samples for a batch from disk. With other `Dataset` classes, this function would simply identify the file containing the requested sample and read that sample from remote storage (e.g., Lustre) using Python's built-in `open` function. On the other hand, `DYADTorchDataset` does four things. First, it identifies the file containing the requested sample. Second, it uses DYAD's `get_metadata` function to check if that file has already been cached into DYAD. Third, if the file has already been cached, it will retrieve the sample using DYAD's `dyad_open` function. This function retrieves the sample from a different node, if needed, and then makes that sample available through an interface equivalent to Python's built-in `open` function. Finally, if the file has **not** been cached, it will read the sample from remote storage (e.g., Lustre) and cache the sample into DYAD for more efficient future reading." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab755b0a", + "metadata": {}, + "outputs": [], + "source": [ + "display(HTML(highlight(inspect.getsource(DYADTorchDataset.__getitem__), PythonLexer(), HtmlFormatter(full=True))))" + ] + }, + { + "cell_type": "markdown", + "id": "fefd9ae3", + "metadata": {}, + "source": [ + "### Running DLIO with DYAD for a 3D U-Net model\n", + "\n", + "Now that we have seen how DYAD is integrated into PyTorch, we configure and run DYAD through a configuration of DLIO that mimics the training of a 3D U-Net model." + ] + }, + { + "cell_type": "markdown", + "id": "731d52a3", + "metadata": {}, + "source": [ + "#### Configuring DLIO and DYAD\n", + "\n", + "First, we configure DYAD. DYAD requires three settings for configuration:\n", + "1. A namespace in the Flux key-value store, which DYAD will use for metadata management\n", + "2. A \"managed directory,\" which DYAD will use to determine the files that should be tracked\n", + "3. A data transport layer (DTL) mode, which DYAD will use to select the underlying networking library for data transfer " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21abe5ee", + "metadata": {}, + "outputs": [], + "source": [ + "kvs_namespace = \"dyad\"\n", + "managed_directory = \"./dyad_data\"\n", + "dtl_mode = \"UCX\" # We currently only support UCX, so do not change this" + ] + }, + { + "cell_type": "markdown", + "id": "6e32bc27", + "metadata": {}, + "source": [ + "Next, we configure DLIO. DLIO requires several configuration settings. However, for this tutorial, the only one that should be set is the initial data directory, or the directory where the dataset initially resides at the start of training. When running DLIO, the `DYADTorchDataset` class dynamically copies files from this directory into DYAD's managed directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5e3438f", + "metadata": {}, + "outputs": [], + "source": [ + "initial_data_directory = \"./dlio_data\"" + ] + }, + { + "cell_type": "markdown", + "id": "d979369c", + "metadata": {}, + "source": [ + "Finally, we set the remaining configurations for DLIO. These should not be edited because they depend on the directory structure and configuration of this tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92881a8f", + "metadata": {}, + "outputs": [], + "source": [ + "workers_per_node = 1\n", + "dyad_install_prefix = \"/usr/local\"\n", + "num_nodes = 2\n", + "dlio_extensions_dir = \"/home/jovyan/flux-tutorial-2024/dlio_extensions\"\n", + "workload = \"dyad_unet3d_demo\"" + ] + }, + { + "cell_type": "markdown", + "id": "801719eb", + "metadata": {}, + "source": [ + "To properly set the environment variables needed for running DLIO with DYAD, we create an environment file that is compatible with the `--env-file` flag of `flux submit`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ce527f2", + "metadata": {}, + "outputs": [], + "source": [ + "env_file = f\"\"\"\n", + "DYAD_KVS_NAMESPACE={kvs_namespace}\n", + "DYAD_DTL_MODE={dtl_mode}\n", + "DYAD_PATH={managed_directory}\n", + "LD_LIBRARY_PATH={dyad_install_prefix}/lib\n", + "PYTHONPATH={dlio_extensions_dir}:$PYTHONPATH\n", + "DLIO_PROFILER_ENABLE=0\n", + "\"\"\"\n", + "\n", + "with open(\"dlio_env.txt\", \"w\") as f:\n", + " f.write(env_file)" + ] + }, + { + "cell_type": "markdown", + "id": "398e110f", + "metadata": {}, + "source": [ + "#### Creating a Flux KVS namespace and starting the DYAD service\n", + "\n", + "Next, we start the DYAD service. This involves two steps. First, we need to create a namespace withing the Flux key-value store. This namespace is used by DYAD to store metadata about cached files. This metadata is then used by DYAD's Hierarchical Data Locator to locate files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf132600", + "metadata": {}, + "outputs": [], + "source": [ + "!flux kvs namespace create {kvs_namespace}" + ] + }, + { + "cell_type": "markdown", + "id": "723cbeaf", + "metadata": {}, + "source": [ + "After creating the key-value store namespace, we start the DYAD service itself using the `flux module load` command. We run that command through `flux exec -r all` to deploy the service across all Flux brokers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3220ef03", + "metadata": {}, + "outputs": [], + "source": [ + "!flux exec -r all flux module load {dyad_install_prefix}/lib/dyad.so --mode={dtl_mode} {managed_directory}" + ] + }, + { + "cell_type": "markdown", + "id": "f95e0145", + "metadata": {}, + "source": [ + "Finally, we check that the service and key-value store namespace were successfully created with the cells below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4750013c", + "metadata": {}, + "outputs": [], + "source": [ + "!flux module list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3322e350", + "metadata": {}, + "outputs": [], + "source": [ + "!flux kvs namespace list" + ] + }, + { + "cell_type": "markdown", + "id": "c0dfe655", + "metadata": {}, + "source": [ + "#### Generating data for the 3D U-Net\n", + "\n", + "Before running DLIO, we need to obtain data for emulated training of the 3D U-Net. Instead of downloading the full dataset, we use DLIO to generate a smaller, synthetic version of the dataset for this tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2dd03ec1", + "metadata": {}, + "outputs": [], + "source": [ + "!flux run -N {num_nodes} --tasks-per-node=1 mkdir -p {managed_directory} \n", + "!flux run -N {num_nodes} --tasks-per-node=1 rm -r {managed_directory}/* \n", + "!flux run -N {num_nodes} --tasks-per-node=1 mkdir -p {initial_data_directory} \n", + "!flux run -N {num_nodes} --tasks-per-node=1 rm -r {initial_data_directory}/* " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4e5d30e", + "metadata": {}, + "outputs": [], + "source": [ + "!flux run -N {num_nodes} -o cpu-affinity=off --tasks-per-node={workers_per_node} --env-file=dlio_env.txt \\\n", + " dlio_benchmark --config-dir={dlio_extensions_dir}/configs workload={workload} \\\n", + " ++workload.dataset.data_folder={initial_data_directory} ++workload.workflow.generate_data=True \\\n", + " ++workload.workflow.train=False\n", + "!echo \"FINISHED GENERATING DATA\"" + ] + }, + { + "cell_type": "markdown", + "id": "3f14ffdd", + "metadata": {}, + "source": [ + "#### Emulating training of the 3D U-Net with DLIO\n", + "\n", + "Now, we run DLIO using the command below. As DLIO runs, it prints out logging statements showing how long sample reading takes. At the end of the run, DLIO prints out a performance summary." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3437a068", + "metadata": {}, + "outputs": [], + "source": [ + "!flux run -N {num_nodes} -o cpu-affinity=on --tasks-per-node={workers_per_node} --env-file=dlio_env.txt \\\n", + " dlio_benchmark --config-dir={dlio_extensions_dir}/configs workload={workload} \\\n", + " ++workload.dataset.data_folder={initial_data_directory} ++workload.workflow.generate_data=False \\\n", + " ++workload.workflow.train=True\n", + "!echo \"FINISHED TRAINING\"" + ] + }, + { + "cell_type": "markdown", + "id": "573ce232", + "metadata": {}, + "source": [ + "#### Shutting down the DYAD service\n", + "\n", + "Now that we are done running DLIO, we need to shutdown the DYAD service and remove the key-value store namespace used by DYAD. This is done with the two Flux commands below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "755251df", + "metadata": {}, + "outputs": [], + "source": [ + "!flux kvs namespace remove {kvs_namespace}\n", + "!flux exec -r all flux module remove dyad" + ] + }, + { + "cell_type": "markdown", + "id": "cbd52626", + "metadata": {}, + "source": [ + "The following cells show that the DYAD service has been removed and that the namespace has been removed from the key-value store." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bf50c8e", + "metadata": {}, + "outputs": [], + "source": [ + "!flux module list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e50c926e", + "metadata": {}, + "outputs": [], + "source": [ + "!flux kvs namespace list" + ] + }, + { + "cell_type": "markdown", + "id": "607cb1d2", + "metadata": {}, + "source": [ + "Finally, we need to remove all the files we generated while running DLIO. We use `flux run` to ensure that any node-local files are deleted." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b99e3b4", + "metadata": {}, + "outputs": [], + "source": [ + "!flux run -N {num_nodes} --tasks-per-node=1 mkdir -p {managed_directory} \n", + "!flux run -N {num_nodes} --tasks-per-node=1 rm -r {managed_directory}/* \n", + "!flux run -N {num_nodes} --tasks-per-node=1 mkdir -p {initial_data_directory} \n", + "!flux run -N {num_nodes} --tasks-per-node=1 rm -r {initial_data_directory}/* " + ] + }, + { + "cell_type": "markdown", + "id": "68ea2fe7", + "metadata": {}, + "source": [ + "### Evaluating DYAD's performance for the 3D U-Net at scale on Corona\n", + "\n", + "
\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "Figure X shows the performance of Lustre, [UnifyFS](https://ieeexplore.ieee.org/document/10177390), and DYAD in terms of runtime and I/O bandwidth for the full version of the 3D U-Net training. As explained on the [webpage for the KiTS19 Challenge](https://kits19.grand-challenge.org/), the dataset for the full version of this application consists of 10,240, NPZ-formatted image files, resulting in a total dataset size of 1.36 TB. Within each epoch of PyTorch-based training, the model processes batches of 4 images using 6 I/O processes per GPU. The model trains for 20 epochs without checkpointing. The model scales from 8 to 64 nodes of LLNL's [Corona](https://hpc.llnl.gov/hardware/compute-platforms/corona) supercomputer, with 8 GPUs per node.\n", + "\n", + "In the leftmost plot of Figure X, we show the runtime of the training for Lustre, UnifyFS, and DYAD at 8, 16, 32, and 64 nodes. This plot shows that DYAD provides significant runtime improvement compared to Lustre and UnifyFS for the 3D U-Net, mainly due to locality optimizations. DYAD runs up to 7.5 times faster than Lustre and 1.88 times faster than UnifyFS, with less performance variability due to DYAD's use of node-local storage.\n", + "\n", + "In the middle plot of Figure X, we show the bandwidth per epoch of training across 512 GPUs (64 nodes). Because DYAD's capabilities allow for on-the-fly caching of data, its performance starts similar to that of Lustre. As more data is cached into DYAD, its bandwidth increases to 140 GB/s due to DYAD's streaming RPC over RDMA protocol. Finally, as even more data is cached, DYAD's bandwidth reaches 1409 GB/s because DYAD's locality-aware caching allows almost all sample reads to be performed directly on node-local NVMe. In comparison, both Lustre and Unify maintain consistent bandwidths well under those of DYAD. By the 20th epoch, DYAD speeds up training by 10.62 times compared to UnifyFS.\n", + "\n", + "Finally, in the rightmost plot of Figure X, we show how often DYAD retrieved data from node-local storage versus retrieving data from storage on a remote node in terms of percentage of data access requests. Initially, DYAD mostly performs remote requests. As training continues, more and more data is replicated with DYAD's locality-aware caching, resulting in a larger percentage of local requests. By epoch 13, almost all data is accessed through local requests. This transition from mostly remote requests to mostly local requests corresponds with the increase in bandwidth shown in the middle plot of Figure X." + ] + }, + { + "cell_type": "markdown", + "id": "81d7d87f-1e09-42c8-b165-8902551f6847", + "metadata": {}, + "source": [ + "# This concludes Module 3.\n", + "\n", + "In this module, we covered:\n", + "1. Design of DYAD\n", + "2. Distributed DL training\n", + "3. Deep Learning I/O (DLIO) benchmark\n", + "4. Accelerating distributed DL training\n", + "\n", + "To continue with the tutorial, open [Module 4](./04_flux_tutorial_conclusions.ipynb)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dyad_example1.svg b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dyad_example1.svg new file mode 100644 index 0000000..e24636a --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dyad_example1.svg @@ -0,0 +1 @@ +ProducerConsumerNode1Node2Local StorageLocal StorageWI/O intercepting wrapper lib relying on KVS + RPCR \ No newline at end of file diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dyad_example2.svg b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dyad_example2.svg new file mode 100644 index 0000000..2539695 --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/dyad_example2.svg @@ -0,0 +1 @@ +RWConsumerProducerP1P2Shared FSsync \ No newline at end of file diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/img/dyad-software-stack.png b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/img/dyad-software-stack.png new file mode 100644 index 0000000..eae3071 Binary files /dev/null and b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/img/dyad-software-stack.png differ diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/img/dyad-unet3d-results.svg b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/img/dyad-unet3d-results.svg new file mode 100644 index 0000000..d02c33d --- /dev/null +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/img/dyad-unet3d-results.svg @@ -0,0 +1,149 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/img/dyad_design.png b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/img/dyad_design.png new file mode 100644 index 0000000..88bc649 Binary files /dev/null and b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/supplementary/dyad/img/dyad_design.png differ