From 74028b2bd7f16555ea1f4f80d6610d95ec7be2a2 Mon Sep 17 00:00:00 2001 From: vsoch Date: Sat, 20 Jul 2024 12:45:04 -0600 Subject: [PATCH] flux commands Improvement on sections and table for flux commands, and addition of flux accounting to container (likely will not easily work). Signed-off-by: vsoch --- 2024-RADIUSS-AWS/JupyterNotebook/README.md | 35 +- .../JupyterNotebook/docker/Dockerfile.spawn | 14 + .../tutorial/01_flux_tutorial.ipynb | 1156 +++++++++++++---- 3 files changed, 935 insertions(+), 270 deletions(-) diff --git a/2024-RADIUSS-AWS/JupyterNotebook/README.md b/2024-RADIUSS-AWS/JupyterNotebook/README.md index cb39c04..d891233 100644 --- a/2024-RADIUSS-AWS/JupyterNotebook/README.md +++ b/2024-RADIUSS-AWS/JupyterNotebook/README.md @@ -32,7 +32,40 @@ to build them unless you are developing or changing them. If you do build (and use a different name) be sure to push your images to a public registry (or load them locally to your development cluster). - +### TODO + +After we add the flux-accounting: +- after flux resource list, to see queues available (flux queue list) + +- more carbon copy examples +- move flux batch aboe hierarchy + - transition into "what if I flux batch in my flux batch" (in my interactive allocation) + - yooo dawg + - check out riken tutorial for example + - drop the tree thing + - drop the throughput thing +- better way to render script in the notebook +- "construct a job submission object, called a jobspec" +- Python, make handle, create job description (jobspec), submit and info (monitor) +- collapse json dump +- add watch / track events for some job + - reproduce cheese / pancakes example here + - how to list jobs + - hot to get output for a job +- figure out way to collapse the last section +- typo at top of chapter 2 +- do a section for flux exec? (show doing something across our "nodes" +- move flux archive into main tutorial +- Plumbing to Porcelain - "the toilet vs. the pipes" 💩️🚽️ +- squash Deep Dive into section above it +- set up flux-accounting and see if it works + - how to specify a bank for a job + - list banks (all) - flux account view-bank --tree + - specify banks - flux account view user $USER + +- Chapter 2: Flux Plumbing 💩️🚽️ + - add flux job submit, show --dry-run + ## Local Usage While the tutorial here is intended for deployment on AWS or Google Cloud, you can also give it a try on your local machine with a single container! You will need to [install Docker](https://docs.docker.com/engine/install/). When you have Docker available, you can build and run the tutorial with: diff --git a/2024-RADIUSS-AWS/JupyterNotebook/docker/Dockerfile.spawn b/2024-RADIUSS-AWS/JupyterNotebook/docker/Dockerfile.spawn index 55f51e1..16a6b2e 100644 --- a/2024-RADIUSS-AWS/JupyterNotebook/docker/Dockerfile.spawn +++ b/2024-RADIUSS-AWS/JupyterNotebook/docker/Dockerfile.spawn @@ -79,6 +79,13 @@ RUN git clone https://github.com/flux-framework/dyad.git \ COPY ./flux-tree/* /usr/libexec/flux/cmd/ RUN chmod +x /usr/libexec/flux/cmd/flux-tree* +# Flux accounting +RUN git clone https://github.com/flux-framework/flux-accounting && \ + cd flux-accounting && \ + ./autogen.sh && \ + ./configure --prefix=/usr && \ + make && make install + RUN apt-get update && apt-get install -y nodejs && apt-get clean && rm -rf /var/lib/apt/lists/* RUN wget https://nodejs.org/dist/v20.15.0/node-v20.15.0-linux-x64.tar.xz && \ @@ -119,6 +126,13 @@ COPY ./docker/start.sh /start.sh RUN mkdir -p $HOME/.local/share && \ chmod 777 $HOME/.local/share +# Quick setup of flux-accounting (not working) +# RUN flux start /bin/bash -c "nohup flux account create-db && flux account-service & flux account add-bank root 1" && \ +# flux start flux account add-bank --parent-bank=root default 1 && \ +# flux start flux account add-user --username=jovyan --bank=default && \ +# flux start flux jobtap load mf_priority.so && \ +# flux start flux account-update-db + USER ${NB_USER} CMD ["flux", "start", "--test-size=4", "jupyter", "lab"] diff --git a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/01_flux_tutorial.ipynb b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/01_flux_tutorial.ipynb index 36370ad..71bf3ea 100644 --- a/2024-RADIUSS-AWS/JupyterNotebook/tutorial/01_flux_tutorial.ipynb +++ b/2024-RADIUSS-AWS/JupyterNotebook/tutorial/01_flux_tutorial.ipynb @@ -23,7 +23,7 @@ "\n", "> What is Flux Framework? 🤔️\n", " \n", - "Flux is a flexible framework for resource management, built for your site. The framework consists of a suite of projects, tools, and libraries which may be used to build site-custom resource managers for High Performance Computing centers. Flux is a next-generation resource manager and scheduler with many transformative capabilities like hierarchical scheduling and resource management (you can think of it as \"fractal scheduling\") and directed-graph based resource representations.\n", + "Flux is a flexible framework for resource management, built for your site. The framework consists of a suite of projects, tools, and libraries that may be used to build site-custom resource managers for High Performance Computing centers and cloud environments. Flux is a next-generation resource manager and scheduler with many transformative capabilities like hierarchical scheduling and resource management (you can think of it as \"fractal scheduling\") and directed-graph based resource representations.\n", "\n", "> I'm ready! How do I do this tutorial? 😁️\n", "\n", @@ -33,7 +33,7 @@ "\n", "This tutorial is split into 3 chapters, each of which has a notebook:\n", "* [Chapter 1: Getting started with Flux](./01_flux_tutorial.ipynb) (you're already here, it's this notebook!)\n", - "* [Chapter 2: Using Flux to manage and deploy distributed services](./02_flux_framework.ipynb)\n", + "* [Chapter 2: Flux Plumbing](./02_flux_framework.ipynb)\n", "* [Chapter 3: Lessons learned, next steps, and discussion](./03_flux_tutorial_conclusions.ipynb)\n", "\n", "And if you have some extra time and interest, we have supplementary chapters to teach you about advanced (often experimental, or under development) features:\n", @@ -45,7 +45,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 1, "id": "d71ecd22-8552-4b4d-9bc4-61d86f8d33fe", "metadata": { "tags": [] @@ -111,6 +111,10 @@ "execution_count": 3, "id": "c7d616de-70cd-4090-bd43-ffacb5ade1f6", "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, "tags": [] }, "outputs": [ @@ -180,15 +184,579 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 30, "id": "2e54f640-283a-4523-8dde-9617fd6ef0c5", "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "scrolled": true, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FLUX-JOBS(1) flux-core FLUX-JOBS(1)\n", + "\n", + "NAME\n", + " flux-jobs - list jobs submitted to Flux\n", + "\n", + "SYNOPSIS\n", + " flux jobs [OPTIONS] [JOBID ...]\n", + "\n", + "DESCRIPTION\n", + " flux jobs is used to list jobs run under Flux. By default only pending\n", + " and running jobs for the current user are listed. Additional jobs and\n", + " information can be listed using options listed below. Alternately,\n", + " specific job ids can be listed on the command line to only list those\n", + " job IDs.\n", + "\n", + "OPTIONS\n", + " -a List jobs in all states, including inactive jobs. This is\n", + " shorthand for --filter=pending,running,inactive.\n", + "\n", + " -A List jobs of all users. This is shorthand for --user=all.\n", + "\n", + " -n, --no-header\n", + " For default output, do not output column headers.\n", + "\n", + " -u, --user=[USERNAME|UID]\n", + " List jobs for a specific username or userid. Specify all for all\n", + " users.\n", + "\n", + " --name=[JOB NAME]\n", + " List jobs with a specific job name.\n", + "\n", + " --queue=[QUEUE]\n", + " List jobs in a specific queue.\n", + "\n", + " -c, --count=N\n", + " Limit output to N jobs (default 1000)\n", + "\n", + " --since=WHEN\n", + " Limit output to jobs that have been active since a given time‐\n", + " stamp. In other words, jobs that are currently pending, cur‐\n", + " rently running, or became inactive since the given timestamp.\n", + " This option implies -a if no other --filter options are speci‐\n", + " fied. If WHEN begins with - character, then the remainder is\n", + " considered to be a an offset in Flux standard duration (RFC 23).\n", + " Otherwise, any datetime expression accepted by the Python ‐\n", + " parsedatetime module is accepted. Examples: \"-6h\", \"-1d\", \"yes‐\n", + " terday\", \"2021-06-21 6am\", \"last Monday\", etc. It is assumed to\n", + " be an error if a timestamp in the future is supplied.\n", + "\n", + " -f, --filter=STATE|RESULT\n", + " List jobs with specific job state or result. Multiple states or\n", + " results can be listed separated by comma. See JOB STATUS below\n", + " for additional information. Defaults to pending,running.\n", + "\n", + " -o, --format=NAME|FORMAT\n", + " Specify a named output format NAME or a format string using\n", + " Python's format syntax. See OUTPUT FORMAT below for field names.\n", + " Named formats may be listed via --format=help. An alternate de‐\n", + " fault format can be set via the FLUX_JOBS_FORMAT_DEFAULT envi‐\n", + " ronment variable. Additional named formats may be registered\n", + " with flux jobs via configuration. See the CONFIGURATION section\n", + " for more details. A configuration snippet for an existing named\n", + " format may be generated with --format=get-config=NAME.\n", + "\n", + " --json Emit data for selected jobs in JSON format. The data for multi‐\n", + " ple matching jobs is contained in a jobs array in the emitted\n", + " JSON object, unless a single job was selected by jobid on the\n", + " command line, in which case a JSON object representing that job\n", + " is emitted on success. With --recursive, each job which is also\n", + " an instance of Flux will will have any recursively listed jobs\n", + " in a jobs array, and so on for each sub-child.\n", + "\n", + " Only the attributes which are available at the time of the flux\n", + " jobs query will be present in the returned JSON object for a\n", + " job. For instance a pending job will not have runtime, waitsta‐\n", + " tus or result keys, among others. A missing key should be con‐\n", + " sidered unavailable.\n", + "\n", + " The --json option is incompatible with --stats and --stats-only,\n", + " and any --format is ignored.\n", + "\n", + " --color[=WHEN]\n", + " Control output coloring. The optional argument WHEN can be\n", + " auto, never, or always. If WHEN is omitted, it defaults to al‐\n", + " ways. Otherwise the default is auto.\n", + "\n", + " --stats\n", + " Output a summary of job statistics before the header. By de‐\n", + " fault shows global statistics. If --queue is specified, shows\n", + " statistics for the specified queue. May be useful in conjunc‐\n", + " tion with utilities like watch(1), e.g.:\n", + "\n", + " $ watch -n 2 flux jobs --stats -f running -c 25\n", + "\n", + " will display a summary of statistics along with the top 25 run‐\n", + " ning jobs, updated every 2 seconds.\n", + "\n", + " Note that all job failures, including canceled and timeout jobs,\n", + " are collectively counted as \"failed\" in --stats.\n", + "\n", + " --stats-only\n", + " Output a summary of job statistics and exit. By default shows\n", + " global statistics. If --queue is specified, shows statistics\n", + " for the specified queue. flux jobs will exit with non-zero exit\n", + " status with --stats-only if there are no active jobs. This al‐\n", + " lows the following loop to work:\n", + "\n", + " $ while flux jobs --stats-only; do sleep 2; done\n", + "\n", + " All options other than --queue are ignored when --stats-only is\n", + " used.\n", + "\n", + " Note that all job failures, including canceled and timeout jobs,\n", + " are collectively counted as \"failed\" in --stats-only.\n", + "\n", + " -R, --recursive\n", + " List jobs recursively. Each child job which is also an instance\n", + " of Flux is prefixed by its jobid \"path\" followed by the list of\n", + " jobs, recursively up to any defined --level. If the --stats op‐\n", + " tion is used, then each child instance in the hierarchy is\n", + " listed with its stats.\n", + "\n", + " --recurse-all\n", + " By default, jobs not owned by the user running flux jobs are\n", + " skipped with --recursive, because normally Flux instances only\n", + " permit the instance owner to connect. This option forces the\n", + " command to attempt to recurse into the jobs of other users. Im‐\n", + " plies --recursive.\n", + "\n", + " -L, --level=N\n", + " With --recursive, stop recursive job listing at level N. Levels\n", + " are counted starting at 0, so flux jobs -R --level=0 is equiva‐\n", + " lent to flux jobs without -R, and --level=1 would limit recur‐\n", + " sive job listing to child jobs of the current instance.\n", + "\n", + " --threads=N\n", + " When flux jobs recursively queries job lists (with --recursive)\n", + " or fetches info for jobs that are also instances (see instance.*\n", + " fields), a pool of threads is used to parallelize the required\n", + " RPCs. Normally, the default number of ThreadPoolExecutor threads\n", + " is used, but by using the --threads, a specific number of\n", + " threads can be chosen.\n", + "\n", + "JOB STATUS\n", + " Jobs may be observed to pass through five job states in Flux: DEPEND,\n", + " PRIORITY, SCHED, RUN, CLEANUP, and INACTIVE (see Flux RFC 21). Under\n", + " the state_single field name, these are abbreviated as D, S, P, R, C,\n", + " and I respectively. For convenience and clarity, the following virtual\n", + " job states also exist: \"pending\", an alias for DEPEND,PRIORITY,SCHED;\n", + " \"running\", an alias for RUN,CLEANUP; \"active\", an alias for \"pend‐\n", + " ing,running\".\n", + "\n", + " After a job has finished and is in the INACTIVE state, it can be marked\n", + " with one of the possible results: COMPLETED, FAILED, CANCELED, TIMEOUT.\n", + " Under the result_abbrev field name, these are abbreviated as CD, F, CA,\n", + " and TO respectively.\n", + "\n", + " The job status is a user friendly mix of both, a job is always in one\n", + " of the following statuses: DEPEND, PRIORITY, SCHED, RUN, CLEANUP, COM‐\n", + " PLETED, FAILED, CANCELED, or TIMEOUT. Under the status_abbrev field\n", + " name, these are abbreviated as D, P, S, R, C, CD, F, CA, and TO respec‐\n", + " tively.\n", + "\n", + "OUTPUT FORMAT\n", + " The --format option can be used to specify an output format to flux\n", + " jobs using Python's string format syntax. For example, the following is\n", + " the format used for the default format:\n", + "\n", + " {id.f58:>12} ?:{queue:<8.8} {username:<8.8} {name:<10.10+} \\\n", + " {status_abbrev:>2.2} {ntasks:>6} {nnodes:>6h} \\\n", + " {contextual_time!F:>8h} {contextual_info}\n", + "\n", + " If a format field is preceded by the special string ?: this will cause\n", + " the field to be removed entirely from output if the result would be an\n", + " empty string or zero value for all jobs in the listing. E.g.:\n", + "\n", + " {id.f58:>12} ?:{exception.type}\n", + "\n", + " would eliminate the EXCEPTION-TYPE column if no jobs in the list re‐\n", + " ceived an exception. (Thus the job queue is only displayed if at least\n", + " one job has a queue assigned in the default format shown above).\n", + "\n", + " As a reminder to the reader, some shells will interpret braces ({ and\n", + " }) in the format string. They may need to be quoted.\n", + "\n", + " The special presentation type h can be used to convert an empty string,\n", + " \"0s\", \"0.0\", \"0:00:00\", or epoch time to a hyphen. For example, nor‐\n", + " mally \"{nodelist}\" would output an empty string if the job has not yet\n", + " run. By specifying, \"{nodelist:h}\", a hyphen would be presented in‐\n", + " stead.\n", + "\n", + " The special suffix + can be used to indicate if a string was truncated\n", + " by including a + character when truncation occurs. If both h and + are\n", + " being used, then the + must appear after the h.\n", + "\n", + " Additionally, the custom job formatter supports a set of special con‐\n", + " version flags. Conversion flags follow the format field and are used to\n", + " transform the value before formatting takes place. Currently, the fol‐\n", + " lowing conversion flags are supported by flux jobs:\n", + "\n", + " !D convert a timestamp field to ISO8601 date and time (e.g.\n", + " 2020-01-07T13:31:00). Defaults to empty string if timestamp\n", + " field does not exist or the timestamp is 0 (i.e epoch time).\n", + "\n", + " !d convert a timestamp to a Python datetime object. This allows\n", + " datetime specific format to be used, e.g. {t_inac‐\n", + " tive!d:%H:%M:%S}. Additionally, width and alignment can be spec‐\n", + " ified after the time format by using two colons (::), e.g.\n", + " {t_inactive!d:%H:%M:%S::>20}. Returns an empty string (or \"-\" if\n", + " the h suffix is used) for an unset timestamp.\n", + "\n", + " !F convert a time duration in floating point seconds to Flux Stan‐\n", + " dard Duration (FSD) string (e.g. {runtime!F}). Defaults to\n", + " empty string if field does not exist.\n", + "\n", + " !H convert a time duration in floating point seconds to hours:min‐\n", + " utes:seconds form (e.g. {runtime!H}). Defaults to empty string\n", + " if time duration field does not exist.\n", + "\n", + " !P convert a floating point number into a percentage fitting in 5\n", + " characters including the \"%\" character. E.g. 0.5 becomes \"50%\"\n", + " 0.015 becomes 1.5%, and 0.0005 becomes 0.05% etc.\n", + "\n", + " As a reminder to the reader, some shells will interpret the exclamation\n", + " point (!) when using a conversion flag. The exclamation point may need\n", + " to be escaped (\\!).\n", + "\n", + " Annotations can be retrieved via the annotations field name. Specific\n", + " keys and sub-object keys can be retrieved separated by a period (\".\").\n", + " For example, if the scheduler has annotated the job with a reason pend‐\n", + " ing status, it can be retrieved via \"{annotations.sched.reason_pend‐\n", + " ing}\".\n", + "\n", + " As a convenience, the field names sched and user can be used as substi‐\n", + " tutions for annotations.sched and annotations.user. For example, a\n", + " reason pending status can be retrieved via \"{sched.reason_pending}\".\n", + "\n", + " The field names that can be specified are:\n", + "\n", + " id job ID\n", + "\n", + " id.f58 job ID in RFC 19 F58 (base58) encoding\n", + "\n", + " id.f58plain\n", + " job ID in RFC 19 F58 encoding with ascii f\n", + "\n", + " id.dec job ID in decimal representation\n", + "\n", + " id.hex job ID in 0x prefix hexadecimal representation\n", + "\n", + " id.dothex\n", + " job ID in dotted hexadecimal representation (xx.xx.xx.xx)\n", + "\n", + " id.words\n", + " job ID in mnemonic encoding\n", + "\n", + " id.emoji\n", + " job ID in emoji encoding\n", + "\n", + " userid job submitter's userid\n", + "\n", + " username\n", + " job submitter's username\n", + "\n", + " urgency\n", + " job urgency\n", + "\n", + " priority\n", + " job priority\n", + "\n", + " dependencies\n", + " list of any currently outstanding job dependencies\n", + "\n", + " status job status (DEPEND, SCHED, RUN, CLEANUP, COMPLETED, FAILED, CAN‐\n", + " CELED, or TIMEOUT)\n", + "\n", + " status_abbrev\n", + " status but in a max 2 character abbreviation\n", + "\n", + " status_emoji\n", + " status but an appropriate emoji instead of job state / result\n", + "\n", + " name job name\n", + "\n", + " cwd job current working directory\n", + "\n", + " queue job queue\n", + "\n", + " project\n", + " job accounting project\n", + "\n", + " bank job accounting bank\n", + "\n", + " ntasks job task count\n", + "\n", + " ncores job core count\n", + "\n", + " duration\n", + " job duration in seconds\n", + "\n", + " nnodes job node count (if job ran / is running), empty string otherwise\n", + "\n", + " ranks job ranks (if job ran / is running), empty string otherwise\n", + "\n", + " nodelist\n", + " job nodelist (if job ran / is running), empty string otherwise\n", + "\n", + " state job state (DEPEND, SCHED, RUN, CLEANUP, INACTIVE)\n", + "\n", + " state_single\n", + " job state as a single character\n", + "\n", + " state_emoji\n", + " job state but an appropriate emoji instead of DEPEND, SCHED,\n", + " RUN, CLEANUP, or INACTIVE\n", + "\n", + " result job result if job is inactive (COMPLETED, FAILED, CANCELED,\n", + " TIMEOUT), empty string otherwise\n", + "\n", + " result_abbrev\n", + " result but in a max 2 character abbreviation\n", + "\n", + " result_emoji\n", + " result but an appropriate emoji instead of COMPLETED, FAILED,\n", + " CANCELED, or TIMEOUT\n", + "\n", + " success\n", + " True of False if job completed successfully, empty string other‐\n", + " wise\n", + "\n", + " waitstatus\n", + " The raw status of the job as returned by waitpid(2) if the job\n", + " exited, otherwise an empty string. Note: waitstatus is the maxi‐\n", + " mum wait status returned by all job shells in a job, which may\n", + " not necessarily indicate the highest task wait status. (The job\n", + " shell exits with the maximum task exit status, unless a task\n", + " died due to a signal, in which case the shell exits with\n", + " 128+signo)\n", + "\n", + " returncode\n", + " The job return code if the job has exited, or an empty string if\n", + " the job is still active. The return code of a job is the highest\n", + " job shell exit code, or negative signal number if the job shell\n", + " was terminated by a signal. If the job was canceled before it\n", + " started, then the returncode is set to the special value -128.\n", + "\n", + " exception.occurred\n", + " True of False if job had an exception, empty string otherwise\n", + "\n", + " exception.severity\n", + " If exception.occurred True, the highest severity, empty string\n", + " otherwise\n", + "\n", + " exception.type\n", + " If exception.occurred True, the highest severity exception type,\n", + " empty string otherwise\n", + "\n", + " exception.note\n", + " If exception.occurred True, the highest severity exception note,\n", + " empty string otherwise\n", + "\n", + " t_submit\n", + " time job was submitted\n", + "\n", + " t_depend\n", + " time job entered depend state\n", + "\n", + " t_run time job entered run state\n", + "\n", + " t_cleanup\n", + " time job entered cleanup state\n", + "\n", + " t_inactive\n", + " time job entered inactive state\n", + "\n", + " runtime\n", + " job runtime\n", + "\n", + " expiration\n", + " time at which job allocation was marked to expire\n", + "\n", + " t_remaining\n", + " If job is running, amount of time remaining before expiration\n", + "\n", + " annotations\n", + " annotations metadata, use \".\" to get specific keys\n", + "\n", + " sched short hand for annotations.sched\n", + "\n", + " user short hand for annotations.user\n", + "\n", + " Field names which are specific to jobs which are also instances of Flux\n", + " include:\n", + "\n", + " instance.stats\n", + " a short string describing current job statistics for the in‐\n", + " stance of the form PD:{pending} R:{running} CD:{successful}\n", + " F:{failed}\n", + "\n", + " instance.stats.total\n", + " total number of jobs in any state in the instance.\n", + "\n", + " instance.utilization\n", + " number of cores currently allocated divided by the total number\n", + " of cores. Can be formatted as a percentage with !P, e.g. {in‐\n", + " stance.utilization!P:>4}.\n", + "\n", + " instance.gpu_utilization\n", + " same as instance.utilization but for gpu resources\n", + "\n", + " instance.progress\n", + " number of inactive jobs divided by the total number of jobs.\n", + " Can be formatted as a percentage with {instance.progress!P:>4}\n", + "\n", + " instance.resources..{ncores,ngpus}\n", + " number of cores, gpus in state state, where state can be all,\n", + " up, down, allocated, or free, e.g. {instance.re‐\n", + " sources.all.ncores}\n", + "\n", + " The following fields may return different information depending on the\n", + " state of the job or other context:\n", + "\n", + " contextual_info\n", + " Returns selected information based on the job's current state.\n", + " If the job is in PRIORITY state, then the string priority-wait\n", + " is returned, if the job is in DEPEND state, then a list of out‐\n", + " standing dependencies is returned, if the job is in SCHED state\n", + " then an estimated time the job will run is returned (if the\n", + " scheduler supports it). Otherwise, the assigned nodelist is re‐\n", + " turned (if resources were assigned).\n", + "\n", + " contextual_info\n", + " Returns the job runtime for jobs in RUN state or later, other‐\n", + " wise the job duration (if set) is returned.\n", + "\n", + " inactive_reason\n", + " If the job is inactive, returns the reason that the job is no\n", + " longer active. Generally speaking, will output \"Exit\", \"Time‐\n", + " out\", \"Canceled\", or signal. If available, other contextual in‐\n", + " formation will also be provided such as the exit returncode or\n", + " cancellation message.\n", + "\n", + "CONFIGURATION\n", + " The flux jobs command supports registration of named output formats in\n", + " configuration files. The command loads configuration files from\n", + " flux-jobs.EXT from the following paths in order of increasing prece‐\n", + " dence:\n", + "\n", + " • $XDG_CONFIG_DIRS/flux or /etc/xdg/flux if XDG_CONFIG_DIRS is not\n", + " set. Note that XDG_CONFIG_DIRS is traversed in reverse order such\n", + " that entries first in the colon separated path are highest prior‐\n", + " ity.\n", + "\n", + " • $XDG_CONFIG_HOME/flux or $HOME/.config/flux if XDG_CONFIG_HOME is\n", + " not set\n", + "\n", + " where EXT can be one of toml, yaml, or json.\n", + "\n", + " If there are multiple flux-jobs.* files found in a directory, then they\n", + " are loaded in lexical order (i.e. .json first, then .toml, then .yaml)\n", + "\n", + " Named formats are registered in a formats table or dictionary with a\n", + " key per format pointing to a table or dictionary with the keys:\n", + "\n", + " format (required) The format string\n", + "\n", + " description\n", + " (optional) A short description of the named format, displayed\n", + " with flux jobs --format=help\n", + "\n", + " If a format name is specified in more than one config file, then the\n", + " last one loaded is used. Due to the order that flux jobs loads config\n", + " files, this allows user configuration to override system configuration.\n", + " It is an error to override any internally defined formats (such as de‐\n", + " fault).\n", + "\n", + " If a format name or string is not specified on the command line the in‐\n", + " ternally defined format default is used.\n", + "\n", + " Example:\n", + "\n", + " # $HOME/.config/flux/flux-jobs.toml\n", + "\n", + " [formats.myformat]\n", + " description = \"My useful format\"\n", + " format = \"\"\"\\\n", + " {id.f58:>12} {name:>8.8} {t_submit!D:<19} \\\n", + " {t_run!D:<19} {t_remaining!F}\\\n", + " \"\"\"\n", + "\n", + " It may be helpful to start with an existing named format by using the\n", + " --format=get-config=NAME option, e.g.:\n", + "\n", + " $ flux jobs --format=get-config=default >> ~/.config/flux/flux-jobs.toml\n", + "\n", + " Be sure to change the name of the format string from default. It is an\n", + " error to redefine the default format string.\n", + "\n", + "EXAMPLES\n", + " The default output of flux jobs will list the pending and running jobs\n", + " of the current user. It is equivalent to:\n", + "\n", + " $ flux jobs --filter=pending,running\n", + "\n", + " To list all pending, running, and inactive jobs, of the current user,\n", + " you can use --filter option or the -a option:\n", + "\n", + " $ flux jobs -a\n", + "\n", + " OR\n", + "\n", + " $ flux jobs --filter=pending,running,inactive\n", + "\n", + " To alter which user's jobs are listed, specify the user with --user:\n", + "\n", + " $ flux jobs --user=flux\n", + "\n", + " Jobs that have finished may be filtered further by specifying if they\n", + " have completed, failed, or were canceled. For example, the following\n", + " will list the jobs that have failed or were canceled:\n", + "\n", + " $ flux jobs --filter=failed,canceled\n", + "\n", + " The --format option can be used to alter the output format or output\n", + " additional information. For example, the following would output all\n", + " jobids for the user in decimal form, and output any annotations the\n", + " scheduler attached to each job:\n", + "\n", + " $ flux jobs -a --format=\"{id} {annotations.sched}\"\n", + "\n", + " The following would output the job id and exception information, so a\n", + " user can learn why a job failed.\n", + "\n", + " $ flux jobs --filter=failed --format=\"{id} {exception.type} {exception.note}\"\n", + "\n", + "RESOURCES\n", + " Flux: http://flux-framework.org\n", + "\n", + " Flux RFC: https://flux-framework.readthedocs.io/projects/flux-rfc\n", + "\n", + "SEE ALSO\n", + " flux-pstree(1)\n", + "\n", + "AUTHOR\n", + " This page is maintained by the Flux community.\n", + "\n", + "COPYRIGHT\n", + " Copyright 2014 Lawrence Livermore National Security, LLC and Flux de‐\n", + " velopers.\n", + "\n", + " SPDX-License-Identifier: LGPL-3.0\n", + "\n", + " Jun 06, 2024 FLUX-JOBS(1)\n" + ] + } + ], "source": [ - "# We have commented this out because the output is huge! Feel free to uncomment (remove the #) and run the command\n", - "#!flux help jobs" + "!flux help jobs" ] }, { @@ -251,93 +819,90 @@ }, { "cell_type": "markdown", - "id": "eda1a33c-9f9e-4ba0-a013-e97601f79e41", + "id": "ec052119", "metadata": {}, "source": [ - "## Flux uptime\n", + "## Flux Resources\n", "\n", - "Did someone say... [uptime](https://youtu.be/SYRlTISvjww?si=zDlvpWbBljUmZw_Q)? ☝️🕑️\n", + "When you are interacting with Flux, you will commonly want to know what resources are available to you. Flux uses [hwloc](https://github.com/open-mpi/hwloc) to detect the resources on each node and then to populate its resource graph.\n", "\n", - "Don't worry, we are going to insert tidbits of fun throughout the tutorial! Don't be afraid to pause and dance! 🕺️ Flux provides an `uptime` utility to display properties of the Flux instance such as state of the current instance, how long it has been running, its size and if scheduling is disabled or stopped. The output shows how long the instance has been up, the instance owner, the instance depth (depth in the Flux hierarchy), and the size of the instance (number of brokers)." + "You can access the topology information that Flux collects with the `flux resource` subcommand. Let's run `flux resource list` to see the resources available to us in this notebook:" ] }, { "cell_type": "code", - "execution_count": 9, - "id": "1268ed06-e8f4-47a0-af4b-1b93fe3fa1b1", + "execution_count": 1, + "id": "scenic-chassis", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " 04:18:52 run 27m, owner jovyan, depth 0, size 4\n" + " STATE NNODES NCORES NGPUS NODELIST\n", + " free 4 40 0 f5af[12550686,12550686,12550686,12550686]\n", + " allocated 0 0 0 \n", + " down 0 0 0 \n" ] } ], "source": [ - "! flux uptime" + "!flux resource list" ] }, { "cell_type": "markdown", - "id": "ec052119", + "id": "0086e47e", "metadata": {}, "source": [ - "## Flux Resources\n", - "\n", - "When you are interacting with Flux, you will commonly want to know what resources are available to you. Flux uses [hwloc](https://github.com/open-mpi/hwloc) to detect the resources on each node and then to populate its resource graph.\n", - "\n", - "You can access the topology information that Flux collects with the `flux resource` subcommand. Let's run `flux resource list` to see the resources available to us in this notebook:" + "Flux can also bootstrap its resource graph based on static input files, like in the case of a multi-user system instance setup by site administrators. [More information on Flux's static resource configuration files](https://flux-framework.readthedocs.io/en/latest/adminguide.html#resource-configuration). Flux provides a more standard interface to listing available resources that works regardless of the resource input source: `flux resource`." ] }, { "cell_type": "code", - "execution_count": 1, - "id": "scenic-chassis", + "execution_count": 2, + "id": "prime-equilibrium", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " STATE NNODES NCORES NGPUS NODELIST\n", - " free 4 40 0 f5af[12550686,12550686,12550686,12550686]\n", - " allocated 0 0 0 \n", - " down 0 0 0 \n" + " STATE UP NNODES NODELIST\n", + " avail \u001b[01;32m ✔\u001b[0;0m 4 f5af[12550686,12550686,12550686,12550686]\n" ] } ], "source": [ - "!flux resource list" + "# To view status of resources\n", + "!flux resource status" ] }, { "cell_type": "markdown", - "id": "0086e47e", + "id": "e6603d7f-dd45-4743-9efb-bf65ba7e2f22", "metadata": {}, "source": [ - "Flux can also bootstrap its resource graph based on static input files, like in the case of a multi-user system instance setup by site administrators. [More information on Flux's static resource configuration files](https://flux-framework.readthedocs.io/en/latest/adminguide.html#resource-configuration). Flux provides a more standard interface to listing available resources that works regardless of the resource input source: `flux resource`." + "It might also be the case that you need to see queues. Here is how to do that:" ] }, { "cell_type": "code", - "execution_count": 2, - "id": "prime-equilibrium", + "execution_count": 32, + "id": "c7fbe877-c0bf-4296-a20b-21809caa72d7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " STATE UP NNODES NODELIST\n", - " avail \u001b[01;32m ✔\u001b[0;0m 4 f5af[12550686,12550686,12550686,12550686]\n" + " DEFAULTTIME TIMELIMIT NNODES NCORES NGPUS\n", + " inf inf 0-inf 0-inf 0-inf\n" ] } ], "source": [ - "# To view status of resources\n", - "!flux resource status" + "!flux queue list" ] }, { @@ -347,9 +912,9 @@ "tags": [] }, "source": [ - "# Submitting Jobs to Flux 💼️\n", + "# Flux Commands \n", "\n", - "How to submit jobs to Flux? Let us count the ways! Here are how Flux commands map to other schedulers you are familiar with. You can use the `flux` `submit`, `run`, `bulksubmit`, `batch`, and `alloc` commands.\n", + "Here are how Flux commands map to a scheduler you are likely familiar with, Slurm. A larger table with similar mappings for LSF, Moab, and Slurm can be [viewed here](https://hpc.llnl.gov/banks-jobs/running-jobs/batch-system-cross-reference-guides). For submitting jobs, you can use the `flux` `submit`, `run`, `bulksubmit`, `batch`, and `alloc` commands.\n", "\n", "\n", " \n", @@ -358,24 +923,29 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -383,194 +953,224 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - "
Flux
One-off submit of a single job (not interactive)One-off run of a single job (blocking)srunflux submitflux run
One-off submit of a single job (interactive)One-off run of a single job (interactive)srun --ptyflux runflux run -o pty.interactive
Submitting batch jobssbatchflux batchOne-off run of a single job (not blocking)NAflux submit
Submiting interactive jobssallocflux allocBulk submission of jobs (not blocking)NAflux bulksubmit
Watching jobsNAflux watch
Querying the status of jobsflux jobs/flux job info job_id
Cancelling running jobsCanceling running jobsscancelflux cancel
\n", + " \n", + " Submitting batch jobs\n", + " sbatch\n", + " flux batch\n", + " \n", + " \n", + " Allocation for an interactive instance\n", + " salloc\n", + " flux alloc\n", + " \n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "ac798095", + "metadata": {}, + "source": [ + "## flux run\n", "\n", - "## flux submit\n", + "
\n", + "Description: One-off run of a single job (blocking)\n", + "
\n", "\n", - "The `flux submit` command submits a job to Flux and prints out the jobid. " + "The `flux run` command submits a job to Flux (similar to `flux submit`) but then attaches to the job with `flux job attach`, printing the job's stdout/stderr to the terminal and exiting with the same exit code as the job. It's basically doing an interactive submit, because you will be able to watch the output in your terminal, and it will block your terminal until the job completes." ] }, { "cell_type": "code", - "execution_count": 10, - "id": "8a5e7d41-1d8d-426c-8198-0ad4a57e7d04", + "execution_count": 5, + "id": "52d26496-dd1f-44f7-bb10-8a9b4b8c9c80", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "ƒF91H7Gc7\n" + "399f5da372b0\n" ] } ], "source": [ - "!flux submit hostname" + "!flux run hostname" ] }, { "cell_type": "markdown", - "id": "a7e4c25e-3ca8-4277-bb70-a0e94bcd223b", + "id": "53357a9d-11d8-4c2d-87d8-c30ae38d01ba", "metadata": {}, "source": [ - "`submit` supports common options like `--nnodes`, `--ntasks`, and `--cores-per-task`. There are short option equivalents (`-N`, `-n`, and `-c`, respectively) of these options as well. `--cores-per-task=1` is the default." + "The output from the previous command is the hostname (a container ID string in this case). If the job exits with a non-zero exit code this will be reported by `flux job attach` (occurs implicitly with `flux run`). For example, execute the following:" ] }, { "cell_type": "code", - "execution_count": 11, - "id": "571d8c3d-b24a-415e-b9ac-f58b99a7e92c", + "execution_count": 6, + "id": "fa40cb98-a138-4771-a7ef-f1860dddf7db", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "ƒFCA1pkFH\n" + "flux-job: task(s) exited with exit code 1\n" ] } ], "source": [ - "!flux submit -N1 -n2 sleep inf" + "!flux run /bin/false" ] }, { - "cell_type": "code", - "execution_count": 12, - "id": "cc2bddee-f454-4674-80d4-4a39c5f1bee2", + "cell_type": "markdown", + "id": "6b2b5c3f-e24a-45a8-a10c-e10bfdbb7b87", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "usage: flux submit [OPTIONS...] COMMAND [ARGS...]\n", - "\n", - "enqueue a job\n", - "\n", - "positional arguments:\n", - " command Job command and arguments\n", - "\n", - "options:\n", - " -h, --help show this help message and exit\n", - " -q, --queue=NAME Submit a job to a specific named queue\n", - " -t, --time-limit=MIN|FSD Time limit in minutes when no units provided,\n", - " otherwise in Flux standard duration, e.g. 30s,\n", - " 2d, 1.5h\n", - " --urgency=N Set job urgency (0-31), hold=0, default=16,\n", - " expedite=31\n" - ] - } - ], "source": [ - "# Let's peek at the help for flux submit!\n", - "!flux submit --help | head -n 15" + "A job submitted with `run` can be canceled with two rapid `Cltr-C`s in succession, or a user can detach from the job with `Ctrl-C Ctrl-Z`. The user can then re-attach to the job by using `flux job attach JOBID`." ] }, { "cell_type": "markdown", - "id": "ac798095", + "id": "81e5213d", "metadata": {}, "source": [ - "## flux run\n", - "\n", - "The `flux run` command submits a job to Flux (similar to `flux submit`) but then attaches to the job with `flux job attach`, printing the job's stdout/stderr to the terminal and exiting with the same exit code as the job. It's basically doing an interactive submit, because you will be able to watch the output in your terminal, and it will block your terminal until the job completes." + "`flux submit` and `flux run` also support many other useful flags:" ] }, { "cell_type": "code", - "execution_count": 13, - "id": "52d26496-dd1f-44f7-bb10-8a9b4b8c9c80", + "execution_count": 7, + "id": "02032748", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "7db0bdd6f967\n" + "3: 399f5da372b0\n", + "2: 399f5da372b0\n", + "1: 399f5da372b0\n", + "0: 399f5da372b0\n" ] } ], "source": [ - "!flux run hostname" + "!flux run -n4 --label-io --time-limit=5s --env-remove=LD_LIBRARY_PATH hostname" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "f52bb357-a7ce-458d-9c3f-4d664eca4fbd", + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment and run this help command if you want to see all the flags for flux run\n", + "# !flux run --help" ] }, { "cell_type": "markdown", - "id": "53357a9d-11d8-4c2d-87d8-c30ae38d01ba", + "id": "7c09708a-74a1-4e61-b678-cb337b7df435", "metadata": {}, "source": [ - "The output from the previous command is the hostname (a container ID string in this case). If the job exits with a non-zero exit code this will be reported by `flux job attach` (occurs implicitly with `flux run`). For example, execute the following:" + "## flux submit\n", + "\n", + "
\n", + "Description: One-off run of a single job (not blocking)\n", + "
\n", + "\n", + "\n", + "The `flux submit` command submits a job to Flux and prints out the jobid. " ] }, { "cell_type": "code", - "execution_count": 14, - "id": "fa40cb98-a138-4771-a7ef-f1860dddf7db", - "metadata": {}, + "execution_count": 4, + "id": "cc2bddee-f454-4674-80d4-4a39c5f1bee2", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "flux-job: task(s) exited with exit code 1\n" + "usage: flux submit [OPTIONS...] COMMAND [ARGS...]\n", + "\n", + "enqueue a job\n", + "\n", + "positional arguments:\n", + " command Job command and arguments\n", + "\n", + "options:\n", + " -h, --help show this help message and exit\n", + " -q, --queue=NAME Submit a job to a specific named queue\n", + " -t, --time-limit=MIN|FSD Time limit in minutes when no units provided,\n", + " otherwise in Flux standard duration, e.g. 30s,\n", + " 2d, 1.5h\n", + " --urgency=N Set job urgency (0-31), hold=0, default=16,\n", + " expedite=31\n" ] } ], "source": [ - "!flux run /bin/false" + "# Let's peek at the help for flux submit!\n", + "!flux submit --help | head -n 15" ] }, { - "cell_type": "markdown", - "id": "6b2b5c3f-e24a-45a8-a10c-e10bfdbb7b87", + "cell_type": "code", + "execution_count": 2, + "id": "8a5e7d41-1d8d-426c-8198-0ad4a57e7d04", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ƒScZH3DbD\n" + ] + } + ], "source": [ - "A job submitted with `run` can be canceled with two rapid `Cltr-C`s in succession, or a user can detach from the job with `Ctrl-C Ctrl-Z`. The user can then re-attach to the job by using `flux job attach JOBID`." + "!flux submit hostname" ] }, { "cell_type": "markdown", - "id": "81e5213d", + "id": "a7e4c25e-3ca8-4277-bb70-a0e94bcd223b", "metadata": {}, "source": [ - "`flux submit` and `flux run` also support many other useful flags:" + "`submit` supports common options like `--nnodes`, `--ntasks`, and `--cores-per-task`. There are short option equivalents (`-N`, `-n`, and `-c`, respectively) of these options as well. `--cores-per-task=1` is the default." ] }, { "cell_type": "code", - "execution_count": 15, - "id": "02032748", + "execution_count": 3, + "id": "571d8c3d-b24a-415e-b9ac-f58b99a7e92c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "3: 7db0bdd6f967\n", - "2: 7db0bdd6f967\n", - "1: 7db0bdd6f967\n", - "0: 7db0bdd6f967\n" + "ƒSdrJJshH\n" ] } ], "source": [ - "!flux run -n4 --label-io --time-limit=5s --env-remove=LD_LIBRARY_PATH hostname" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "f52bb357-a7ce-458d-9c3f-4d664eca4fbd", - "metadata": {}, - "outputs": [], - "source": [ - "# Uncomment and run this help command if you want to see all the flags for flux run\n", - "# !flux run --help" + "!flux submit -N1 -n2 sleep inf" ] }, { @@ -580,12 +1180,16 @@ "source": [ "## flux bulksubmit\n", "\n", + "
\n", + "Description: Bulk submission of jobs (not blocking)\n", + "
\n", + "\n", "The `flux bulksubmit` command enqueues jobs based on a set of inputs which are substituted on the command line, similar to `xargs` and the GNU `parallel` utility, except the jobs have access to the resources of an entire Flux instance instead of only the local system." ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 8, "id": "f0e82702", "metadata": {}, "outputs": [ @@ -593,11 +1197,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "ƒFSHgbfxs\n", - "ƒFSHgbfxt\n", - "ƒFSHgbfxu\n", - "baz\n", + "ƒSqGSA7dh\n", + "ƒSqGSA7di\n", + "ƒSqGSA7dj\n", "bar\n", + "baz\n", "foo\n" ] } @@ -616,39 +1220,12 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "0ea1962b-1831-4bd2-8dab-c61fd710df9c", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ƒFVMZdW4X\n", - "ƒFVMZdW4Y\n", - "ƒFVMZdW4Z\n", - "ƒFVMZdW4a\n", - "ƒFVMb7VLs\n", - "ƒFVMb7VLt\n", - "ƒFVMb7VLu\n", - "ƒFVMb7VLv\n", - "ƒFVMb7VLw\n", - "ƒFVMb7VLx\n", - "7db0bdd6f967\n", - "7db0bdd6f967\n", - "7db0bdd6f967\n", - "7db0bdd6f967\n", - "7db0bdd6f967\n", - "7db0bdd6f967\n", - "7db0bdd6f967\n", - "7db0bdd6f967\n", - "7db0bdd6f967\n", - "7db0bdd6f967\n" - ] - } - ], + "outputs": [], "source": [ - "!flux submit --cc=1-10 --watch hostname" + "!flux submit --cc=1-4 --watch hostname" ] }, { @@ -673,7 +1250,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 10, "id": "brazilian-former", "metadata": {}, "outputs": [ @@ -681,8 +1258,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "ƒFZ7C25NP\n", - "ƒFZDMK6HV\n" + "ƒT2VDkrT1\n", + "ƒT2azp48w\n" ] } ], @@ -698,6 +1275,10 @@ "source": [ "## flux watch\n", "\n", + "
\n", + "Description: 👀️ Watching jobs\n", + "
\n", + "\n", "Wouldn't it be cool to submit a job and then watch it? Well, yeah! We can do this now with flux watch. Let's run a fun example, and then watch the output. We have sleeps in here interspersed with echos only to show you the live action! 🥞️\n", "Also note a nice trick - you can always use `flux job last` to get the last JOBID.\n", "Here is an example (not runnable, as notebooks don't support environment variables) for getting and saving a job id:\n", @@ -712,7 +1293,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 11, "id": "5ad231c2-4cdb-4d18-afc2-7cb3a74759c2", "metadata": {}, "outputs": [ @@ -720,11 +1301,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "ƒGA8B8Cf1\n", - "25 blueberry pancakes on the table... 25 blueberry pancakes! 🥞️\n", - "Eat a stack, for a snack, 15 blueberry pancakes on the table! 🥄️\n", - "15 blueberry pancakes on the table... 15 blueberry pancakes! 🥞️\n", - "Throw a stack... it makes a smack! 15 blueberry pancakes on the wall! 🥞️\n", + "ƒTR3HXBfD\n", + "25 chocolate chip pancakes on the table... 25 chocolate chip pancakes! 🥞️\n", + "Eat a stack, for a snack, 15 chocolate chip pancakes on the table! 🥄️\n", + "15 chocolate chip pancakes on the table... 15 chocolate chip pancakes! 🥞️\n", + "Throw a stack... it makes a smack! 15 chocolate chip pancakes on the wall! 🥞️\n", "You got some cleaning to do 🧽️\n" ] } @@ -741,14 +1322,16 @@ "source": [ "## flux jobs\n", "\n", - "> Used for listing job properties\n", + "
\n", + "Description: Querying the status of jobs\n", + "
\n", "\n", "We can now list the jobs in the queue with `flux jobs` and we should see both jobs that we just submitted. Jobs that are instances are colored blue in output, red jobs are failed jobs, and green jobs are those that completed successfully. Note that the JupyterLab notebook may not display these colors. You will be able to see them in the terminal." ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 12, "id": "institutional-vocabulary", "metadata": {}, "outputs": [ @@ -757,9 +1340,9 @@ "output_type": "stream", "text": [ " JOBID USER NAME ST NTASKS NNODES TIME INFO\n", - " ƒFZDMK6HV jovyan analysis R 1 1 1.492m 7db0bdd6f967\n", - " ƒFZ7C25NP jovyan simulation R 2 2 1.496m 7db0bdd6f[967,967]\n", - " ƒFCA1pkFH jovyan sleep R 2 1 2.288m 7db0bdd6f967\n" + " ƒT2azp48w jovyan analysis R 1 1 1.267m 399f5da372b0\n", + " ƒT2VDkrT1 jovyan simulation R 2 2 1.271m 399f5da372b[0,0]\n", + " ƒSdrJJshH jovyan sleep R 2 1 2.127m 399f5da372b0\n" ] } ], @@ -792,12 +1375,16 @@ "source": [ "## flux cancel\n", "\n", + "
\n", + "Description: Canceling running jobs\n", + "
\n", + "\n", "Since some of the jobs we see in the table above won't ever exit (and we didn't specify a timelimit), let's cancel them all now and free up the resources." ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 13, "id": "46dd8ec8-6c64-4d8d-9a00-949f5f58c07b", "metadata": {}, "outputs": [ @@ -823,16 +1410,18 @@ "source": [ "## flux batch\n", "\n", - "We can use the `flux batch` command to easily created nested flux instances. When `flux batch` is invoked, Flux will automatically create a nested instance that spans the resources allocated to the job, and then Flux runs the batch script passed to `flux batch` on rank 0 of the nested instance. \"Rank\" refers to the rank of the Tree-Based Overlay Network (TBON) used by the [Flux brokers](https://flux-framework.readthedocs.io/projects/flux-core/en/latest/man1/flux-broker.html).\n", + "
\n", + "Description: Submitting batch jobs\n", + "
\n", "\n", - "While a batch script is expected to launch parallel jobs using `flux run` or `flux submit` at this level, nothing prevents the script from further batching other sub-batch-jobs using the `flux batch` interface, if desired.\n", + "We can use the `flux batch` command to easily created nested flux instances. When `flux batch` is invoked, Flux will automatically create a nested instance that spans the resources allocated to the job, and then Flux runs the batch script passed to `flux batch` on rank 0 of the nested instance. \"Rank\" refers to the rank of the Tree-Based Overlay Network (TBON) used by the [Flux brokers](https://flux-framework.readthedocs.io/projects/flux-core/en/latest/man1/flux-broker.html).\n", "\n", - "Note: Flux also provides a `flux alloc` which is an interactive version of `flux batch`, but demonstrating that in a Jupyter notebook is difficult due to the lack of pseudo-terminal." + "While a batch script is expected to launch parallel jobs using `flux run` or `flux submit` at this level, nothing prevents the script from further batching other sub-batch-jobs using the `flux batch` interface, if desired." ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 14, "id": "blank-carpet", "metadata": {}, "outputs": [ @@ -840,8 +1429,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "ƒGYBWWT7d\n", - "ƒGYGiwvby\n" + "ƒThKfdhKD\n", + "ƒThRLkwsm\n" ] } ], @@ -860,7 +1449,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 15, "id": "381a3f6c-0da1-4923-801f-486ca5226d3c", "metadata": {}, "outputs": [ @@ -992,7 +1581,7 @@ "flux run -N 2 -n 2 sleep 30\n" ] }, - "execution_count": 24, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1004,7 +1593,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 16, "id": "edff8993-3c39-4f46-939d-4c8be5739fbc", "metadata": {}, "outputs": [ @@ -1012,20 +1601,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "ƒGbBf664w\n", + "ƒU5u1XQcf\n", " JOBID USER NAME ST NTASKS NNODES TIME INFO\n", - "\u001b[01;34m ƒGYGiwvby jovyan ./sleep_b+ R 2 2 6.797s 7db0bdd6f[967,967]\n", - "\u001b[0;0m\u001b[01;34m ƒGYBWWT7d jovyan ./sleep_b+ R 2 2 6.996s 7db0bdd6f[967,967]\n", + "\u001b[01;34m ƒThRLkwsm jovyan ./sleep_b+ R 2 2 51.17s 399f5da372b[0,0]\n", + "\u001b[0;0m\u001b[01;34m ƒThKfdhKD jovyan ./sleep_b+ R 2 2 51.39s 399f5da372b[0,0]\n", "\u001b[0;0m JOBID USER NAME ST NTASKS NNODES TIME INFO\n", - "\u001b[01;34m ƒGYGiwvby jovyan ./sleep_b+ R 2 2 6.977s 7db0bdd6f[967,967]\n", - "\u001b[0;0m\u001b[01;34m ƒGYBWWT7d jovyan ./sleep_b+ R 2 2 7.176s 7db0bdd6f[967,967]\n", + "\u001b[01;34m ƒThRLkwsm jovyan ./sleep_b+ R 2 2 51.34s 399f5da372b[0,0]\n", + "\u001b[0;0m\u001b[01;34m ƒThKfdhKD jovyan ./sleep_b+ R 2 2 51.56s 399f5da372b[0,0]\n", "\u001b[0;0m\n", - "ƒGYGiwvby:\n", - " ƒJZWVbu jovyan sleep R 2 2 6.123s 7db0bdd6f[967,967]\n", + "ƒThRLkwsm:\n", + " ƒEgNEfjm jovyan sleep R 2 2 20.11s 399f5da372b[0,0]\n", "\n", - "ƒGYBWWT7d:\n", - " ƒJnrP91 jovyan sleep R 2 2 6.302s 7db0bdd6f[967,967]\n", - "{\"version\": 1, \"execution\": {\"R_lite\": [{\"rank\": \"3\", \"children\": {\"core\": \"7\"}}], \"nodelist\": [\"7db0bdd6f967\"], \"starttime\": 1720153582, \"expiration\": 4873751530}}\n", + "ƒThKfdhKD:\n", + " ƒEga6ZzX jovyan sleep R 2 2 20.32s 399f5da372b[0,0]\n", + "{\"version\": 1, \"execution\": {\"R_lite\": [{\"rank\": \"3\", \"children\": {\"core\": \"7\"}}], \"nodelist\": [\"399f5da372b0\"], \"starttime\": 1721424338, \"expiration\": 4875020774}}\n", "0: stdout redirected to /tmp/cheese.txt\n", "0: stderr redirected to /tmp/cheese.txt\n" ] @@ -1119,7 +1708,7 @@ "Sweet dreams 🌚️ are made of cheese, who am I to diss a brie? 🧀️" ] }, - "execution_count": 25, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1160,44 +1749,10 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "id": "df8a8b7c-f475-4a51-8bc6-9983dc9d78ab", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " JOBID USER NAME ST NTASKS NNODES TIME INFO\n", - "\u001b[01;34m ƒGYGiwvby jovyan ./sleep_b+ R 2 2 13.68s 7db0bdd6f[967,967]\n", - "\u001b[0;0m\u001b[01;34m ƒGYBWWT7d jovyan ./sleep_b+ R 2 2 13.88s 7db0bdd6f[967,967]\n", - "\u001b[0;0m\u001b[01;32m ƒGbBf664w jovyan echo CD 1 1 0.033s 7db0bdd6f967\n", - "\u001b[0;0m\u001b[37m ƒFCA1pkFH jovyan sleep CA 2 1 2.291m 7db0bdd6f967\n", - "\u001b[0;0m\u001b[37m ƒFZ7C25NP jovyan simulation CA 2 2 1.499m 7db0bdd6f[967,967]\n", - "\u001b[0;0m\u001b[37m ƒFZDMK6HV jovyan analysis CA 1 1 1.495m 7db0bdd6f967\n", - "\u001b[0;0m\u001b[01;32m ƒGA8B8Cf1 jovyan job-watch+ CD 1 1 10.05s 7db0bdd6f967\n", - "\u001b[0;0m\u001b[01;32m ƒFf72VaCo jovyan job-watch+ CD 1 1 10.06s 7db0bdd6f967\n", - "\u001b[0;0m\u001b[01;32m ƒFVMb7VLx jovyan hostname CD 1 1 0.027s 7db0bdd6f967\n", - "\u001b[0;0m\u001b[01;32m ƒFVMb7VLv jovyan hostname CD 1 1 0.027s 7db0bdd6f967\n", - "\u001b[0;0m\u001b[01;32m ƒFVMb7VLu jovyan hostname CD 1 1 0.026s 7db0bdd6f967\n", - "\u001b[0;0m\u001b[01;32m ƒFVMb7VLw jovyan hostname CD 1 1 0.026s 7db0bdd6f967\n", - "\u001b[0;0m\u001b[01;32m ƒFVMb7VLs jovyan hostname CD 1 1 0.023s 7db0bdd6f967\n", - "\u001b[0;0m\u001b[01;32m ƒFVMb7VLt jovyan hostname CD 1 1 0.022s 7db0bdd6f967\n", - "\u001b[0;0m\u001b[01;32m ƒFVMZdW4a jovyan hostname CD 1 1 0.021s 7db0bdd6f967\n", - "\u001b[0;0m\u001b[01;32m ƒFVMZdW4Z jovyan hostname CD 1 1 0.020s 7db0bdd6f967\n", - "\u001b[0;0m\u001b[01;32m ƒFVMZdW4X jovyan hostname CD 1 1 0.019s 7db0bdd6f967\n", - "\u001b[0;0m\u001b[01;32m ƒFVMZdW4Y jovyan hostname CD 1 1 0.018s 7db0bdd6f967\n", - "\u001b[0;0m\u001b[01;32m ƒFSHgbfxs jovyan echo CD 1 1 0.016s 7db0bdd6f967\n", - "\u001b[0;0m\u001b[01;32m ƒFSHgbfxt jovyan echo CD 1 1 0.013s 7db0bdd6f967\n", - "\u001b[0;0m\u001b[01;32m ƒFSHgbfxu jovyan echo CD 1 1 0.012s 7db0bdd6f967\n", - "\u001b[0;0m\u001b[01;32m ƒFQ98A1iX jovyan hostname CD 4 1 0.044s 7db0bdd6f967\n", - "\u001b[0;0m\u001b[01;31m ƒFKYd97bM jovyan false F 1 1 0.053s 7db0bdd6f967\n", - "\u001b[0;0m\u001b[01;32m ƒFH2K3zXZ jovyan hostname CD 1 1 0.034s 7db0bdd6f967\n", - "\u001b[0;0m\u001b[01;32m ƒF91H7Gc7 jovyan hostname CD 1 1 0.045s 7db0bdd6f967\n", - "\u001b[0;0m" - ] - } - ], + "outputs": [], "source": [ "!flux jobs -a" ] @@ -1212,7 +1767,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 18, "id": "032597d2-4b02-47ea-a5e5-915313cdd7f9", "metadata": {}, "outputs": [ @@ -1221,7 +1776,7 @@ "output_type": "stream", "text": [ " JOBID USER NAME ST NTASKS NNODES TIME INFO\n", - "\u001b[01;31m ƒFKYd97bM jovyan false F 1 1 0.053s 7db0bdd6f967\n", + "\u001b[01;31m ƒSixhuHXu jovyan false F 1 1 0.070s 399f5da372b0\n", "\u001b[0;0m" ] } @@ -1230,6 +1785,34 @@ "!flux jobs -f failed" ] }, + { + "cell_type": "markdown", + "id": "2d3e314e-98eb-487a-ad8e-1442840e37d8", + "metadata": {}, + "source": [ + "## flux alloc\n", + "\n", + "
\n", + "Description: Allocation for an interactive instance\n", + "
\n", + "\n", + "You might want to request an allocation for a set of resources (an allocation) and then attach to the interactively. This is the goal of flux alloc. Since we can't easily do that in a cell, try opening up the and doing: \n", + "\n", + "```bash\n", + "# Look at the resources you have outside of the allocation\n", + "flux resource list\n", + "\n", + "# Request an allocation with 2 \"nodes\" - a subset of what you have in total\n", + "flux alloc -N 2\n", + "\n", + "# See the resources you are given\n", + "flux resource list\n", + "\n", + "# You can exit from the allocation like this!\n", + "exit\n", + "```" + ] + }, { "cell_type": "markdown", "id": "04b405b1-219f-489c-abfc-e2983e82124a", @@ -1317,7 +1900,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 19, "id": "e82863e5-b2a1-456b-9ff1-f669b3525fa1", "metadata": {}, "outputs": [ @@ -1431,7 +2014,7 @@ "flux job wait --all" ] }, - "execution_count": 29, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1458,7 +2041,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 20, "id": "72358a03-6f1f-4c5e-91eb-cab71883a232", "metadata": {}, "outputs": [ @@ -1466,12 +2049,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "ƒJVmi8uzX\n", - "ƒJVmi8uzX\n", - "Hello job 1 from 7db0bdd6f967 💛️\n", - "Hello job 2 from 7db0bdd6f967 💚️\n", - "Hello job 3 from 7db0bdd6f967 💙️\n", - "Hello job 4 from 7db0bdd6f967 💜️\n" + "ƒY424FfiX\n", + "ƒY424FfiX\n", + "Hello job 1 from 399f5da372b0 💛️\n", + "Hello job 2 from 399f5da372b0 💚️\n", + "Hello job 3 from 399f5da372b0 💙️\n", + "Hello job 4 from 399f5da372b0 💜️\n" ] } ], @@ -1722,7 +2305,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 22, "id": "8640a611-38e4-42b1-a913-89e0c76c8014", "metadata": {}, "outputs": [ @@ -1730,7 +2313,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "ƒSRXzaub5\n" + "ƒYRzZSFuy\n" ] } ], @@ -1750,7 +2333,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 23, "id": "2d2b1f0b-e6c2-4583-8068-7c76fa341884", "metadata": {}, "outputs": [ @@ -1759,14 +2342,16 @@ "output_type": "stream", "text": [ ".\n", + "├── ./sub_job1.sh\n", + "│ └── ./sub_job2.sh\n", + "│ └── sleep:R\n", "├── ./hello-batch.sh:CD\n", - "├── 2*[flux-tree-Tpb37xfIP23YjqCChAoJjWshyEHYJob1:F]\n", "├── 2*[./sleep_batch.sh:CD]\n", "├── 4*[echo:CD]\n", "├── sleep:CA\n", "├── simulation:CA\n", "├── analysis:CA\n", - "├── 2*[job-watch.sh:CD]\n", + "├── job-watch.sh:CD\n", "├── 13*[hostname:CD]\n", "└── false:F\n" ] @@ -1786,10 +2371,43 @@ }, { "cell_type": "markdown", - "id": "03e2ae62-3e3b-4c82-a0c7-4c97ff1376d2", + "id": "eda1a33c-9f9e-4ba0-a013-e97601f79e41", "metadata": {}, "source": [ "# Process and Job Utilities ⚙️\n", + "\n", + "## Flux uptime\n", + "\n", + "Did someone say... [uptime](https://youtu.be/SYRlTISvjww?si=zDlvpWbBljUmZw_Q)? ☝️🕑️🕺️\n", + "\n", + "Flux provides an `uptime` utility to display properties of the Flux instance such as state of the current instance, how long it has been running, its size and if scheduling is disabled or stopped. The output shows how long the instance has been up, the instance owner, the instance depth (depth in the Flux hierarchy), and the size of the instance (number of brokers)." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "095f2ac3-145b-4cda-8350-7c281f2b2b45", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 22:16:17 run 1.8h, owner jovyan, depth 0, size 4\n" + ] + } + ], + "source": [ + "!flux uptime" + ] + }, + { + "cell_type": "markdown", + "id": "03e2ae62-3e3b-4c82-a0c7-4c97ff1376d2", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ "## Flux top \n", "Flux provides a feature-full version of `top` for nested Flux instances and jobs. In the JupyterLab terminal, invoke `flux top` to see the \"sleep\" jobs. If they have already completed you can resubmit them. \n", "\n", @@ -1847,13 +2465,13 @@ "id": "997faffc", "metadata": {}, "source": [ - "## Python Submission API 🐍️\n", + "# Python Submission API 🐍️\n", "Flux also provides first-class python bindings which can be used to submit jobs programmatically. The following script shows this with the `flux.job.submit()` call:" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 24, "id": "third-comment", "metadata": {}, "outputs": [], @@ -1867,7 +2485,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 25, "id": "selective-uganda", "metadata": {}, "outputs": [ @@ -1875,7 +2493,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "ƒKKdeYAGo\n" + "ƒZoXw7Pdq\n" ] } ], @@ -1898,7 +2516,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 26, "id": "ed65cb46-8d8a-41f0-bec1-92b9a89e6db2", "metadata": {}, "outputs": [ @@ -1906,9 +2524,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "🎉️ Hooray, we just submitted ƒKLZWG53M!\n", + "🎉️ Hooray, we just submitted ƒZrqPNeNb!\n", "{\n", - " \"t_depend\": 1720153943.7848454,\n", + " \"t_depend\": 1721425098.682836,\n", " \"t_run\": 0.0,\n", " \"t_cleanup\": 0.0,\n", " \"t_inactive\": 0.0,\n", @@ -1928,8 +2546,8 @@ " \"success\": \"\",\n", " \"result\": \"\",\n", " \"waitstatus\": \"\",\n", - " \"id\": 40488354709504,\n", - " \"t_submit\": 1720153943.7735925,\n", + " \"id\": 72552617607168,\n", + " \"t_submit\": 1721425098.6718118,\n", " \"t_remaining\": 0.0,\n", " \"state\": \"SCHED\",\n", " \"username\": \"jovyan\",\n", @@ -1963,7 +2581,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 27, "id": "5d679897-7054-4f96-b340-7f39245aca89", "metadata": {}, "outputs": [ @@ -1971,8 +2589,8 @@ "name": "stdout", "output_type": "stream", "text": [ - " ƒSRkkNjD9 jovyan compute.py F 1 1 0.014s 993a4f746854\n", - " ƒSRjxR7UT jovyan compute.py F 1 1 0.019s 993a4f746854\n" + " ƒZrqPNeNb jovyan compute.py F 1 1 0.009s 399f5da372b0\n", + " ƒZoXw7Pdq jovyan compute.py F 1 1 0.011s 399f5da372b0\n" ] } ], @@ -1990,7 +2608,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 28, "id": "efa06478", "metadata": {}, "outputs": [ @@ -2137,7 +2755,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 29, "id": "cleared-lawsuit", "metadata": {}, "outputs": [ @@ -2145,10 +2763,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "bulksubmit_executor: submitted 200 jobs in 0.28s. 721.87job/s\n", - "bulksubmit_executor: First job finished in about 0.328s\n", - "|██████████████████████████████████████████████████████████| 100.0% (174.4 job/s)\n", - "bulksubmit_executor: Ran 200 jobs in 1.3s. 153.7 job/s\n" + "bulksubmit_executor: submitted 200 jobs in 0.24s. 831.05job/s\n", + "bulksubmit_executor: First job finished in about 0.254s\n", + "|██████████████████████████████████████████████████████████| 100.0% (278.2 job/s)\n", + "bulksubmit_executor: Ran 200 jobs in 0.9s. 221.8 job/s\n" ] } ],