diff --git a/doc/man7/flux-broker-attributes.rst b/doc/man7/flux-broker-attributes.rst index 8872bc7e6f2f..6d0ea6311a40 100644 --- a/doc/man7/flux-broker-attributes.rst +++ b/doc/man7/flux-broker-attributes.rst @@ -64,11 +64,6 @@ tbon.maxlevel tbon.endpoint The endpoint for the tree based overlay network to communicate over. - Format specifier "%h" can be used to specify the IP address of the - host and is useful when configuring an IP endpoint. Format specifier - "%B" can be used to specify the value of the attribute broker.rundir. - It is useful when configuring an IPC endpoint. Defaults to - "tcp://%h:\*". SOCKET ATTRIBUTES diff --git a/src/broker/boot_config.c b/src/broker/boot_config.c index 763794a185d5..fac7b45004c4 100644 --- a/src/broker/boot_config.c +++ b/src/broker/boot_config.c @@ -24,6 +24,7 @@ #include "src/common/libutil/log.h" #include "src/common/libutil/kary.h" #include "src/common/libutil/errno_safe.h" +#include "src/common/libpmi/clique.h" #include "attr.h" #include "overlay.h" @@ -250,6 +251,8 @@ int boot_config_attr (attr_t *attrs, json_t *hosts) char *s = NULL; size_t index; json_t *value; + char buf[1024]; + char *val; int rv = -1; if (!hosts || json_array_size (hosts) == 0) @@ -280,6 +283,30 @@ int boot_config_attr (attr_t *attrs, json_t *hosts) goto error; } + /* Generate broker.mapping. + * For now, set it to NULL if there are multiple brokers per node. + */ + hostlist_uniq (hl); + if (hostlist_count (hl) < json_array_size (hosts)) + val = NULL; + else { + struct pmi_map_block mapblock = { + .nodeid = 0, + .nodes = json_array_size (hosts), + .procs = 1 + }; + if (pmi_process_mapping_encode (&mapblock, 1, buf, sizeof (buf)) < 0) { + log_msg ("encoding broker.mapping"); + errno = EOVERFLOW; + goto error; + } + val = buf; + } + if (attr_add (attrs, "broker.mapping", val, FLUX_ATTRFLAG_IMMUTABLE) < 0) { + log_err ("setattr broker.mapping"); + goto error; + } + rv = 0; error: hostlist_destroy (hl); @@ -419,15 +446,6 @@ int boot_config (flux_t *h, struct overlay *overlay, attr_t *attrs, int tbon_k) uint32_t size; json_t *hosts = NULL; - /* Throw an error if 'tbon.endpoint' attribute is already set. - * flux-start sets this, and it's not compatible with the - * config boot method as it would be overwritten below. - */ - if (attr_get (attrs, "tbon.endpoint", NULL, NULL) == 0) { - log_msg ("attr tbon.endpoint may not be set with [bootstrap] config"); - return -1; - } - /* Ingest the [bootstrap] stanza. */ if (boot_config_parse (flux_get_conf (h), &conf, &hosts) < 0) diff --git a/src/broker/boot_pmi.c b/src/broker/boot_pmi.c index c474f69a8436..833a036cb055 100644 --- a/src/broker/boot_pmi.c +++ b/src/broker/boot_pmi.c @@ -21,6 +21,7 @@ #include "src/common/libutil/kary.h" #include "src/common/libpmi/pmi.h" #include "src/common/libpmi/pmi_strerror.h" +#include "src/common/libpmi/clique.h" #include "attr.h" #include "overlay.h" @@ -28,90 +29,6 @@ #include "pmiutil.h" -/* Generally accepted max, although some go higher (IE is 2083) */ -#define ENDPOINT_MAX 2048 - -/* Given a string with possible format specifiers, return string that is - * fully expanded. - * - * Possible format specifiers: - * - %h - local IP address by heuristic (see src/libutil/ipaddr.h) - * - %B - value of attribute broker.rundir - * - * Caller is responsible for freeing memory of returned value. - */ -static char * format_endpoint (attr_t *attrs, const char *endpoint) -{ - char ipaddr[HOST_NAME_MAX + 1]; - char *ptr, *buf, *rv = NULL; - bool percent_flag = false; - unsigned int len = 0; - const char *rundir; - char error[200]; - - if (!(buf = calloc (1, ENDPOINT_MAX + 1))) { - errno = ENOMEM; - return NULL; - } - - ptr = (char *)endpoint; - while (*ptr) { - if (percent_flag) { - if (*ptr == 'h') { - if (ipaddr_getprimary (ipaddr, sizeof (ipaddr), - error, sizeof (error)) < 0) { - log_msg ("%s", error); - goto done; - } - if ((len + strlen (ipaddr)) > ENDPOINT_MAX) { - log_msg ("ipaddr overflow max endpoint length"); - goto done; - } - strcat (buf, ipaddr); - len += strlen (ipaddr); - } - else if (*ptr == 'B') { - if (attr_get (attrs, "broker.rundir", &rundir, NULL) < 0) { - log_msg ("broker.rundir attribute is not set"); - goto done; - } - if ((len + strlen (rundir)) > ENDPOINT_MAX) { - log_msg ("broker.rundir overflow max endpoint length"); - goto done; - } - strcat (buf, rundir); - len += strlen (rundir); - } - else if (*ptr == '%') - buf[len++] = '%'; - else { - buf[len++] = '%'; - buf[len++] = *ptr; - } - percent_flag = false; - } - else { - if (*ptr == '%') - percent_flag = true; - else - buf[len++] = *ptr; - } - - if (len >= ENDPOINT_MAX) { - log_msg ("overflow max endpoint length"); - goto done; - } - - ptr++; - } - - rv = buf; -done: - if (!rv) - free (buf); - return (rv); -} - /* If the broker is launched via flux-shell, then the shell may opt * to set a "flux.instance-level" parameter in the PMI kvs to tell * the booting instance at what "level" it will be running, i.e. the @@ -145,6 +62,86 @@ static int set_instance_level_attr (struct pmi_handle *pmi, return 0; } +/* Set broker.mapping attribute from enclosing instance PMI_process_mapping. + */ +static int set_broker_mapping_attr (struct pmi_handle *pmi, + const char *kvsname, + attr_t *attrs) +{ + char buf[1024]; + char *val = NULL; + + if (broker_pmi_kvs_get (pmi, + kvsname, + "PMI_process_mapping", + buf, + sizeof (buf)) == PMI_SUCCESS) + val = buf; + if (attr_add (attrs, "broker.mapping", val, FLUX_ATTRFLAG_IMMUTABLE) < 0) + return -1; + return 0; +} + +/* Check if IPC can be used to communicate. + * Currently this only goes so far as to check if the process mapping of + * brokers has all brokers on the same node. We could check if all peers + * are on the same node, but given how the TBON maps to rank assignments, + * it is fairly unlikely. + */ +static bool use_ipc (attr_t *attrs) +{ + bool result = false; + struct pmi_map_block *blocks = NULL; + int nblocks; + const char *val; + + if (attr_get (attrs, "broker.mapping", &val, NULL) < 0 || !val) + goto done; + if (pmi_process_mapping_parse (val, &blocks, &nblocks) < 0) + goto done; + if (nblocks == 1 && blocks[0].nodes == 1) // one node + result = true; +done: + free (blocks); + return result; +} + +/* Build URI for broker TBON to bind to. + * If IPC, use '/tbon-' which should be unique if there are + * multiple brokers and/or multiple instances per node. + * If using TCP, choose the address to be the one associated with the default + * route (see src/common/libutil/ipaddr.h), and a randomly chosen port. + */ +static int format_bind_uri (char *buf, int bufsz, attr_t *attrs, int rank) +{ + if (use_ipc (attrs)) { + const char *rundir; + + if (attr_get (attrs, "rundir", &rundir, NULL) < 0) { + log_err ("rundir attribute is not set"); + return -1; + } + if (snprintf (buf, bufsz, "ipc://%s/tbon-%d", rundir, rank) >= bufsz) + goto overflow; + } + else { + char ipaddr[HOST_NAME_MAX + 1]; + char error[200]; + + if (ipaddr_getprimary (ipaddr, sizeof (ipaddr), + error, sizeof (error)) < 0) { + log_err ("%s", error); + return -1; + } + if (snprintf (buf, bufsz, "tcp://%s:*", ipaddr) >= bufsz) + goto overflow; + } + return 0; +overflow: + log_msg ("buffer overflow while building bind URI"); + return -1; +} + int boot_pmi (struct overlay *overlay, attr_t *attrs, int tbon_k) { int rank; @@ -175,6 +172,10 @@ int boot_pmi (struct overlay *overlay, attr_t *attrs, int tbon_k) log_err ("set_instance_level_attr"); goto error; } + if (set_broker_mapping_attr (pmi, pmi_params.kvsname, attrs) < 0) { + log_err ("error setting broker.mapping attribute"); + goto error; + } if (overlay_init (overlay, pmi_params.size, pmi_params.rank, tbon_k) < 0) goto error; @@ -192,25 +193,19 @@ int boot_pmi (struct overlay *overlay, attr_t *attrs, int tbon_k) pmi_params.size, pmi_params.rank, 0) != KARY_NONE) { - const char *fmt; - char *tmp; + char buf[1024]; - if (attr_get (attrs, "tbon.endpoint", &fmt, NULL) < 0) - fmt = "tcp://%h:*"; - if (!(tmp = format_endpoint (attrs, fmt))) + if (format_bind_uri (buf, sizeof (buf), attrs, pmi_params.rank) < 0) goto error; - if (overlay_bind (overlay, tmp) < 0) { - log_err ("overlay_bind %s failed", tmp); - free (tmp); + if (overlay_bind (overlay, buf) < 0) { + log_err ("error binding to %s", buf); goto error; } - free (tmp); uri = overlay_get_bind_uri (overlay); } else { uri = NULL; } - (void)attr_delete (attrs, "tbon.endpoint", true); if (attr_add (attrs, "tbon.endpoint", uri, FLUX_ATTRFLAG_IMMUTABLE) < 0) { log_err ("setattr tbon.endpoint"); goto error; diff --git a/src/cmd/flux-start.c b/src/cmd/flux-start.c index 08b2c670d383..3050cbd3b753 100644 --- a/src/cmd/flux-start.c +++ b/src/cmd/flux-start.c @@ -29,6 +29,7 @@ #include "src/common/libutil/cleanup.h" #include "src/common/libutil/setenvf.h" #include "src/common/libpmi/simple_server.h" +#include "src/common/libpmi/clique.h" #include "src/common/libpmi/dgetline.h" #define DEFAULT_KILLER_TIMEOUT 20.0 @@ -94,6 +95,8 @@ static struct optparse_option opts[] = { .usage = "Trace pmi simple server protocol exchange", }, { .name = "scratchdir", .key = 'D', .has_arg = 1, .arginfo = "DIR", .usage = "Use DIR as scratch directory", }, + { .name = "noclique", .key = 'c', .has_arg = 0, .arginfo = NULL, + .usage = "Don't set PMI_process_mapping in PMI KVS", }, /* Option group 1, these options will be listed after those above */ { .group = 1, @@ -209,6 +212,8 @@ int main (int argc, char *argv[]) case BOOTSTRAP_PMI: if (optparse_hasopt (ctx.opts, "scratchdir")) log_msg_exit ("--scratchdir only works with --bootstrap=selfpmi"); + if (optparse_hasopt (ctx.opts, "noclique")) + log_msg_exit ("--noclique only works with --bootstrap=selfpmi"); status = exec_broker (command, len, broker_path); break; case BOOTSTRAP_SELFPMI: @@ -481,7 +486,6 @@ struct client *client_create (const char *broker_path, const char *scratch_dir, argz_add (&argz, &argz_len, broker_path); char *dir_arg = xasprintf ("--setattr=rundir=%s", scratch_dir); argz_add (&argz, &argz_len, dir_arg); - argz_add (&argz, &argz_len, "--setattr=tbon.endpoint=ipc://%B/req"); free (dir_arg); add_args_list (&argz, &argz_len, ctx.opts, "broker-opts"); if (rank == 0 && cmd_argz) @@ -546,8 +550,22 @@ void pmi_server_initialize (int flags) .debug_trace = pmi_debug_trace, }; int appnum = 0; + if (!(ctx.pmi.kvs = zhash_new())) oom (); + + if (!optparse_hasopt (ctx.opts, "noclique")) { + struct pmi_map_block mapblock = { + .nodeid = 0, + .nodes = 1, + .procs = ctx.size, + }; + char buf[256]; + if (pmi_process_mapping_encode (&mapblock, 1, buf, sizeof (buf)) < 0) + log_msg_exit ("error encoding PMI_process_mapping"); + zhash_update (ctx.pmi.kvs, "PMI_process_mapping", xstrdup (buf)); + } + ctx.pmi.srv = pmi_simple_server_create (ops, appnum, ctx.size, ctx.size, "-", flags, NULL); if (!ctx.pmi.srv) diff --git a/src/shell/lua.d/openmpi.lua b/src/shell/lua.d/openmpi.lua index 38a1d438529f..8f24feb7dcfa 100644 --- a/src/shell/lua.d/openmpi.lua +++ b/src/shell/lua.d/openmpi.lua @@ -8,8 +8,5 @@ -- SPDX-License-Identifier: LGPL-3.0 ------------------------------------------------------------- -local f = require 'flux'.new () -local rundir = f:getattr ('broker.rundir') -shell.setenv ("OMPI_MCA_orte_tmpdir_base", rundir) shell.setenv ("OMPI_MCA_pmix", "flux") shell.setenv ("OMPI_MCA_schizo", "flux") diff --git a/src/shell/lua.d/spectrum.lua b/src/shell/lua.d/spectrum.lua index 67545fa32ad6..78b21b524c7a 100644 --- a/src/shell/lua.d/spectrum.lua +++ b/src/shell/lua.d/spectrum.lua @@ -44,19 +44,12 @@ local function strip_env_by_prefix (env, prefix) end end -local f = require 'flux'.new() -local rundir = f:getattr ('broker.rundir') - local env = shell.getenv() -- Clear all existing PMIX_ and OMPI_ values before setting our own strip_env_by_prefix (env, "PMIX_") strip_env_by_prefix (env, "OMPI_") --- Avoid shared memory segment name collisions --- when flux instance runs >1 broker per node. -shell.setenv ('OMPI_MCA_orte_tmpdir_base', rundir) - -- Assumes the installation paths of Spectrum MPI on LLNL's Sierra shell.setenv ('OMPI_MCA_osc', "pt2pt") shell.setenv ('OMPI_MCA_pml', "yalla") diff --git a/src/shell/pmi/pmi.c b/src/shell/pmi/pmi.c index 45455793ecd6..fc5b2e4eb54a 100644 --- a/src/shell/pmi/pmi.c +++ b/src/shell/pmi/pmi.c @@ -362,39 +362,92 @@ static void pmi_fd_cb (flux_shell_task_t *task, } } +/* Query broker to see if instance mapping is known, then use that information + * to select whether process mapping should be "none", "single", or "pershell". + */ +static const char *guess_clique_option (struct shell_pmi *pmi) +{ + const char *val; + struct pmi_map_block *blocks = NULL; + int nblocks; + const char *opt = "none"; + + if (pmi->shell->standalone) + goto done; + if (!(val = flux_attr_get (pmi->shell->h, "broker.mapping"))) + goto done; + if (pmi_process_mapping_parse (val, &blocks, &nblocks) < 0) + goto done; + if (nblocks == 1 && blocks[0].nodes == 1) // one node + opt = "single"; + else if (nblocks == 1 && blocks[0].procs == 1) // one broker per node + opt = "pershell"; +done: + free (blocks); + return opt; +} + /* Generate 'PMI_process_mapping' key (see RFC 13) for MPI clique computation. - * - * Create an array of pmi_map_block structures, sized for worst case mapping - * (no compression possible). Walk through the rcalc info for each shell rank. - * If shell's mapping looks identical to previous one, increment block->nodes; - * otherwise consume another array slot. Finally, encode to string, put it - * in the local KVS hash, and free array. */ -static int init_clique (struct shell_pmi *pmi) +static int init_clique (struct shell_pmi *pmi, const char *opt) { - struct pmi_map_block *blocks; + struct pmi_map_block *blocks = NULL; int nblocks; int i; char val[SIMPLE_KVS_VAL_MAX]; - if (!(blocks = calloc (pmi->shell->info->shell_size, sizeof (*blocks)))) - return -1; - nblocks = 0; - - for (i = 0; i < pmi->shell->info->shell_size; i++) { - struct rcalc_rankinfo ri; - - if (rcalc_get_nth (pmi->shell->info->rcalc, i, &ri) < 0) - goto error; - if (nblocks == 0 || blocks[nblocks - 1].procs != ri.ntasks) { - blocks[nblocks].nodeid = i; - blocks[nblocks].procs = ri.ntasks; - blocks[nblocks].nodes = 1; - nblocks++; + if (!opt) + opt = guess_clique_option (pmi); + + /* pmi.clique=pershell (default): one clique per shell. + * Create an array of pmi_map_block structures, sized for worst case + * mapping (no compression possible). Walk through the rcalc info for + * each shell rank. If shell's mapping looks identical to previous one, + * increment block->nodes; otherwise consume another array slot. + */ + if (!strcmp (opt, "pershell")) { + if (!(blocks = calloc (pmi->shell->info->shell_size, sizeof (*blocks)))) + return -1; + nblocks = 0; + + for (i = 0; i < pmi->shell->info->shell_size; i++) { + struct rcalc_rankinfo ri; + + if (rcalc_get_nth (pmi->shell->info->rcalc, i, &ri) < 0) + goto error; + if (nblocks == 0 || blocks[nblocks - 1].procs != ri.ntasks) { + blocks[nblocks].nodeid = i; + blocks[nblocks].procs = ri.ntasks; + blocks[nblocks].nodes = 1; + nblocks++; + } + else + blocks[nblocks - 1].nodes++; } - else - blocks[nblocks - 1].nodes++; } + /* pmi.clique=single: all procs are on the same node. + */ + else if (!strcmp (opt, "single")) { + if (!(blocks = calloc (1, sizeof (*blocks)))) + return -1; + nblocks = 1; + blocks[0].nodeid = 0; + blocks[0].procs = pmi->shell->info->total_ntasks; + blocks[0].nodes = 1; + } + /* pmi.clique=none: disable PMI_process_mapping generation. + */ + else if (!strcmp (opt, "none")) { + goto out; + } + else { + shell_log_error ("pmi.clique=%s is invalid", opt); + goto error; + } + + /* Encode to string, and store to local KVS hash. + */ + /* If value exceeds SIMPLE_KVS_VAL_MAX, skip setting the key * without generating an error. The client side will not treat * a missing key as an error. It should be unusual though so log it. @@ -462,15 +515,20 @@ static struct pmi_simple_ops shell_pmi_ops = { .abort = shell_pmi_abort, }; -static int parse_args (flux_shell_t *shell, int *exchange_k, const char **kvs) +static int parse_args (flux_shell_t *shell, + int *exchange_k, + const char **kvs, + const char **clique) { if (flux_shell_getopt_unpack (shell, "pmi", - "{s?s s?{s?i}}", + "{s?s s?{s?i} s?s}", "kvs", kvs, "exchange", - "k", exchange_k) < 0) + "k", exchange_k, + "clique", + clique) < 0) return -1; return 0; } @@ -483,12 +541,13 @@ static struct shell_pmi *pmi_create (flux_shell_t *shell) char kvsname[32]; const char *kvs = "exchange"; int exchange_k = 0; // 0=use default tree fanout + const char *clique = NULL; if (!(pmi = calloc (1, sizeof (*pmi)))) return NULL; pmi->shell = shell; - if (parse_args (shell, &exchange_k, &kvs) < 0) + if (parse_args (shell, &exchange_k, &kvs, &clique) < 0) goto error; if (!strcmp (kvs, "native")) { shell_pmi_ops.kvs_put = native_kvs_put; @@ -534,7 +593,7 @@ static struct shell_pmi *pmi_create (flux_shell_t *shell) errno = ENOMEM; goto error; } - if (init_clique (pmi) < 0) + if (init_clique (pmi, clique) < 0) goto error; if (!shell->standalone) { if (set_flux_instance_level (pmi) < 0) diff --git a/t/t0001-basic.t b/t/t0001-basic.t index ce2beaebd986..185216f0196d 100755 --- a/t/t0001-basic.t +++ b/t/t0001-basic.t @@ -90,6 +90,12 @@ test_expect_success 'flux-start --bootstrap=selfpmi fails (no size specified)' " test_expect_success 'flux-start --size=1 --boostrap=pmi fails' " test_must_fail flux start ${ARGS} --size=1 --bootstrap=pmi /bin/true " +test_expect_success 'flux-start --scratchdir --boostrap=pmi fails' " + test_must_fail flux start ${ARGS} --scratchdir=$(pwd) --bootstrap=pmi /bin/true +" +test_expect_success 'flux-start --noclique --boostrap=pmi fails' " + test_must_fail flux start ${ARGS} --noclique --bootstrap=pmi /bin/true +" test_expect_success 'flux-start in exec mode passes through errors from command' " test_must_fail flux start ${ARGS} /bin/false " @@ -165,23 +171,19 @@ test_expect_success 'tbon.endpoint can be read' ' ATTR_VAL=`flux start ${ARGS} -s2 flux getattr tbon.endpoint` && echo $ATTR_VAL | grep "://" ' -test_expect_success 'tbon.endpoint can be set and %h works' ' - flux start ${ARGS} -s2 -o,--setattr=tbon.endpoint=tcp://%h:* \ - flux getattr tbon.endpoint >pct_h.out && - grep "^tcp" pct_h.out && - test_must_fail grep "%h" pct_h.out -' -test_expect_success 'tbon.endpoint with %B works' ' - flux start ${ARGS} -s2 -o,--setattr=tbon.endpoint=ipc://%B/req \ - flux getattr tbon.endpoint >pct_B.out && - grep "^ipc" pct_B.out && - test_must_fail grep "%B" pct_B.out -' -# N.B. rank 1 has to be killed in this test after rank 0 fails gracefully -# so test_must_fail won't work here -test_expect_success 'tbon.endpoint fails on bad endpoint' ' - ! flux start ${ARGS} -s2 --killer-timeout=0.2 \ - -o,--setattr=tbon.endpoint=foo://bar /bin/true +test_expect_success 'tbon.endpoint uses ipc:// in standalone instance' ' + flux start ${ARGS} -s2 \ + flux getattr tbon.endpoint >endpoint.out && + grep "^ipc://" endpoint.out +' +test_expect_success 'tbon.endpoint uses tcp:// if process mapping unavailable' ' + flux start ${ARGS} -s2 --noclique \ + flux getattr tbon.endpoint >endpoint2.out && + grep "^tcp" endpoint2.out +' +test_expect_success 'tbon.endpoint cannot be set' ' + test_must_fail flux start ${ARGS} -s2 \ + -o,--setattr=tbon.endpoint=ipc:///tmp/customflux /bin/true ' test_expect_success 'tbon.parent-endpoint cannot be read on rank 0' ' test_must_fail flux start ${ARGS} -s2 flux getattr tbon.parent-endpoint diff --git a/t/t2601-job-shell-standalone.t b/t/t2601-job-shell-standalone.t index 693e6ddc05c8..795d44596e95 100755 --- a/t/t2601-job-shell-standalone.t +++ b/t/t2601-job-shell-standalone.t @@ -135,7 +135,8 @@ test_expect_success 'flux-shell: shell PMI works' ' >pmi_info.out 2>pmi_info.err ' test_expect_success 'flux-shell: shell PMI exports clique info' ' - flux jobspec srun -N1 -n8 ${PMI_INFO} -c >j8pmi_clique && + flux mini run -opmi.clique=pershell --dry-run -N1 -n8 \ + ${PMI_INFO} -c >j8pmi_clique && ${FLUX_SHELL} -v -s -r 0 -j j8pmi_clique -R R8 51 \ >pmi_clique.out 2>pmi_clique.err && COUNT=$(grep "clique=0,1,2,3,4,5,6,7" pmi_clique.out | wc -l) && diff --git a/t/t2602-job-shell.t b/t/t2602-job-shell.t index 627fa57ed2aa..8f5204d04c6d 100755 --- a/t/t2602-job-shell.t +++ b/t/t2602-job-shell.t @@ -62,9 +62,15 @@ test_expect_success 'job-shell: PMI works' ' flux job attach $id >pmi_info.out 2>pmi_info.err && grep size=4 pmi_info.out ' +test_expect_success 'pmi-shell: bad pmi.clique option fails' ' + test_must_fail flux mini run -opmi.clique=badopt \ + /bin/true 2>badopt.err && + grep "pmi.clique=badopt is invalid" badopt.err +' + test_expect_success 'pmi-shell: PMI cliques are correct for 1 ppn' ' - id=$(flux jobspec srun -N4 -n4 ${PMI_INFO} -c | flux job submit) && - flux job attach $id >pmi_clique1.raw && + flux mini run -opmi.clique=pershell -N4 -n4 \ + ${PMI_INFO} -c >pmi_clique1.raw && sort -snk1 pmi_clique1.out && sort >pmi_clique1.exp <<-EOT && 0: clique=0 @@ -75,8 +81,8 @@ test_expect_success 'pmi-shell: PMI cliques are correct for 1 ppn' ' test_cmp pmi_clique1.exp pmi_clique1.out ' test_expect_success 'pmi-shell: PMI cliques are correct for 2 ppn' ' - id=$(flux jobspec srun -N2 -n4 ${PMI_INFO} -c | flux job submit) && - flux job attach $id >pmi_clique2.raw && + flux mini run -opmi.clique=pershell \ + -N2 -n4 ${PMI_INFO} -c >pmi_clique2.raw && sort -snk1 pmi_clique2.out && sort >pmi_clique2.exp <<-EOT && 0: clique=0,1 @@ -87,8 +93,8 @@ test_expect_success 'pmi-shell: PMI cliques are correct for 2 ppn' ' test_cmp pmi_clique2.exp pmi_clique2.out ' test_expect_success 'pmi-shell: PMI cliques are correct for irregular ppn' ' - id=$(flux jobspec srun -N4 -n5 ${PMI_INFO} -c | flux job submit) && - flux job attach $id >pmi_cliquex.raw && + flux mini run -opmi.clique=pershell -N4 -n5 \ + ${PMI_INFO} -c >pmi_cliquex.raw && sort -snk1 pmi_cliquex.out && sort >pmi_cliquex.exp <<-EOT && 0: clique=0,1 diff --git a/t/t3000-mpi-basic.t b/t/t3000-mpi-basic.t index b9ecaa88c75a..eee76732add4 100755 --- a/t/t3000-mpi-basic.t +++ b/t/t3000-mpi-basic.t @@ -10,37 +10,25 @@ if test -z "$FLUX_TEST_MPI"; then test_done fi -if ! test -x ${FLUX_BUILD_DIR}/t/mpi/hello; then +HELLO=${FLUX_BUILD_DIR}/t/mpi/hello +if ! test -x ${HELLO}; then skip_all='skipping MPI tests, MPI not available/configured' test_done fi -# Size the session to one more than the number of cores, minimum of 4 -SIZE=$(test_size_large) -test_under_flux ${SIZE} -echo "# $0: flux session size will be ${SIZE}" - -# Usage: run_program timeout ntasks nnodes -run_program() { - local timeout=$1 - local ntasks=$2 - local nnodes=$3 - local opts=$4 - shift 3 - run_timeout $timeout flux mini run \ - -n${ntasks} -N${nnodes} $* +# rc1-job ensures there are 2 cores per node +SIZE=2 +MAX_MPI_SIZE=$(($SIZE*2)) +test_under_flux $SIZE job + +hello_world() { + run_timeout 30 flux mini run -n$1 ${HELLO} >hello-$1.out && + grep -q "are $1 tasks" hello-$1.out && + test_debug "cat hello-$1.out" } -test_expect_success "mpi hello singleton" ' - run_program 15 1 1 ${FLUX_BUILD_DIR}/t/mpi/hello >single.out && - test_debug "cat single.out" -' - -test_expect_success "mpi hello all ranks" ' - run_program 15 ${SIZE} ${SIZE} ${FLUX_BUILD_DIR}/t/mpi/hello \ - > allranks.out && - test_debug "cat allranks.out" && - grep -q "There are ${SIZE} tasks" allranks.out -' +for size in $(seq 1 ${MAX_MPI_SIZE}); do + test_expect_success "mpi hello size=${size}" "hello_world ${size}" +done test_done diff --git a/t/t3002-pmi.t b/t/t3002-pmi.t index 017b0e9d72d4..a6dac294fff9 100755 --- a/t/t3002-pmi.t +++ b/t/t3002-pmi.t @@ -16,7 +16,8 @@ test_expect_success 'pmi_info works' ' ' test_expect_success 'pmi_info --clique shows each node with own clique' ' - flux mini run -n${SIZE} -N${SIZE} ${pmi_info} --clique >clique.out && + flux mini run -opmi.clique=pershell -n${SIZE} -N${SIZE} \ + ${pmi_info} --clique >clique.out && count=$(cut -f2 -d: clique.out | sort | uniq | wc -l) && test $count -eq ${SIZE} '