diff --git a/docs/admin/env_variables.md b/docs/admin/env_variables.md index 2f5c2053683..060c3790d57 100644 --- a/docs/admin/env_variables.md +++ b/docs/admin/env_variables.md @@ -53,6 +53,7 @@ Environment variables in this section only apply to the server side. |DAOS\_DTX\_RPC\_HELPER\_THD|DTX RPC helper threshold. The valid range is [18, unlimited). The default value is 513.| |DAOS\_DTX\_BATCHED\_ULT\_MAX|The max count of DTX batched commit ULTs. The valid range is [0, unlimited). 0 means to commit DTX synchronously. The default value is 32.| |DAOS\_FORWARD\_NEIGHBOR|Set to enable I/O forwarding on neighbor xstream in the absence of helper threads.| +|DAOS\_POOL\_RF|Redundancy factor for the pool. The valid range is [1, 4]. The default value is 2.| ## Server and Client environment variables diff --git a/docs/admin/pool_operations.md b/docs/admin/pool_operations.md index 388a81d8700..36907a2e31f 100644 --- a/docs/admin/pool_operations.md +++ b/docs/admin/pool_operations.md @@ -916,6 +916,30 @@ and possibly repair a pmemobj file. As discussed in the previous section, the rebuild status can be consulted via the pool query and will be expanded with more information. +## Pool Redundancy Factor + +If the DAOS system experiences cascading failures, where the number of failed +fault domains exceeds a pool's redundancy factor, there could be unrecoverable +errors and applications could suffer from data loss. This can happen in cases +of power or network outages and would cause node/engine failures. In most cases +those failures can be recovered and DAOS engines can be restarted and the system +can function again. + +Administrator can set the default pool redundancy factor by environment variable +"DAOS_POOL_RF" in the server yaml file. If SWIM detects and reports an engine is +dead and the number of failed fault domain exceeds or is going to exceed the pool +redundancy factor, it will not change pool map immediately. Instead, it will give +critical log message: +intolerable unavailability: engine rank x +In this case, the system administrator should check and try to recover those +failed engines and bring them back with: +dmg system start --ranks=x +one by one. A reintegrate call is not needed. + +For true unrecoverable failures, the administrator can still exclude engines. +However, data loss is expected as the number of unrecoverable failures exceeds +the pool redundancy factor. + ## Recovering Container Ownership Typically users are expected to manage their containers. However, in the event diff --git a/src/chk/chk_engine.c b/src/chk/chk_engine.c index 9113ca22531..56e6da3ad9b 100644 --- a/src/chk/chk_engine.c +++ b/src/chk/chk_engine.c @@ -668,7 +668,7 @@ chk_engine_pool_mbs_one(struct chk_pool_rec *cpr, struct pool_map *map, struct c int rc = 0; bool unknown; - dom = pool_map_find_node_by_rank(map, mbs->cpm_rank); + dom = pool_map_find_dom_by_rank(map, mbs->cpm_rank); if (dom == NULL) { D_ASSERT(mbs->cpm_rank != dss_self_rank()); @@ -777,7 +777,7 @@ chk_engine_find_dangling_pm(struct chk_pool_rec *cpr, struct pool_map *map) int j; bool down; - rank_nr = pool_map_find_nodes(map, PO_COMP_ID_ALL, &doms); + rank_nr = pool_map_find_ranks(map, PO_COMP_ID_ALL, &doms); if (rank_nr <= 0) D_GOTO(out, rc = rank_nr); diff --git a/src/common/pool_map.c b/src/common/pool_map.c index 7d7b38adb6c..1712f398dcb 100644 --- a/src/common/pool_map.c +++ b/src/common/pool_map.c @@ -1573,7 +1573,7 @@ add_domain_tree_to_pool_buf(struct pool_map *map, struct pool_buf *map_buf, if (map) { struct pool_domain *found_dom; - found_dom = pool_map_find_node_by_rank(map, rank); + found_dom = pool_map_find_dom_by_rank(map, rank); if (found_dom) { if (found_dom->do_comp.co_status == PO_COMP_ST_NEW) found_new_dom = true; @@ -2038,7 +2038,7 @@ pool_map_find_domain(struct pool_map *map, pool_comp_type_t type, uint32_t id, } /** - * Find all nodes in the pool map. + * Find all ranks in the pool map. * * \param map [IN] pool map to search. * \param id [IN] id to search. @@ -2048,7 +2048,7 @@ pool_map_find_domain(struct pool_map *map, pool_comp_type_t type, uint32_t id, * 0 if none. */ int -pool_map_find_nodes(struct pool_map *map, uint32_t id, +pool_map_find_ranks(struct pool_map *map, uint32_t id, struct pool_domain **domain_pp) { return pool_map_find_domain(map, PO_COMP_TP_RANK, id, @@ -2102,14 +2102,14 @@ pool_map_find_target(struct pool_map *map, uint32_t id, * \return domain found by rank. */ struct pool_domain * -pool_map_find_node_by_rank(struct pool_map *map, uint32_t rank) +pool_map_find_dom_by_rank(struct pool_map *map, uint32_t rank) { struct pool_domain *doms; struct pool_domain *found = NULL; int doms_cnt; int i; - doms_cnt = pool_map_find_nodes(map, PO_COMP_ID_ALL, &doms); + doms_cnt = pool_map_find_ranks(map, PO_COMP_ID_ALL, &doms); if (doms_cnt <= 0) return NULL; @@ -2150,7 +2150,7 @@ pool_map_find_targets_on_ranks(struct pool_map *map, d_rank_list_t *rank_list, for (i = 0; i < rank_list->rl_nr; i++) { struct pool_domain *dom; - dom = pool_map_find_node_by_rank(map, rank_list->rl_ranks[i]); + dom = pool_map_find_dom_by_rank(map, rank_list->rl_ranks[i]); if (dom == NULL) { pool_target_id_list_free(tgts); return 0; @@ -2191,7 +2191,7 @@ pool_map_find_target_by_rank_idx(struct pool_map *map, uint32_t rank, { struct pool_domain *dom; - dom = pool_map_find_node_by_rank(map, rank); + dom = pool_map_find_dom_by_rank(map, rank); if (dom == NULL) return 0; @@ -2867,7 +2867,7 @@ pool_map_find_by_rank_status(struct pool_map *map, *tgt_ppp = NULL; *tgt_cnt = 0; - dom = pool_map_find_node_by_rank(map, rank); + dom = pool_map_find_dom_by_rank(map, rank); if (dom == NULL) return 0; @@ -2902,7 +2902,7 @@ pool_map_get_ranks(uuid_t pool_uuid, struct pool_map *map, bool get_enabled, d_r struct pool_domain *domains = NULL; d_rank_list_t *ranklist = NULL; - nnodes_tot = pool_map_find_nodes(map, PO_COMP_ID_ALL, &domains); + nnodes_tot = pool_map_find_ranks(map, PO_COMP_ID_ALL, &domains); for (i = 0; i < nnodes_tot; i++) { if (pool_map_node_status_match(&domains[i], ENABLED)) nnodes_enabled++; diff --git a/src/container/cli.c b/src/container/cli.c index 590f689333b..cd43667a2a4 100644 --- a/src/container/cli.c +++ b/src/container/cli.c @@ -3386,7 +3386,7 @@ dc_cont_node_id2ptr(daos_handle_t coh, uint32_t node_id, pool = dc_hdl2pool(dc->dc_pool_hdl); D_ASSERT(pool != NULL); D_RWLOCK_RDLOCK(&pool->dp_map_lock); - n = pool_map_find_nodes(pool->dp_map, node_id, dom); + n = pool_map_find_ranks(pool->dp_map, node_id, dom); D_RWLOCK_UNLOCK(&pool->dp_map_lock); dc_pool_put(pool); dc_cont_put(dc); diff --git a/src/container/srv_container.c b/src/container/srv_container.c index 9071f8f731c..372da43afe4 100644 --- a/src/container/srv_container.c +++ b/src/container/srv_container.c @@ -1667,7 +1667,7 @@ cont_ec_agg_alloc(struct cont_svc *cont_svc, uuid_t cont_uuid, { struct cont_ec_agg *ec_agg = NULL; struct pool_domain *doms; - int node_nr; + int rank_nr; int rc = 0; int i; @@ -1676,19 +1676,18 @@ cont_ec_agg_alloc(struct cont_svc *cont_svc, uuid_t cont_uuid, return -DER_NOMEM; D_ASSERT(cont_svc->cs_pool->sp_map != NULL); - node_nr = pool_map_find_nodes(cont_svc->cs_pool->sp_map, - PO_COMP_ID_ALL, &doms); - if (node_nr < 0) - D_GOTO(out, rc = node_nr); + rank_nr = pool_map_find_ranks(cont_svc->cs_pool->sp_map, PO_COMP_ID_ALL, &doms); + if (rank_nr < 0) + D_GOTO(out, rc = rank_nr); - D_ALLOC_ARRAY(ec_agg->ea_server_ephs, node_nr); + D_ALLOC_ARRAY(ec_agg->ea_server_ephs, rank_nr); if (ec_agg->ea_server_ephs == NULL) D_GOTO(out, rc = -DER_NOMEM); uuid_copy(ec_agg->ea_cont_uuid, cont_uuid); - ec_agg->ea_servers_num = node_nr; + ec_agg->ea_servers_num = rank_nr; ec_agg->ea_current_eph = 0; - for (i = 0; i < node_nr; i++) { + for (i = 0; i < rank_nr; i++) { ec_agg->ea_server_ephs[i].rank = doms[i].do_comp.co_rank; ec_agg->ea_server_ephs[i].eph = 0; } diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c index 9623dce4917..863307e9a7f 100644 --- a/src/dtx/dtx_coll.c +++ b/src/dtx/dtx_coll.c @@ -112,7 +112,7 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt struct dtx_coll_target *dct; struct dtx_coll_entry *dce = NULL; struct daos_obj_md md = { 0 }; - uint32_t node_nr; + uint32_t rank_nr; d_rank_t my_rank = dss_self_rank(); d_rank_t max_rank = 0; int rc = 0; @@ -192,19 +192,19 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt } } - node_nr = pool_map_node_nr(map->pl_poolmap); - if (unlikely(node_nr == 1)) + rank_nr = pool_map_rank_nr(map->pl_poolmap); + if (unlikely(rank_nr == 1)) D_GOTO(out, rc = 0); - dce->dce_ranks = d_rank_list_alloc(node_nr - 1); + dce->dce_ranks = d_rank_list_alloc(rank_nr - 1); if (dce->dce_ranks == NULL) D_GOTO(out, rc = -DER_NOMEM); - D_ALLOC_ARRAY(dce->dce_hints, node_nr); + D_ALLOC_ARRAY(dce->dce_hints, rank_nr); if (dce->dce_hints == NULL) D_GOTO(out, rc = -DER_NOMEM); - for (i = 0; i < node_nr; i++) + for (i = 0; i < rank_nr; i++) dce->dce_hints[i] = (uint8_t)(-1); md.omd_id = oid.id_pub; @@ -220,7 +220,7 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt goto out; } - for (i = 0, j = 0; i < layout->ol_nr && j < node_nr - 1; i++) { + for (i = 0, j = 0; i < layout->ol_nr && j < rank_nr - 1; i++) { if (layout->ol_shards[i].po_target == -1 || layout->ol_shards[i].po_shard == -1) continue; diff --git a/src/include/daos/pool_map.h b/src/include/daos/pool_map.h index 0df39f0e510..95695d2b027 100644 --- a/src/include/daos/pool_map.h +++ b/src/include/daos/pool_map.h @@ -281,7 +281,7 @@ int pool_map_find_target(struct pool_map *map, uint32_t id, struct pool_target **target_pp); int pool_map_find_domain(struct pool_map *map, pool_comp_type_t type, uint32_t id, struct pool_domain **domain_pp); -int pool_map_find_nodes(struct pool_map *map, uint32_t id, +int pool_map_find_ranks(struct pool_map *map, uint32_t id, struct pool_domain **domain_pp); int pool_map_find_tgts_by_state(struct pool_map *map, pool_comp_state_t match_states, @@ -311,7 +311,7 @@ bool pool_map_node_status_match(struct pool_domain *dom, unsigned int status); struct pool_domain * -pool_map_find_node_by_rank(struct pool_map *map, uint32_t rank); +pool_map_find_dom_by_rank(struct pool_map *map, uint32_t rank); int pool_map_find_by_rank_status(struct pool_map *map, struct pool_target ***tgt_ppp, @@ -339,9 +339,9 @@ pool_map_target_nr(struct pool_map *map) } static inline unsigned int -pool_map_node_nr(struct pool_map *map) +pool_map_rank_nr(struct pool_map *map) { - return pool_map_find_nodes(map, PO_COMP_ID_ALL, NULL); + return pool_map_find_ranks(map, PO_COMP_ID_ALL, NULL); } /* diff --git a/src/include/daos_prop.h b/src/include/daos_prop.h index c6ca94f84c1..3b7216efd0e 100644 --- a/src/include/daos_prop.h +++ b/src/include/daos_prop.h @@ -464,11 +464,12 @@ enum { /** container redundancy factor */ enum { - DAOS_PROP_CO_REDUN_RF0, - DAOS_PROP_CO_REDUN_RF1, - DAOS_PROP_CO_REDUN_RF2, - DAOS_PROP_CO_REDUN_RF3, - DAOS_PROP_CO_REDUN_RF4, + DAOS_PROP_CO_REDUN_RF0 = 0, + DAOS_PROP_CO_REDUN_RF1 = 1, + DAOS_PROP_CO_REDUN_RF2 = 2, + DAOS_PROP_CO_REDUN_RF3 = 3, + DAOS_PROP_CO_REDUN_RF4 = 4, + DAOS_RF_MAX = 4, }; /** diff --git a/src/object/cli_coll.c b/src/object/cli_coll.c index 12ba634813a..e05abadf3cf 100644 --- a/src/object/cli_coll.c +++ b/src/object/cli_coll.c @@ -139,7 +139,7 @@ obj_coll_oper_args_init(struct coll_oper_args *coa, struct dc_object *obj, bool D_ASSERT(coa->coa_dcts == NULL); D_RWLOCK_RDLOCK(&pool->dp_map_lock); - pool_ranks = pool_map_node_nr(pool->dp_map); + pool_ranks = pool_map_rank_nr(pool->dp_map); D_RWLOCK_UNLOCK(&pool->dp_map_lock); D_RWLOCK_RDLOCK(&obj->cob_lock); diff --git a/src/object/srv_coll.c b/src/object/srv_coll.c index a63a11d574b..2a152b47bd6 100644 --- a/src/object/srv_coll.c +++ b/src/object/srv_coll.c @@ -291,7 +291,7 @@ obj_coll_punch_prep(struct obj_coll_punch_in *ocpi, struct daos_coll_target *dct D_GOTO(out, rc = -DER_INVAL); } - size = pool_map_node_nr(map->pl_poolmap); + size = pool_map_rank_nr(map->pl_poolmap); D_ALLOC_ARRAY(dce->dce_hints, size); if (dce->dce_hints == NULL) D_GOTO(out, rc = -DER_NOMEM); diff --git a/src/pool/cli.c b/src/pool/cli.c index 5345017f824..85fa718aa1c 100644 --- a/src/pool/cli.c +++ b/src/pool/cli.c @@ -503,7 +503,7 @@ update_rsvc_client(struct dc_pool *pool) { struct subtract_rsvc_rank_arg arg; - arg.srra_nodes_len = pool_map_find_nodes(pool->dp_map, PO_COMP_ID_ALL, &arg.srra_nodes); + arg.srra_nodes_len = pool_map_find_ranks(pool->dp_map, PO_COMP_ID_ALL, &arg.srra_nodes); /* There must be at least one rank. */ D_ASSERTF(arg.srra_nodes_len > 0, "%d > 0\n", arg.srra_nodes_len); @@ -2016,7 +2016,7 @@ choose_map_refresh_rank(struct map_refresh_arg *arg) if (arg->mra_n <= 0) return CRT_NO_RANK; - n = pool_map_find_nodes(arg->mra_pool->dp_map, PO_COMP_ID_ALL, &nodes); + n = pool_map_find_ranks(arg->mra_pool->dp_map, PO_COMP_ID_ALL, &nodes); /* There must be at least one rank. */ D_ASSERTF(n > 0, "%d\n", n); diff --git a/src/pool/rpc.h b/src/pool/rpc.h index cf763b896dc..cfddcc48931 100644 --- a/src/pool/rpc.h +++ b/src/pool/rpc.h @@ -147,6 +147,16 @@ CRT_RPC_DECLARE(pool_op, DAOS_ISEQ_POOL_OP, DAOS_OSEQ_POOL_OP) CRT_RPC_DECLARE(pool_create, DAOS_ISEQ_POOL_CREATE, DAOS_OSEQ_POOL_CREATE) /* clang-format on */ + +/* the source of pool map update operation */ +enum map_update_source { + MUS_SWIM = 0, + /* May need to differentiate from administrator/csum scrubber/nvme healthy monitor later. + * Now all non-swim cases fall to DMG category. + */ + MUS_DMG, +}; + enum map_update_opc { MAP_EXCLUDE = 0, MAP_DRAIN, diff --git a/src/pool/srv.c b/src/pool/srv.c index 2a45f4dec05..7e5548e8508 100644 --- a/src/pool/srv.c +++ b/src/pool/srv.c @@ -19,7 +19,12 @@ #include "rpc.h" #include "srv_internal.h" #include "srv_layout.h" -bool ec_agg_disabled; + +bool ec_agg_disabled; +uint32_t pw_rf; /* pool wise RF */ +#define PW_RF_DEFAULT (2) +#define PW_RF_MIN (1) +#define PW_RF_MAX (4) static int init(void) @@ -47,6 +52,15 @@ init(void) if (unlikely(ec_agg_disabled)) D_WARN("EC aggregation is disabled.\n"); + pw_rf = PW_RF_DEFAULT; + d_getenv_uint32_t("DAOS_POOL_RF", &pw_rf); + if (pw_rf < PW_RF_MIN || pw_rf > PW_RF_MAX) { + D_INFO("pw_rf %d is out of range [%d, %d], take default %d\n", + pw_rf, PW_RF_MIN, PW_RF_MAX, PW_RF_DEFAULT); + pw_rf = PW_RF_DEFAULT; + } + D_INFO("pool wise RF %d\n", pw_rf); + ds_pool_rsvc_class_register(); bio_register_ract_ops(&nvme_reaction_ops); diff --git a/src/pool/srv_internal.h b/src/pool/srv_internal.h index c09d2ffcaea..8f864c8c11a 100644 --- a/src/pool/srv_internal.h +++ b/src/pool/srv_internal.h @@ -16,6 +16,8 @@ #include #include +extern uint32_t pw_rf; + /** * Global pool metrics */ diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index 54e29767347..667e4bc6ed6 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -1355,11 +1355,11 @@ handle_event(struct pool_svc *svc, struct pool_svc_event *event) if (event->psv_rank == dss_self_rank() && event->psv_src == CRT_EVS_GRPMOD && event->psv_type == CRT_EVT_DEAD) { - D_DEBUG(DB_MGMT, "ignore exclusion of self\n"); + D_DEBUG(DB_MD, "ignore exclusion of self\n"); goto out; } - D_DEBUG(DB_MD, DF_UUID": handling event: "DF_PS_EVENT"\n", DP_UUID(svc->ps_uuid), + D_INFO(DF_UUID": handling event: "DF_PS_EVENT"\n", DP_UUID(svc->ps_uuid), DP_PS_EVENT(event)); if (event->psv_src == CRT_EVS_SWIM && event->psv_type == CRT_EVT_ALIVE) { @@ -1381,8 +1381,8 @@ handle_event(struct pool_svc *svc, struct pool_svc_event *event) * and does not have a copy of the pool map. */ ds_rsvc_request_map_dist(&svc->ps_rsvc); - D_DEBUG(DB_MD, DF_UUID": requested map dist for rank %u\n", DP_UUID(svc->ps_uuid), - event->psv_rank); + D_DEBUG(DB_MD, DF_UUID": requested map dist for rank %u\n", + DP_UUID(svc->ps_uuid), event->psv_rank); } else if (event->psv_type == CRT_EVT_DEAD) { rc = pool_svc_exclude_rank(svc, event->psv_rank); if (rc != 0) @@ -1809,7 +1809,7 @@ pool_svc_check_node_status(struct pool_svc *svc) D_DEBUG(DB_MD, DF_UUID": checking node status\n", DP_UUID(svc->ps_uuid)); ABT_rwlock_rdlock(svc->ps_pool->sp_lock); - doms_cnt = pool_map_find_nodes(svc->ps_pool->sp_map, PO_COMP_ID_ALL, + doms_cnt = pool_map_find_ranks(svc->ps_pool->sp_map, PO_COMP_ID_ALL, &doms); D_ASSERT(doms_cnt >= 0); for (i = 0; i < doms_cnt; i++) { @@ -6500,6 +6500,49 @@ pool_svc_schedule_reconf(struct pool_svc *svc, struct pool_map *map, uint32_t ma return 0; } +static int +pool_map_crit_prompt(struct pool_svc *svc, struct pool_map *map, d_rank_t rank) +{ + crt_group_t *primary_grp; + struct pool_domain *doms; + int doms_cnt; + int i; + int rc = 0; + + D_DEBUG(DB_MD, DF_UUID": checking node status\n", DP_UUID(svc->ps_uuid)); + doms_cnt = pool_map_find_ranks(map, PO_COMP_ID_ALL, &doms); + D_ASSERT(doms_cnt >= 0); + primary_grp = crt_group_lookup(NULL); + D_ASSERT(primary_grp != NULL); + + D_CRIT("!!! Please try to recover these engines in top priority -\n"); + D_CRIT("!!! Please refer \"Pool-Wise Redundancy Factor\" section in pool_operations.md\n"); + D_CRIT("!!! pool "DF_UUID": intolerable unavailability: engine rank %u\n", + DP_UUID(svc->ps_uuid), rank); + for (i = 0; i < doms_cnt; i++) { + struct swim_member_state state; + + if (!(doms[i].do_comp.co_status & PO_COMP_ST_UPIN) || + (doms[i].do_comp.co_rank == rank)) + continue; + + rc = crt_rank_state_get(primary_grp, doms[i].do_comp.co_rank, &state); + if (rc != 0 && rc != -DER_NONEXIST) { + D_ERROR("failed to get status of rank %u: %d\n", + doms[i].do_comp.co_rank, rc); + break; + } + + D_DEBUG(DB_MD, "rank/state %d/%d\n", doms[i].do_comp.co_rank, + rc == -DER_NONEXIST ? -1 : state.sms_status); + if (rc == -DER_NONEXIST || state.sms_status == SWIM_MEMBER_DEAD) + D_CRIT("!!! pool "DF_UUID" : intolerable unavailability: engine rank %u\n", + DP_UUID(svc->ps_uuid), doms[i].do_comp.co_rank); + } + + return rc; +} + /* * Perform an update to the pool map of \a svc. * @@ -6532,7 +6575,8 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc, struct pool_target_addr_list *tgt_addrs, struct rsvc_hint *hint, bool *p_updated, uint32_t *map_version_p, uint32_t *tgt_map_ver, - struct pool_target_addr_list *inval_tgt_addrs) + struct pool_target_addr_list *inval_tgt_addrs, + enum map_update_source src) { struct rdb_tx tx; struct pool_map *map; @@ -6628,7 +6672,7 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc, * If the map modification affects myself, leave it to a new PS leader * if there's another PS replica, or reject it. */ - node = pool_map_find_node_by_rank(map, dss_self_rank()); + node = pool_map_find_dom_by_rank(map, dss_self_rank()); if (node == NULL || !(node->do_comp.co_status & DC_POOL_SVC_MAP_STATES)) { d_rank_list_t *replicas; @@ -6653,6 +6697,33 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc, goto out_map; } + /* For SWIM exclude, don't change pool map if the pw_rf is broken or is going to be broken, + * with CRIT log message to ask administrator to bring back the engine. + */ + if (src == MUS_SWIM && opc == MAP_EXCLUDE) { + d_rank_t rank; + int failed_cnt; + + rc = pool_map_update_failed_cnt(map); + if (rc != 0) { + DL_ERROR(rc, DF_UUID": pool_map_update_failed_cnt failed.", + DP_UUID(svc->ps_uuid)); + goto out_map; + } + + D_ASSERT(tgt_addrs->pta_number == 1); + rank = tgt_addrs->pta_addrs->pta_rank; + failed_cnt = pool_map_get_failed_cnt(map, PO_COMP_TP_NODE); + D_INFO(DF_UUID": SWIM exclude rank %d, failed NODE %d\n", + DP_UUID(svc->ps_uuid), rank, failed_cnt); + if (failed_cnt > pw_rf) { + D_CRIT(DF_UUID": exclude rank %d will break pw_rf %d, failed_cnt %d\n", + DP_UUID(svc->ps_uuid), rank, pw_rf, failed_cnt); + rc = pool_map_crit_prompt(svc, map, rank); + goto out_map; + } + } + /* Write the new pool map. */ rc = pool_buf_extract(map, &map_buf); if (rc != 0) @@ -6809,7 +6880,7 @@ pool_update_map_internal(uuid_t pool_uuid, unsigned int opc, bool exclude_rank, rc = pool_svc_update_map_internal(svc, opc, exclude_rank, NULL, 0, NULL, tgts, tgt_addrs, hint, p_updated, map_version_p, tgt_map_ver, - inval_tgt_addrs); + inval_tgt_addrs, MUS_DMG); pool_svc_put_leader(svc); return rc; @@ -6859,8 +6930,8 @@ static int pool_svc_update_map(struct pool_svc *svc, crt_opcode_t opc, bool exclude_rank, d_rank_list_t *extend_rank_list, uint32_t *extend_domains, uint32_t extend_domains_nr, struct pool_target_addr_list *list, - struct pool_target_addr_list *inval_list_out, - uint32_t *map_version, struct rsvc_hint *hint) + struct pool_target_addr_list *inval_list_out, uint32_t *map_version, + struct rsvc_hint *hint, enum map_update_source src) { struct pool_target_id_list target_list = { 0 }; daos_prop_t prop = { 0 }; @@ -6875,7 +6946,7 @@ pool_svc_update_map(struct pool_svc *svc, crt_opcode_t opc, bool exclude_rank, rc = pool_svc_update_map_internal(svc, opc, exclude_rank, extend_rank_list, extend_domains_nr, extend_domains, &target_list, list, hint, &updated, - map_version, &tgt_map_ver, inval_list_out); + map_version, &tgt_map_ver, inval_list_out, src); if (rc) D_GOTO(out, rc); @@ -6962,10 +7033,9 @@ ds_pool_extend_handler(crt_rpc_t *rpc) goto out; rc = pool_svc_update_map(svc, pool_opc_2map_opc(opc_get(rpc->cr_opc)), - false /* exclude_rank */, - &rank_list, domains, ndomains, + false /* exclude_rank */, &rank_list, domains, ndomains, NULL, NULL, &out->peo_op.po_map_version, - &out->peo_op.po_hint); + &out->peo_op.po_hint, MUS_DMG); pool_svc_put_leader(svc); out: @@ -7067,7 +7137,7 @@ ds_pool_update_handler(crt_rpc_t *rpc, int handler_version) rc = pool_svc_update_map(svc, pool_opc_2map_opc(opc_get(rpc->cr_opc)), false /* exclude_rank */, NULL, NULL, 0, &list, &inval_list_out, &out->pto_op.po_map_version, - &out->pto_op.po_hint); + &out->pto_op.po_hint, MUS_DMG); if (rc != 0) goto out_svc; @@ -7112,7 +7182,7 @@ pool_svc_exclude_rank(struct pool_svc *svc, d_rank_t rank) rc = pool_svc_update_map(svc, pool_opc_2map_opc(POOL_EXCLUDE), true /* exclude_rank */, NULL, NULL, 0, &list, &inval_list_out, &map_version, - NULL /* hint */); + NULL /* hint */, MUS_SWIM); D_DEBUG(DB_MD, "Exclude pool "DF_UUID"/%u rank %u: rc %d\n", DP_UUID(svc->ps_uuid), map_version, rank, rc); diff --git a/src/pool/srv_pool_map.c b/src/pool/srv_pool_map.c index 1cb5632598f..9793df24f01 100644 --- a/src/pool/srv_pool_map.c +++ b/src/pool/srv_pool_map.c @@ -378,7 +378,7 @@ ds_pool_map_tgts_update(struct pool_map *map, struct pool_target_id_list *tgts, return -DER_NONEXIST; } - dom = pool_map_find_node_by_rank(map, target->ta_comp.co_rank); + dom = pool_map_find_dom_by_rank(map, target->ta_comp.co_rank); if (dom == NULL) { D_ERROR("Got request to change nonexistent rank %u" " in map %p\n", diff --git a/src/pool/srv_util.c b/src/pool/srv_util.c index e39072568e1..29f012d5844 100644 --- a/src/pool/srv_util.c +++ b/src/pool/srv_util.c @@ -21,19 +21,19 @@ int map_ranks_init(const struct pool_map *map, unsigned int status, d_rank_list_t *ranks) { struct pool_domain *domains = NULL; - int nnodes; + int nranks; int n = 0; int i; d_rank_t *rs; - nnodes = pool_map_find_nodes((struct pool_map *)map, + nranks = pool_map_find_ranks((struct pool_map *)map, PO_COMP_ID_ALL, &domains); - if (nnodes == 0) { + if (nranks == 0) { D_ERROR("no nodes in pool map\n"); return -DER_IO; } - for (i = 0; i < nnodes; i++) { + for (i = 0; i < nranks; i++) { if (status & domains[i].do_comp.co_status) n++; } @@ -52,7 +52,7 @@ map_ranks_init(const struct pool_map *map, unsigned int status, d_rank_list_t *r ranks->rl_ranks = rs; n = 0; - for (i = 0; i < nnodes; i++) { + for (i = 0; i < nranks; i++) { if (status & domains[i].do_comp.co_status) { D_ASSERT(n < ranks->rl_nr); ranks->rl_ranks[n] = domains[i].do_comp.co_rank; @@ -85,7 +85,7 @@ ds_pool_map_rank_up(struct pool_map *map, d_rank_t rank) struct pool_domain *node; int rc; - rc = pool_map_find_nodes(map, rank, &node); + rc = pool_map_find_ranks(map, rank, &node); if (rc == 0) return false; D_ASSERTF(rc == 1, "%d\n", rc); @@ -921,7 +921,7 @@ testu_create_pool_map(d_rank_t *ranks, int n_ranks, d_rank_t *down_ranks, int n_ for (i = 0; i < n_down_ranks; i++) { struct pool_domain *d; - d = pool_map_find_node_by_rank(map, down_ranks[i]); + d = pool_map_find_dom_by_rank(map, down_ranks[i]); D_ASSERT(d != NULL); d->do_comp.co_status = PO_COMP_ST_DOWN; } diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index f40ea2d2fe3..b1f722b8254 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -654,7 +654,7 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op, for (i = 0; i < excluded.rl_nr; i++) { struct pool_domain *dom; - dom = pool_map_find_node_by_rank(pool->sp_map, excluded.rl_ranks[i]); + dom = pool_map_find_dom_by_rank(pool->sp_map, excluded.rl_ranks[i]); D_ASSERT(dom != NULL); if (rgt->rgt_opc == RB_OP_REBUILD) { @@ -767,7 +767,7 @@ rebuild_global_pool_tracker_create(struct ds_pool *pool, uint32_t ver, uint32_t uint32_t opc, struct rebuild_global_pool_tracker **p_rgt) { struct rebuild_global_pool_tracker *rgt; - int node_nr; + int rank_nr; struct pool_domain *doms; int i; int rc = 0; @@ -777,11 +777,11 @@ rebuild_global_pool_tracker_create(struct ds_pool *pool, uint32_t ver, uint32_t return -DER_NOMEM; D_INIT_LIST_HEAD(&rgt->rgt_list); - node_nr = pool_map_find_nodes(pool->sp_map, PO_COMP_ID_ALL, &doms); - if (node_nr < 0) - D_GOTO(out, rc = node_nr); + rank_nr = pool_map_find_ranks(pool->sp_map, PO_COMP_ID_ALL, &doms); + if (rank_nr < 0) + D_GOTO(out, rc = rank_nr); - D_ALLOC_ARRAY(rgt->rgt_servers, node_nr); + D_ALLOC_ARRAY(rgt->rgt_servers, rank_nr); if (rgt->rgt_servers == NULL) D_GOTO(out, rc = -DER_NOMEM); @@ -793,9 +793,9 @@ rebuild_global_pool_tracker_create(struct ds_pool *pool, uint32_t ver, uint32_t if (rc != ABT_SUCCESS) D_GOTO(out, rc = dss_abterr2der(rc)); - for (i = 0; i < node_nr; i++) + for (i = 0; i < rank_nr; i++) rgt->rgt_servers[i].rank = doms[i].do_comp.co_rank; - rgt->rgt_servers_number = node_nr; + rgt->rgt_servers_number = rank_nr; uuid_copy(rgt->rgt_pool_uuid, pool->sp_uuid); rgt->rgt_rebuild_ver = ver; @@ -964,7 +964,7 @@ rebuild_scan_broadcast(struct ds_pool *pool, struct rebuild_global_pool_tracker for (i = 0; i < up_ranks.rl_nr; i++) { struct pool_domain *dom; - dom = pool_map_find_node_by_rank(pool->sp_map, up_ranks.rl_ranks[i]); + dom = pool_map_find_dom_by_rank(pool->sp_map, up_ranks.rl_ranks[i]); D_ASSERT(dom != NULL); D_DEBUG(DB_REBUILD, "rank %u ver %u rebuild %u\n", up_ranks.rl_ranks[i], dom->do_comp.co_in_ver, rgt->rgt_rebuild_ver); diff --git a/src/tests/ftest/util/server_utils_params.py b/src/tests/ftest/util/server_utils_params.py index 248617c1b36..440ffe68f82 100644 --- a/src/tests/ftest/util/server_utils_params.py +++ b/src/tests/ftest/util/server_utils_params.py @@ -434,6 +434,7 @@ class EngineYamlParameters(YamlParameters): REQUIRED_ENV_VARS = { "common": [ "D_LOG_FILE_APPEND_PID=1", + "DAOS_POOL_RF=4", "COVFILE=/tmp/test.cov"], "ofi+tcp": [], "ofi+tcp;ofi_rxm": [],