From 35334aaeae0fac404ed4ca34899ec23679efc25c Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Wed, 4 Sep 2024 19:02:27 -0400 Subject: [PATCH 01/12] DAOS-16471 test: Reduce targets for ioctl_pool_handles.py (#15063) (#15071) The dfuse/ioctl_pool_handles.py test is overloading the VM so reduce the number of engine targets. Signed-off-by: Phil Henderson --- src/tests/ftest/dfuse/ioctl_pool_handles.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/ftest/dfuse/ioctl_pool_handles.yaml b/src/tests/ftest/dfuse/ioctl_pool_handles.yaml index 2900f67c328..35752453850 100644 --- a/src/tests/ftest/dfuse/ioctl_pool_handles.yaml +++ b/src/tests/ftest/dfuse/ioctl_pool_handles.yaml @@ -16,7 +16,7 @@ server_config: 0: class: ram scm_mount: /mnt/daos - targets: 16 + targets: 8 system_ram_reserved: 1 pool: From 906f0a44b74359f0e1370640851f53d77df77f49 Mon Sep 17 00:00:00 2001 From: Nasf-Fan Date: Fri, 6 Sep 2024 00:27:23 +0800 Subject: [PATCH 02/12] DAOS-16483 vos: handle empty DTX when vos_tx_end - b26 (#15055) It is possible that the DTX modified nothing when stop currnet backend transaction. Under such case, we may not generate persistent DTX entry. Then need to bypass such case before checking on-disk DTX entry status. The patch makes some clean and removed redundant metrics for committed DTX entries. Enhance vos_dtx_deregister_record() to handle GC case. Signed-off-by: Fan Yong --- src/dtx/dtx_common.c | 2 +- src/tests/ftest/util/telemetry_utils.py | 1 - src/vos/vos_common.c | 31 ++++----- src/vos/vos_dtx.c | 86 ++++++++++++++++++++----- src/vos/vos_tls.h | 1 - 5 files changed, 83 insertions(+), 38 deletions(-) diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c index 353bd880009..ff4f2dfe4ef 100644 --- a/src/dtx/dtx_common.c +++ b/src/dtx/dtx_common.c @@ -1341,7 +1341,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul * it persistently. Otherwise, the subsequent DTX resync may not find it as * to regard it as failed transaction and abort it. */ - if (result == 0 && !dth->dth_active && !dth->dth_prepared && + if (result == 0 && !dth->dth_active && !dth->dth_prepared && !dth->dth_solo && (dth->dth_dist || dth->dth_modification_cnt > 0)) { result = vos_dtx_attach(dth, true, dth->dth_ent != NULL ? true : false); if (unlikely(result < 0)) { diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py index aec831b3b8a..db424b6de68 100644 --- a/src/tests/ftest/util/telemetry_utils.py +++ b/src/tests/ftest/util/telemetry_utils.py @@ -421,7 +421,6 @@ class TelemetryUtils(): ENGINE_NVME_CRIT_WARN_METRICS +\ ENGINE_NVME_INTEL_VENDOR_METRICS ENGINE_MEM_USAGE_METRICS = [ - "engine_mem_vos_dtx_cmt_ent_48", "engine_mem_vos_vos_obj_360", "engine_mem_vos_vos_lru_size", "engine_mem_dtx_dtx_leader_handle_360"] diff --git a/src/vos/vos_common.c b/src/vos/vos_common.c index fb8461e2931..93bf1757f10 100644 --- a/src/vos/vos_common.c +++ b/src/vos/vos_common.c @@ -405,16 +405,24 @@ vos_tx_end(struct vos_container *cont, struct dtx_handle *dth_in, } } else if (dae != NULL) { if (dth->dth_solo) { - if (err == 0 && cont->vc_solo_dtx_epoch < dth->dth_epoch) + if (err == 0 && dae->dae_committing && + cont->vc_solo_dtx_epoch < dth->dth_epoch) cont->vc_solo_dtx_epoch = dth->dth_epoch; vos_dtx_post_handle(cont, &dae, &dce, 1, false, err != 0); } else { D_ASSERT(dce == NULL); - if (err == 0) { - dae->dae_prepared = 1; + if (err == 0 && dth->dth_active) { + D_ASSERTF(!UMOFF_IS_NULL(dae->dae_df_off), + "Non-prepared DTX " DF_DTI "\n", + DP_DTI(&dth->dth_xid)); + dae_df = umem_off2ptr(umm, dae->dae_df_off); - D_ASSERT(!(dae_df->dae_flags & DTE_INVALID)); + D_ASSERTF(!(dae_df->dae_flags & DTE_INVALID), + "Invalid status for DTX " DF_DTI "\n", + DP_DTI(&dth->dth_xid)); + + dae->dae_prepared = 1; } } } @@ -563,13 +571,6 @@ vos_tls_init(int tags, int xs_id, int tgt_id) } } - rc = d_tm_add_metric(&tls->vtl_committed, D_TM_STATS_GAUGE, - "Number of committed entries kept around for reply" - " reconstruction", "entries", - "io/dtx/committed/tgt_%u", tgt_id); - if (rc) - D_WARN("Failed to create committed cnt sensor: "DF_RC"\n", - DP_RC(rc)); if (tgt_id >= 0) { rc = d_tm_add_metric(&tls->vtl_committed, D_TM_STATS_GAUGE, "Number of committed entries kept around for reply" @@ -579,14 +580,6 @@ vos_tls_init(int tags, int xs_id, int tgt_id) D_WARN("Failed to create committed cnt sensor: "DF_RC"\n", DP_RC(rc)); - rc = d_tm_add_metric(&tls->vtl_dtx_cmt_ent_cnt, D_TM_GAUGE, - "Number of committed entries", "entry", - "mem/vos/dtx_cmt_ent_%u/tgt_%u", - sizeof(struct vos_dtx_cmt_ent), tgt_id); - if (rc) - D_WARN("Failed to create committed cnt: "DF_RC"\n", - DP_RC(rc)); - rc = d_tm_add_metric(&tls->vtl_obj_cnt, D_TM_GAUGE, "Number of cached vos object", "entry", "mem/vos/vos_obj_%u/tgt_%u", diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index 0e70133629f..1c60f781507 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -769,7 +769,6 @@ vos_dtx_commit_one(struct vos_container *cont, struct dtx_id *dti, daos_epoch_t daos_epoch_t cmt_time, struct vos_dtx_cmt_ent **dce_p, struct vos_dtx_act_ent **dae_p, bool *rm_cos, bool *fatal) { - struct vos_tls *tls = vos_tls_get(false); struct vos_dtx_act_ent *dae = NULL; struct vos_dtx_cmt_ent *dce = NULL; d_iov_t kiov; @@ -834,7 +833,6 @@ vos_dtx_commit_one(struct vos_container *cont, struct dtx_id *dti, daos_epoch_t if (dce == NULL) D_GOTO(out, rc = -DER_NOMEM); - d_tm_inc_gauge(tls->vtl_dtx_cmt_ent_cnt, 1); DCE_CMT_TIME(dce) = cmt_time; if (dae != NULL) { DCE_XID(dce) = DAE_XID(dae); @@ -1535,10 +1533,14 @@ int vos_dtx_deregister_record(struct umem_instance *umm, daos_handle_t coh, uint32_t entry, daos_epoch_t epoch, umem_off_t record) { + struct dtx_handle *dth = vos_dth_get(false); struct vos_container *cont; struct vos_dtx_act_ent *dae; + struct vos_dtx_act_ent_df *dae_df; + umem_off_t *rec_df; bool found; int count; + int rc; int i; if (!vos_dtx_is_normal_entry(entry)) @@ -1567,10 +1569,54 @@ vos_dtx_deregister_record(struct umem_instance *umm, daos_handle_t coh, * by another prepared (but non-committed) DTX, then do not allow current transaction * to modify it. Because if current transaction is aborted or failed for some reason, * there is no efficient way to recover such former non-committed DTX. + * + * If dth is NULL, then it is for GC. Under such case, deregister the record anyway. */ - if (dae->dae_dbd != NULL) - return dtx_inprogress(dae, vos_dth_get(cont->vc_pool->vp_sysdb), false, false, 8); + if (dae->dae_dbd != NULL) { + if (dth != NULL) + return dtx_inprogress(dae, dth, false, false, 8); + + dae_df = umem_off2ptr(umm, dae->dae_df_off); + D_ASSERT(!(dae_df->dae_flags & DTE_INVALID)); + if (dae_df->dae_rec_cnt > DTX_INLINE_REC_CNT) + count = DTX_INLINE_REC_CNT; + else + count = dae_df->dae_rec_cnt; + + rec_df = dae_df->dae_rec_inline; + for (i = 0; i < count; i++) { + if (record == umem_off2offset(rec_df[i])) { + rc = umem_tx_add_ptr(umm, &rec_df[i], sizeof(rec_df[i])); + if (rc != 0) + return rc; + + rec_df[i] = UMOFF_NULL; + goto cache; + } + } + + rec_df = umem_off2ptr(umm, dae_df->dae_rec_off); + if (rec_df == NULL) + /* If non-exist on disk, then must be non-exist in cache. */ + return 0; + + for (i = 0; i < dae_df->dae_rec_cnt - DTX_INLINE_REC_CNT; i++) { + if (record == umem_off2offset(rec_df[i])) { + rc = umem_tx_add_ptr(umm, &rec_df[i], sizeof(rec_df[i])); + if (rc != 0) + return rc; + + rec_df[i] = UMOFF_NULL; + goto cache; + } + } + + /* If non-exist on disk, then must be non-exist in cache. */ + return 0; + } + +cache: if (DAE_REC_CNT(dae) > DTX_INLINE_REC_CNT) count = DTX_INLINE_REC_CNT; else @@ -2116,14 +2162,18 @@ vos_dtx_post_handle(struct vos_container *cont, if (!abort && dces != NULL) { struct vos_tls *tls = vos_tls_get(false); + int j = 0; D_ASSERT(cont->vc_pool->vp_sysdb == false); for (i = 0; i < count; i++) { - if (dces[i] != NULL) { - cont->vc_dtx_committed_count++; - cont->vc_pool->vp_dtx_committed_count++; - d_tm_inc_gauge(tls->vtl_committed, 1); - } + if (dces[i] != NULL) + j++; + } + + if (j > 0) { + cont->vc_dtx_committed_count += j; + cont->vc_pool->vp_dtx_committed_count += j; + d_tm_inc_gauge(tls->vtl_committed, j); } } @@ -2439,6 +2489,7 @@ vos_dtx_aggregate(daos_handle_t coh) uint64_t epoch; umem_off_t dbd_off; umem_off_t next = UMOFF_NULL; + int count = 0; int rc; int i; @@ -2481,13 +2532,10 @@ vos_dtx_aggregate(daos_handle_t coh) UMOFF_P(dbd_off), DP_RC(rc)); goto out; } - - cont->vc_dtx_committed_count--; - cont->vc_pool->vp_dtx_committed_count--; - d_tm_dec_gauge(tls->vtl_committed, 1); - d_tm_dec_gauge(tls->vtl_dtx_cmt_ent_cnt, 1); } + count = dbd->dbd_count; + if (epoch != cont_df->cd_newest_aggregated) { rc = umem_tx_add_ptr(umm, &cont_df->cd_newest_aggregated, sizeof(cont_df->cd_newest_aggregated)); @@ -2545,8 +2593,14 @@ vos_dtx_aggregate(daos_handle_t coh) out: rc = umem_tx_end(umm, rc); - if (rc == 0 && cont->vc_cmt_dtx_reindex_pos == dbd_off) - cont->vc_cmt_dtx_reindex_pos = next; + if (rc == 0) { + if (cont->vc_cmt_dtx_reindex_pos == dbd_off) + cont->vc_cmt_dtx_reindex_pos = next; + + cont->vc_dtx_committed_count -= count; + cont->vc_pool->vp_dtx_committed_count -= count; + d_tm_dec_gauge(tls->vtl_committed, count); + } DL_CDEBUG(rc != 0, DLOG_ERR, DB_IO, rc, "Release DTX committed blob %p (" UMOFF_PF ") for cont " DF_UUID, dbd, diff --git a/src/vos/vos_tls.h b/src/vos/vos_tls.h index 981cce10be5..2fc328457d0 100644 --- a/src/vos/vos_tls.h +++ b/src/vos/vos_tls.h @@ -64,7 +64,6 @@ struct vos_tls { }; struct d_tm_node_t *vtl_committed; struct d_tm_node_t *vtl_obj_cnt; - struct d_tm_node_t *vtl_dtx_cmt_ent_cnt; struct d_tm_node_t *vtl_lru_alloc_size; }; From 3d9e2d0cafcb5f314135ba9e7a27dd541102f7c5 Mon Sep 17 00:00:00 2001 From: Joseph Moore <26410038+jgmoore-or@users.noreply.github.com> Date: Thu, 5 Sep 2024 14:39:17 -0600 Subject: [PATCH 03/12] DAOS-16271 mercury: Add patch to check ep for null in UCX key resolve. (#15077) Signed-off-by: Joseph Moore --- utils/build.config | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/build.config b/utils/build.config index 174df687036..c38d49a267a 100644 --- a/utils/build.config +++ b/utils/build.config @@ -29,3 +29,4 @@ ucx=https://github.com/openucx/ucx.git spdk=https://github.com/spdk/spdk/commit/b0aba3fcd5aceceea530a702922153bc75664978.diff,https://github.com/spdk/spdk/commit/445a4c808badbad3942696ecf16fa60e8129a747.diff ofi=https://github.com/ofiwg/libfabric/commit/d827c6484cc5bf67dfbe395890e258860c3f0979.diff fuse=https://github.com/libfuse/libfuse/commit/c9905341ea34ff9acbc11b3c53ba8bcea35eeed8.diff +mercury=https://raw.githubusercontent.com/daos-stack/mercury/481297621bafbbcac4cc6f8feab3f1b6f8b14b59/na_ucx_keyres_epchk.patch From 185ba8f3055ff7639fc472f5ba1210a8034f9354 Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Thu, 5 Sep 2024 16:28:11 -0700 Subject: [PATCH 04/12] DAOS-16457 test: remove display_memory_info (#15031) (#15075) display_memory_info was added to debug an issue when starting the servers, but resolved by #14295. It is no longer needed and consumes too much log space and time. Signed-off-by: Dalton Bohning --- src/tests/ftest/util/server_utils.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/tests/ftest/util/server_utils.py b/src/tests/ftest/util/server_utils.py index 6ae05af94e9..752473021a3 100644 --- a/src/tests/ftest/util/server_utils.py +++ b/src/tests/ftest/util/server_utils.py @@ -464,14 +464,6 @@ def support_collect_log(self, **kwargs): return run_remote( self.log, self._hosts, cmd.with_exports, timeout=self.collect_log_timeout.value) - def display_memory_info(self): - """Display server hosts memory info.""" - self.log.debug("#" * 80) - self.log.debug(" Collection debug memory info") - run_remote(self.log, self._hosts, "free -m && df -h --type=tmpfs") - run_remote(self.log, self._hosts, "ps -eo size,pid,user,command --sort -size | head -n 6") - self.log.debug("#" * 80) - def detect_format_ready(self, reformat=False): """Detect when all the daos_servers are ready for storage format. @@ -664,14 +656,11 @@ def start(self): self.prepare() # Start the servers and wait for them to be ready for storage format - self.display_memory_info() self.detect_format_ready() # Collect storage and network information from the servers. - self.display_memory_info() self.information.collect_storage_information() self.information.collect_network_information() - self.display_memory_info() # Format storage and wait for server to change ownership self.log.info(" Formatting hosts: <%s>", self.dmg.hostlist) @@ -711,9 +700,6 @@ def stop(self): # Make sure the mount directory belongs to non-root user self.set_scm_mount_ownership() - # Collective memory usage after stop. - self.display_memory_info() - # Report any errors after all stop actions have been attempted if messages: raise ServerFailed("Failed to stop servers:\n {}".format("\n ".join(messages))) From e1b6a7e88c142c73986eadb2d1c6312c5092ee4b Mon Sep 17 00:00:00 2001 From: Nasf-Fan Date: Fri, 6 Sep 2024 09:43:04 +0800 Subject: [PATCH 05/12] DAOS-16458 object: fix invalid DRAM access in obj_bulk_transfer - b26 (#15054) For EC object update via CPD RPC, when calculate the bitmap to skip some iods for current EC data shard, we may input NULL for "*skips" parameter. It may cause the old logic in obj_get_iods_offs_by_oid() to generate some undefined DRAM for "skips" bitmap. Such bitmap may be over-written by others, as to subsequent obj_bulk_transfer() may be misguided. The patch also fixes a bug inside obj_bulk_transfer() that cast any input RPC as UPDATE/FETCH by force. Signed-off-by: Fan Yong --- src/object/srv_coll.c | 2 +- src/object/srv_internal.h | 2 +- src/object/srv_obj.c | 41 ++++++++++++++++++++++++--------------- 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/src/object/srv_coll.c b/src/object/srv_coll.c index 9e421810861..a63a11d574b 100644 --- a/src/object/srv_coll.c +++ b/src/object/srv_coll.c @@ -183,7 +183,7 @@ obj_coll_punch_bulk(crt_rpc_t *rpc, d_iov_t *iov, crt_proc_t *p_proc, sgl.sg_iovs = iov; rc = obj_bulk_transfer(rpc, CRT_BULK_GET, false, &ocpi->ocpi_tgt_bulk, NULL, NULL, - DAOS_HDL_INVAL, &sgls, 1, NULL, NULL); + DAOS_HDL_INVAL, &sgls, 1, 1, NULL, NULL); if (rc != 0) { D_ERROR("Failed to prepare bulk transfer for coll_punch, size %u: "DF_RC"\n", ocpi->ocpi_bulk_tgt_sz, DP_RC(rc)); diff --git a/src/object/srv_internal.h b/src/object/srv_internal.h index 6f13e3f36dc..a24986247a5 100644 --- a/src/object/srv_internal.h +++ b/src/object/srv_internal.h @@ -280,7 +280,7 @@ typedef int (*ds_iofw_cb_t)(crt_rpc_t *req, void *arg); int obj_bulk_transfer(crt_rpc_t *rpc, crt_bulk_op_t bulk_op, bool bulk_bind, crt_bulk_t *remote_bulks, uint64_t *remote_offs, uint8_t *skips, - daos_handle_t ioh, d_sg_list_t **sgls, int sgl_nr, + daos_handle_t ioh, d_sg_list_t **sgls, int sgl_nr, int bulk_nr, struct obj_bulk_args *p_arg, struct ds_cont_hdl *coh); int obj_tgt_punch(struct obj_tgt_punch_args *otpa, uint32_t *shards, uint32_t count); int obj_tgt_query(struct obj_tgt_query_args *otqa, uuid_t po_uuid, uuid_t co_hdl, uuid_t co_uuid, diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 0a246bebdca..febd3d36ead 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -488,22 +488,24 @@ bulk_transfer_sgl(daos_handle_t ioh, crt_rpc_t *rpc, crt_bulk_t remote_bulk, int obj_bulk_transfer(crt_rpc_t *rpc, crt_bulk_op_t bulk_op, bool bulk_bind, crt_bulk_t *remote_bulks, uint64_t *remote_offs, uint8_t *skips, daos_handle_t ioh, d_sg_list_t **sgls, - int sgl_nr, struct obj_bulk_args *p_arg, struct ds_cont_hdl *coh) + int sgl_nr, int bulk_nr, struct obj_bulk_args *p_arg, struct ds_cont_hdl *coh) { - struct obj_rw_in *orw = crt_req_get(rpc); struct obj_bulk_args arg = { 0 }; int i, rc, *status, ret; int skip_nr = 0; - int bulk_nr; bool async = true; uint64_t time = daos_get_ntime(); + if (unlikely(sgl_nr > bulk_nr)) { + D_ERROR("Invalid sgl_nr vs bulk_nr: %d/%d\n", sgl_nr, bulk_nr); + return -DER_INVAL; + } + if (remote_bulks == NULL) { D_ERROR("No remote bulks provided\n"); return -DER_INVAL; } - bulk_nr = orw->orw_bulks.ca_count; if (p_arg == NULL) { p_arg = &arg; async = false; @@ -514,7 +516,7 @@ obj_bulk_transfer(crt_rpc_t *rpc, crt_bulk_op_t bulk_op, bool bulk_bind, crt_bul return dss_abterr2der(rc); p_arg->inited = true; - D_DEBUG(DB_IO, "bulk_op %d sgl_nr %d\n", bulk_op, sgl_nr); + D_DEBUG(DB_IO, "bulk_op %d, sgl_nr %d, bulk_nr %d\n", bulk_op, sgl_nr, bulk_nr); p_arg->bulks_inflight++; @@ -542,9 +544,9 @@ obj_bulk_transfer(crt_rpc_t *rpc, crt_bulk_op_t bulk_op, bool bulk_bind, crt_bul while (skips != NULL && isset(skips, i + skip_nr)) skip_nr++; - if (bulk_nr > 0) - D_ASSERTF(i + skip_nr < bulk_nr, "i %d, skip_nr %d, bulk_nr %d\n", - i, skip_nr, bulk_nr); + D_ASSERTF(i + skip_nr < bulk_nr, "i %d, skip_nr %d, sgl_nr %d, bulk_nr %d\n", + i, skip_nr, sgl_nr, bulk_nr); + if (remote_bulks[i + skip_nr] == NULL) continue; @@ -574,6 +576,12 @@ obj_bulk_transfer(crt_rpc_t *rpc, crt_bulk_op_t bulk_op, bool bulk_bind, crt_bul break; } } + + if (skips != NULL) + D_ASSERTF(skip_nr + sgl_nr <= bulk_nr, + "Unmatched skip_nr %d, sgl_nr %d, bulk_nr %d\n", + skip_nr, sgl_nr, bulk_nr); + done: if (--(p_arg->bulks_inflight) == 0) ABT_eventual_set(p_arg->eventual, &rc, sizeof(rc)); @@ -836,7 +844,7 @@ obj_echo_rw(crt_rpc_t *rpc, daos_iod_t *iod, uint64_t *off) /* Only support 1 iod now */ bulk_bind = orw->orw_flags & ORF_BULK_BIND; rc = obj_bulk_transfer(rpc, bulk_op, bulk_bind, orw->orw_bulks.ca_arrays, off, - NULL, DAOS_HDL_INVAL, &p_sgl, 1, NULL, NULL); + NULL, DAOS_HDL_INVAL, &p_sgl, 1, 1, NULL, NULL); out: orwo->orw_ret = rc; orwo->orw_map_version = orw->orw_map_ver; @@ -1636,7 +1644,8 @@ obj_local_rw_internal(crt_rpc_t *rpc, struct obj_io_context *ioc, daos_iod_t *io if (rma) { bulk_bind = orw->orw_flags & ORF_BULK_BIND; rc = obj_bulk_transfer(rpc, bulk_op, bulk_bind, orw->orw_bulks.ca_arrays, offs, - skips, ioh, NULL, iods_nr, NULL, ioc->ioc_coh); + skips, ioh, NULL, iods_nr, orw->orw_bulks.ca_count, NULL, + ioc->ioc_coh); if (rc == 0) { bio_iod_flush(biod); @@ -1809,7 +1818,7 @@ obj_get_iods_offs_by_oid(daos_unit_oid_t uoid, struct obj_iod_array *iod_array, } } if (oiod_nr > LOCAL_SKIP_BITS_NUM || *skips == NULL) { - D_ALLOC(*skips, roundup(oiod_nr / NBBY, 4)); + D_ALLOC(*skips, (oiod_nr + NBBY - 1) / NBBY); if (*skips == NULL) D_GOTO(out, rc = -DER_NOMEM); } @@ -2448,7 +2457,7 @@ ds_obj_ec_rep_handler(crt_rpc_t *rpc) goto end; } rc = obj_bulk_transfer(rpc, CRT_BULK_GET, false, &oer->er_bulk, NULL, NULL, - ioh, NULL, 1, NULL, ioc.ioc_coh); + ioh, NULL, 1, 1, NULL, ioc.ioc_coh); if (rc) D_ERROR(DF_UOID " bulk transfer failed: " DF_RC "\n", DP_UOID(oer->er_oid), DP_RC(rc)); @@ -2526,7 +2535,7 @@ ds_obj_ec_agg_handler(crt_rpc_t *rpc) goto end; } rc = obj_bulk_transfer(rpc, CRT_BULK_GET, false, &oea->ea_bulk, - NULL, NULL, ioh, NULL, 1, NULL, ioc.ioc_coh); + NULL, NULL, ioh, NULL, 1, 1, NULL, ioc.ioc_coh); if (rc) D_ERROR(DF_UOID " bulk transfer failed: " DF_RC "\n", DP_UOID(oea->ea_oid), DP_RC(rc)); @@ -3275,7 +3284,7 @@ obj_enum_reply_bulk(crt_rpc_t *rpc) return 0; rc = obj_bulk_transfer(rpc, CRT_BULK_PUT, false, bulks, NULL, NULL, - DAOS_HDL_INVAL, sgls, idx, NULL, NULL); + DAOS_HDL_INVAL, sgls, idx, idx, NULL, NULL); if (oei->oei_kds_bulk) { D_FREE(oeo->oeo_kds.ca_arrays); oeo->oeo_kds.ca_count = 0; @@ -4560,7 +4569,7 @@ ds_cpd_handle_one(crt_rpc_t *rpc, struct daos_cpd_sub_head *dcsh, struct daos_cp rc = obj_bulk_transfer(rpc, CRT_BULK_GET, dcu->dcu_flags & ORF_BULK_BIND, dcu->dcu_bulks, poffs[i], pskips[i], iohs[i], NULL, - piod_nrs[i], &bulks[i], ioc->ioc_coh); + piod_nrs[i], dcsr->dcsr_nr, &bulks[i], ioc->ioc_coh); if (rc != 0) { D_ERROR("Bulk transfer failed for obj " DF_UOID", DTX "DF_DTI": "DF_RC"\n", @@ -5276,7 +5285,7 @@ ds_obj_cpd_body_bulk(crt_rpc_t *rpc, struct obj_io_context *ioc, bool leader, } rc = obj_bulk_transfer(rpc, CRT_BULK_GET, ORF_BULK_BIND, bulks, NULL, NULL, - DAOS_HDL_INVAL, sgls, count, NULL, ioc->ioc_coh); + DAOS_HDL_INVAL, sgls, count, count, NULL, ioc->ioc_coh); if (rc != 0) goto out; From aa811d73facfe0d2bf491ec26cf80d19bdcf3e11 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Fri, 6 Sep 2024 16:40:05 +0800 Subject: [PATCH 06/12] DAOS-15863 container: fix a race for container cache (#15038) (#15065) * DAOS-15863 container: fix a race for container cache while destroying a container, cont_child_destroy_one() releases its own refcount before waiting, if another ULT releases its refcount, which is the last one, wakes up the waiting ULT and frees it ds_cont_child straightaway, because no one else has refcount. When the waiting ULT is waken up, it will try to change the already freed ds_cont_child. This patch changes the LRU eviction logic and fixes this race. Signed-off-by: Liang Zhen Signed-off-by: Jeff Olivier Co-authored-by: Jeff Olivier --- src/common/lru.c | 54 ++++++++++++++++---------------------- src/container/srv_target.c | 4 ++- src/include/daos/lru.h | 38 +++++++-------------------- 3 files changed, 35 insertions(+), 61 deletions(-) diff --git a/src/common/lru.c b/src/common/lru.c index bb270500ab7..de86d367e0e 100644 --- a/src/common/lru.c +++ b/src/common/lru.c @@ -36,7 +36,10 @@ lru_hop_rec_decref(struct d_hash_table *htable, d_list_t *link) D_ASSERT(llink->ll_ref > 0); llink->ll_ref--; - if (llink->ll_ref == 1 && llink->ll_ops->lop_wakeup) + + /* eviction waiter is the last one holds refcount */ + if (llink->ll_wait_evict && + llink->ll_ops->lop_wakeup && daos_lru_is_last_user(llink)) llink->ll_ops->lop_wakeup(llink); /* Delete from hash only if no more references */ @@ -215,15 +218,6 @@ daos_lru_ref_hold(struct daos_lru_cache *lcache, void *key, if (link != NULL) { llink = link2llink(link); D_ASSERT(llink->ll_evicted == 0); - if (llink->ll_evicting) { - /** - * Avoid calling `lru_hop_rec_decref()` at this point - * to prevent `wakeup()` from being invoked twice. - */ - D_ASSERT(llink->ll_ref > 1); - llink->ll_ref--; - D_GOTO(out, rc = -DER_SHUTDOWN); - } /* remove busy item from LRU */ if (!d_list_empty(&llink->ll_qlink)) d_list_del_init(&llink->ll_qlink); @@ -257,24 +251,17 @@ daos_lru_ref_hold(struct daos_lru_cache *lcache, void *key, return rc; } -static void -lru_ref_release_internal(struct daos_lru_cache *lcache, struct daos_llink *llink, bool wait) +void +daos_lru_ref_release(struct daos_lru_cache *lcache, struct daos_llink *llink) { D_ASSERT(lcache != NULL && llink != NULL && llink->ll_ref > 1); D_ASSERT(d_list_empty(&llink->ll_qlink)); lru_hop_rec_decref(&lcache->dlc_htable, &llink->ll_link); - if (wait && llink->ll_ref > 1) { - D_ASSERT(llink->ll_evicting == 0); - llink->ll_evicting = 1; - lcache->dlc_ops->lop_wait(llink); - llink->ll_evicting = 0; - llink->ll_evicted = 1; - } - if (llink->ll_ref == 1) { /* the last refcount */ - if (lcache->dlc_csize == 0) + /* zero-sized cache always evicts unused item */ + if (lcache->dlc_csize == 0 && !llink->ll_evicted) llink->ll_evicted = 1; if (llink->ll_evicted) { @@ -297,15 +284,20 @@ lru_ref_release_internal(struct daos_lru_cache *lcache, struct daos_llink *llink } void -daos_lru_ref_release(struct daos_lru_cache *lcache, struct daos_llink *llink) -{ - lru_ref_release_internal(lcache, llink, false); -} - -void -daos_lru_ref_wait_evict(struct daos_lru_cache *lcache, struct daos_llink *llink) +daos_lru_ref_evict_wait(struct daos_lru_cache *lcache, struct daos_llink *llink) { - D_ASSERT(lcache->dlc_ops->lop_wait); - - lru_ref_release_internal(lcache, llink, true); + if (!llink->ll_evicted) + daos_lru_ref_evict(lcache, llink); + + if (lcache->dlc_ops->lop_wait && !daos_lru_is_last_user(llink)) { + /* Wait until I'm the last one. + * XXX: the implementation can only support one waiter for now, if there + * is a secondary ULT calls this function on the same item, it will hit + * the assertion. + */ + D_ASSERT(!llink->ll_wait_evict); + llink->ll_wait_evict = 1; + lcache->dlc_ops->lop_wait(llink); + llink->ll_wait_evict = 0; + } } diff --git a/src/container/srv_target.c b/src/container/srv_target.c index b5abcc2d759..f3ef47c8447 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -1261,7 +1261,9 @@ cont_child_destroy_one(void *vin) D_GOTO(out_pool, rc = -DER_BUSY); } /* else: resync should have completed, try again */ - daos_lru_ref_wait_evict(tls->dt_cont_cache, &cont->sc_list); + /* nobody should see it again after eviction */ + daos_lru_ref_evict_wait(tls->dt_cont_cache, &cont->sc_list); + daos_lru_ref_release(tls->dt_cont_cache, &cont->sc_list); } D_DEBUG(DB_MD, DF_CONT": destroying vos container\n", diff --git a/src/include/daos/lru.h b/src/include/daos/lru.h index 03b1eb90e4c..40bee5c492b 100644 --- a/src/include/daos/lru.h +++ b/src/include/daos/lru.h @@ -37,8 +37,8 @@ struct daos_llink { d_list_t ll_link; /**< LRU hash link */ d_list_t ll_qlink; /**< Temp link for traverse */ uint32_t ll_ref; /**< refcount for this ref */ - uint32_t ll_evicted:1, /**< has been evicted */ - ll_evicting:1; /**< been evicting */ + uint32_t ll_evicted:1; /**< has been evicted */ + uint32_t ll_wait_evict:1; /**< wait for completion of eviction */ struct daos_llink_ops *ll_ops; /**< ops to maintain refs */ }; @@ -121,26 +121,7 @@ void daos_lru_ref_release(struct daos_lru_cache *lcache, struct daos_llink *llink); /** - * Evicts the LRU link from the DAOS LRU cache after waiting - * for all references to be released. - * - * \param[in] lcache DAOS LRU cache - * \param[in] llink DAOS LRU link to be evicted - * - */ -void -daos_lru_ref_wait_evict(struct daos_lru_cache *lcache, struct daos_llink *llink); - -/** - * Flush old items from LRU. - * - * \param[in] lcache DAOS LRU cache - */ -void -daos_lru_ref_flush(struct daos_lru_cache *lcache); - -/** - * Evict the item from LRU after releasing the last refcount on it. + * Evict the item from LRU before releasing the refcount on it. * * \param[in] lcache DAOS LRU cache * \param[in] llink DAOS LRU item to be evicted @@ -153,15 +134,14 @@ daos_lru_ref_evict(struct daos_lru_cache *lcache, struct daos_llink *llink) } /** - * Check if a LRU element has been evicted or not + * Evict the item from LRU before releasing the refcount on it, wait until + * the caller is the last one holds refcount. * - * \param[in] llink DAOS LRU item to check + * \param[in] lcache DAOS LRU cache + * \param[in] llink DAOS LRU item to be evicted */ -static inline bool -daos_lru_ref_evicted(struct daos_llink *llink) -{ - return llink->ll_evicted; -} +void +daos_lru_ref_evict_wait(struct daos_lru_cache *lcache, struct daos_llink *llink); /** * Increase a usage reference to the LRU element From f7d35232ccdaefb48a06ffee002f32c38d99dc69 Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Fri, 6 Sep 2024 11:11:11 -0400 Subject: [PATCH 07/12] DAOS-16484 test: Support mixed speeds when selecting a default interface (#15050) (#15080) Allow selecting a default interface that is running at a different speed on different hosts. Primarily this is to support selecting the ib0 interface by default when the launch node has a slower ib0 interface than the cluster hosts. Signed-off-by: Phil Henderson --- src/tests/ftest/util/environment_utils.py | 3 ++- src/tests/ftest/util/network_utils.py | 9 +++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/tests/ftest/util/environment_utils.py b/src/tests/ftest/util/environment_utils.py index 8223063a85e..e36d750500e 100644 --- a/src/tests/ftest/util/environment_utils.py +++ b/src/tests/ftest/util/environment_utils.py @@ -9,6 +9,7 @@ from ClusterShell.NodeSet import NodeSet # pylint: disable=import-error,no-name-in-module +from util.host_utils import get_local_host from util.network_utils import (PROVIDER_ALIAS, SUPPORTED_PROVIDERS, NetworkException, get_common_provider, get_fastest_interface) from util.run_utils import run_remote @@ -326,7 +327,7 @@ def _default_interface(self, logger, hosts): # Find all the /sys/class/net interfaces on the launch node (excluding lo) logger.debug("Detecting network devices - D_INTERFACE not set") try: - interface = get_fastest_interface(logger, hosts) + interface = get_fastest_interface(logger, hosts | get_local_host()) except NetworkException as error: raise TestEnvironmentException("Error obtaining a default interface!") from error return interface diff --git a/src/tests/ftest/util/network_utils.py b/src/tests/ftest/util/network_utils.py index 52ba2420964..e3802364d8f 100644 --- a/src/tests/ftest/util/network_utils.py +++ b/src/tests/ftest/util/network_utils.py @@ -405,11 +405,12 @@ def get_fastest_interface(logger, hosts, verbose=True): for interface in common_interfaces: detected_speeds = get_interface_speeds(logger, hosts, interface, verbose) speed_list = [] + speed_hosts = NodeSet() for speed, node_set in detected_speeds.items(): - if node_set == hosts: - # Only include detected homogeneous interface speeds - speed_list.append(speed) - if speed_list: + speed_list.append(speed) + speed_hosts.add(node_set) + if speed_list and speed_hosts == hosts: + # Only include interface speeds if a speed is detected on all the hosts interface_speeds[interface] = min(speed_list) logger.info("Active network interface speeds on %s:", hosts) From 8a8c7d4ceb24c66330f4623d446f3349111c3f2b Mon Sep 17 00:00:00 2001 From: Liu Xuezhao Date: Fri, 6 Sep 2024 23:47:29 +0800 Subject: [PATCH 08/12] DAOS-16467 rebuild: add DAOS_POOL_RF ENV for massive failure case (#15057) * DAOS-16467 rebuild: add DAOS_PW_RF ENV for massive failure case Allow user to set DAOS_PW_RF as pw_rf (pool wise RF). If SWIM detected engine failure is going to break pw_rf, don't change pool map, also don't trigger rebuild. With critical log message to ask administrator to bring back those engines in top priority (just "system start --ranks=xxx", need not to reintegrate those engines). a few functions renamed to avoid confuse - pool_map_find_nodes() -> pool_map_find_ranks() pool_map_find_node_by_rank() -> pool_map_find_dom_by_rank() pool_map_node_nr() -> pool_map_rank_nr() Signed-off-by: Xuezhao Liu --- docs/admin/env_variables.md | 1 + docs/admin/pool_operations.md | 24 +++++ src/chk/chk_engine.c | 4 +- src/common/pool_map.c | 18 ++-- src/container/cli.c | 2 +- src/container/srv_container.c | 15 ++- src/dtx/dtx_coll.c | 14 +-- src/include/daos/pool_map.h | 8 +- src/include/daos_prop.h | 11 ++- src/object/cli_coll.c | 2 +- src/object/srv_coll.c | 2 +- src/pool/cli.c | 4 +- src/pool/rpc.h | 10 ++ src/pool/srv.c | 16 ++- src/pool/srv_internal.h | 2 + src/pool/srv_pool.c | 102 +++++++++++++++++--- src/pool/srv_pool_map.c | 2 +- src/pool/srv_util.c | 14 +-- src/rebuild/srv.c | 18 ++-- src/tests/ftest/util/server_utils_params.py | 1 + 20 files changed, 196 insertions(+), 74 deletions(-) diff --git a/docs/admin/env_variables.md b/docs/admin/env_variables.md index 2f5c2053683..060c3790d57 100644 --- a/docs/admin/env_variables.md +++ b/docs/admin/env_variables.md @@ -53,6 +53,7 @@ Environment variables in this section only apply to the server side. |DAOS\_DTX\_RPC\_HELPER\_THD|DTX RPC helper threshold. The valid range is [18, unlimited). The default value is 513.| |DAOS\_DTX\_BATCHED\_ULT\_MAX|The max count of DTX batched commit ULTs. The valid range is [0, unlimited). 0 means to commit DTX synchronously. The default value is 32.| |DAOS\_FORWARD\_NEIGHBOR|Set to enable I/O forwarding on neighbor xstream in the absence of helper threads.| +|DAOS\_POOL\_RF|Redundancy factor for the pool. The valid range is [1, 4]. The default value is 2.| ## Server and Client environment variables diff --git a/docs/admin/pool_operations.md b/docs/admin/pool_operations.md index 388a81d8700..36907a2e31f 100644 --- a/docs/admin/pool_operations.md +++ b/docs/admin/pool_operations.md @@ -916,6 +916,30 @@ and possibly repair a pmemobj file. As discussed in the previous section, the rebuild status can be consulted via the pool query and will be expanded with more information. +## Pool Redundancy Factor + +If the DAOS system experiences cascading failures, where the number of failed +fault domains exceeds a pool's redundancy factor, there could be unrecoverable +errors and applications could suffer from data loss. This can happen in cases +of power or network outages and would cause node/engine failures. In most cases +those failures can be recovered and DAOS engines can be restarted and the system +can function again. + +Administrator can set the default pool redundancy factor by environment variable +"DAOS_POOL_RF" in the server yaml file. If SWIM detects and reports an engine is +dead and the number of failed fault domain exceeds or is going to exceed the pool +redundancy factor, it will not change pool map immediately. Instead, it will give +critical log message: +intolerable unavailability: engine rank x +In this case, the system administrator should check and try to recover those +failed engines and bring them back with: +dmg system start --ranks=x +one by one. A reintegrate call is not needed. + +For true unrecoverable failures, the administrator can still exclude engines. +However, data loss is expected as the number of unrecoverable failures exceeds +the pool redundancy factor. + ## Recovering Container Ownership Typically users are expected to manage their containers. However, in the event diff --git a/src/chk/chk_engine.c b/src/chk/chk_engine.c index 9113ca22531..56e6da3ad9b 100644 --- a/src/chk/chk_engine.c +++ b/src/chk/chk_engine.c @@ -668,7 +668,7 @@ chk_engine_pool_mbs_one(struct chk_pool_rec *cpr, struct pool_map *map, struct c int rc = 0; bool unknown; - dom = pool_map_find_node_by_rank(map, mbs->cpm_rank); + dom = pool_map_find_dom_by_rank(map, mbs->cpm_rank); if (dom == NULL) { D_ASSERT(mbs->cpm_rank != dss_self_rank()); @@ -777,7 +777,7 @@ chk_engine_find_dangling_pm(struct chk_pool_rec *cpr, struct pool_map *map) int j; bool down; - rank_nr = pool_map_find_nodes(map, PO_COMP_ID_ALL, &doms); + rank_nr = pool_map_find_ranks(map, PO_COMP_ID_ALL, &doms); if (rank_nr <= 0) D_GOTO(out, rc = rank_nr); diff --git a/src/common/pool_map.c b/src/common/pool_map.c index 7d7b38adb6c..1712f398dcb 100644 --- a/src/common/pool_map.c +++ b/src/common/pool_map.c @@ -1573,7 +1573,7 @@ add_domain_tree_to_pool_buf(struct pool_map *map, struct pool_buf *map_buf, if (map) { struct pool_domain *found_dom; - found_dom = pool_map_find_node_by_rank(map, rank); + found_dom = pool_map_find_dom_by_rank(map, rank); if (found_dom) { if (found_dom->do_comp.co_status == PO_COMP_ST_NEW) found_new_dom = true; @@ -2038,7 +2038,7 @@ pool_map_find_domain(struct pool_map *map, pool_comp_type_t type, uint32_t id, } /** - * Find all nodes in the pool map. + * Find all ranks in the pool map. * * \param map [IN] pool map to search. * \param id [IN] id to search. @@ -2048,7 +2048,7 @@ pool_map_find_domain(struct pool_map *map, pool_comp_type_t type, uint32_t id, * 0 if none. */ int -pool_map_find_nodes(struct pool_map *map, uint32_t id, +pool_map_find_ranks(struct pool_map *map, uint32_t id, struct pool_domain **domain_pp) { return pool_map_find_domain(map, PO_COMP_TP_RANK, id, @@ -2102,14 +2102,14 @@ pool_map_find_target(struct pool_map *map, uint32_t id, * \return domain found by rank. */ struct pool_domain * -pool_map_find_node_by_rank(struct pool_map *map, uint32_t rank) +pool_map_find_dom_by_rank(struct pool_map *map, uint32_t rank) { struct pool_domain *doms; struct pool_domain *found = NULL; int doms_cnt; int i; - doms_cnt = pool_map_find_nodes(map, PO_COMP_ID_ALL, &doms); + doms_cnt = pool_map_find_ranks(map, PO_COMP_ID_ALL, &doms); if (doms_cnt <= 0) return NULL; @@ -2150,7 +2150,7 @@ pool_map_find_targets_on_ranks(struct pool_map *map, d_rank_list_t *rank_list, for (i = 0; i < rank_list->rl_nr; i++) { struct pool_domain *dom; - dom = pool_map_find_node_by_rank(map, rank_list->rl_ranks[i]); + dom = pool_map_find_dom_by_rank(map, rank_list->rl_ranks[i]); if (dom == NULL) { pool_target_id_list_free(tgts); return 0; @@ -2191,7 +2191,7 @@ pool_map_find_target_by_rank_idx(struct pool_map *map, uint32_t rank, { struct pool_domain *dom; - dom = pool_map_find_node_by_rank(map, rank); + dom = pool_map_find_dom_by_rank(map, rank); if (dom == NULL) return 0; @@ -2867,7 +2867,7 @@ pool_map_find_by_rank_status(struct pool_map *map, *tgt_ppp = NULL; *tgt_cnt = 0; - dom = pool_map_find_node_by_rank(map, rank); + dom = pool_map_find_dom_by_rank(map, rank); if (dom == NULL) return 0; @@ -2902,7 +2902,7 @@ pool_map_get_ranks(uuid_t pool_uuid, struct pool_map *map, bool get_enabled, d_r struct pool_domain *domains = NULL; d_rank_list_t *ranklist = NULL; - nnodes_tot = pool_map_find_nodes(map, PO_COMP_ID_ALL, &domains); + nnodes_tot = pool_map_find_ranks(map, PO_COMP_ID_ALL, &domains); for (i = 0; i < nnodes_tot; i++) { if (pool_map_node_status_match(&domains[i], ENABLED)) nnodes_enabled++; diff --git a/src/container/cli.c b/src/container/cli.c index 590f689333b..cd43667a2a4 100644 --- a/src/container/cli.c +++ b/src/container/cli.c @@ -3386,7 +3386,7 @@ dc_cont_node_id2ptr(daos_handle_t coh, uint32_t node_id, pool = dc_hdl2pool(dc->dc_pool_hdl); D_ASSERT(pool != NULL); D_RWLOCK_RDLOCK(&pool->dp_map_lock); - n = pool_map_find_nodes(pool->dp_map, node_id, dom); + n = pool_map_find_ranks(pool->dp_map, node_id, dom); D_RWLOCK_UNLOCK(&pool->dp_map_lock); dc_pool_put(pool); dc_cont_put(dc); diff --git a/src/container/srv_container.c b/src/container/srv_container.c index 9071f8f731c..372da43afe4 100644 --- a/src/container/srv_container.c +++ b/src/container/srv_container.c @@ -1667,7 +1667,7 @@ cont_ec_agg_alloc(struct cont_svc *cont_svc, uuid_t cont_uuid, { struct cont_ec_agg *ec_agg = NULL; struct pool_domain *doms; - int node_nr; + int rank_nr; int rc = 0; int i; @@ -1676,19 +1676,18 @@ cont_ec_agg_alloc(struct cont_svc *cont_svc, uuid_t cont_uuid, return -DER_NOMEM; D_ASSERT(cont_svc->cs_pool->sp_map != NULL); - node_nr = pool_map_find_nodes(cont_svc->cs_pool->sp_map, - PO_COMP_ID_ALL, &doms); - if (node_nr < 0) - D_GOTO(out, rc = node_nr); + rank_nr = pool_map_find_ranks(cont_svc->cs_pool->sp_map, PO_COMP_ID_ALL, &doms); + if (rank_nr < 0) + D_GOTO(out, rc = rank_nr); - D_ALLOC_ARRAY(ec_agg->ea_server_ephs, node_nr); + D_ALLOC_ARRAY(ec_agg->ea_server_ephs, rank_nr); if (ec_agg->ea_server_ephs == NULL) D_GOTO(out, rc = -DER_NOMEM); uuid_copy(ec_agg->ea_cont_uuid, cont_uuid); - ec_agg->ea_servers_num = node_nr; + ec_agg->ea_servers_num = rank_nr; ec_agg->ea_current_eph = 0; - for (i = 0; i < node_nr; i++) { + for (i = 0; i < rank_nr; i++) { ec_agg->ea_server_ephs[i].rank = doms[i].do_comp.co_rank; ec_agg->ea_server_ephs[i].eph = 0; } diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c index 9623dce4917..863307e9a7f 100644 --- a/src/dtx/dtx_coll.c +++ b/src/dtx/dtx_coll.c @@ -112,7 +112,7 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt struct dtx_coll_target *dct; struct dtx_coll_entry *dce = NULL; struct daos_obj_md md = { 0 }; - uint32_t node_nr; + uint32_t rank_nr; d_rank_t my_rank = dss_self_rank(); d_rank_t max_rank = 0; int rc = 0; @@ -192,19 +192,19 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt } } - node_nr = pool_map_node_nr(map->pl_poolmap); - if (unlikely(node_nr == 1)) + rank_nr = pool_map_rank_nr(map->pl_poolmap); + if (unlikely(rank_nr == 1)) D_GOTO(out, rc = 0); - dce->dce_ranks = d_rank_list_alloc(node_nr - 1); + dce->dce_ranks = d_rank_list_alloc(rank_nr - 1); if (dce->dce_ranks == NULL) D_GOTO(out, rc = -DER_NOMEM); - D_ALLOC_ARRAY(dce->dce_hints, node_nr); + D_ALLOC_ARRAY(dce->dce_hints, rank_nr); if (dce->dce_hints == NULL) D_GOTO(out, rc = -DER_NOMEM); - for (i = 0; i < node_nr; i++) + for (i = 0; i < rank_nr; i++) dce->dce_hints[i] = (uint8_t)(-1); md.omd_id = oid.id_pub; @@ -220,7 +220,7 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt goto out; } - for (i = 0, j = 0; i < layout->ol_nr && j < node_nr - 1; i++) { + for (i = 0, j = 0; i < layout->ol_nr && j < rank_nr - 1; i++) { if (layout->ol_shards[i].po_target == -1 || layout->ol_shards[i].po_shard == -1) continue; diff --git a/src/include/daos/pool_map.h b/src/include/daos/pool_map.h index 0df39f0e510..95695d2b027 100644 --- a/src/include/daos/pool_map.h +++ b/src/include/daos/pool_map.h @@ -281,7 +281,7 @@ int pool_map_find_target(struct pool_map *map, uint32_t id, struct pool_target **target_pp); int pool_map_find_domain(struct pool_map *map, pool_comp_type_t type, uint32_t id, struct pool_domain **domain_pp); -int pool_map_find_nodes(struct pool_map *map, uint32_t id, +int pool_map_find_ranks(struct pool_map *map, uint32_t id, struct pool_domain **domain_pp); int pool_map_find_tgts_by_state(struct pool_map *map, pool_comp_state_t match_states, @@ -311,7 +311,7 @@ bool pool_map_node_status_match(struct pool_domain *dom, unsigned int status); struct pool_domain * -pool_map_find_node_by_rank(struct pool_map *map, uint32_t rank); +pool_map_find_dom_by_rank(struct pool_map *map, uint32_t rank); int pool_map_find_by_rank_status(struct pool_map *map, struct pool_target ***tgt_ppp, @@ -339,9 +339,9 @@ pool_map_target_nr(struct pool_map *map) } static inline unsigned int -pool_map_node_nr(struct pool_map *map) +pool_map_rank_nr(struct pool_map *map) { - return pool_map_find_nodes(map, PO_COMP_ID_ALL, NULL); + return pool_map_find_ranks(map, PO_COMP_ID_ALL, NULL); } /* diff --git a/src/include/daos_prop.h b/src/include/daos_prop.h index c6ca94f84c1..3b7216efd0e 100644 --- a/src/include/daos_prop.h +++ b/src/include/daos_prop.h @@ -464,11 +464,12 @@ enum { /** container redundancy factor */ enum { - DAOS_PROP_CO_REDUN_RF0, - DAOS_PROP_CO_REDUN_RF1, - DAOS_PROP_CO_REDUN_RF2, - DAOS_PROP_CO_REDUN_RF3, - DAOS_PROP_CO_REDUN_RF4, + DAOS_PROP_CO_REDUN_RF0 = 0, + DAOS_PROP_CO_REDUN_RF1 = 1, + DAOS_PROP_CO_REDUN_RF2 = 2, + DAOS_PROP_CO_REDUN_RF3 = 3, + DAOS_PROP_CO_REDUN_RF4 = 4, + DAOS_RF_MAX = 4, }; /** diff --git a/src/object/cli_coll.c b/src/object/cli_coll.c index 12ba634813a..e05abadf3cf 100644 --- a/src/object/cli_coll.c +++ b/src/object/cli_coll.c @@ -139,7 +139,7 @@ obj_coll_oper_args_init(struct coll_oper_args *coa, struct dc_object *obj, bool D_ASSERT(coa->coa_dcts == NULL); D_RWLOCK_RDLOCK(&pool->dp_map_lock); - pool_ranks = pool_map_node_nr(pool->dp_map); + pool_ranks = pool_map_rank_nr(pool->dp_map); D_RWLOCK_UNLOCK(&pool->dp_map_lock); D_RWLOCK_RDLOCK(&obj->cob_lock); diff --git a/src/object/srv_coll.c b/src/object/srv_coll.c index a63a11d574b..2a152b47bd6 100644 --- a/src/object/srv_coll.c +++ b/src/object/srv_coll.c @@ -291,7 +291,7 @@ obj_coll_punch_prep(struct obj_coll_punch_in *ocpi, struct daos_coll_target *dct D_GOTO(out, rc = -DER_INVAL); } - size = pool_map_node_nr(map->pl_poolmap); + size = pool_map_rank_nr(map->pl_poolmap); D_ALLOC_ARRAY(dce->dce_hints, size); if (dce->dce_hints == NULL) D_GOTO(out, rc = -DER_NOMEM); diff --git a/src/pool/cli.c b/src/pool/cli.c index 5345017f824..85fa718aa1c 100644 --- a/src/pool/cli.c +++ b/src/pool/cli.c @@ -503,7 +503,7 @@ update_rsvc_client(struct dc_pool *pool) { struct subtract_rsvc_rank_arg arg; - arg.srra_nodes_len = pool_map_find_nodes(pool->dp_map, PO_COMP_ID_ALL, &arg.srra_nodes); + arg.srra_nodes_len = pool_map_find_ranks(pool->dp_map, PO_COMP_ID_ALL, &arg.srra_nodes); /* There must be at least one rank. */ D_ASSERTF(arg.srra_nodes_len > 0, "%d > 0\n", arg.srra_nodes_len); @@ -2016,7 +2016,7 @@ choose_map_refresh_rank(struct map_refresh_arg *arg) if (arg->mra_n <= 0) return CRT_NO_RANK; - n = pool_map_find_nodes(arg->mra_pool->dp_map, PO_COMP_ID_ALL, &nodes); + n = pool_map_find_ranks(arg->mra_pool->dp_map, PO_COMP_ID_ALL, &nodes); /* There must be at least one rank. */ D_ASSERTF(n > 0, "%d\n", n); diff --git a/src/pool/rpc.h b/src/pool/rpc.h index cf763b896dc..cfddcc48931 100644 --- a/src/pool/rpc.h +++ b/src/pool/rpc.h @@ -147,6 +147,16 @@ CRT_RPC_DECLARE(pool_op, DAOS_ISEQ_POOL_OP, DAOS_OSEQ_POOL_OP) CRT_RPC_DECLARE(pool_create, DAOS_ISEQ_POOL_CREATE, DAOS_OSEQ_POOL_CREATE) /* clang-format on */ + +/* the source of pool map update operation */ +enum map_update_source { + MUS_SWIM = 0, + /* May need to differentiate from administrator/csum scrubber/nvme healthy monitor later. + * Now all non-swim cases fall to DMG category. + */ + MUS_DMG, +}; + enum map_update_opc { MAP_EXCLUDE = 0, MAP_DRAIN, diff --git a/src/pool/srv.c b/src/pool/srv.c index 2a45f4dec05..7e5548e8508 100644 --- a/src/pool/srv.c +++ b/src/pool/srv.c @@ -19,7 +19,12 @@ #include "rpc.h" #include "srv_internal.h" #include "srv_layout.h" -bool ec_agg_disabled; + +bool ec_agg_disabled; +uint32_t pw_rf; /* pool wise RF */ +#define PW_RF_DEFAULT (2) +#define PW_RF_MIN (1) +#define PW_RF_MAX (4) static int init(void) @@ -47,6 +52,15 @@ init(void) if (unlikely(ec_agg_disabled)) D_WARN("EC aggregation is disabled.\n"); + pw_rf = PW_RF_DEFAULT; + d_getenv_uint32_t("DAOS_POOL_RF", &pw_rf); + if (pw_rf < PW_RF_MIN || pw_rf > PW_RF_MAX) { + D_INFO("pw_rf %d is out of range [%d, %d], take default %d\n", + pw_rf, PW_RF_MIN, PW_RF_MAX, PW_RF_DEFAULT); + pw_rf = PW_RF_DEFAULT; + } + D_INFO("pool wise RF %d\n", pw_rf); + ds_pool_rsvc_class_register(); bio_register_ract_ops(&nvme_reaction_ops); diff --git a/src/pool/srv_internal.h b/src/pool/srv_internal.h index c09d2ffcaea..8f864c8c11a 100644 --- a/src/pool/srv_internal.h +++ b/src/pool/srv_internal.h @@ -16,6 +16,8 @@ #include #include +extern uint32_t pw_rf; + /** * Global pool metrics */ diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index 54e29767347..667e4bc6ed6 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -1355,11 +1355,11 @@ handle_event(struct pool_svc *svc, struct pool_svc_event *event) if (event->psv_rank == dss_self_rank() && event->psv_src == CRT_EVS_GRPMOD && event->psv_type == CRT_EVT_DEAD) { - D_DEBUG(DB_MGMT, "ignore exclusion of self\n"); + D_DEBUG(DB_MD, "ignore exclusion of self\n"); goto out; } - D_DEBUG(DB_MD, DF_UUID": handling event: "DF_PS_EVENT"\n", DP_UUID(svc->ps_uuid), + D_INFO(DF_UUID": handling event: "DF_PS_EVENT"\n", DP_UUID(svc->ps_uuid), DP_PS_EVENT(event)); if (event->psv_src == CRT_EVS_SWIM && event->psv_type == CRT_EVT_ALIVE) { @@ -1381,8 +1381,8 @@ handle_event(struct pool_svc *svc, struct pool_svc_event *event) * and does not have a copy of the pool map. */ ds_rsvc_request_map_dist(&svc->ps_rsvc); - D_DEBUG(DB_MD, DF_UUID": requested map dist for rank %u\n", DP_UUID(svc->ps_uuid), - event->psv_rank); + D_DEBUG(DB_MD, DF_UUID": requested map dist for rank %u\n", + DP_UUID(svc->ps_uuid), event->psv_rank); } else if (event->psv_type == CRT_EVT_DEAD) { rc = pool_svc_exclude_rank(svc, event->psv_rank); if (rc != 0) @@ -1809,7 +1809,7 @@ pool_svc_check_node_status(struct pool_svc *svc) D_DEBUG(DB_MD, DF_UUID": checking node status\n", DP_UUID(svc->ps_uuid)); ABT_rwlock_rdlock(svc->ps_pool->sp_lock); - doms_cnt = pool_map_find_nodes(svc->ps_pool->sp_map, PO_COMP_ID_ALL, + doms_cnt = pool_map_find_ranks(svc->ps_pool->sp_map, PO_COMP_ID_ALL, &doms); D_ASSERT(doms_cnt >= 0); for (i = 0; i < doms_cnt; i++) { @@ -6500,6 +6500,49 @@ pool_svc_schedule_reconf(struct pool_svc *svc, struct pool_map *map, uint32_t ma return 0; } +static int +pool_map_crit_prompt(struct pool_svc *svc, struct pool_map *map, d_rank_t rank) +{ + crt_group_t *primary_grp; + struct pool_domain *doms; + int doms_cnt; + int i; + int rc = 0; + + D_DEBUG(DB_MD, DF_UUID": checking node status\n", DP_UUID(svc->ps_uuid)); + doms_cnt = pool_map_find_ranks(map, PO_COMP_ID_ALL, &doms); + D_ASSERT(doms_cnt >= 0); + primary_grp = crt_group_lookup(NULL); + D_ASSERT(primary_grp != NULL); + + D_CRIT("!!! Please try to recover these engines in top priority -\n"); + D_CRIT("!!! Please refer \"Pool-Wise Redundancy Factor\" section in pool_operations.md\n"); + D_CRIT("!!! pool "DF_UUID": intolerable unavailability: engine rank %u\n", + DP_UUID(svc->ps_uuid), rank); + for (i = 0; i < doms_cnt; i++) { + struct swim_member_state state; + + if (!(doms[i].do_comp.co_status & PO_COMP_ST_UPIN) || + (doms[i].do_comp.co_rank == rank)) + continue; + + rc = crt_rank_state_get(primary_grp, doms[i].do_comp.co_rank, &state); + if (rc != 0 && rc != -DER_NONEXIST) { + D_ERROR("failed to get status of rank %u: %d\n", + doms[i].do_comp.co_rank, rc); + break; + } + + D_DEBUG(DB_MD, "rank/state %d/%d\n", doms[i].do_comp.co_rank, + rc == -DER_NONEXIST ? -1 : state.sms_status); + if (rc == -DER_NONEXIST || state.sms_status == SWIM_MEMBER_DEAD) + D_CRIT("!!! pool "DF_UUID" : intolerable unavailability: engine rank %u\n", + DP_UUID(svc->ps_uuid), doms[i].do_comp.co_rank); + } + + return rc; +} + /* * Perform an update to the pool map of \a svc. * @@ -6532,7 +6575,8 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc, struct pool_target_addr_list *tgt_addrs, struct rsvc_hint *hint, bool *p_updated, uint32_t *map_version_p, uint32_t *tgt_map_ver, - struct pool_target_addr_list *inval_tgt_addrs) + struct pool_target_addr_list *inval_tgt_addrs, + enum map_update_source src) { struct rdb_tx tx; struct pool_map *map; @@ -6628,7 +6672,7 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc, * If the map modification affects myself, leave it to a new PS leader * if there's another PS replica, or reject it. */ - node = pool_map_find_node_by_rank(map, dss_self_rank()); + node = pool_map_find_dom_by_rank(map, dss_self_rank()); if (node == NULL || !(node->do_comp.co_status & DC_POOL_SVC_MAP_STATES)) { d_rank_list_t *replicas; @@ -6653,6 +6697,33 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc, goto out_map; } + /* For SWIM exclude, don't change pool map if the pw_rf is broken or is going to be broken, + * with CRIT log message to ask administrator to bring back the engine. + */ + if (src == MUS_SWIM && opc == MAP_EXCLUDE) { + d_rank_t rank; + int failed_cnt; + + rc = pool_map_update_failed_cnt(map); + if (rc != 0) { + DL_ERROR(rc, DF_UUID": pool_map_update_failed_cnt failed.", + DP_UUID(svc->ps_uuid)); + goto out_map; + } + + D_ASSERT(tgt_addrs->pta_number == 1); + rank = tgt_addrs->pta_addrs->pta_rank; + failed_cnt = pool_map_get_failed_cnt(map, PO_COMP_TP_NODE); + D_INFO(DF_UUID": SWIM exclude rank %d, failed NODE %d\n", + DP_UUID(svc->ps_uuid), rank, failed_cnt); + if (failed_cnt > pw_rf) { + D_CRIT(DF_UUID": exclude rank %d will break pw_rf %d, failed_cnt %d\n", + DP_UUID(svc->ps_uuid), rank, pw_rf, failed_cnt); + rc = pool_map_crit_prompt(svc, map, rank); + goto out_map; + } + } + /* Write the new pool map. */ rc = pool_buf_extract(map, &map_buf); if (rc != 0) @@ -6809,7 +6880,7 @@ pool_update_map_internal(uuid_t pool_uuid, unsigned int opc, bool exclude_rank, rc = pool_svc_update_map_internal(svc, opc, exclude_rank, NULL, 0, NULL, tgts, tgt_addrs, hint, p_updated, map_version_p, tgt_map_ver, - inval_tgt_addrs); + inval_tgt_addrs, MUS_DMG); pool_svc_put_leader(svc); return rc; @@ -6859,8 +6930,8 @@ static int pool_svc_update_map(struct pool_svc *svc, crt_opcode_t opc, bool exclude_rank, d_rank_list_t *extend_rank_list, uint32_t *extend_domains, uint32_t extend_domains_nr, struct pool_target_addr_list *list, - struct pool_target_addr_list *inval_list_out, - uint32_t *map_version, struct rsvc_hint *hint) + struct pool_target_addr_list *inval_list_out, uint32_t *map_version, + struct rsvc_hint *hint, enum map_update_source src) { struct pool_target_id_list target_list = { 0 }; daos_prop_t prop = { 0 }; @@ -6875,7 +6946,7 @@ pool_svc_update_map(struct pool_svc *svc, crt_opcode_t opc, bool exclude_rank, rc = pool_svc_update_map_internal(svc, opc, exclude_rank, extend_rank_list, extend_domains_nr, extend_domains, &target_list, list, hint, &updated, - map_version, &tgt_map_ver, inval_list_out); + map_version, &tgt_map_ver, inval_list_out, src); if (rc) D_GOTO(out, rc); @@ -6962,10 +7033,9 @@ ds_pool_extend_handler(crt_rpc_t *rpc) goto out; rc = pool_svc_update_map(svc, pool_opc_2map_opc(opc_get(rpc->cr_opc)), - false /* exclude_rank */, - &rank_list, domains, ndomains, + false /* exclude_rank */, &rank_list, domains, ndomains, NULL, NULL, &out->peo_op.po_map_version, - &out->peo_op.po_hint); + &out->peo_op.po_hint, MUS_DMG); pool_svc_put_leader(svc); out: @@ -7067,7 +7137,7 @@ ds_pool_update_handler(crt_rpc_t *rpc, int handler_version) rc = pool_svc_update_map(svc, pool_opc_2map_opc(opc_get(rpc->cr_opc)), false /* exclude_rank */, NULL, NULL, 0, &list, &inval_list_out, &out->pto_op.po_map_version, - &out->pto_op.po_hint); + &out->pto_op.po_hint, MUS_DMG); if (rc != 0) goto out_svc; @@ -7112,7 +7182,7 @@ pool_svc_exclude_rank(struct pool_svc *svc, d_rank_t rank) rc = pool_svc_update_map(svc, pool_opc_2map_opc(POOL_EXCLUDE), true /* exclude_rank */, NULL, NULL, 0, &list, &inval_list_out, &map_version, - NULL /* hint */); + NULL /* hint */, MUS_SWIM); D_DEBUG(DB_MD, "Exclude pool "DF_UUID"/%u rank %u: rc %d\n", DP_UUID(svc->ps_uuid), map_version, rank, rc); diff --git a/src/pool/srv_pool_map.c b/src/pool/srv_pool_map.c index 1cb5632598f..9793df24f01 100644 --- a/src/pool/srv_pool_map.c +++ b/src/pool/srv_pool_map.c @@ -378,7 +378,7 @@ ds_pool_map_tgts_update(struct pool_map *map, struct pool_target_id_list *tgts, return -DER_NONEXIST; } - dom = pool_map_find_node_by_rank(map, target->ta_comp.co_rank); + dom = pool_map_find_dom_by_rank(map, target->ta_comp.co_rank); if (dom == NULL) { D_ERROR("Got request to change nonexistent rank %u" " in map %p\n", diff --git a/src/pool/srv_util.c b/src/pool/srv_util.c index e39072568e1..29f012d5844 100644 --- a/src/pool/srv_util.c +++ b/src/pool/srv_util.c @@ -21,19 +21,19 @@ int map_ranks_init(const struct pool_map *map, unsigned int status, d_rank_list_t *ranks) { struct pool_domain *domains = NULL; - int nnodes; + int nranks; int n = 0; int i; d_rank_t *rs; - nnodes = pool_map_find_nodes((struct pool_map *)map, + nranks = pool_map_find_ranks((struct pool_map *)map, PO_COMP_ID_ALL, &domains); - if (nnodes == 0) { + if (nranks == 0) { D_ERROR("no nodes in pool map\n"); return -DER_IO; } - for (i = 0; i < nnodes; i++) { + for (i = 0; i < nranks; i++) { if (status & domains[i].do_comp.co_status) n++; } @@ -52,7 +52,7 @@ map_ranks_init(const struct pool_map *map, unsigned int status, d_rank_list_t *r ranks->rl_ranks = rs; n = 0; - for (i = 0; i < nnodes; i++) { + for (i = 0; i < nranks; i++) { if (status & domains[i].do_comp.co_status) { D_ASSERT(n < ranks->rl_nr); ranks->rl_ranks[n] = domains[i].do_comp.co_rank; @@ -85,7 +85,7 @@ ds_pool_map_rank_up(struct pool_map *map, d_rank_t rank) struct pool_domain *node; int rc; - rc = pool_map_find_nodes(map, rank, &node); + rc = pool_map_find_ranks(map, rank, &node); if (rc == 0) return false; D_ASSERTF(rc == 1, "%d\n", rc); @@ -921,7 +921,7 @@ testu_create_pool_map(d_rank_t *ranks, int n_ranks, d_rank_t *down_ranks, int n_ for (i = 0; i < n_down_ranks; i++) { struct pool_domain *d; - d = pool_map_find_node_by_rank(map, down_ranks[i]); + d = pool_map_find_dom_by_rank(map, down_ranks[i]); D_ASSERT(d != NULL); d->do_comp.co_status = PO_COMP_ST_DOWN; } diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index f40ea2d2fe3..b1f722b8254 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -654,7 +654,7 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op, for (i = 0; i < excluded.rl_nr; i++) { struct pool_domain *dom; - dom = pool_map_find_node_by_rank(pool->sp_map, excluded.rl_ranks[i]); + dom = pool_map_find_dom_by_rank(pool->sp_map, excluded.rl_ranks[i]); D_ASSERT(dom != NULL); if (rgt->rgt_opc == RB_OP_REBUILD) { @@ -767,7 +767,7 @@ rebuild_global_pool_tracker_create(struct ds_pool *pool, uint32_t ver, uint32_t uint32_t opc, struct rebuild_global_pool_tracker **p_rgt) { struct rebuild_global_pool_tracker *rgt; - int node_nr; + int rank_nr; struct pool_domain *doms; int i; int rc = 0; @@ -777,11 +777,11 @@ rebuild_global_pool_tracker_create(struct ds_pool *pool, uint32_t ver, uint32_t return -DER_NOMEM; D_INIT_LIST_HEAD(&rgt->rgt_list); - node_nr = pool_map_find_nodes(pool->sp_map, PO_COMP_ID_ALL, &doms); - if (node_nr < 0) - D_GOTO(out, rc = node_nr); + rank_nr = pool_map_find_ranks(pool->sp_map, PO_COMP_ID_ALL, &doms); + if (rank_nr < 0) + D_GOTO(out, rc = rank_nr); - D_ALLOC_ARRAY(rgt->rgt_servers, node_nr); + D_ALLOC_ARRAY(rgt->rgt_servers, rank_nr); if (rgt->rgt_servers == NULL) D_GOTO(out, rc = -DER_NOMEM); @@ -793,9 +793,9 @@ rebuild_global_pool_tracker_create(struct ds_pool *pool, uint32_t ver, uint32_t if (rc != ABT_SUCCESS) D_GOTO(out, rc = dss_abterr2der(rc)); - for (i = 0; i < node_nr; i++) + for (i = 0; i < rank_nr; i++) rgt->rgt_servers[i].rank = doms[i].do_comp.co_rank; - rgt->rgt_servers_number = node_nr; + rgt->rgt_servers_number = rank_nr; uuid_copy(rgt->rgt_pool_uuid, pool->sp_uuid); rgt->rgt_rebuild_ver = ver; @@ -964,7 +964,7 @@ rebuild_scan_broadcast(struct ds_pool *pool, struct rebuild_global_pool_tracker for (i = 0; i < up_ranks.rl_nr; i++) { struct pool_domain *dom; - dom = pool_map_find_node_by_rank(pool->sp_map, up_ranks.rl_ranks[i]); + dom = pool_map_find_dom_by_rank(pool->sp_map, up_ranks.rl_ranks[i]); D_ASSERT(dom != NULL); D_DEBUG(DB_REBUILD, "rank %u ver %u rebuild %u\n", up_ranks.rl_ranks[i], dom->do_comp.co_in_ver, rgt->rgt_rebuild_ver); diff --git a/src/tests/ftest/util/server_utils_params.py b/src/tests/ftest/util/server_utils_params.py index 248617c1b36..440ffe68f82 100644 --- a/src/tests/ftest/util/server_utils_params.py +++ b/src/tests/ftest/util/server_utils_params.py @@ -434,6 +434,7 @@ class EngineYamlParameters(YamlParameters): REQUIRED_ENV_VARS = { "common": [ "D_LOG_FILE_APPEND_PID=1", + "DAOS_POOL_RF=4", "COVFILE=/tmp/test.cov"], "ofi+tcp": [], "ofi+tcp;ofi_rxm": [], From ee9a06d359fd31509c5b1e1870cdb21398bcde6a Mon Sep 17 00:00:00 2001 From: Niu Yawei Date: Sat, 7 Sep 2024 03:42:00 +0800 Subject: [PATCH 09/12] DAOS-16486 object: return proper error on stale pool map (#15064) (#15084) Client with stale pool map may try to send RPC to a DOWN target, if the target was brought DOWN due to faulty NVMe device, the ds_pool_child could have been stopped on the NVMe faulty reaction, We'd ensure proper error code is returned for such case. Signed-off-by: Niu Yawei --- src/dtx/tests/srv_mock.c | 7 +++++++ src/include/daos_srv/pool.h | 2 ++ src/object/srv_obj.c | 31 ++++++++++++++++++++++++++++++- src/pool/srv_target.c | 15 +++++++++++++++ 4 files changed, 54 insertions(+), 1 deletion(-) diff --git a/src/dtx/tests/srv_mock.c b/src/dtx/tests/srv_mock.c index 245b3b11513..3d4ac70d773 100644 --- a/src/dtx/tests/srv_mock.c +++ b/src/dtx/tests/srv_mock.c @@ -71,6 +71,13 @@ ds_pool_child_put(struct ds_pool_child *child) assert_true(false); } +struct ds_pool_child * +ds_pool_child_find(const uuid_t uuid) +{ + assert_true(false); + return NULL; +} + struct ds_pool_child * ds_pool_child_lookup(const uuid_t uuid) { diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h index 07ca3c0dbc1..6cbe3873f0a 100644 --- a/src/include/daos_srv/pool.h +++ b/src/include/daos_srv/pool.h @@ -249,6 +249,8 @@ ds_pool_svc_ops_save(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid_t uint64_t cli_time, bool dup_op, int rc_in, struct ds_pool_svc_op_val *op_valp); /* Find ds_pool_child in cache, hold one reference */ +struct ds_pool_child *ds_pool_child_find(const uuid_t uuid); +/* Find ds_pool_child in STARTING or STARTED state, hold one reference */ struct ds_pool_child *ds_pool_child_lookup(const uuid_t uuid); /* Put the reference held by ds_pool_child_lookup() */ void ds_pool_child_put(struct ds_pool_child *child); diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index febd3d36ead..a51682b4785 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2170,8 +2170,37 @@ obj_ioc_begin_lite(uint32_t rpc_map_ver, uuid_t pool_uuid, int rc; rc = obj_ioc_init(pool_uuid, coh_uuid, cont_uuid, rpc, ioc); - if (rc) + if (rc) { + DL_ERROR(rc, "Failed to initialize object I/O context."); + + /* + * Client with stale pool map may try to send RPC to a DOWN target, if the + * target was brought DOWN due to faulty NVMe device, the ds_pool_child could + * have been stopped on the NVMe faulty reaction, then above obj_io_init() + * will fail with -DER_NO_HDL. + * + * We'd ensure proper error code is returned for such case. + */ + poc = ds_pool_child_find(pool_uuid); + if (poc == NULL) { + D_ERROR("Failed to find pool:"DF_UUID"\n", DP_UUID(pool_uuid)); + return rc; + } + + if (rpc_map_ver < poc->spc_pool->sp_map_version) { + D_ERROR("Stale pool map version %u < %u from client.\n", + rpc_map_ver, poc->spc_pool->sp_map_version); + + /* Restart the DTX if using stale pool map */ + if (opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_CPD) + rc = -DER_TX_RESTART; + else + rc = -DER_STALE; + } + + ds_pool_child_put(poc); return rc; + } poc = ioc->ioc_coc->sc_pool; D_ASSERT(poc != NULL); diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index 0b195216cf5..cfa837e8b2a 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -87,6 +87,21 @@ pool_child_lookup_noref(const uuid_t uuid) return NULL; } +struct ds_pool_child * +ds_pool_child_find(const uuid_t uuid) +{ + struct ds_pool_child *child; + + child = pool_child_lookup_noref(uuid); + if (child == NULL) { + D_ERROR(DF_UUID": Pool child isn't found.\n", DP_UUID(uuid)); + return child; + } + + child->spc_ref++; + return child; +} + struct ds_pool_child * ds_pool_child_lookup(const uuid_t uuid) { From 70e43620ac6eb4b8465fd3e64cf591a7c002ae5e Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Sat, 7 Sep 2024 05:49:08 +0800 Subject: [PATCH 10/12] DAOS-16514 vos: fix coverity issue (#15083) (#15086) Fix coverity 2555843 explict null dereferenced. Signed-off-by: Niu Yawei --- src/vos/vos_obj.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/vos/vos_obj.c b/src/vos/vos_obj.c index fc0ff4fbe9c..c4f50a6705c 100644 --- a/src/vos/vos_obj.c +++ b/src/vos/vos_obj.c @@ -1700,7 +1700,8 @@ vos_obj_iter_prep(vos_iter_type_t type, vos_iter_param_t *param, return -DER_NOMEM; /* ip_hdl is dkey or akey tree open handle for vos_iterate_key() */ - if (!(param->ip_flags & VOS_IT_KEY_TREE)) { + if (param->ip_flags != VOS_IT_KEY_TREE) { + D_ASSERT(!(param->ip_flags & VOS_IT_KEY_TREE)); cont = vos_hdl2cont(param->ip_hdl); is_sysdb = cont->vc_pool->vp_sysdb; dth = vos_dth_get(is_sysdb); From 4e201a070dcf3d9124fe6cb30cbd57a6127e731b Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Mon, 9 Sep 2024 13:44:29 -0400 Subject: [PATCH 11/12] DAOS-16515 build: Tag 2.6.1 rc1 (#15103) Tag first release candidate for 2.6.1. Signed-off-by: Phil Henderson --- TAG | 2 +- VERSION | 2 +- debian/changelog | 6 ++++++ utils/rpms/daos.spec | 7 +++++-- 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/TAG b/TAG index 3b4c8028c74..07c46f93195 100644 --- a/TAG +++ b/TAG @@ -1 +1 @@ -2.6.0-rc3 +2.6.1-rc1 diff --git a/VERSION b/VERSION index e70b4523ae7..6a6a3d8e35c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.6.0 +2.6.1 diff --git a/debian/changelog b/debian/changelog index fb08d568cbd..73df378df78 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +daos (2.6.1-1) unstable; urgency=medium + [ Phillip Henderson ] + * First release candidate for 2.6.1 + + -- Phillip Henderson Mon, 09 Sep 2024 08:46:00 -0500 + daos (2.6.0-5) unstable; urgency=medium [ Tom Nabarro ] * Add pciutils runtime dep for daos_server lspci call diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec index eb85dab4eef..1ae790f8623 100644 --- a/utils/rpms/daos.spec +++ b/utils/rpms/daos.spec @@ -14,8 +14,8 @@ %endif Name: daos -Version: 2.6.0 -Release: 5%{?relval}%{?dist} +Version: 2.6.1 +Release: 1%{?relval}%{?dist} Summary: DAOS Storage Engine License: BSD-2-Clause-Patent @@ -591,6 +591,9 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent # No files in a shim package %changelog +* Mon Sep 09 2024 Phillip Henderson 2.6.1-1 +- First release candidate for 2.6.1 + * Thu Aug 08 2024 Tom Nabarro 2.6.0-5 - Add pciutils runtime dep for daos_server lspci call - Add pciutils-devel build dep for pciutils CGO bindings From f2c36ad394b991ea7e94916b839387ac0c09915e Mon Sep 17 00:00:00 2001 From: Michael MacDonald Date: Mon, 9 Sep 2024 22:02:17 +0000 Subject: [PATCH 12/12] Revert "DAOS-16271 mercury: Add patch to check ep for null in UCX key resolve. (#15077)" Not needed in our build (b/364929445). This reverts commit 3d9e2d0cafcb5f314135ba9e7a27dd541102f7c5. --- utils/build.config | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/build.config b/utils/build.config index 92c3bd673d3..f94d0e6a912 100644 --- a/utils/build.config +++ b/utils/build.config @@ -30,4 +30,3 @@ spdk=https://github.com/spdk/spdk/commit/b0aba3fcd5aceceea530a702922153bc7566497 ofi=https://github.com/ofiwg/libfabric/commit/d827c6484cc5bf67dfbe395890e258860c3f0979.diff mercury=https://raw.githubusercontent.com/daos-stack/mercury/857f1d5d2ca72d4c1b8d7be5e7fd26d6292b495f/na_ucx_am_send_retry.patch,https://github.com/mercury-hpc/mercury/commit/b8c26fd86281f3b0883c31bd2d0cb467a12b860d.diff,https://github.com/mercury-hpc/mercury/commit/a35589c3d1134d9c80640e78247e210162ac4a3c.diff,https://github.com/mercury-hpc/mercury/commit/fa4abbb6273d975b2ef17ac4e561fd4255d384db.diff fuse=https://github.com/libfuse/libfuse/commit/c9905341ea34ff9acbc11b3c53ba8bcea35eeed8.diff -mercury=https://raw.githubusercontent.com/daos-stack/mercury/481297621bafbbcac4cc6f8feab3f1b6f8b14b59/na_ucx_keyres_epchk.patch