From 35334aaeae0fac404ed4ca34899ec23679efc25c Mon Sep 17 00:00:00 2001
From: Phil Henderson <phillip.henderson@intel.com>
Date: Wed, 4 Sep 2024 19:02:27 -0400
Subject: [PATCH 01/12] DAOS-16471 test: Reduce targets for
 ioctl_pool_handles.py (#15063) (#15071)

The dfuse/ioctl_pool_handles.py test is overloading the VM so reduce the number of engine targets.

Signed-off-by: Phil Henderson <phillip.henderson@intel.com>
---
 src/tests/ftest/dfuse/ioctl_pool_handles.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tests/ftest/dfuse/ioctl_pool_handles.yaml b/src/tests/ftest/dfuse/ioctl_pool_handles.yaml
index 2900f67c328..35752453850 100644
--- a/src/tests/ftest/dfuse/ioctl_pool_handles.yaml
+++ b/src/tests/ftest/dfuse/ioctl_pool_handles.yaml
@@ -16,7 +16,7 @@ server_config:
         0:
           class: ram
           scm_mount: /mnt/daos
-      targets: 16
+      targets: 8
   system_ram_reserved: 1
 
 pool:

From 906f0a44b74359f0e1370640851f53d77df77f49 Mon Sep 17 00:00:00 2001
From: Nasf-Fan <fan.yong@intel.com>
Date: Fri, 6 Sep 2024 00:27:23 +0800
Subject: [PATCH 02/12] DAOS-16483 vos: handle empty DTX when vos_tx_end - b26
 (#15055)

It is possible that the DTX modified nothing when stop currnet backend
transaction. Under such case, we may not generate persistent DTX entry.
Then need to bypass such case before checking on-disk DTX entry status.

The patch makes some clean and removed redundant metrics for committed
DTX entries.

Enhance vos_dtx_deregister_record() to handle GC case.

Signed-off-by: Fan Yong <fan.yong@intel.com>
---
 src/dtx/dtx_common.c                    |  2 +-
 src/tests/ftest/util/telemetry_utils.py |  1 -
 src/vos/vos_common.c                    | 31 ++++-----
 src/vos/vos_dtx.c                       | 86 ++++++++++++++++++++-----
 src/vos/vos_tls.h                       |  1 -
 5 files changed, 83 insertions(+), 38 deletions(-)

diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c
index 353bd880009..ff4f2dfe4ef 100644
--- a/src/dtx/dtx_common.c
+++ b/src/dtx/dtx_common.c
@@ -1341,7 +1341,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
 	 * it persistently. Otherwise, the subsequent DTX resync may not find it as
 	 * to regard it as failed transaction and abort it.
 	 */
-	if (result == 0 && !dth->dth_active && !dth->dth_prepared &&
+	if (result == 0 && !dth->dth_active && !dth->dth_prepared && !dth->dth_solo &&
 	    (dth->dth_dist || dth->dth_modification_cnt > 0)) {
 		result = vos_dtx_attach(dth, true, dth->dth_ent != NULL ? true : false);
 		if (unlikely(result < 0)) {
diff --git a/src/tests/ftest/util/telemetry_utils.py b/src/tests/ftest/util/telemetry_utils.py
index aec831b3b8a..db424b6de68 100644
--- a/src/tests/ftest/util/telemetry_utils.py
+++ b/src/tests/ftest/util/telemetry_utils.py
@@ -421,7 +421,6 @@ class TelemetryUtils():
         ENGINE_NVME_CRIT_WARN_METRICS +\
         ENGINE_NVME_INTEL_VENDOR_METRICS
     ENGINE_MEM_USAGE_METRICS = [
-        "engine_mem_vos_dtx_cmt_ent_48",
         "engine_mem_vos_vos_obj_360",
         "engine_mem_vos_vos_lru_size",
         "engine_mem_dtx_dtx_leader_handle_360"]
diff --git a/src/vos/vos_common.c b/src/vos/vos_common.c
index fb8461e2931..93bf1757f10 100644
--- a/src/vos/vos_common.c
+++ b/src/vos/vos_common.c
@@ -405,16 +405,24 @@ vos_tx_end(struct vos_container *cont, struct dtx_handle *dth_in,
 			}
 		} else if (dae != NULL) {
 			if (dth->dth_solo) {
-				if (err == 0 && cont->vc_solo_dtx_epoch < dth->dth_epoch)
+				if (err == 0 && dae->dae_committing &&
+				    cont->vc_solo_dtx_epoch < dth->dth_epoch)
 					cont->vc_solo_dtx_epoch = dth->dth_epoch;
 
 				vos_dtx_post_handle(cont, &dae, &dce, 1, false, err != 0);
 			} else {
 				D_ASSERT(dce == NULL);
-				if (err == 0) {
-					dae->dae_prepared = 1;
+				if (err == 0 && dth->dth_active) {
+					D_ASSERTF(!UMOFF_IS_NULL(dae->dae_df_off),
+						  "Non-prepared DTX " DF_DTI "\n",
+						  DP_DTI(&dth->dth_xid));
+
 					dae_df = umem_off2ptr(umm, dae->dae_df_off);
-					D_ASSERT(!(dae_df->dae_flags & DTE_INVALID));
+					D_ASSERTF(!(dae_df->dae_flags & DTE_INVALID),
+						  "Invalid status for DTX " DF_DTI "\n",
+						  DP_DTI(&dth->dth_xid));
+
+					dae->dae_prepared = 1;
 				}
 			}
 		}
@@ -563,13 +571,6 @@ vos_tls_init(int tags, int xs_id, int tgt_id)
 		}
 	}
 
-	rc = d_tm_add_metric(&tls->vtl_committed, D_TM_STATS_GAUGE,
-			     "Number of committed entries kept around for reply"
-			     " reconstruction", "entries",
-			     "io/dtx/committed/tgt_%u", tgt_id);
-	if (rc)
-		D_WARN("Failed to create committed cnt sensor: "DF_RC"\n",
-		       DP_RC(rc));
 	if (tgt_id >= 0) {
 		rc = d_tm_add_metric(&tls->vtl_committed, D_TM_STATS_GAUGE,
 				     "Number of committed entries kept around for reply"
@@ -579,14 +580,6 @@ vos_tls_init(int tags, int xs_id, int tgt_id)
 			D_WARN("Failed to create committed cnt sensor: "DF_RC"\n",
 			       DP_RC(rc));
 
-		rc = d_tm_add_metric(&tls->vtl_dtx_cmt_ent_cnt, D_TM_GAUGE,
-				     "Number of committed entries", "entry",
-				     "mem/vos/dtx_cmt_ent_%u/tgt_%u",
-				     sizeof(struct vos_dtx_cmt_ent), tgt_id);
-		if (rc)
-			D_WARN("Failed to create committed cnt: "DF_RC"\n",
-			       DP_RC(rc));
-
 		rc = d_tm_add_metric(&tls->vtl_obj_cnt, D_TM_GAUGE,
 				     "Number of cached vos object", "entry",
 				     "mem/vos/vos_obj_%u/tgt_%u",
diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c
index 0e70133629f..1c60f781507 100644
--- a/src/vos/vos_dtx.c
+++ b/src/vos/vos_dtx.c
@@ -769,7 +769,6 @@ vos_dtx_commit_one(struct vos_container *cont, struct dtx_id *dti, daos_epoch_t
 		   daos_epoch_t cmt_time, struct vos_dtx_cmt_ent **dce_p,
 		   struct vos_dtx_act_ent **dae_p, bool *rm_cos, bool *fatal)
 {
-	struct vos_tls			*tls = vos_tls_get(false);
 	struct vos_dtx_act_ent		*dae = NULL;
 	struct vos_dtx_cmt_ent		*dce = NULL;
 	d_iov_t				 kiov;
@@ -834,7 +833,6 @@ vos_dtx_commit_one(struct vos_container *cont, struct dtx_id *dti, daos_epoch_t
 	if (dce == NULL)
 		D_GOTO(out, rc = -DER_NOMEM);
 
-	d_tm_inc_gauge(tls->vtl_dtx_cmt_ent_cnt, 1);
 	DCE_CMT_TIME(dce) = cmt_time;
 	if (dae != NULL) {
 		DCE_XID(dce) = DAE_XID(dae);
@@ -1535,10 +1533,14 @@ int
 vos_dtx_deregister_record(struct umem_instance *umm, daos_handle_t coh,
 			  uint32_t entry, daos_epoch_t epoch, umem_off_t record)
 {
+	struct dtx_handle		*dth = vos_dth_get(false);
 	struct vos_container		*cont;
 	struct vos_dtx_act_ent		*dae;
+	struct vos_dtx_act_ent_df	*dae_df;
+	umem_off_t			*rec_df;
 	bool				 found;
 	int				 count;
+	int				 rc;
 	int				 i;
 
 	if (!vos_dtx_is_normal_entry(entry))
@@ -1567,10 +1569,54 @@ vos_dtx_deregister_record(struct umem_instance *umm, daos_handle_t coh,
 	 *	 by another prepared (but non-committed) DTX, then do not allow current transaction
 	 *	 to modify it. Because if current transaction is aborted or failed for some reason,
 	 *	 there is no efficient way to recover such former non-committed DTX.
+	 *
+	 *	 If dth is NULL, then it is for GC. Under such case, deregister the record anyway.
 	 */
-	if (dae->dae_dbd != NULL)
-		return dtx_inprogress(dae, vos_dth_get(cont->vc_pool->vp_sysdb), false, false, 8);
+	if (dae->dae_dbd != NULL) {
+		if (dth != NULL)
+			return dtx_inprogress(dae, dth, false, false, 8);
+
+		dae_df = umem_off2ptr(umm, dae->dae_df_off);
+		D_ASSERT(!(dae_df->dae_flags & DTE_INVALID));
 
+		if (dae_df->dae_rec_cnt > DTX_INLINE_REC_CNT)
+			count = DTX_INLINE_REC_CNT;
+		else
+			count = dae_df->dae_rec_cnt;
+
+		rec_df = dae_df->dae_rec_inline;
+		for (i = 0; i < count; i++) {
+			if (record == umem_off2offset(rec_df[i])) {
+				rc = umem_tx_add_ptr(umm, &rec_df[i], sizeof(rec_df[i]));
+				if (rc != 0)
+					return rc;
+
+				rec_df[i] = UMOFF_NULL;
+				goto cache;
+			}
+		}
+
+		rec_df = umem_off2ptr(umm, dae_df->dae_rec_off);
+		if (rec_df == NULL)
+			/* If non-exist on disk, then must be non-exist in cache. */
+			return 0;
+
+		for (i = 0; i < dae_df->dae_rec_cnt - DTX_INLINE_REC_CNT; i++) {
+			if (record == umem_off2offset(rec_df[i])) {
+				rc = umem_tx_add_ptr(umm, &rec_df[i], sizeof(rec_df[i]));
+				if (rc != 0)
+					return rc;
+
+				rec_df[i] = UMOFF_NULL;
+				goto cache;
+			}
+		}
+
+		/* If non-exist on disk, then must be non-exist in cache. */
+		return 0;
+	}
+
+cache:
 	if (DAE_REC_CNT(dae) > DTX_INLINE_REC_CNT)
 		count = DTX_INLINE_REC_CNT;
 	else
@@ -2116,14 +2162,18 @@ vos_dtx_post_handle(struct vos_container *cont,
 
 	if (!abort && dces != NULL) {
 		struct vos_tls		*tls = vos_tls_get(false);
+		int			 j = 0;
 
 		D_ASSERT(cont->vc_pool->vp_sysdb == false);
 		for (i = 0; i < count; i++) {
-			if (dces[i] != NULL) {
-				cont->vc_dtx_committed_count++;
-				cont->vc_pool->vp_dtx_committed_count++;
-				d_tm_inc_gauge(tls->vtl_committed, 1);
-			}
+			if (dces[i] != NULL)
+				j++;
+		}
+
+		if (j > 0) {
+			cont->vc_dtx_committed_count += j;
+			cont->vc_pool->vp_dtx_committed_count += j;
+			d_tm_inc_gauge(tls->vtl_committed, j);
 		}
 	}
 
@@ -2439,6 +2489,7 @@ vos_dtx_aggregate(daos_handle_t coh)
 	uint64_t			 epoch;
 	umem_off_t			 dbd_off;
 	umem_off_t			 next = UMOFF_NULL;
+	int				 count = 0;
 	int				 rc;
 	int				 i;
 
@@ -2481,13 +2532,10 @@ vos_dtx_aggregate(daos_handle_t coh)
 				UMOFF_P(dbd_off), DP_RC(rc));
 			goto out;
 		}
-
-		cont->vc_dtx_committed_count--;
-		cont->vc_pool->vp_dtx_committed_count--;
-		d_tm_dec_gauge(tls->vtl_committed, 1);
-		d_tm_dec_gauge(tls->vtl_dtx_cmt_ent_cnt, 1);
 	}
 
+	count = dbd->dbd_count;
+
 	if (epoch != cont_df->cd_newest_aggregated) {
 		rc = umem_tx_add_ptr(umm, &cont_df->cd_newest_aggregated,
 				     sizeof(cont_df->cd_newest_aggregated));
@@ -2545,8 +2593,14 @@ vos_dtx_aggregate(daos_handle_t coh)
 
 out:
 	rc = umem_tx_end(umm, rc);
-	if (rc == 0 && cont->vc_cmt_dtx_reindex_pos == dbd_off)
-		cont->vc_cmt_dtx_reindex_pos = next;
+	if (rc == 0) {
+		if (cont->vc_cmt_dtx_reindex_pos == dbd_off)
+			cont->vc_cmt_dtx_reindex_pos = next;
+
+		cont->vc_dtx_committed_count -= count;
+		cont->vc_pool->vp_dtx_committed_count -= count;
+		d_tm_dec_gauge(tls->vtl_committed, count);
+	}
 
 	DL_CDEBUG(rc != 0, DLOG_ERR, DB_IO, rc,
 		  "Release DTX committed blob %p (" UMOFF_PF ") for cont " DF_UUID, dbd,
diff --git a/src/vos/vos_tls.h b/src/vos/vos_tls.h
index 981cce10be5..2fc328457d0 100644
--- a/src/vos/vos_tls.h
+++ b/src/vos/vos_tls.h
@@ -64,7 +64,6 @@ struct vos_tls {
 	};
 	struct d_tm_node_t		 *vtl_committed;
 	struct d_tm_node_t		 *vtl_obj_cnt;
-	struct d_tm_node_t		 *vtl_dtx_cmt_ent_cnt;
 	struct d_tm_node_t		 *vtl_lru_alloc_size;
 };
 

From 3d9e2d0cafcb5f314135ba9e7a27dd541102f7c5 Mon Sep 17 00:00:00 2001
From: Joseph Moore <26410038+jgmoore-or@users.noreply.github.com>
Date: Thu, 5 Sep 2024 14:39:17 -0600
Subject: [PATCH 03/12] DAOS-16271 mercury: Add patch to check ep for null in
 UCX key resolve. (#15077)

Signed-off-by: Joseph Moore <joseph.moore@intel.com>
---
 utils/build.config | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/build.config b/utils/build.config
index 174df687036..c38d49a267a 100644
--- a/utils/build.config
+++ b/utils/build.config
@@ -29,3 +29,4 @@ ucx=https://github.com/openucx/ucx.git
 spdk=https://github.com/spdk/spdk/commit/b0aba3fcd5aceceea530a702922153bc75664978.diff,https://github.com/spdk/spdk/commit/445a4c808badbad3942696ecf16fa60e8129a747.diff
 ofi=https://github.com/ofiwg/libfabric/commit/d827c6484cc5bf67dfbe395890e258860c3f0979.diff
 fuse=https://github.com/libfuse/libfuse/commit/c9905341ea34ff9acbc11b3c53ba8bcea35eeed8.diff
+mercury=https://raw.githubusercontent.com/daos-stack/mercury/481297621bafbbcac4cc6f8feab3f1b6f8b14b59/na_ucx_keyres_epchk.patch

From 185ba8f3055ff7639fc472f5ba1210a8034f9354 Mon Sep 17 00:00:00 2001
From: Dalton Bohning <dalton.bohning@intel.com>
Date: Thu, 5 Sep 2024 16:28:11 -0700
Subject: [PATCH 04/12] DAOS-16457 test: remove display_memory_info (#15031)
 (#15075)

display_memory_info was added to debug an issue when starting the servers,
but resolved by #14295.
It is no longer needed and consumes too much log space and time.

Signed-off-by: Dalton Bohning <dalton.bohning@intel.com>
---
 src/tests/ftest/util/server_utils.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/src/tests/ftest/util/server_utils.py b/src/tests/ftest/util/server_utils.py
index 6ae05af94e9..752473021a3 100644
--- a/src/tests/ftest/util/server_utils.py
+++ b/src/tests/ftest/util/server_utils.py
@@ -464,14 +464,6 @@ def support_collect_log(self, **kwargs):
         return run_remote(
             self.log, self._hosts, cmd.with_exports, timeout=self.collect_log_timeout.value)
 
-    def display_memory_info(self):
-        """Display server hosts memory info."""
-        self.log.debug("#" * 80)
-        self.log.debug("<SERVER> Collection debug memory info")
-        run_remote(self.log, self._hosts, "free -m && df -h --type=tmpfs")
-        run_remote(self.log, self._hosts, "ps -eo size,pid,user,command --sort -size | head -n 6")
-        self.log.debug("#" * 80)
-
     def detect_format_ready(self, reformat=False):
         """Detect when all the daos_servers are ready for storage format.
 
@@ -664,14 +656,11 @@ def start(self):
         self.prepare()
 
         # Start the servers and wait for them to be ready for storage format
-        self.display_memory_info()
         self.detect_format_ready()
 
         # Collect storage and network information from the servers.
-        self.display_memory_info()
         self.information.collect_storage_information()
         self.information.collect_network_information()
-        self.display_memory_info()
 
         # Format storage and wait for server to change ownership
         self.log.info("<SERVER> Formatting hosts: <%s>", self.dmg.hostlist)
@@ -711,9 +700,6 @@ def stop(self):
             # Make sure the mount directory belongs to non-root user
             self.set_scm_mount_ownership()
 
-        # Collective memory usage after stop.
-        self.display_memory_info()
-
         # Report any errors after all stop actions have been attempted
         if messages:
             raise ServerFailed("Failed to stop servers:\n  {}".format("\n  ".join(messages)))

From e1b6a7e88c142c73986eadb2d1c6312c5092ee4b Mon Sep 17 00:00:00 2001
From: Nasf-Fan <fan.yong@intel.com>
Date: Fri, 6 Sep 2024 09:43:04 +0800
Subject: [PATCH 05/12] DAOS-16458 object: fix invalid DRAM access in
 obj_bulk_transfer - b26 (#15054)

For EC object update via CPD RPC, when calculate the bitmap to skip
some iods for current EC data shard, we may input NULL for "*skips"
parameter. It may cause the old logic in obj_get_iods_offs_by_oid()
to generate some undefined DRAM for "skips" bitmap. Such bitmap may
be over-written by others, as to subsequent obj_bulk_transfer() may
be misguided.

The patch also fixes a bug inside obj_bulk_transfer() that cast any
input RPC as UPDATE/FETCH by force.

Signed-off-by: Fan Yong <fan.yong@intel.com>
---
 src/object/srv_coll.c     |  2 +-
 src/object/srv_internal.h |  2 +-
 src/object/srv_obj.c      | 41 ++++++++++++++++++++++++---------------
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/src/object/srv_coll.c b/src/object/srv_coll.c
index 9e421810861..a63a11d574b 100644
--- a/src/object/srv_coll.c
+++ b/src/object/srv_coll.c
@@ -183,7 +183,7 @@ obj_coll_punch_bulk(crt_rpc_t *rpc, d_iov_t *iov, crt_proc_t *p_proc,
 	sgl.sg_iovs = iov;
 
 	rc = obj_bulk_transfer(rpc, CRT_BULK_GET, false, &ocpi->ocpi_tgt_bulk, NULL, NULL,
-			       DAOS_HDL_INVAL, &sgls, 1, NULL, NULL);
+			       DAOS_HDL_INVAL, &sgls, 1, 1, NULL, NULL);
 	if (rc != 0) {
 		D_ERROR("Failed to prepare bulk transfer for coll_punch, size %u: "DF_RC"\n",
 			ocpi->ocpi_bulk_tgt_sz, DP_RC(rc));
diff --git a/src/object/srv_internal.h b/src/object/srv_internal.h
index 6f13e3f36dc..a24986247a5 100644
--- a/src/object/srv_internal.h
+++ b/src/object/srv_internal.h
@@ -280,7 +280,7 @@ typedef int (*ds_iofw_cb_t)(crt_rpc_t *req, void *arg);
 
 int obj_bulk_transfer(crt_rpc_t *rpc, crt_bulk_op_t bulk_op, bool bulk_bind,
 		      crt_bulk_t *remote_bulks, uint64_t *remote_offs, uint8_t *skips,
-		      daos_handle_t ioh, d_sg_list_t **sgls, int sgl_nr,
+		      daos_handle_t ioh, d_sg_list_t **sgls, int sgl_nr, int bulk_nr,
 		      struct obj_bulk_args *p_arg, struct ds_cont_hdl *coh);
 int obj_tgt_punch(struct obj_tgt_punch_args *otpa, uint32_t *shards, uint32_t count);
 int obj_tgt_query(struct obj_tgt_query_args *otqa, uuid_t po_uuid, uuid_t co_hdl, uuid_t co_uuid,
diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c
index 0a246bebdca..febd3d36ead 100644
--- a/src/object/srv_obj.c
+++ b/src/object/srv_obj.c
@@ -488,22 +488,24 @@ bulk_transfer_sgl(daos_handle_t ioh, crt_rpc_t *rpc, crt_bulk_t remote_bulk,
 int
 obj_bulk_transfer(crt_rpc_t *rpc, crt_bulk_op_t bulk_op, bool bulk_bind, crt_bulk_t *remote_bulks,
 		  uint64_t *remote_offs, uint8_t *skips, daos_handle_t ioh, d_sg_list_t **sgls,
-		  int sgl_nr, struct obj_bulk_args *p_arg, struct ds_cont_hdl *coh)
+		  int sgl_nr, int bulk_nr, struct obj_bulk_args *p_arg, struct ds_cont_hdl *coh)
 {
-	struct obj_rw_in	*orw = crt_req_get(rpc);
 	struct obj_bulk_args	arg = { 0 };
 	int			i, rc, *status, ret;
 	int			skip_nr = 0;
-	int			bulk_nr;
 	bool			async = true;
 	uint64_t		time = daos_get_ntime();
 
+	if (unlikely(sgl_nr > bulk_nr)) {
+		D_ERROR("Invalid sgl_nr vs bulk_nr: %d/%d\n", sgl_nr, bulk_nr);
+		return -DER_INVAL;
+	}
+
 	if (remote_bulks == NULL) {
 		D_ERROR("No remote bulks provided\n");
 		return -DER_INVAL;
 	}
 
-	bulk_nr = orw->orw_bulks.ca_count;
 	if (p_arg == NULL) {
 		p_arg = &arg;
 		async = false;
@@ -514,7 +516,7 @@ obj_bulk_transfer(crt_rpc_t *rpc, crt_bulk_op_t bulk_op, bool bulk_bind, crt_bul
 		return dss_abterr2der(rc);
 
 	p_arg->inited = true;
-	D_DEBUG(DB_IO, "bulk_op %d sgl_nr %d\n", bulk_op, sgl_nr);
+	D_DEBUG(DB_IO, "bulk_op %d, sgl_nr %d, bulk_nr %d\n", bulk_op, sgl_nr, bulk_nr);
 
 	p_arg->bulks_inflight++;
 
@@ -542,9 +544,9 @@ obj_bulk_transfer(crt_rpc_t *rpc, crt_bulk_op_t bulk_op, bool bulk_bind, crt_bul
 		while (skips != NULL && isset(skips, i + skip_nr))
 			skip_nr++;
 
-		if (bulk_nr > 0)
-			D_ASSERTF(i + skip_nr < bulk_nr, "i %d, skip_nr %d, bulk_nr %d\n",
-				  i, skip_nr, bulk_nr);
+		D_ASSERTF(i + skip_nr < bulk_nr, "i %d, skip_nr %d, sgl_nr %d, bulk_nr %d\n",
+			  i, skip_nr, sgl_nr, bulk_nr);
+
 		if (remote_bulks[i + skip_nr] == NULL)
 			continue;
 
@@ -574,6 +576,12 @@ obj_bulk_transfer(crt_rpc_t *rpc, crt_bulk_op_t bulk_op, bool bulk_bind, crt_bul
 			break;
 		}
 	}
+
+	if (skips != NULL)
+		D_ASSERTF(skip_nr + sgl_nr <= bulk_nr,
+			  "Unmatched skip_nr %d, sgl_nr %d, bulk_nr %d\n",
+			  skip_nr, sgl_nr, bulk_nr);
+
 done:
 	if (--(p_arg->bulks_inflight) == 0)
 		ABT_eventual_set(p_arg->eventual, &rc, sizeof(rc));
@@ -836,7 +844,7 @@ obj_echo_rw(crt_rpc_t *rpc, daos_iod_t *iod, uint64_t *off)
 	/* Only support 1 iod now */
 	bulk_bind = orw->orw_flags & ORF_BULK_BIND;
 	rc = obj_bulk_transfer(rpc, bulk_op, bulk_bind, orw->orw_bulks.ca_arrays, off,
-			       NULL, DAOS_HDL_INVAL, &p_sgl, 1, NULL, NULL);
+			       NULL, DAOS_HDL_INVAL, &p_sgl, 1, 1, NULL, NULL);
 out:
 	orwo->orw_ret = rc;
 	orwo->orw_map_version = orw->orw_map_ver;
@@ -1636,7 +1644,8 @@ obj_local_rw_internal(crt_rpc_t *rpc, struct obj_io_context *ioc, daos_iod_t *io
 	if (rma) {
 		bulk_bind = orw->orw_flags & ORF_BULK_BIND;
 		rc = obj_bulk_transfer(rpc, bulk_op, bulk_bind, orw->orw_bulks.ca_arrays, offs,
-				       skips, ioh, NULL, iods_nr, NULL, ioc->ioc_coh);
+				       skips, ioh, NULL, iods_nr, orw->orw_bulks.ca_count, NULL,
+				       ioc->ioc_coh);
 		if (rc == 0) {
 			bio_iod_flush(biod);
 
@@ -1809,7 +1818,7 @@ obj_get_iods_offs_by_oid(daos_unit_oid_t uoid, struct obj_iod_array *iod_array,
 		}
 	}
 	if (oiod_nr > LOCAL_SKIP_BITS_NUM || *skips == NULL) {
-		D_ALLOC(*skips, roundup(oiod_nr / NBBY, 4));
+		D_ALLOC(*skips, (oiod_nr + NBBY - 1) / NBBY);
 		if (*skips == NULL)
 			D_GOTO(out, rc = -DER_NOMEM);
 	}
@@ -2448,7 +2457,7 @@ ds_obj_ec_rep_handler(crt_rpc_t *rpc)
 		goto end;
 	}
 	rc = obj_bulk_transfer(rpc, CRT_BULK_GET, false, &oer->er_bulk, NULL, NULL,
-			       ioh, NULL, 1, NULL, ioc.ioc_coh);
+			       ioh, NULL, 1, 1, NULL, ioc.ioc_coh);
 	if (rc)
 		D_ERROR(DF_UOID " bulk transfer failed: " DF_RC "\n", DP_UOID(oer->er_oid),
 			DP_RC(rc));
@@ -2526,7 +2535,7 @@ ds_obj_ec_agg_handler(crt_rpc_t *rpc)
 			goto end;
 		}
 		rc = obj_bulk_transfer(rpc, CRT_BULK_GET, false, &oea->ea_bulk,
-				       NULL, NULL, ioh, NULL, 1, NULL, ioc.ioc_coh);
+				       NULL, NULL, ioh, NULL, 1, 1, NULL, ioc.ioc_coh);
 		if (rc)
 			D_ERROR(DF_UOID " bulk transfer failed: " DF_RC "\n", DP_UOID(oea->ea_oid),
 				DP_RC(rc));
@@ -3275,7 +3284,7 @@ obj_enum_reply_bulk(crt_rpc_t *rpc)
 		return 0;
 
 	rc = obj_bulk_transfer(rpc, CRT_BULK_PUT, false, bulks, NULL, NULL,
-			       DAOS_HDL_INVAL, sgls, idx, NULL, NULL);
+			       DAOS_HDL_INVAL, sgls, idx, idx, NULL, NULL);
 	if (oei->oei_kds_bulk) {
 		D_FREE(oeo->oeo_kds.ca_arrays);
 		oeo->oeo_kds.ca_count = 0;
@@ -4560,7 +4569,7 @@ ds_cpd_handle_one(crt_rpc_t *rpc, struct daos_cpd_sub_head *dcsh, struct daos_cp
 
 			rc = obj_bulk_transfer(rpc, CRT_BULK_GET, dcu->dcu_flags & ORF_BULK_BIND,
 					       dcu->dcu_bulks, poffs[i], pskips[i], iohs[i], NULL,
-					       piod_nrs[i], &bulks[i], ioc->ioc_coh);
+					       piod_nrs[i], dcsr->dcsr_nr, &bulks[i], ioc->ioc_coh);
 			if (rc != 0) {
 				D_ERROR("Bulk transfer failed for obj "
 					DF_UOID", DTX "DF_DTI": "DF_RC"\n",
@@ -5276,7 +5285,7 @@ ds_obj_cpd_body_bulk(crt_rpc_t *rpc, struct obj_io_context *ioc, bool leader,
 	}
 
 	rc = obj_bulk_transfer(rpc, CRT_BULK_GET, ORF_BULK_BIND, bulks, NULL, NULL,
-			       DAOS_HDL_INVAL, sgls, count, NULL, ioc->ioc_coh);
+			       DAOS_HDL_INVAL, sgls, count, count, NULL, ioc->ioc_coh);
 	if (rc != 0)
 		goto out;
 

From aa811d73facfe0d2bf491ec26cf80d19bdcf3e11 Mon Sep 17 00:00:00 2001
From: Liang Zhen <liang.zhen@intel.com>
Date: Fri, 6 Sep 2024 16:40:05 +0800
Subject: [PATCH 06/12] DAOS-15863 container: fix a race for container cache
 (#15038) (#15065)

* DAOS-15863 container: fix a race for container cache

while destroying a container, cont_child_destroy_one() releases
its own refcount before waiting, if another ULT releases its
refcount, which is the last one, wakes up the waiting ULT and frees
it ds_cont_child straightaway, because no one else has refcount.

When the waiting ULT is waken up, it will try to change the already
freed ds_cont_child.

This patch changes the LRU eviction logic and fixes this race.

Signed-off-by: Liang Zhen <liang.zhen@intel.com>
Signed-off-by: Jeff Olivier <jeffolivier@google.com>
Co-authored-by: Jeff Olivier <jeffolivier@google.com>
---
 src/common/lru.c           | 54 ++++++++++++++++----------------------
 src/container/srv_target.c |  4 ++-
 src/include/daos/lru.h     | 38 +++++++--------------------
 3 files changed, 35 insertions(+), 61 deletions(-)

diff --git a/src/common/lru.c b/src/common/lru.c
index bb270500ab7..de86d367e0e 100644
--- a/src/common/lru.c
+++ b/src/common/lru.c
@@ -36,7 +36,10 @@ lru_hop_rec_decref(struct d_hash_table *htable, d_list_t *link)
 
 	D_ASSERT(llink->ll_ref > 0);
 	llink->ll_ref--;
-	if (llink->ll_ref == 1 && llink->ll_ops->lop_wakeup)
+
+	/* eviction waiter is the last one holds refcount */
+	if (llink->ll_wait_evict &&
+	    llink->ll_ops->lop_wakeup && daos_lru_is_last_user(llink))
 		llink->ll_ops->lop_wakeup(llink);
 
 	/* Delete from hash only if no more references */
@@ -215,15 +218,6 @@ daos_lru_ref_hold(struct daos_lru_cache *lcache, void *key,
 	if (link != NULL) {
 		llink = link2llink(link);
 		D_ASSERT(llink->ll_evicted == 0);
-		if (llink->ll_evicting) {
-			/**
-			 * Avoid calling `lru_hop_rec_decref()` at this point
-			 * to prevent `wakeup()` from being invoked twice.
-			 */
-			D_ASSERT(llink->ll_ref > 1);
-			llink->ll_ref--;
-			D_GOTO(out, rc = -DER_SHUTDOWN);
-		}
 		/* remove busy item from LRU */
 		if (!d_list_empty(&llink->ll_qlink))
 			d_list_del_init(&llink->ll_qlink);
@@ -257,24 +251,17 @@ daos_lru_ref_hold(struct daos_lru_cache *lcache, void *key,
 	return rc;
 }
 
-static void
-lru_ref_release_internal(struct daos_lru_cache *lcache, struct daos_llink *llink, bool wait)
+void
+daos_lru_ref_release(struct daos_lru_cache *lcache, struct daos_llink *llink)
 {
 	D_ASSERT(lcache != NULL && llink != NULL && llink->ll_ref > 1);
 	D_ASSERT(d_list_empty(&llink->ll_qlink));
 
 	lru_hop_rec_decref(&lcache->dlc_htable, &llink->ll_link);
 
-	if (wait && llink->ll_ref > 1) {
-		D_ASSERT(llink->ll_evicting == 0);
-		llink->ll_evicting = 1;
-		lcache->dlc_ops->lop_wait(llink);
-		llink->ll_evicting = 0;
-		llink->ll_evicted = 1;
-	}
-
 	if (llink->ll_ref == 1) { /* the last refcount */
-		if (lcache->dlc_csize == 0)
+		/* zero-sized cache always evicts unused item */
+		if (lcache->dlc_csize == 0 && !llink->ll_evicted)
 			llink->ll_evicted = 1;
 
 		if (llink->ll_evicted) {
@@ -297,15 +284,20 @@ lru_ref_release_internal(struct daos_lru_cache *lcache, struct daos_llink *llink
 }
 
 void
-daos_lru_ref_release(struct daos_lru_cache *lcache, struct daos_llink *llink)
-{
-	lru_ref_release_internal(lcache, llink, false);
-}
-
-void
-daos_lru_ref_wait_evict(struct daos_lru_cache *lcache, struct daos_llink *llink)
+daos_lru_ref_evict_wait(struct daos_lru_cache *lcache, struct daos_llink *llink)
 {
-	D_ASSERT(lcache->dlc_ops->lop_wait);
-
-	lru_ref_release_internal(lcache, llink, true);
+	if (!llink->ll_evicted)
+		daos_lru_ref_evict(lcache, llink);
+
+	if (lcache->dlc_ops->lop_wait && !daos_lru_is_last_user(llink)) {
+		/* Wait until I'm the last one.
+		 * XXX: the implementation can only support one waiter for now, if there
+		 * is a secondary ULT calls this function on the same item, it will hit
+		 * the assertion.
+		 */
+		D_ASSERT(!llink->ll_wait_evict);
+		llink->ll_wait_evict = 1;
+		lcache->dlc_ops->lop_wait(llink);
+		llink->ll_wait_evict = 0;
+	}
 }
diff --git a/src/container/srv_target.c b/src/container/srv_target.c
index b5abcc2d759..f3ef47c8447 100644
--- a/src/container/srv_target.c
+++ b/src/container/srv_target.c
@@ -1261,7 +1261,9 @@ cont_child_destroy_one(void *vin)
 			D_GOTO(out_pool, rc = -DER_BUSY);
 		} /* else: resync should have completed, try again */
 
-		daos_lru_ref_wait_evict(tls->dt_cont_cache, &cont->sc_list);
+		/* nobody should see it again after eviction */
+		daos_lru_ref_evict_wait(tls->dt_cont_cache, &cont->sc_list);
+		daos_lru_ref_release(tls->dt_cont_cache, &cont->sc_list);
 	}
 
 	D_DEBUG(DB_MD, DF_CONT": destroying vos container\n",
diff --git a/src/include/daos/lru.h b/src/include/daos/lru.h
index 03b1eb90e4c..40bee5c492b 100644
--- a/src/include/daos/lru.h
+++ b/src/include/daos/lru.h
@@ -37,8 +37,8 @@ struct daos_llink {
 	d_list_t		 ll_link;	/**< LRU hash link */
 	d_list_t		 ll_qlink;	/**< Temp link for traverse */
 	uint32_t		 ll_ref;	/**< refcount for this ref */
-	uint32_t		 ll_evicted:1,	/**< has been evicted */
-				 ll_evicting:1; /**< been evicting */
+	uint32_t		 ll_evicted:1;	/**< has been evicted */
+	uint32_t		 ll_wait_evict:1; /**< wait for completion of eviction */
 	struct daos_llink_ops	*ll_ops;	/**< ops to maintain refs */
 };
 
@@ -121,26 +121,7 @@ void
 daos_lru_ref_release(struct daos_lru_cache *lcache, struct daos_llink *llink);
 
 /**
- * Evicts the LRU link from the DAOS LRU cache after waiting
- * for all references to be released.
- *
- * \param[in] lcache		DAOS LRU cache
- * \param[in] llink		DAOS LRU link to be evicted
- *
- */
-void
-daos_lru_ref_wait_evict(struct daos_lru_cache *lcache, struct daos_llink *llink);
-
-/**
- * Flush old items from LRU.
- *
- * \param[in] lcache		DAOS LRU cache
- */
-void
-daos_lru_ref_flush(struct daos_lru_cache *lcache);
-
-/**
- * Evict the item from LRU after releasing the last refcount on it.
+ * Evict the item from LRU before releasing the refcount on it.
  *
  * \param[in] lcache		DAOS LRU cache
  * \param[in] llink		DAOS LRU item to be evicted
@@ -153,15 +134,14 @@ daos_lru_ref_evict(struct daos_lru_cache *lcache, struct daos_llink *llink)
 }
 
 /**
- * Check if a LRU element has been evicted or not
+ * Evict the item from LRU before releasing the refcount on it, wait until
+ * the caller is the last one holds refcount.
  *
- * \param[in] llink		DAOS LRU item to check
+ * \param[in] lcache		DAOS LRU cache
+ * \param[in] llink		DAOS LRU item to be evicted
  */
-static inline bool
-daos_lru_ref_evicted(struct daos_llink *llink)
-{
-	return llink->ll_evicted;
-}
+void
+daos_lru_ref_evict_wait(struct daos_lru_cache *lcache, struct daos_llink *llink);
 
 /**
  * Increase a usage reference to the LRU element

From f7d35232ccdaefb48a06ffee002f32c38d99dc69 Mon Sep 17 00:00:00 2001
From: Phil Henderson <phillip.henderson@intel.com>
Date: Fri, 6 Sep 2024 11:11:11 -0400
Subject: [PATCH 07/12] DAOS-16484 test: Support mixed speeds when selecting a
 default interface (#15050) (#15080)

Allow selecting a default interface that is running at a different speed
on different hosts.  Primarily this is to support selecting the ib0
interface by default when the launch node has a slower ib0 interface
than the cluster hosts.

Signed-off-by: Phil Henderson <phillip.henderson@intel.com>
---
 src/tests/ftest/util/environment_utils.py | 3 ++-
 src/tests/ftest/util/network_utils.py     | 9 +++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/tests/ftest/util/environment_utils.py b/src/tests/ftest/util/environment_utils.py
index 8223063a85e..e36d750500e 100644
--- a/src/tests/ftest/util/environment_utils.py
+++ b/src/tests/ftest/util/environment_utils.py
@@ -9,6 +9,7 @@
 
 from ClusterShell.NodeSet import NodeSet
 # pylint: disable=import-error,no-name-in-module
+from util.host_utils import get_local_host
 from util.network_utils import (PROVIDER_ALIAS, SUPPORTED_PROVIDERS, NetworkException,
                                 get_common_provider, get_fastest_interface)
 from util.run_utils import run_remote
@@ -326,7 +327,7 @@ def _default_interface(self, logger, hosts):
             # Find all the /sys/class/net interfaces on the launch node (excluding lo)
             logger.debug("Detecting network devices - D_INTERFACE not set")
             try:
-                interface = get_fastest_interface(logger, hosts)
+                interface = get_fastest_interface(logger, hosts | get_local_host())
             except NetworkException as error:
                 raise TestEnvironmentException("Error obtaining a default interface!") from error
         return interface
diff --git a/src/tests/ftest/util/network_utils.py b/src/tests/ftest/util/network_utils.py
index 52ba2420964..e3802364d8f 100644
--- a/src/tests/ftest/util/network_utils.py
+++ b/src/tests/ftest/util/network_utils.py
@@ -405,11 +405,12 @@ def get_fastest_interface(logger, hosts, verbose=True):
     for interface in common_interfaces:
         detected_speeds = get_interface_speeds(logger, hosts, interface, verbose)
         speed_list = []
+        speed_hosts = NodeSet()
         for speed, node_set in detected_speeds.items():
-            if node_set == hosts:
-                # Only include detected homogeneous interface speeds
-                speed_list.append(speed)
-        if speed_list:
+            speed_list.append(speed)
+            speed_hosts.add(node_set)
+        if speed_list and speed_hosts == hosts:
+            # Only include interface speeds if a speed is detected on all the hosts
             interface_speeds[interface] = min(speed_list)
 
     logger.info("Active network interface speeds on %s:", hosts)

From 8a8c7d4ceb24c66330f4623d446f3349111c3f2b Mon Sep 17 00:00:00 2001
From: Liu Xuezhao <xuezhao.liu@intel.com>
Date: Fri, 6 Sep 2024 23:47:29 +0800
Subject: [PATCH 08/12] DAOS-16467 rebuild: add DAOS_POOL_RF ENV for massive
 failure case (#15057)

* DAOS-16467 rebuild: add DAOS_PW_RF ENV for massive failure case

Allow user to set DAOS_PW_RF as pw_rf (pool wise RF).
If SWIM detected engine failure is going to break pw_rf, don't change
pool map, also don't trigger rebuild.
With critical log message to ask administrator to bring back those
engines in top priority (just "system start --ranks=xxx", need not to
reintegrate those engines).

a few functions renamed to avoid confuse -
pool_map_find_nodes() -> pool_map_find_ranks()
pool_map_find_node_by_rank() -> pool_map_find_dom_by_rank()
pool_map_node_nr() -> pool_map_rank_nr()

Signed-off-by: Xuezhao Liu <xuezhao.liu@intel.com>
---
 docs/admin/env_variables.md                 |   1 +
 docs/admin/pool_operations.md               |  24 +++++
 src/chk/chk_engine.c                        |   4 +-
 src/common/pool_map.c                       |  18 ++--
 src/container/cli.c                         |   2 +-
 src/container/srv_container.c               |  15 ++-
 src/dtx/dtx_coll.c                          |  14 +--
 src/include/daos/pool_map.h                 |   8 +-
 src/include/daos_prop.h                     |  11 ++-
 src/object/cli_coll.c                       |   2 +-
 src/object/srv_coll.c                       |   2 +-
 src/pool/cli.c                              |   4 +-
 src/pool/rpc.h                              |  10 ++
 src/pool/srv.c                              |  16 ++-
 src/pool/srv_internal.h                     |   2 +
 src/pool/srv_pool.c                         | 102 +++++++++++++++++---
 src/pool/srv_pool_map.c                     |   2 +-
 src/pool/srv_util.c                         |  14 +--
 src/rebuild/srv.c                           |  18 ++--
 src/tests/ftest/util/server_utils_params.py |   1 +
 20 files changed, 196 insertions(+), 74 deletions(-)

diff --git a/docs/admin/env_variables.md b/docs/admin/env_variables.md
index 2f5c2053683..060c3790d57 100644
--- a/docs/admin/env_variables.md
+++ b/docs/admin/env_variables.md
@@ -53,6 +53,7 @@ Environment variables in this section only apply to the server side.
 |DAOS\_DTX\_RPC\_HELPER\_THD|DTX RPC helper threshold. The valid range is [18, unlimited). The default value is 513.|
 |DAOS\_DTX\_BATCHED\_ULT\_MAX|The max count of DTX batched commit ULTs. The valid range is [0, unlimited). 0 means to commit DTX synchronously. The default value is 32.|
 |DAOS\_FORWARD\_NEIGHBOR|Set to enable I/O forwarding on neighbor xstream in the absence of helper threads.|
+|DAOS\_POOL\_RF|Redundancy factor for the pool. The valid range is [1, 4]. The default value is 2.|
 
 ## Server and Client environment variables
 
diff --git a/docs/admin/pool_operations.md b/docs/admin/pool_operations.md
index 388a81d8700..36907a2e31f 100644
--- a/docs/admin/pool_operations.md
+++ b/docs/admin/pool_operations.md
@@ -916,6 +916,30 @@ and possibly repair a pmemobj file. As discussed in the previous section, the
 rebuild status can be consulted via the pool query and will be expanded
 with more information.
 
+## Pool Redundancy Factor
+
+If the DAOS system experiences cascading failures, where the number of failed
+fault domains exceeds a pool's redundancy factor, there could be unrecoverable
+errors and applications could suffer from data loss. This can happen in cases
+of power or network outages and would cause node/engine failures. In most cases
+those failures can be recovered and DAOS engines can be restarted and the system
+can function again.
+
+Administrator can set the default pool redundancy factor by environment variable
+"DAOS_POOL_RF" in the server yaml file. If SWIM detects and reports an engine is
+dead and the number of failed fault domain exceeds or is going to exceed the pool
+redundancy factor, it will not change pool map immediately. Instead, it will give
+critical log message:
+intolerable unavailability: engine rank x
+In this case, the system administrator should check and try to recover those
+failed engines and bring them back with:
+dmg system start --ranks=x
+one by one. A reintegrate call is not needed.
+
+For true unrecoverable failures, the administrator can still exclude engines.
+However, data loss is expected as the number of unrecoverable failures exceeds
+the pool redundancy factor.
+
 ## Recovering Container Ownership
 
 Typically users are expected to manage their containers. However, in the event
diff --git a/src/chk/chk_engine.c b/src/chk/chk_engine.c
index 9113ca22531..56e6da3ad9b 100644
--- a/src/chk/chk_engine.c
+++ b/src/chk/chk_engine.c
@@ -668,7 +668,7 @@ chk_engine_pool_mbs_one(struct chk_pool_rec *cpr, struct pool_map *map, struct c
 	int			 rc = 0;
 	bool			 unknown;
 
-	dom = pool_map_find_node_by_rank(map, mbs->cpm_rank);
+	dom = pool_map_find_dom_by_rank(map, mbs->cpm_rank);
 	if (dom == NULL) {
 		D_ASSERT(mbs->cpm_rank != dss_self_rank());
 
@@ -777,7 +777,7 @@ chk_engine_find_dangling_pm(struct chk_pool_rec *cpr, struct pool_map *map)
 	int			 j;
 	bool			 down;
 
-	rank_nr = pool_map_find_nodes(map, PO_COMP_ID_ALL, &doms);
+	rank_nr = pool_map_find_ranks(map, PO_COMP_ID_ALL, &doms);
 	if (rank_nr <= 0)
 		D_GOTO(out, rc = rank_nr);
 
diff --git a/src/common/pool_map.c b/src/common/pool_map.c
index 7d7b38adb6c..1712f398dcb 100644
--- a/src/common/pool_map.c
+++ b/src/common/pool_map.c
@@ -1573,7 +1573,7 @@ add_domain_tree_to_pool_buf(struct pool_map *map, struct pool_buf *map_buf,
 			if (map) {
 				struct pool_domain *found_dom;
 
-				found_dom = pool_map_find_node_by_rank(map, rank);
+				found_dom = pool_map_find_dom_by_rank(map, rank);
 				if (found_dom) {
 					if (found_dom->do_comp.co_status == PO_COMP_ST_NEW)
 						found_new_dom = true;
@@ -2038,7 +2038,7 @@ pool_map_find_domain(struct pool_map *map, pool_comp_type_t type, uint32_t id,
 }
 
 /**
- * Find all nodes in the pool map.
+ * Find all ranks in the pool map.
  *
  * \param map	[IN]	pool map to search.
  * \param id	[IN]	id to search.
@@ -2048,7 +2048,7 @@ pool_map_find_domain(struct pool_map *map, pool_comp_type_t type, uint32_t id,
  *                      0 if none.
  */
 int
-pool_map_find_nodes(struct pool_map *map, uint32_t id,
+pool_map_find_ranks(struct pool_map *map, uint32_t id,
 		    struct pool_domain **domain_pp)
 {
 	return pool_map_find_domain(map, PO_COMP_TP_RANK, id,
@@ -2102,14 +2102,14 @@ pool_map_find_target(struct pool_map *map, uint32_t id,
  * \return              domain found by rank.
  */
 struct pool_domain *
-pool_map_find_node_by_rank(struct pool_map *map, uint32_t rank)
+pool_map_find_dom_by_rank(struct pool_map *map, uint32_t rank)
 {
 	struct pool_domain	*doms;
 	struct pool_domain	*found = NULL;
 	int			doms_cnt;
 	int			i;
 
-	doms_cnt = pool_map_find_nodes(map, PO_COMP_ID_ALL, &doms);
+	doms_cnt = pool_map_find_ranks(map, PO_COMP_ID_ALL, &doms);
 	if (doms_cnt <= 0)
 		return NULL;
 
@@ -2150,7 +2150,7 @@ pool_map_find_targets_on_ranks(struct pool_map *map, d_rank_list_t *rank_list,
 	for (i = 0; i < rank_list->rl_nr; i++) {
 		struct pool_domain *dom;
 
-		dom = pool_map_find_node_by_rank(map, rank_list->rl_ranks[i]);
+		dom = pool_map_find_dom_by_rank(map, rank_list->rl_ranks[i]);
 		if (dom == NULL) {
 			pool_target_id_list_free(tgts);
 			return 0;
@@ -2191,7 +2191,7 @@ pool_map_find_target_by_rank_idx(struct pool_map *map, uint32_t rank,
 {
 	struct pool_domain	*dom;
 
-	dom = pool_map_find_node_by_rank(map, rank);
+	dom = pool_map_find_dom_by_rank(map, rank);
 	if (dom == NULL)
 		return 0;
 
@@ -2867,7 +2867,7 @@ pool_map_find_by_rank_status(struct pool_map *map,
 
 	*tgt_ppp = NULL;
 	*tgt_cnt = 0;
-	dom = pool_map_find_node_by_rank(map, rank);
+	dom = pool_map_find_dom_by_rank(map, rank);
 	if (dom == NULL)
 		return 0;
 
@@ -2902,7 +2902,7 @@ pool_map_get_ranks(uuid_t pool_uuid, struct pool_map *map, bool get_enabled, d_r
 	struct pool_domain	*domains = NULL;
 	d_rank_list_t		*ranklist = NULL;
 
-	nnodes_tot = pool_map_find_nodes(map, PO_COMP_ID_ALL, &domains);
+	nnodes_tot = pool_map_find_ranks(map, PO_COMP_ID_ALL, &domains);
 	for (i = 0; i < nnodes_tot; i++) {
 		if (pool_map_node_status_match(&domains[i], ENABLED))
 			nnodes_enabled++;
diff --git a/src/container/cli.c b/src/container/cli.c
index 590f689333b..cd43667a2a4 100644
--- a/src/container/cli.c
+++ b/src/container/cli.c
@@ -3386,7 +3386,7 @@ dc_cont_node_id2ptr(daos_handle_t coh, uint32_t node_id,
 	pool = dc_hdl2pool(dc->dc_pool_hdl);
 	D_ASSERT(pool != NULL);
 	D_RWLOCK_RDLOCK(&pool->dp_map_lock);
-	n = pool_map_find_nodes(pool->dp_map, node_id, dom);
+	n = pool_map_find_ranks(pool->dp_map, node_id, dom);
 	D_RWLOCK_UNLOCK(&pool->dp_map_lock);
 	dc_pool_put(pool);
 	dc_cont_put(dc);
diff --git a/src/container/srv_container.c b/src/container/srv_container.c
index 9071f8f731c..372da43afe4 100644
--- a/src/container/srv_container.c
+++ b/src/container/srv_container.c
@@ -1667,7 +1667,7 @@ cont_ec_agg_alloc(struct cont_svc *cont_svc, uuid_t cont_uuid,
 {
 	struct cont_ec_agg	*ec_agg = NULL;
 	struct pool_domain	*doms;
-	int			node_nr;
+	int			rank_nr;
 	int			rc = 0;
 	int			i;
 
@@ -1676,19 +1676,18 @@ cont_ec_agg_alloc(struct cont_svc *cont_svc, uuid_t cont_uuid,
 		return -DER_NOMEM;
 
 	D_ASSERT(cont_svc->cs_pool->sp_map != NULL);
-	node_nr = pool_map_find_nodes(cont_svc->cs_pool->sp_map,
-				      PO_COMP_ID_ALL, &doms);
-	if (node_nr < 0)
-		D_GOTO(out, rc = node_nr);
+	rank_nr = pool_map_find_ranks(cont_svc->cs_pool->sp_map, PO_COMP_ID_ALL, &doms);
+	if (rank_nr < 0)
+		D_GOTO(out, rc = rank_nr);
 
-	D_ALLOC_ARRAY(ec_agg->ea_server_ephs, node_nr);
+	D_ALLOC_ARRAY(ec_agg->ea_server_ephs, rank_nr);
 	if (ec_agg->ea_server_ephs == NULL)
 		D_GOTO(out, rc = -DER_NOMEM);
 
 	uuid_copy(ec_agg->ea_cont_uuid, cont_uuid);
-	ec_agg->ea_servers_num = node_nr;
+	ec_agg->ea_servers_num = rank_nr;
 	ec_agg->ea_current_eph = 0;
-	for (i = 0; i < node_nr; i++) {
+	for (i = 0; i < rank_nr; i++) {
 		ec_agg->ea_server_ephs[i].rank = doms[i].do_comp.co_rank;
 		ec_agg->ea_server_ephs[i].eph = 0;
 	}
diff --git a/src/dtx/dtx_coll.c b/src/dtx/dtx_coll.c
index 9623dce4917..863307e9a7f 100644
--- a/src/dtx/dtx_coll.c
+++ b/src/dtx/dtx_coll.c
@@ -112,7 +112,7 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt
 	struct dtx_coll_target	*dct;
 	struct dtx_coll_entry	*dce = NULL;
 	struct daos_obj_md	 md = { 0 };
-	uint32_t		 node_nr;
+	uint32_t		 rank_nr;
 	d_rank_t		 my_rank = dss_self_rank();
 	d_rank_t		 max_rank = 0;
 	int			 rc = 0;
@@ -192,19 +192,19 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt
 		}
 	}
 
-	node_nr = pool_map_node_nr(map->pl_poolmap);
-	if (unlikely(node_nr == 1))
+	rank_nr = pool_map_rank_nr(map->pl_poolmap);
+	if (unlikely(rank_nr == 1))
 		D_GOTO(out, rc = 0);
 
-	dce->dce_ranks = d_rank_list_alloc(node_nr - 1);
+	dce->dce_ranks = d_rank_list_alloc(rank_nr - 1);
 	if (dce->dce_ranks == NULL)
 		D_GOTO(out, rc = -DER_NOMEM);
 
-	D_ALLOC_ARRAY(dce->dce_hints, node_nr);
+	D_ALLOC_ARRAY(dce->dce_hints, rank_nr);
 	if (dce->dce_hints == NULL)
 		D_GOTO(out, rc = -DER_NOMEM);
 
-	for (i = 0; i < node_nr; i++)
+	for (i = 0; i < rank_nr; i++)
 		dce->dce_hints[i] = (uint8_t)(-1);
 
 	md.omd_id = oid.id_pub;
@@ -220,7 +220,7 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt
 		goto out;
 	}
 
-	for (i = 0, j = 0; i < layout->ol_nr && j < node_nr - 1; i++) {
+	for (i = 0, j = 0; i < layout->ol_nr && j < rank_nr - 1; i++) {
 		if (layout->ol_shards[i].po_target == -1 || layout->ol_shards[i].po_shard == -1)
 			continue;
 
diff --git a/src/include/daos/pool_map.h b/src/include/daos/pool_map.h
index 0df39f0e510..95695d2b027 100644
--- a/src/include/daos/pool_map.h
+++ b/src/include/daos/pool_map.h
@@ -281,7 +281,7 @@ int pool_map_find_target(struct pool_map *map, uint32_t id,
 			 struct pool_target **target_pp);
 int pool_map_find_domain(struct pool_map *map, pool_comp_type_t type,
 			 uint32_t id, struct pool_domain **domain_pp);
-int pool_map_find_nodes(struct pool_map *map, uint32_t id,
+int pool_map_find_ranks(struct pool_map *map, uint32_t id,
 			struct pool_domain **domain_pp);
 int pool_map_find_tgts_by_state(struct pool_map *map,
 				pool_comp_state_t match_states,
@@ -311,7 +311,7 @@ bool
 pool_map_node_status_match(struct pool_domain *dom, unsigned int status);
 
 struct pool_domain *
-pool_map_find_node_by_rank(struct pool_map *map, uint32_t rank);
+pool_map_find_dom_by_rank(struct pool_map *map, uint32_t rank);
 
 int pool_map_find_by_rank_status(struct pool_map *map,
 				 struct pool_target ***tgt_ppp,
@@ -339,9 +339,9 @@ pool_map_target_nr(struct pool_map *map)
 }
 
 static inline unsigned int
-pool_map_node_nr(struct pool_map *map)
+pool_map_rank_nr(struct pool_map *map)
 {
-	return pool_map_find_nodes(map, PO_COMP_ID_ALL, NULL);
+	return pool_map_find_ranks(map, PO_COMP_ID_ALL, NULL);
 }
 
 /*
diff --git a/src/include/daos_prop.h b/src/include/daos_prop.h
index c6ca94f84c1..3b7216efd0e 100644
--- a/src/include/daos_prop.h
+++ b/src/include/daos_prop.h
@@ -464,11 +464,12 @@ enum {
 
 /** container redundancy factor */
 enum {
-	DAOS_PROP_CO_REDUN_RF0,
-	DAOS_PROP_CO_REDUN_RF1,
-	DAOS_PROP_CO_REDUN_RF2,
-	DAOS_PROP_CO_REDUN_RF3,
-	DAOS_PROP_CO_REDUN_RF4,
+	DAOS_PROP_CO_REDUN_RF0	= 0,
+	DAOS_PROP_CO_REDUN_RF1	= 1,
+	DAOS_PROP_CO_REDUN_RF2	= 2,
+	DAOS_PROP_CO_REDUN_RF3	= 3,
+	DAOS_PROP_CO_REDUN_RF4	= 4,
+	DAOS_RF_MAX		= 4,
 };
 
 /**
diff --git a/src/object/cli_coll.c b/src/object/cli_coll.c
index 12ba634813a..e05abadf3cf 100644
--- a/src/object/cli_coll.c
+++ b/src/object/cli_coll.c
@@ -139,7 +139,7 @@ obj_coll_oper_args_init(struct coll_oper_args *coa, struct dc_object *obj, bool
 	D_ASSERT(coa->coa_dcts == NULL);
 
 	D_RWLOCK_RDLOCK(&pool->dp_map_lock);
-	pool_ranks = pool_map_node_nr(pool->dp_map);
+	pool_ranks = pool_map_rank_nr(pool->dp_map);
 	D_RWLOCK_UNLOCK(&pool->dp_map_lock);
 
 	D_RWLOCK_RDLOCK(&obj->cob_lock);
diff --git a/src/object/srv_coll.c b/src/object/srv_coll.c
index a63a11d574b..2a152b47bd6 100644
--- a/src/object/srv_coll.c
+++ b/src/object/srv_coll.c
@@ -291,7 +291,7 @@ obj_coll_punch_prep(struct obj_coll_punch_in *ocpi, struct daos_coll_target *dct
 		D_GOTO(out, rc = -DER_INVAL);
 	}
 
-	size = pool_map_node_nr(map->pl_poolmap);
+	size = pool_map_rank_nr(map->pl_poolmap);
 	D_ALLOC_ARRAY(dce->dce_hints, size);
 	if (dce->dce_hints == NULL)
 		D_GOTO(out, rc = -DER_NOMEM);
diff --git a/src/pool/cli.c b/src/pool/cli.c
index 5345017f824..85fa718aa1c 100644
--- a/src/pool/cli.c
+++ b/src/pool/cli.c
@@ -503,7 +503,7 @@ update_rsvc_client(struct dc_pool *pool)
 {
 	struct subtract_rsvc_rank_arg arg;
 
-	arg.srra_nodes_len = pool_map_find_nodes(pool->dp_map, PO_COMP_ID_ALL, &arg.srra_nodes);
+	arg.srra_nodes_len = pool_map_find_ranks(pool->dp_map, PO_COMP_ID_ALL, &arg.srra_nodes);
 	/* There must be at least one rank. */
 	D_ASSERTF(arg.srra_nodes_len > 0, "%d > 0\n", arg.srra_nodes_len);
 
@@ -2016,7 +2016,7 @@ choose_map_refresh_rank(struct map_refresh_arg *arg)
 	if (arg->mra_n <= 0)
 		return CRT_NO_RANK;
 
-	n = pool_map_find_nodes(arg->mra_pool->dp_map, PO_COMP_ID_ALL, &nodes);
+	n = pool_map_find_ranks(arg->mra_pool->dp_map, PO_COMP_ID_ALL, &nodes);
 	/* There must be at least one rank. */
 	D_ASSERTF(n > 0, "%d\n", n);
 
diff --git a/src/pool/rpc.h b/src/pool/rpc.h
index cf763b896dc..cfddcc48931 100644
--- a/src/pool/rpc.h
+++ b/src/pool/rpc.h
@@ -147,6 +147,16 @@ CRT_RPC_DECLARE(pool_op, DAOS_ISEQ_POOL_OP, DAOS_OSEQ_POOL_OP)
 CRT_RPC_DECLARE(pool_create, DAOS_ISEQ_POOL_CREATE, DAOS_OSEQ_POOL_CREATE)
 
 /* clang-format on */
+
+/* the source of pool map update operation */
+enum map_update_source {
+	MUS_SWIM = 0,
+	/* May need to differentiate from administrator/csum scrubber/nvme healthy monitor later.
+	 * Now all non-swim cases fall to DMG category.
+	 */
+	MUS_DMG,
+};
+
 enum map_update_opc {
 	MAP_EXCLUDE = 0,
 	MAP_DRAIN,
diff --git a/src/pool/srv.c b/src/pool/srv.c
index 2a45f4dec05..7e5548e8508 100644
--- a/src/pool/srv.c
+++ b/src/pool/srv.c
@@ -19,7 +19,12 @@
 #include "rpc.h"
 #include "srv_internal.h"
 #include "srv_layout.h"
-bool ec_agg_disabled;
+
+bool		ec_agg_disabled;
+uint32_t	pw_rf; /* pool wise RF */
+#define PW_RF_DEFAULT	(2)
+#define PW_RF_MIN	(1)
+#define PW_RF_MAX	(4)
 
 static int
 init(void)
@@ -47,6 +52,15 @@ init(void)
 	if (unlikely(ec_agg_disabled))
 		D_WARN("EC aggregation is disabled.\n");
 
+	pw_rf = PW_RF_DEFAULT;
+	d_getenv_uint32_t("DAOS_POOL_RF", &pw_rf);
+	if (pw_rf < PW_RF_MIN || pw_rf > PW_RF_MAX) {
+		D_INFO("pw_rf %d is out of range [%d, %d], take default %d\n",
+		       pw_rf, PW_RF_MIN, PW_RF_MAX, PW_RF_DEFAULT);
+		pw_rf = PW_RF_DEFAULT;
+	}
+	D_INFO("pool wise RF %d\n", pw_rf);
+
 	ds_pool_rsvc_class_register();
 
 	bio_register_ract_ops(&nvme_reaction_ops);
diff --git a/src/pool/srv_internal.h b/src/pool/srv_internal.h
index c09d2ffcaea..8f864c8c11a 100644
--- a/src/pool/srv_internal.h
+++ b/src/pool/srv_internal.h
@@ -16,6 +16,8 @@
 #include <daos_security.h>
 #include <gurt/telemetry_common.h>
 
+extern uint32_t pw_rf;
+
 /**
  * Global pool metrics
  */
diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c
index 54e29767347..667e4bc6ed6 100644
--- a/src/pool/srv_pool.c
+++ b/src/pool/srv_pool.c
@@ -1355,11 +1355,11 @@ handle_event(struct pool_svc *svc, struct pool_svc_event *event)
 
 	if (event->psv_rank == dss_self_rank() && event->psv_src == CRT_EVS_GRPMOD &&
 	    event->psv_type == CRT_EVT_DEAD) {
-		D_DEBUG(DB_MGMT, "ignore exclusion of self\n");
+		D_DEBUG(DB_MD, "ignore exclusion of self\n");
 		goto out;
 	}
 
-	D_DEBUG(DB_MD, DF_UUID": handling event: "DF_PS_EVENT"\n", DP_UUID(svc->ps_uuid),
+	D_INFO(DF_UUID": handling event: "DF_PS_EVENT"\n", DP_UUID(svc->ps_uuid),
 		DP_PS_EVENT(event));
 
 	if (event->psv_src == CRT_EVS_SWIM && event->psv_type == CRT_EVT_ALIVE) {
@@ -1381,8 +1381,8 @@ handle_event(struct pool_svc *svc, struct pool_svc_event *event)
 		 * and does not have a copy of the pool map.
 		 */
 		ds_rsvc_request_map_dist(&svc->ps_rsvc);
-		D_DEBUG(DB_MD, DF_UUID": requested map dist for rank %u\n", DP_UUID(svc->ps_uuid),
-			event->psv_rank);
+		D_DEBUG(DB_MD, DF_UUID": requested map dist for rank %u\n",
+			DP_UUID(svc->ps_uuid), event->psv_rank);
 	} else if (event->psv_type == CRT_EVT_DEAD) {
 		rc = pool_svc_exclude_rank(svc, event->psv_rank);
 		if (rc != 0)
@@ -1809,7 +1809,7 @@ pool_svc_check_node_status(struct pool_svc *svc)
 
 	D_DEBUG(DB_MD, DF_UUID": checking node status\n", DP_UUID(svc->ps_uuid));
 	ABT_rwlock_rdlock(svc->ps_pool->sp_lock);
-	doms_cnt = pool_map_find_nodes(svc->ps_pool->sp_map, PO_COMP_ID_ALL,
+	doms_cnt = pool_map_find_ranks(svc->ps_pool->sp_map, PO_COMP_ID_ALL,
 				       &doms);
 	D_ASSERT(doms_cnt >= 0);
 	for (i = 0; i < doms_cnt; i++) {
@@ -6500,6 +6500,49 @@ pool_svc_schedule_reconf(struct pool_svc *svc, struct pool_map *map, uint32_t ma
 	return 0;
 }
 
+static int
+pool_map_crit_prompt(struct pool_svc *svc, struct pool_map *map, d_rank_t rank)
+{
+	crt_group_t		*primary_grp;
+	struct pool_domain	*doms;
+	int			 doms_cnt;
+	int			 i;
+	int			 rc = 0;
+
+	D_DEBUG(DB_MD, DF_UUID": checking node status\n", DP_UUID(svc->ps_uuid));
+	doms_cnt = pool_map_find_ranks(map, PO_COMP_ID_ALL, &doms);
+	D_ASSERT(doms_cnt >= 0);
+	primary_grp = crt_group_lookup(NULL);
+	D_ASSERT(primary_grp != NULL);
+
+	D_CRIT("!!! Please try to recover these engines in top priority -\n");
+	D_CRIT("!!! Please refer \"Pool-Wise Redundancy Factor\" section in pool_operations.md\n");
+	D_CRIT("!!! pool "DF_UUID": intolerable unavailability: engine rank %u\n",
+	       DP_UUID(svc->ps_uuid), rank);
+	for (i = 0; i < doms_cnt; i++) {
+		struct swim_member_state state;
+
+		if (!(doms[i].do_comp.co_status & PO_COMP_ST_UPIN) ||
+		    (doms[i].do_comp.co_rank == rank))
+			continue;
+
+		rc = crt_rank_state_get(primary_grp, doms[i].do_comp.co_rank, &state);
+		if (rc != 0 && rc != -DER_NONEXIST) {
+			D_ERROR("failed to get status of rank %u: %d\n",
+				doms[i].do_comp.co_rank, rc);
+			break;
+		}
+
+		D_DEBUG(DB_MD, "rank/state %d/%d\n", doms[i].do_comp.co_rank,
+			rc == -DER_NONEXIST ? -1 : state.sms_status);
+		if (rc == -DER_NONEXIST || state.sms_status == SWIM_MEMBER_DEAD)
+			D_CRIT("!!! pool "DF_UUID" : intolerable unavailability: engine rank %u\n",
+			       DP_UUID(svc->ps_uuid), doms[i].do_comp.co_rank);
+	}
+
+	return rc;
+}
+
 /*
  * Perform an update to the pool map of \a svc.
  *
@@ -6532,7 +6575,8 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc,
 			     struct pool_target_addr_list *tgt_addrs,
 			     struct rsvc_hint *hint, bool *p_updated,
 			     uint32_t *map_version_p, uint32_t *tgt_map_ver,
-			     struct pool_target_addr_list *inval_tgt_addrs)
+			     struct pool_target_addr_list *inval_tgt_addrs,
+			     enum map_update_source src)
 {
 	struct rdb_tx		tx;
 	struct pool_map	       *map;
@@ -6628,7 +6672,7 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc,
 	 * If the map modification affects myself, leave it to a new PS leader
 	 * if there's another PS replica, or reject it.
 	 */
-	node = pool_map_find_node_by_rank(map, dss_self_rank());
+	node = pool_map_find_dom_by_rank(map, dss_self_rank());
 	if (node == NULL || !(node->do_comp.co_status & DC_POOL_SVC_MAP_STATES)) {
 		d_rank_list_t *replicas;
 
@@ -6653,6 +6697,33 @@ pool_svc_update_map_internal(struct pool_svc *svc, unsigned int opc,
 		goto out_map;
 	}
 
+	/* For SWIM exclude, don't change pool map if the pw_rf is broken or is going to be broken,
+	 * with CRIT log message to ask administrator to bring back the engine.
+	 */
+	if (src == MUS_SWIM && opc == MAP_EXCLUDE) {
+		d_rank_t	rank;
+		int		failed_cnt;
+
+		rc = pool_map_update_failed_cnt(map);
+		if (rc != 0) {
+			DL_ERROR(rc, DF_UUID": pool_map_update_failed_cnt failed.",
+				 DP_UUID(svc->ps_uuid));
+			goto out_map;
+		}
+
+		D_ASSERT(tgt_addrs->pta_number == 1);
+		rank = tgt_addrs->pta_addrs->pta_rank;
+		failed_cnt = pool_map_get_failed_cnt(map, PO_COMP_TP_NODE);
+		D_INFO(DF_UUID": SWIM exclude rank %d, failed NODE %d\n",
+		       DP_UUID(svc->ps_uuid), rank, failed_cnt);
+		if (failed_cnt > pw_rf) {
+			D_CRIT(DF_UUID": exclude rank %d will break pw_rf %d, failed_cnt %d\n",
+			       DP_UUID(svc->ps_uuid), rank, pw_rf, failed_cnt);
+			rc = pool_map_crit_prompt(svc, map, rank);
+			goto out_map;
+		}
+	}
+
 	/* Write the new pool map. */
 	rc = pool_buf_extract(map, &map_buf);
 	if (rc != 0)
@@ -6809,7 +6880,7 @@ pool_update_map_internal(uuid_t pool_uuid, unsigned int opc, bool exclude_rank,
 	rc = pool_svc_update_map_internal(svc, opc, exclude_rank, NULL, 0,
 					  NULL, tgts, tgt_addrs, hint, p_updated,
 					  map_version_p, tgt_map_ver,
-					  inval_tgt_addrs);
+					  inval_tgt_addrs, MUS_DMG);
 
 	pool_svc_put_leader(svc);
 	return rc;
@@ -6859,8 +6930,8 @@ static int
 pool_svc_update_map(struct pool_svc *svc, crt_opcode_t opc, bool exclude_rank,
 		    d_rank_list_t *extend_rank_list, uint32_t *extend_domains,
 		    uint32_t extend_domains_nr, struct pool_target_addr_list *list,
-		    struct pool_target_addr_list *inval_list_out,
-		    uint32_t *map_version, struct rsvc_hint *hint)
+		    struct pool_target_addr_list *inval_list_out, uint32_t *map_version,
+		    struct rsvc_hint *hint, enum map_update_source src)
 {
 	struct pool_target_id_list	target_list = { 0 };
 	daos_prop_t			prop = { 0 };
@@ -6875,7 +6946,7 @@ pool_svc_update_map(struct pool_svc *svc, crt_opcode_t opc, bool exclude_rank,
 	rc = pool_svc_update_map_internal(svc, opc, exclude_rank, extend_rank_list,
 					  extend_domains_nr, extend_domains,
 					  &target_list, list, hint, &updated,
-					  map_version, &tgt_map_ver, inval_list_out);
+					  map_version, &tgt_map_ver, inval_list_out, src);
 	if (rc)
 		D_GOTO(out, rc);
 
@@ -6962,10 +7033,9 @@ ds_pool_extend_handler(crt_rpc_t *rpc)
 		goto out;
 
 	rc = pool_svc_update_map(svc, pool_opc_2map_opc(opc_get(rpc->cr_opc)),
-				 false /* exclude_rank */,
-				 &rank_list, domains, ndomains,
+				 false /* exclude_rank */, &rank_list, domains, ndomains,
 				 NULL, NULL, &out->peo_op.po_map_version,
-				 &out->peo_op.po_hint);
+				 &out->peo_op.po_hint, MUS_DMG);
 
 	pool_svc_put_leader(svc);
 out:
@@ -7067,7 +7137,7 @@ ds_pool_update_handler(crt_rpc_t *rpc, int handler_version)
 	rc = pool_svc_update_map(svc, pool_opc_2map_opc(opc_get(rpc->cr_opc)),
 				 false /* exclude_rank */, NULL, NULL, 0, &list,
 				 &inval_list_out, &out->pto_op.po_map_version,
-				 &out->pto_op.po_hint);
+				 &out->pto_op.po_hint, MUS_DMG);
 	if (rc != 0)
 		goto out_svc;
 
@@ -7112,7 +7182,7 @@ pool_svc_exclude_rank(struct pool_svc *svc, d_rank_t rank)
 
 	rc = pool_svc_update_map(svc, pool_opc_2map_opc(POOL_EXCLUDE), true /* exclude_rank */,
 				 NULL, NULL, 0, &list, &inval_list_out, &map_version,
-				 NULL /* hint */);
+				 NULL /* hint */, MUS_SWIM);
 
 	D_DEBUG(DB_MD, "Exclude pool "DF_UUID"/%u rank %u: rc %d\n",
 		DP_UUID(svc->ps_uuid), map_version, rank, rc);
diff --git a/src/pool/srv_pool_map.c b/src/pool/srv_pool_map.c
index 1cb5632598f..9793df24f01 100644
--- a/src/pool/srv_pool_map.c
+++ b/src/pool/srv_pool_map.c
@@ -378,7 +378,7 @@ ds_pool_map_tgts_update(struct pool_map *map, struct pool_target_id_list *tgts,
 			return -DER_NONEXIST;
 		}
 
-		dom = pool_map_find_node_by_rank(map, target->ta_comp.co_rank);
+		dom = pool_map_find_dom_by_rank(map, target->ta_comp.co_rank);
 		if (dom == NULL) {
 			D_ERROR("Got request to change nonexistent rank %u"
 				" in map %p\n",
diff --git a/src/pool/srv_util.c b/src/pool/srv_util.c
index e39072568e1..29f012d5844 100644
--- a/src/pool/srv_util.c
+++ b/src/pool/srv_util.c
@@ -21,19 +21,19 @@ int
 map_ranks_init(const struct pool_map *map, unsigned int status, d_rank_list_t *ranks)
 {
 	struct pool_domain     *domains = NULL;
-	int			nnodes;
+	int			nranks;
 	int			n = 0;
 	int			i;
 	d_rank_t	       *rs;
 
-	nnodes = pool_map_find_nodes((struct pool_map *)map,
+	nranks = pool_map_find_ranks((struct pool_map *)map,
 				      PO_COMP_ID_ALL, &domains);
-	if (nnodes == 0) {
+	if (nranks == 0) {
 		D_ERROR("no nodes in pool map\n");
 		return -DER_IO;
 	}
 
-	for (i = 0; i < nnodes; i++) {
+	for (i = 0; i < nranks; i++) {
 		if (status & domains[i].do_comp.co_status)
 			n++;
 	}
@@ -52,7 +52,7 @@ map_ranks_init(const struct pool_map *map, unsigned int status, d_rank_list_t *r
 	ranks->rl_ranks = rs;
 
 	n = 0;
-	for (i = 0; i < nnodes; i++) {
+	for (i = 0; i < nranks; i++) {
 		if (status & domains[i].do_comp.co_status) {
 			D_ASSERT(n < ranks->rl_nr);
 			ranks->rl_ranks[n] = domains[i].do_comp.co_rank;
@@ -85,7 +85,7 @@ ds_pool_map_rank_up(struct pool_map *map, d_rank_t rank)
 	struct pool_domain     *node;
 	int			rc;
 
-	rc = pool_map_find_nodes(map, rank, &node);
+	rc = pool_map_find_ranks(map, rank, &node);
 	if (rc == 0)
 		return false;
 	D_ASSERTF(rc == 1, "%d\n", rc);
@@ -921,7 +921,7 @@ testu_create_pool_map(d_rank_t *ranks, int n_ranks, d_rank_t *down_ranks, int n_
 	for (i = 0; i < n_down_ranks; i++) {
 		struct pool_domain *d;
 
-		d = pool_map_find_node_by_rank(map, down_ranks[i]);
+		d = pool_map_find_dom_by_rank(map, down_ranks[i]);
 		D_ASSERT(d != NULL);
 		d->do_comp.co_status = PO_COMP_ST_DOWN;
 	}
diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c
index f40ea2d2fe3..b1f722b8254 100644
--- a/src/rebuild/srv.c
+++ b/src/rebuild/srv.c
@@ -654,7 +654,7 @@ rebuild_leader_status_check(struct ds_pool *pool, uint32_t op,
 		for (i = 0; i < excluded.rl_nr; i++) {
 			struct pool_domain *dom;
 
-			dom = pool_map_find_node_by_rank(pool->sp_map, excluded.rl_ranks[i]);
+			dom = pool_map_find_dom_by_rank(pool->sp_map, excluded.rl_ranks[i]);
 			D_ASSERT(dom != NULL);
 
 			if (rgt->rgt_opc == RB_OP_REBUILD) {
@@ -767,7 +767,7 @@ rebuild_global_pool_tracker_create(struct ds_pool *pool, uint32_t ver, uint32_t
 				   uint32_t opc, struct rebuild_global_pool_tracker **p_rgt)
 {
 	struct rebuild_global_pool_tracker *rgt;
-	int node_nr;
+	int rank_nr;
 	struct pool_domain *doms;
 	int i;
 	int rc = 0;
@@ -777,11 +777,11 @@ rebuild_global_pool_tracker_create(struct ds_pool *pool, uint32_t ver, uint32_t
 		return -DER_NOMEM;
 
 	D_INIT_LIST_HEAD(&rgt->rgt_list);
-	node_nr = pool_map_find_nodes(pool->sp_map, PO_COMP_ID_ALL, &doms);
-	if (node_nr < 0)
-		D_GOTO(out, rc = node_nr);
+	rank_nr = pool_map_find_ranks(pool->sp_map, PO_COMP_ID_ALL, &doms);
+	if (rank_nr < 0)
+		D_GOTO(out, rc = rank_nr);
 
-	D_ALLOC_ARRAY(rgt->rgt_servers, node_nr);
+	D_ALLOC_ARRAY(rgt->rgt_servers, rank_nr);
 	if (rgt->rgt_servers == NULL)
 		D_GOTO(out, rc = -DER_NOMEM);
 
@@ -793,9 +793,9 @@ rebuild_global_pool_tracker_create(struct ds_pool *pool, uint32_t ver, uint32_t
 	if (rc != ABT_SUCCESS)
 		D_GOTO(out, rc = dss_abterr2der(rc));
 
-	for (i = 0; i < node_nr; i++)
+	for (i = 0; i < rank_nr; i++)
 		rgt->rgt_servers[i].rank = doms[i].do_comp.co_rank;
-	rgt->rgt_servers_number = node_nr;
+	rgt->rgt_servers_number = rank_nr;
 
 	uuid_copy(rgt->rgt_pool_uuid, pool->sp_uuid);
 	rgt->rgt_rebuild_ver = ver;
@@ -964,7 +964,7 @@ rebuild_scan_broadcast(struct ds_pool *pool, struct rebuild_global_pool_tracker
 		for (i = 0; i < up_ranks.rl_nr; i++) {
 			struct pool_domain *dom;
 
-			dom = pool_map_find_node_by_rank(pool->sp_map, up_ranks.rl_ranks[i]);
+			dom = pool_map_find_dom_by_rank(pool->sp_map, up_ranks.rl_ranks[i]);
 			D_ASSERT(dom != NULL);
 			D_DEBUG(DB_REBUILD, "rank %u ver %u rebuild %u\n",
 				up_ranks.rl_ranks[i], dom->do_comp.co_in_ver, rgt->rgt_rebuild_ver);
diff --git a/src/tests/ftest/util/server_utils_params.py b/src/tests/ftest/util/server_utils_params.py
index 248617c1b36..440ffe68f82 100644
--- a/src/tests/ftest/util/server_utils_params.py
+++ b/src/tests/ftest/util/server_utils_params.py
@@ -434,6 +434,7 @@ class EngineYamlParameters(YamlParameters):
     REQUIRED_ENV_VARS = {
         "common": [
             "D_LOG_FILE_APPEND_PID=1",
+            "DAOS_POOL_RF=4",
             "COVFILE=/tmp/test.cov"],
         "ofi+tcp": [],
         "ofi+tcp;ofi_rxm": [],

From ee9a06d359fd31509c5b1e1870cdb21398bcde6a Mon Sep 17 00:00:00 2001
From: Niu Yawei <yawei.niu@intel.com>
Date: Sat, 7 Sep 2024 03:42:00 +0800
Subject: [PATCH 09/12] DAOS-16486 object: return proper error on stale pool
 map (#15064) (#15084)

Client with stale pool map may try to send RPC to a DOWN target, if the
target was brought DOWN due to faulty NVMe device, the ds_pool_child could
have been stopped on the NVMe faulty reaction, We'd ensure proper error
code is returned for such case.

Signed-off-by: Niu Yawei <yawei.niu@intel.com>
---
 src/dtx/tests/srv_mock.c    |  7 +++++++
 src/include/daos_srv/pool.h |  2 ++
 src/object/srv_obj.c        | 31 ++++++++++++++++++++++++++++++-
 src/pool/srv_target.c       | 15 +++++++++++++++
 4 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/src/dtx/tests/srv_mock.c b/src/dtx/tests/srv_mock.c
index 245b3b11513..3d4ac70d773 100644
--- a/src/dtx/tests/srv_mock.c
+++ b/src/dtx/tests/srv_mock.c
@@ -71,6 +71,13 @@ ds_pool_child_put(struct ds_pool_child *child)
 	assert_true(false);
 }
 
+struct ds_pool_child *
+ds_pool_child_find(const uuid_t uuid)
+{
+	assert_true(false);
+	return NULL;
+}
+
 struct ds_pool_child *
 ds_pool_child_lookup(const uuid_t uuid)
 {
diff --git a/src/include/daos_srv/pool.h b/src/include/daos_srv/pool.h
index 07ca3c0dbc1..6cbe3873f0a 100644
--- a/src/include/daos_srv/pool.h
+++ b/src/include/daos_srv/pool.h
@@ -249,6 +249,8 @@ ds_pool_svc_ops_save(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid_t
 		     uint64_t cli_time, bool dup_op, int rc_in, struct ds_pool_svc_op_val *op_valp);
 
 /* Find ds_pool_child in cache, hold one reference */
+struct ds_pool_child *ds_pool_child_find(const uuid_t uuid);
+/* Find ds_pool_child in STARTING or STARTED state, hold one reference */
 struct ds_pool_child *ds_pool_child_lookup(const uuid_t uuid);
 /* Put the reference held by ds_pool_child_lookup() */
 void ds_pool_child_put(struct ds_pool_child *child);
diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c
index febd3d36ead..a51682b4785 100644
--- a/src/object/srv_obj.c
+++ b/src/object/srv_obj.c
@@ -2170,8 +2170,37 @@ obj_ioc_begin_lite(uint32_t rpc_map_ver, uuid_t pool_uuid,
 	int			rc;
 
 	rc = obj_ioc_init(pool_uuid, coh_uuid, cont_uuid, rpc, ioc);
-	if (rc)
+	if (rc) {
+		DL_ERROR(rc, "Failed to initialize object I/O context.");
+
+		/*
+		 * Client with stale pool map may try to send RPC to a DOWN target, if the
+		 * target was brought DOWN due to faulty NVMe device, the ds_pool_child could
+		 * have been stopped on the NVMe faulty reaction, then above obj_io_init()
+		 * will fail with -DER_NO_HDL.
+		 *
+		 * We'd ensure proper error code is returned for such case.
+		 */
+		poc = ds_pool_child_find(pool_uuid);
+		if (poc == NULL) {
+			D_ERROR("Failed to find pool:"DF_UUID"\n", DP_UUID(pool_uuid));
+			return rc;
+		}
+
+		if (rpc_map_ver < poc->spc_pool->sp_map_version) {
+			D_ERROR("Stale pool map version %u < %u from client.\n",
+				rpc_map_ver, poc->spc_pool->sp_map_version);
+
+			/* Restart the DTX if using stale pool map */
+			if (opc_get(rpc->cr_opc) == DAOS_OBJ_RPC_CPD)
+				rc = -DER_TX_RESTART;
+			else
+				rc = -DER_STALE;
+		}
+
+		ds_pool_child_put(poc);
 		return rc;
+	}
 
 	poc = ioc->ioc_coc->sc_pool;
 	D_ASSERT(poc != NULL);
diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c
index 0b195216cf5..cfa837e8b2a 100644
--- a/src/pool/srv_target.c
+++ b/src/pool/srv_target.c
@@ -87,6 +87,21 @@ pool_child_lookup_noref(const uuid_t uuid)
 	return NULL;
 }
 
+struct ds_pool_child *
+ds_pool_child_find(const uuid_t uuid)
+{
+	struct ds_pool_child	*child;
+
+	child = pool_child_lookup_noref(uuid);
+	if (child == NULL) {
+		D_ERROR(DF_UUID": Pool child isn't found.\n", DP_UUID(uuid));
+		return child;
+	}
+
+	child->spc_ref++;
+	return child;
+}
+
 struct ds_pool_child *
 ds_pool_child_lookup(const uuid_t uuid)
 {

From 70e43620ac6eb4b8465fd3e64cf591a7c002ae5e Mon Sep 17 00:00:00 2001
From: Liang Zhen <liang.zhen@intel.com>
Date: Sat, 7 Sep 2024 05:49:08 +0800
Subject: [PATCH 10/12] DAOS-16514 vos: fix coverity issue (#15083) (#15086)

Fix coverity 2555843 explict null dereferenced.

Signed-off-by: Niu Yawei <yawei.niu@intel.com>
---
 src/vos/vos_obj.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/vos/vos_obj.c b/src/vos/vos_obj.c
index fc0ff4fbe9c..c4f50a6705c 100644
--- a/src/vos/vos_obj.c
+++ b/src/vos/vos_obj.c
@@ -1700,7 +1700,8 @@ vos_obj_iter_prep(vos_iter_type_t type, vos_iter_param_t *param,
 		return -DER_NOMEM;
 
 	/* ip_hdl is dkey or akey tree open handle for vos_iterate_key() */
-	if (!(param->ip_flags & VOS_IT_KEY_TREE)) {
+	if (param->ip_flags != VOS_IT_KEY_TREE) {
+		D_ASSERT(!(param->ip_flags & VOS_IT_KEY_TREE));
 		cont = vos_hdl2cont(param->ip_hdl);
 		is_sysdb = cont->vc_pool->vp_sysdb;
 		dth = vos_dth_get(is_sysdb);

From 4e201a070dcf3d9124fe6cb30cbd57a6127e731b Mon Sep 17 00:00:00 2001
From: Phil Henderson <phillip.henderson@intel.com>
Date: Mon, 9 Sep 2024 13:44:29 -0400
Subject: [PATCH 11/12] DAOS-16515 build: Tag 2.6.1 rc1 (#15103)

Tag first release candidate for 2.6.1.

Signed-off-by: Phil Henderson <phillip.henderson@intel.com>
---
 TAG                  | 2 +-
 VERSION              | 2 +-
 debian/changelog     | 6 ++++++
 utils/rpms/daos.spec | 7 +++++--
 4 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/TAG b/TAG
index 3b4c8028c74..07c46f93195 100644
--- a/TAG
+++ b/TAG
@@ -1 +1 @@
-2.6.0-rc3
+2.6.1-rc1
diff --git a/VERSION b/VERSION
index e70b4523ae7..6a6a3d8e35c 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.6.0
+2.6.1
diff --git a/debian/changelog b/debian/changelog
index fb08d568cbd..73df378df78 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+daos (2.6.1-1) unstable; urgency=medium
+  [ Phillip Henderson ]
+  * First release candidate for 2.6.1
+
+ -- Phillip Henderson <phillip.henderson@intel.com>  Mon, 09 Sep 2024 08:46:00 -0500
+
 daos (2.6.0-5) unstable; urgency=medium
   [ Tom Nabarro ]
   * Add pciutils runtime dep for daos_server lspci call
diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec
index eb85dab4eef..1ae790f8623 100644
--- a/utils/rpms/daos.spec
+++ b/utils/rpms/daos.spec
@@ -14,8 +14,8 @@
 %endif
 
 Name:          daos
-Version:       2.6.0
-Release:       5%{?relval}%{?dist}
+Version:       2.6.1
+Release:       1%{?relval}%{?dist}
 Summary:       DAOS Storage Engine
 
 License:       BSD-2-Clause-Patent
@@ -591,6 +591,9 @@ getent passwd daos_agent >/dev/null || useradd -s /sbin/nologin -r -g daos_agent
 # No files in a shim package
 
 %changelog
+* Mon Sep 09 2024 Phillip Henderson <phillip.henderson@intel.com> 2.6.1-1
+- First release candidate for 2.6.1
+
 * Thu Aug 08 2024 Tom Nabarro <tom.nabarro@intel.com> 2.6.0-5
 - Add pciutils runtime dep for daos_server lspci call
 - Add pciutils-devel build dep for pciutils CGO bindings

From f2c36ad394b991ea7e94916b839387ac0c09915e Mon Sep 17 00:00:00 2001
From: Michael MacDonald <mjmac@google.com>
Date: Mon, 9 Sep 2024 22:02:17 +0000
Subject: [PATCH 12/12] Revert "DAOS-16271 mercury: Add patch to check ep for
 null in UCX key resolve. (#15077)"

Not needed in our build (b/364929445).

This reverts commit 3d9e2d0cafcb5f314135ba9e7a27dd541102f7c5.
---
 utils/build.config | 1 -
 1 file changed, 1 deletion(-)

diff --git a/utils/build.config b/utils/build.config
index 92c3bd673d3..f94d0e6a912 100644
--- a/utils/build.config
+++ b/utils/build.config
@@ -30,4 +30,3 @@ spdk=https://github.com/spdk/spdk/commit/b0aba3fcd5aceceea530a702922153bc7566497
 ofi=https://github.com/ofiwg/libfabric/commit/d827c6484cc5bf67dfbe395890e258860c3f0979.diff
 mercury=https://raw.githubusercontent.com/daos-stack/mercury/857f1d5d2ca72d4c1b8d7be5e7fd26d6292b495f/na_ucx_am_send_retry.patch,https://github.com/mercury-hpc/mercury/commit/b8c26fd86281f3b0883c31bd2d0cb467a12b860d.diff,https://github.com/mercury-hpc/mercury/commit/a35589c3d1134d9c80640e78247e210162ac4a3c.diff,https://github.com/mercury-hpc/mercury/commit/fa4abbb6273d975b2ef17ac4e561fd4255d384db.diff
 fuse=https://github.com/libfuse/libfuse/commit/c9905341ea34ff9acbc11b3c53ba8bcea35eeed8.diff
-mercury=https://raw.githubusercontent.com/daos-stack/mercury/481297621bafbbcac4cc6f8feab3f1b6f8b14b59/na_ucx_keyres_epchk.patch