From e9bf5f3a193ba0542f670e67064b018da7157f12 Mon Sep 17 00:00:00 2001 From: Li Wei Date: Wed, 25 Dec 2024 15:39:17 +0900 Subject: [PATCH] DAOS-16878 pool: Reduce unexpected DER_NO_SERVICEs It has been observed that pool_svc_step_up_cb may encounter a -DER_NOTLEADER and pass it to ds_pool_failed_add. This error is a replica error and may be transient; it doesn't indicate that the PS is unavailable. This patch addresses the observed scenario by replacing the ds_pool_failed_add call from pool_svc_step_up_cb with a special up-but-with-error mode for the PS, which can only serve requests by returning an error. - Add pool_svc.ps_error for indicating the special up-but-with-error mode. Check and return it in pool_svc_lookup_leader. Handle it specially in callers of pool_svc_lookup. - Use this new mode only for a conservative set of errors. Including an error by mistake is worse than missing an error. - Add pool UUIDs to a few log messages to make future debugging easier. The ds_pool_failed_add mechanism should be used for replica errors only. And, such errors should not immediately stop PS clients from trying other replicas. This issue is relatively tricky and will not be addressed by the current patch. Features: pool Skip-nlt: true Signed-off-by: Li Wei Required-githooks: true --- src/pool/srv_pool.c | 128 +++++++++++++++++++++++++++++++------------- 1 file changed, 90 insertions(+), 38 deletions(-) diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index 06a5f29e866..0dd04ae5de5 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -1,5 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -213,6 +214,7 @@ struct pool_svc { rdb_path_t ps_handles; /* pool handle KVS */ rdb_path_t ps_user; /* pool user attributes KVS */ rdb_path_t ps_ops; /* metadata ops KVS */ + int ps_error; /* in DB data (see pool_svc_lookup_leader) */ struct pool_svc_events ps_events; uint32_t ps_global_version; int ps_svc_rf; @@ -1950,8 +1952,11 @@ read_db_for_stepping_up(struct pool_svc *svc, struct pool_buf **map_buf_out, rc = ds_pool_svc_load(&tx, svc->ps_uuid, &svc->ps_root, &svc->ps_global_version, &map_buf, &map_version); - if (rc != 0) + if (rc != 0) { + if (rc == -DER_DF_INCOMPT) + svc->ps_error = rc; goto out_lock; + } rc = pool_prop_read(&tx, svc, DAOS_PO_QUERY_PROP_ALL, &prop); if (rc != 0) { @@ -2263,6 +2268,8 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc) d_rank_t rank = dss_self_rank(); int rc; + D_ASSERTF(svc->ps_error == 0, "ps_error: " DF_RC "\n", DP_RC(svc->ps_error)); + /* * If this is the only voting replica, it may have become the leader * without doing any RPC. The primary group may have yet to be @@ -2328,9 +2335,9 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc) } rc = ds_pool_iv_prop_update(svc->ps_pool, prop); - if (rc) { - D_ERROR("ds_pool_iv_prop_update failed: " DF_RC "\n", DP_RC(rc)); - D_GOTO(out, rc); + if (rc != 0) { + DL_ERROR(rc, DF_UUID ": ds_pool_iv_prop_update failed", DP_UUID(svc->ps_uuid)); + goto out; } if (!uuid_is_null(svc->ps_pool->sp_srv_cont_hdl)) { @@ -2346,11 +2353,10 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc) uuid_copy(svc->ps_pool->sp_srv_pool_hdl, pool_hdl_uuid); } - rc = ds_pool_iv_srv_hdl_update(svc->ps_pool, pool_hdl_uuid, - cont_hdl_uuid); - if (rc) { - D_ERROR("ds_pool_iv_srv_hdl_update failed: " DF_RC "\n", DP_RC(rc)); - D_GOTO(out, rc); + rc = ds_pool_iv_srv_hdl_update(svc->ps_pool, pool_hdl_uuid, cont_hdl_uuid); + if (rc != 0) { + DL_ERROR(rc, DF_UUID ": ds_pool_iv_srv_hdl_update failed", DP_UUID(svc->ps_uuid)); + goto out; } /* resume pool upgrade if needed */ @@ -2385,27 +2391,39 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc) D_FREE(map_buf); if (prop != NULL) daos_prop_free(prop); - if (rc < 0) - ds_pool_failed_add(svc->ps_uuid, rc); - else if (rc == 0) - ds_pool_failed_remove(svc->ps_uuid); + if (svc->ps_error != 0) { + /* + * Step up with the error anyway, so that RPCs to the PS + * receive an error instead of timeouts. + */ + DS_POOL_NOTE_PRINT(DF_UUID": rank %u became pool service leader "DF_U64 + " with error: "DF_RC"\n", DP_UUID(svc->ps_uuid), rank, + svc->ps_rsvc.s_term, DP_RC(svc->ps_error)); + rc = 0; + } return rc; } static void pool_svc_step_down_cb(struct ds_rsvc *rsvc) { - struct pool_svc *svc = pool_svc_obj(rsvc); - d_rank_t rank = dss_self_rank(); - - pool_svc_step_down_metrics(svc); - fini_events(svc); - sched_cancel_and_wait(&svc->ps_reconf_sched); - sched_cancel_and_wait(&svc->ps_rfcheck_sched); - ds_cont_svc_step_down(svc->ps_cont_svc); + struct pool_svc *svc = pool_svc_obj(rsvc); + d_rank_t rank = dss_self_rank(); - DS_POOL_NOTE_PRINT(DF_UUID": rank %u no longer pool service leader "DF_U64"\n", - DP_UUID(svc->ps_uuid), rank, svc->ps_rsvc.s_term); + if (svc->ps_error == 0) { + pool_svc_step_down_metrics(svc); + fini_events(svc); + sched_cancel_and_wait(&svc->ps_reconf_sched); + sched_cancel_and_wait(&svc->ps_rfcheck_sched); + ds_cont_svc_step_down(svc->ps_cont_svc); + DS_POOL_NOTE_PRINT(DF_UUID": rank %u no longer pool service leader "DF_U64"\n", + DP_UUID(svc->ps_uuid), rank, svc->ps_rsvc.s_term); + } else { + DS_POOL_NOTE_PRINT(DF_UUID": rank %u no longer pool service leader "DF_U64 + " with error: "DF_RC"\n", DP_UUID(svc->ps_uuid), rank, + svc->ps_rsvc.s_term, DP_RC(svc->ps_error)); + svc->ps_error = 0; + } } static void @@ -2476,12 +2494,13 @@ ds_pool_rsvc_class_unregister(void) ds_rsvc_class_unregister(DS_RSVC_CLASS_POOL); } +/* Use pool_svc_lookup_leader instead. */ static int pool_svc_lookup(uuid_t uuid, struct pool_svc **svcp) { struct ds_rsvc *rsvc; - d_iov_t id; - int rc; + d_iov_t id; + int rc; d_iov_set(&id, uuid, sizeof(uuid_t)); rc = ds_rsvc_lookup(DS_RSVC_CLASS_POOL, &id, &rsvc); @@ -2492,17 +2511,16 @@ pool_svc_lookup(uuid_t uuid, struct pool_svc **svcp) } static int -pool_svc_lookup_leader(uuid_t uuid, struct pool_svc **svcp, - struct rsvc_hint *hint) +pool_svc_lookup_leader(uuid_t uuid, struct pool_svc **svcp, struct rsvc_hint *hint) { - struct ds_rsvc *rsvc; - d_iov_t id; - int rc; + struct ds_rsvc *rsvc; + d_iov_t id; + struct pool_svc *svc; + int rc; rc = ds_pool_failed_lookup(uuid); - if (rc) { - D_ERROR(DF_UUID": failed to start: "DF_RC"\n", - DP_UUID(uuid), DP_RC(rc)); + if (rc != 0) { + D_DEBUG(DB_MD, DF_UUID ": failed: " DF_RC "\n", DP_UUID(uuid), DP_RC(rc)); return -DER_NO_SERVICE; } @@ -2510,7 +2528,31 @@ pool_svc_lookup_leader(uuid_t uuid, struct pool_svc **svcp, rc = ds_rsvc_lookup_leader(DS_RSVC_CLASS_POOL, &id, &rsvc, hint); if (rc != 0) return rc; - *svcp = pool_svc_obj(rsvc); + + /* + * The svc->ps_error field stores a persistent error, usually in the DB + * data, if any. For instance, "the layout of the DB data is + * incompatible with the software version". This mustn't be a replica + * error, because there may be a majorty of replicas working. We let the + * PS step up with this error so that it can serve all requests by + * returning the error. PS clients therefore get a quick error response + * instead of a timeout. + * + * Checking svc->ps_error here without confirming our leadership via + * rdb_raft_verify_leadership may cause some requests to get + * unnecessary errors, if there is a newer leader whose svc->ps_error + * is zero and is able to serve those requests. Such a state won't last + * much longer than an election timeout though, because we will step + * down due to inability to maintain a majority lease. + */ + svc = pool_svc_obj(rsvc); + if (svc->ps_error != 0) { + rc = svc->ps_error; + ds_rsvc_put_leader(rsvc); + return rc; + } + + *svcp = svc; return 0; } @@ -2582,6 +2624,7 @@ int ds_pool_failed_add(uuid_t uuid, int rc) uuid_copy(psf->psf_uuid, uuid); psf->psf_error = rc; d_list_add_tail(&psf->psf_link, &pool_svc_failed_list); + DL_ERROR(rc, DF_UUID ": added to list of failed pools", DP_UUID(uuid)); out: D_RWLOCK_UNLOCK(&psfl_rwlock); return ret; @@ -2596,6 +2639,8 @@ void ds_pool_failed_remove(uuid_t uuid) d_list_for_each_entry_safe(psf, tmp, &pool_svc_failed_list, psf_link) { if (uuid_compare(psf->psf_uuid, uuid) == 0) { d_list_del_init(&psf->psf_link); + DL_INFO(psf->psf_error, DF_UUID ": removed from list of failed pools", + DP_UUID(uuid)); D_FREE(psf); break; } @@ -3431,7 +3476,7 @@ ds_pool_svc_ops_lookup(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid int rc = 0; if (!svc) { - rc = pool_svc_lookup(pool_uuid, &svc); + rc = pool_svc_lookup_leader(pool_uuid, &svc, NULL /* hint */); if (rc != 0) { DL_ERROR(rc, "pool_svc lookup failed"); goto out; @@ -3479,7 +3524,7 @@ ds_pool_svc_ops_lookup(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid D_FREE(op_key_enc.iov_buf); out_svc: if (need_put_svc) - pool_svc_put(svc); + pool_svc_put_leader(svc); out: if (rc == 0) { *is_dup = duplicate; @@ -3526,7 +3571,7 @@ ds_pool_svc_ops_save(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid_t int rc = 0; if (!svc) { - rc = pool_svc_lookup(pool_uuid, &svc); + rc = pool_svc_lookup_leader(pool_uuid, &svc, NULL /* hint */); if (rc != 0) { DL_ERROR(rc, "pool_svc lookup failed"); goto out; @@ -3588,7 +3633,7 @@ ds_pool_svc_ops_save(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid_t D_FREE(op_key_enc.iov_buf); out_svc: if (need_put_svc) - pool_svc_put(svc); + pool_svc_put_leader(svc); out: return rc; } @@ -3681,6 +3726,13 @@ ds_pool_create_handler(crt_rpc_t *rpc) ABT_rwlock_wrlock(svc->ps_lock); ds_cont_wrlock_metadata(svc->ps_cont_svc); + if (svc->ps_error != 0) { + DL_ERROR(svc->ps_error, DF_UUID ": encountered pool service leader with error", + DP_UUID(svc->ps_uuid)); + rc = svc->ps_error; + goto out_tx; + } + /* See if the DB has already been initialized. */ d_iov_set(&value, NULL /* buf */, 0 /* size */); rc = rdb_tx_lookup(&tx, &svc->ps_root, &ds_pool_prop_map_buffer,