diff --git a/src/pool/srv_pool.c b/src/pool/srv_pool.c index 06a5f29e866..36b099ef27a 100644 --- a/src/pool/srv_pool.c +++ b/src/pool/srv_pool.c @@ -213,6 +213,7 @@ struct pool_svc { rdb_path_t ps_handles; /* pool handle KVS */ rdb_path_t ps_user; /* pool user attributes KVS */ rdb_path_t ps_ops; /* metadata ops KVS */ + int ps_error; /* in DB data (see pool_svc_lookup_leader) */ struct pool_svc_events ps_events; uint32_t ps_global_version; int ps_svc_rf; @@ -1950,8 +1951,11 @@ read_db_for_stepping_up(struct pool_svc *svc, struct pool_buf **map_buf_out, rc = ds_pool_svc_load(&tx, svc->ps_uuid, &svc->ps_root, &svc->ps_global_version, &map_buf, &map_version); - if (rc != 0) + if (rc != 0) { + if (rc == -DER_DF_INCOMPT) + svc->ps_error = rc; goto out_lock; + } rc = pool_prop_read(&tx, svc, DAOS_PO_QUERY_PROP_ALL, &prop); if (rc != 0) { @@ -2263,6 +2267,8 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc) d_rank_t rank = dss_self_rank(); int rc; + D_ASSERTF(svc->ps_error == 0, "ps_error: " DF_RC "\n", DP_RC(svc->ps_error)); + /* * If this is the only voting replica, it may have become the leader * without doing any RPC. The primary group may have yet to be @@ -2328,9 +2334,9 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc) } rc = ds_pool_iv_prop_update(svc->ps_pool, prop); - if (rc) { - D_ERROR("ds_pool_iv_prop_update failed: " DF_RC "\n", DP_RC(rc)); - D_GOTO(out, rc); + if (rc != 0) { + DL_ERROR(rc, DF_UUID ": ds_pool_iv_prop_update failed", DP_UUID(svc->ps_uuid)); + goto out; } if (!uuid_is_null(svc->ps_pool->sp_srv_cont_hdl)) { @@ -2346,11 +2352,10 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc) uuid_copy(svc->ps_pool->sp_srv_pool_hdl, pool_hdl_uuid); } - rc = ds_pool_iv_srv_hdl_update(svc->ps_pool, pool_hdl_uuid, - cont_hdl_uuid); - if (rc) { - D_ERROR("ds_pool_iv_srv_hdl_update failed: " DF_RC "\n", DP_RC(rc)); - D_GOTO(out, rc); + rc = ds_pool_iv_srv_hdl_update(svc->ps_pool, pool_hdl_uuid, cont_hdl_uuid); + if (rc != 0) { + DL_ERROR(rc, DF_UUID ": ds_pool_iv_srv_hdl_update failed", DP_UUID(svc->ps_uuid)); + goto out; } /* resume pool upgrade if needed */ @@ -2385,27 +2390,39 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc) D_FREE(map_buf); if (prop != NULL) daos_prop_free(prop); - if (rc < 0) - ds_pool_failed_add(svc->ps_uuid, rc); - else if (rc == 0) - ds_pool_failed_remove(svc->ps_uuid); + if (svc->ps_error != 0) { + /* + * Step up with the error anyway, so that RPCs to the PS + * receive an error instead of timeouts. + */ + DS_POOL_NOTE_PRINT(DF_UUID": rank %u became pool service leader "DF_U64 + " with error: "DF_RC"\n", DP_UUID(svc->ps_uuid), rank, + svc->ps_rsvc.s_term, DP_RC(svc->ps_error)); + rc = 0; + } return rc; } static void pool_svc_step_down_cb(struct ds_rsvc *rsvc) { - struct pool_svc *svc = pool_svc_obj(rsvc); - d_rank_t rank = dss_self_rank(); - - pool_svc_step_down_metrics(svc); - fini_events(svc); - sched_cancel_and_wait(&svc->ps_reconf_sched); - sched_cancel_and_wait(&svc->ps_rfcheck_sched); - ds_cont_svc_step_down(svc->ps_cont_svc); + struct pool_svc *svc = pool_svc_obj(rsvc); + d_rank_t rank = dss_self_rank(); - DS_POOL_NOTE_PRINT(DF_UUID": rank %u no longer pool service leader "DF_U64"\n", - DP_UUID(svc->ps_uuid), rank, svc->ps_rsvc.s_term); + if (svc->ps_error == 0) { + pool_svc_step_down_metrics(svc); + fini_events(svc); + sched_cancel_and_wait(&svc->ps_reconf_sched); + sched_cancel_and_wait(&svc->ps_rfcheck_sched); + ds_cont_svc_step_down(svc->ps_cont_svc); + DS_POOL_NOTE_PRINT(DF_UUID": rank %u no longer pool service leader "DF_U64"\n", + DP_UUID(svc->ps_uuid), rank, svc->ps_rsvc.s_term); + } else { + DS_POOL_NOTE_PRINT(DF_UUID": rank %u no longer pool service leader "DF_U64 + " with error: "DF_RC"\n", DP_UUID(svc->ps_uuid), rank, + svc->ps_rsvc.s_term, DP_RC(svc->ps_error)); + svc->ps_error = 0; + } } static void @@ -2476,12 +2493,13 @@ ds_pool_rsvc_class_unregister(void) ds_rsvc_class_unregister(DS_RSVC_CLASS_POOL); } +/* Use pool_svc_lookup_leader instead. */ static int pool_svc_lookup(uuid_t uuid, struct pool_svc **svcp) { struct ds_rsvc *rsvc; - d_iov_t id; - int rc; + d_iov_t id; + int rc; d_iov_set(&id, uuid, sizeof(uuid_t)); rc = ds_rsvc_lookup(DS_RSVC_CLASS_POOL, &id, &rsvc); @@ -2492,17 +2510,16 @@ pool_svc_lookup(uuid_t uuid, struct pool_svc **svcp) } static int -pool_svc_lookup_leader(uuid_t uuid, struct pool_svc **svcp, - struct rsvc_hint *hint) +pool_svc_lookup_leader(uuid_t uuid, struct pool_svc **svcp, struct rsvc_hint *hint) { - struct ds_rsvc *rsvc; - d_iov_t id; - int rc; + struct ds_rsvc *rsvc; + d_iov_t id; + struct pool_svc *svc; + int rc; rc = ds_pool_failed_lookup(uuid); - if (rc) { - D_ERROR(DF_UUID": failed to start: "DF_RC"\n", - DP_UUID(uuid), DP_RC(rc)); + if (rc != 0) { + D_DEBUG(DB_MD, DF_UUID ": failed: " DF_RC "\n", DP_UUID(uuid), DP_RC(rc)); return -DER_NO_SERVICE; } @@ -2510,7 +2527,31 @@ pool_svc_lookup_leader(uuid_t uuid, struct pool_svc **svcp, rc = ds_rsvc_lookup_leader(DS_RSVC_CLASS_POOL, &id, &rsvc, hint); if (rc != 0) return rc; - *svcp = pool_svc_obj(rsvc); + + /* + * The svc->ps_error field stores a persistent error, usually in the DB + * data, if any. For instance, "the layout of the DB data is + * incompatible with the software version". This mustn't be a replica + * error, because there may be a majorty of replicas working. We let the + * PS step up with this error so that it can serve all requests by + * returning the error. PS clients therefore get a quick error response + * instead of a timeout. + * + * Checking svc->ps_error here without confirming our leadership via + * rdb_raft_verify_leadership may cause some requests to get + * unnecessary errors, if there is a newer leader whose svc->ps_error + * is zero and is able to serve those requests. Such a state won't last + * much longer than an election timeout though, because we will step + * down due to inability to maintain a majority lease. + */ + svc = pool_svc_obj(rsvc); + if (svc->ps_error != 0) { + rc = svc->ps_error; + ds_rsvc_put_leader(rsvc); + return rc; + } + + *svcp = svc; return 0; } @@ -2582,6 +2623,7 @@ int ds_pool_failed_add(uuid_t uuid, int rc) uuid_copy(psf->psf_uuid, uuid); psf->psf_error = rc; d_list_add_tail(&psf->psf_link, &pool_svc_failed_list); + DL_ERROR(rc, DF_UUID ": added to list of failed pools", DP_UUID(uuid)); out: D_RWLOCK_UNLOCK(&psfl_rwlock); return ret; @@ -2596,6 +2638,8 @@ void ds_pool_failed_remove(uuid_t uuid) d_list_for_each_entry_safe(psf, tmp, &pool_svc_failed_list, psf_link) { if (uuid_compare(psf->psf_uuid, uuid) == 0) { d_list_del_init(&psf->psf_link); + DL_INFO(psf->psf_error, DF_UUID ": removed from list of failed pools", + DP_UUID(uuid)); D_FREE(psf); break; } @@ -3431,7 +3475,7 @@ ds_pool_svc_ops_lookup(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid int rc = 0; if (!svc) { - rc = pool_svc_lookup(pool_uuid, &svc); + rc = pool_svc_lookup_leader(pool_uuid, &svc, NULL /* hint */); if (rc != 0) { DL_ERROR(rc, "pool_svc lookup failed"); goto out; @@ -3479,7 +3523,7 @@ ds_pool_svc_ops_lookup(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid D_FREE(op_key_enc.iov_buf); out_svc: if (need_put_svc) - pool_svc_put(svc); + pool_svc_put_leader(svc); out: if (rc == 0) { *is_dup = duplicate; @@ -3526,7 +3570,7 @@ ds_pool_svc_ops_save(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid_t int rc = 0; if (!svc) { - rc = pool_svc_lookup(pool_uuid, &svc); + rc = pool_svc_lookup_leader(pool_uuid, &svc, NULL /* hint */); if (rc != 0) { DL_ERROR(rc, "pool_svc lookup failed"); goto out; @@ -3588,7 +3632,7 @@ ds_pool_svc_ops_save(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid_t D_FREE(op_key_enc.iov_buf); out_svc: if (need_put_svc) - pool_svc_put(svc); + pool_svc_put_leader(svc); out: return rc; } @@ -3681,6 +3725,13 @@ ds_pool_create_handler(crt_rpc_t *rpc) ABT_rwlock_wrlock(svc->ps_lock); ds_cont_wrlock_metadata(svc->ps_cont_svc); + if (svc->ps_error != 0) { + DL_ERROR(svc->ps_error, DF_UUID ": encountered pool service leader with error", + DP_UUID(svc->ps_uuid)); + rc = svc->ps_error; + goto out_tx; + } + /* See if the DB has already been initialized. */ d_iov_set(&value, NULL /* buf */, 0 /* size */); rc = rdb_tx_lookup(&tx, &svc->ps_root, &ds_pool_prop_map_buffer,