Skip to content

Commit

Permalink
DAOS-16878 pool: Reduce unexpected DER_NO_SERVICEs
Browse files Browse the repository at this point in the history
It has been observed that pool_svc_step_up_cb may encounter a
-DER_NOTLEADER and pass it to ds_pool_failed_add. This error is a
replica error and may be transient; it doesn't indicate that the PS is
unavailable. This patch addresses the observed scenario by replacing the
ds_pool_failed_add call from pool_svc_step_up_cb with a special
up-but-with-error mode for the PS, which can only serve requests by
returning an error.

  - Add pool_svc.ps_error for indicating the special up-but-with-error
    mode. Check and return it in pool_svc_lookup_leader. Handle it
    specially in callers of pool_svc_lookup.

  - Use this new mode only for a conservative set of errors. Including
    an error by mistake is worse than missing an error.

  - Add pool UUIDs to a few log messages to make future debugging
    easier.

The ds_pool_failed_add mechanism should be used for replica errors only.
And, such errors should not immediately stop PS clients from trying
other replicas. This issue is relatively tricky and will not be
addressed by the current patch.

Features: pool
Skip-nlt: true
Signed-off-by: Li Wei <[email protected]>
Required-githooks: true
  • Loading branch information
liw committed Jan 8, 2025
1 parent 14bddd2 commit 39c3cbb
Showing 1 changed file with 89 additions and 38 deletions.
127 changes: 89 additions & 38 deletions src/pool/srv_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ struct pool_svc {
rdb_path_t ps_handles; /* pool handle KVS */
rdb_path_t ps_user; /* pool user attributes KVS */
rdb_path_t ps_ops; /* metadata ops KVS */
int ps_error; /* in DB data (see pool_svc_lookup_leader) */
struct pool_svc_events ps_events;
uint32_t ps_global_version;
int ps_svc_rf;
Expand Down Expand Up @@ -1950,8 +1951,11 @@ read_db_for_stepping_up(struct pool_svc *svc, struct pool_buf **map_buf_out,

rc = ds_pool_svc_load(&tx, svc->ps_uuid, &svc->ps_root, &svc->ps_global_version, &map_buf,
&map_version);
if (rc != 0)
if (rc != 0) {
if (rc == -DER_DF_INCOMPT)
svc->ps_error = rc;
goto out_lock;
}

rc = pool_prop_read(&tx, svc, DAOS_PO_QUERY_PROP_ALL, &prop);
if (rc != 0) {
Expand Down Expand Up @@ -2263,6 +2267,8 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc)
d_rank_t rank = dss_self_rank();
int rc;

D_ASSERTF(svc->ps_error == 0, "ps_error: " DF_RC "\n", DP_RC(svc->ps_error));

/*
* If this is the only voting replica, it may have become the leader
* without doing any RPC. The primary group may have yet to be
Expand Down Expand Up @@ -2328,9 +2334,9 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc)
}

rc = ds_pool_iv_prop_update(svc->ps_pool, prop);
if (rc) {
D_ERROR("ds_pool_iv_prop_update failed: " DF_RC "\n", DP_RC(rc));
D_GOTO(out, rc);
if (rc != 0) {
DL_ERROR(rc, DF_UUID ": ds_pool_iv_prop_update failed", DP_UUID(svc->ps_uuid));
goto out;
}

if (!uuid_is_null(svc->ps_pool->sp_srv_cont_hdl)) {
Expand All @@ -2346,11 +2352,10 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc)
uuid_copy(svc->ps_pool->sp_srv_pool_hdl, pool_hdl_uuid);
}

rc = ds_pool_iv_srv_hdl_update(svc->ps_pool, pool_hdl_uuid,
cont_hdl_uuid);
if (rc) {
D_ERROR("ds_pool_iv_srv_hdl_update failed: " DF_RC "\n", DP_RC(rc));
D_GOTO(out, rc);
rc = ds_pool_iv_srv_hdl_update(svc->ps_pool, pool_hdl_uuid, cont_hdl_uuid);
if (rc != 0) {
DL_ERROR(rc, DF_UUID ": ds_pool_iv_srv_hdl_update failed", DP_UUID(svc->ps_uuid));
goto out;
}

/* resume pool upgrade if needed */
Expand Down Expand Up @@ -2385,27 +2390,39 @@ pool_svc_step_up_cb(struct ds_rsvc *rsvc)
D_FREE(map_buf);
if (prop != NULL)
daos_prop_free(prop);
if (rc < 0)
ds_pool_failed_add(svc->ps_uuid, rc);
else if (rc == 0)
ds_pool_failed_remove(svc->ps_uuid);
if (svc->ps_error != 0) {
/*
* Step up with the error anyway, so that RPCs to the PS
* receive an error instead of timeouts.
*/
DS_POOL_NOTE_PRINT(DF_UUID": rank %u became pool service leader "DF_U64
" with error: "DF_RC"\n", DP_UUID(svc->ps_uuid), rank,
svc->ps_rsvc.s_term, DP_RC(svc->ps_error));
rc = 0;
}
return rc;
}

static void
pool_svc_step_down_cb(struct ds_rsvc *rsvc)
{
struct pool_svc *svc = pool_svc_obj(rsvc);
d_rank_t rank = dss_self_rank();

pool_svc_step_down_metrics(svc);
fini_events(svc);
sched_cancel_and_wait(&svc->ps_reconf_sched);
sched_cancel_and_wait(&svc->ps_rfcheck_sched);
ds_cont_svc_step_down(svc->ps_cont_svc);
struct pool_svc *svc = pool_svc_obj(rsvc);
d_rank_t rank = dss_self_rank();

DS_POOL_NOTE_PRINT(DF_UUID": rank %u no longer pool service leader "DF_U64"\n",
DP_UUID(svc->ps_uuid), rank, svc->ps_rsvc.s_term);
if (svc->ps_error == 0) {
pool_svc_step_down_metrics(svc);
fini_events(svc);
sched_cancel_and_wait(&svc->ps_reconf_sched);
sched_cancel_and_wait(&svc->ps_rfcheck_sched);
ds_cont_svc_step_down(svc->ps_cont_svc);
DS_POOL_NOTE_PRINT(DF_UUID": rank %u no longer pool service leader "DF_U64"\n",
DP_UUID(svc->ps_uuid), rank, svc->ps_rsvc.s_term);
} else {
DS_POOL_NOTE_PRINT(DF_UUID": rank %u no longer pool service leader "DF_U64
" with error: "DF_RC"\n", DP_UUID(svc->ps_uuid), rank,
svc->ps_rsvc.s_term, DP_RC(svc->ps_error));
svc->ps_error = 0;
}
}

static void
Expand Down Expand Up @@ -2476,12 +2493,13 @@ ds_pool_rsvc_class_unregister(void)
ds_rsvc_class_unregister(DS_RSVC_CLASS_POOL);
}

/* Use pool_svc_lookup_leader instead. */
static int
pool_svc_lookup(uuid_t uuid, struct pool_svc **svcp)
{
struct ds_rsvc *rsvc;
d_iov_t id;
int rc;
d_iov_t id;
int rc;

d_iov_set(&id, uuid, sizeof(uuid_t));
rc = ds_rsvc_lookup(DS_RSVC_CLASS_POOL, &id, &rsvc);
Expand All @@ -2492,25 +2510,48 @@ pool_svc_lookup(uuid_t uuid, struct pool_svc **svcp)
}

static int
pool_svc_lookup_leader(uuid_t uuid, struct pool_svc **svcp,
struct rsvc_hint *hint)
pool_svc_lookup_leader(uuid_t uuid, struct pool_svc **svcp, struct rsvc_hint *hint)
{
struct ds_rsvc *rsvc;
d_iov_t id;
int rc;
struct ds_rsvc *rsvc;
d_iov_t id;
struct pool_svc *svc;
int rc;

rc = ds_pool_failed_lookup(uuid);
if (rc) {
D_ERROR(DF_UUID": failed to start: "DF_RC"\n",
DP_UUID(uuid), DP_RC(rc));
if (rc != 0) {
D_DEBUG(DB_MD, DF_UUID ": failed: " DF_RC "\n", DP_UUID(uuid), DP_RC(rc));
return -DER_NO_SERVICE;
}

d_iov_set(&id, uuid, sizeof(uuid_t));
rc = ds_rsvc_lookup_leader(DS_RSVC_CLASS_POOL, &id, &rsvc, hint);
if (rc != 0)
return rc;
*svcp = pool_svc_obj(rsvc);

/*
* The svc->ps_error field stores a persistent error, usually in the DB
* data, if any. For instance, "the layout of the DB data is
* incompatible with the software version". This mustn't be a replica
* error, because there may be a majorty of replicas working. We let the
* PS step up with this error so that it can serve all requests by
* returning the error. PS clients therefore get a quick error response
* instead of a timeout.
*
* Checking svc->ps_error here without confirming our leadership via
* rdb_raft_verify_leadership may cause some requests to get
* unnecessary errors, if there is a newer leader whose svc->ps_error
* is zero and is able to serve those requests. Such a state won't last
* much longer than an election timeout though, because we will step
* down due to inability to maintain a majority lease.
*/
svc = pool_svc_obj(rsvc);
if (svc->ps_error != 0) {
rc = svc->ps_error;
ds_rsvc_put_leader(rsvc);
return rc;
}

*svcp = svc;
return 0;
}

Expand Down Expand Up @@ -2582,6 +2623,7 @@ int ds_pool_failed_add(uuid_t uuid, int rc)
uuid_copy(psf->psf_uuid, uuid);
psf->psf_error = rc;
d_list_add_tail(&psf->psf_link, &pool_svc_failed_list);
DL_ERROR(rc, DF_UUID ": added to list of failed pools", DP_UUID(uuid));
out:
D_RWLOCK_UNLOCK(&psfl_rwlock);
return ret;
Expand All @@ -2596,6 +2638,8 @@ void ds_pool_failed_remove(uuid_t uuid)
d_list_for_each_entry_safe(psf, tmp, &pool_svc_failed_list, psf_link) {
if (uuid_compare(psf->psf_uuid, uuid) == 0) {
d_list_del_init(&psf->psf_link);
DL_INFO(psf->psf_error, DF_UUID ": removed from list of failed pools",
DP_UUID(uuid));
D_FREE(psf);
break;
}
Expand Down Expand Up @@ -3431,7 +3475,7 @@ ds_pool_svc_ops_lookup(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid
int rc = 0;

if (!svc) {
rc = pool_svc_lookup(pool_uuid, &svc);
rc = pool_svc_lookup_leader(pool_uuid, &svc, NULL /* hint */);
if (rc != 0) {
DL_ERROR(rc, "pool_svc lookup failed");
goto out;
Expand Down Expand Up @@ -3479,7 +3523,7 @@ ds_pool_svc_ops_lookup(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid
D_FREE(op_key_enc.iov_buf);
out_svc:
if (need_put_svc)
pool_svc_put(svc);
pool_svc_put_leader(svc);
out:
if (rc == 0) {
*is_dup = duplicate;
Expand Down Expand Up @@ -3526,7 +3570,7 @@ ds_pool_svc_ops_save(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid_t
int rc = 0;

if (!svc) {
rc = pool_svc_lookup(pool_uuid, &svc);
rc = pool_svc_lookup_leader(pool_uuid, &svc, NULL /* hint */);
if (rc != 0) {
DL_ERROR(rc, "pool_svc lookup failed");
goto out;
Expand Down Expand Up @@ -3588,7 +3632,7 @@ ds_pool_svc_ops_save(struct rdb_tx *tx, void *pool_svc, uuid_t pool_uuid, uuid_t
D_FREE(op_key_enc.iov_buf);
out_svc:
if (need_put_svc)
pool_svc_put(svc);
pool_svc_put_leader(svc);
out:
return rc;
}
Expand Down Expand Up @@ -3681,6 +3725,13 @@ ds_pool_create_handler(crt_rpc_t *rpc)
ABT_rwlock_wrlock(svc->ps_lock);
ds_cont_wrlock_metadata(svc->ps_cont_svc);

if (svc->ps_error != 0) {
DL_ERROR(svc->ps_error, DF_UUID ": encountered pool service leader with error",
DP_UUID(svc->ps_uuid));
rc = svc->ps_error;
goto out_tx;
}

/* See if the DB has already been initialized. */
d_iov_set(&value, NULL /* buf */, 0 /* size */);
rc = rdb_tx_lookup(&tx, &svc->ps_root, &ds_pool_prop_map_buffer,
Expand Down

0 comments on commit 39c3cbb

Please sign in to comment.