Skip to content

Commit

Permalink
DAOS-16916 container: check inflight open (#15682) (#15745)
Browse files Browse the repository at this point in the history
Check inflight container open, which might be stucked in
IV fetch, then the following cont open will just increase
the open count, then if the previous container open failed,
it will get the assertion failure.

So let's retry if there are inflight container open.

Signed-off-by: Di Wang <[email protected]>
  • Loading branch information
jolivier23 authored Jan 17, 2025
1 parent 0405fff commit 8ddde42
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 17 deletions.
29 changes: 26 additions & 3 deletions src/container/srv_target.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -1607,11 +1608,23 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid,
*/
D_ASSERT(hdl->sch_cont != NULL);
D_ASSERT(hdl->sch_cont->sc_pool != NULL);

hdl->sch_cont->sc_open++;
if (hdl->sch_cont->sc_open > 1) {
/* If there is an inflight open being stuck, then
* let's retry and wait until it finished.
*/
if (hdl->sch_cont->sc_open_initializing) {
hdl->sch_cont->sc_open--;
D_GOTO(err_cont, rc = -DER_AGAIN);
}

if (hdl->sch_cont->sc_open > 1)
goto opened;
/* Only go through if the 1st open succeeds */
if (hdl->sch_cont->sc_props_fetched)
goto opened;
}

hdl->sch_cont->sc_open_initializing = 1;
if (ds_pool_restricted(hdl->sch_cont->sc_pool->spc_pool, false))
goto csum_init;

Expand Down Expand Up @@ -1646,6 +1659,8 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid,
rc = ds_cont_csummer_init(hdl->sch_cont);
if (rc != 0)
D_GOTO(err_dtx, rc);

hdl->sch_cont->sc_open_initializing = 0;
}
opened:
if (cont_hdl != NULL) {
Expand All @@ -1663,6 +1678,7 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid,
dtx_cont_close(hdl->sch_cont, true);

err_cont:
hdl->sch_cont->sc_open_initializing = 0;
if (daos_handle_is_valid(poh)) {
int rc_tmp;

Expand Down Expand Up @@ -1750,9 +1766,15 @@ ds_cont_tgt_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid,
D_DEBUG(DB_TRACE, "open pool/cont/hdl "DF_UUID"/"DF_UUID"/"DF_UUID"\n",
DP_UUID(pool_uuid), DP_UUID(cont_uuid), DP_UUID(cont_hdl_uuid));

retry:
rc = ds_pool_thread_collective(pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN |
PO_COMP_ST_DOWNOUT, cont_open_one, &arg, 0);
if (rc != 0)
if (rc != 0) {
if (rc == -DER_AGAIN) {
dss_sleep(50);
goto retry;
}

/* Once it exclude the target from the pool, since the target
* might still in the cart group, so IV cont open might still
* come to this target, especially if cont open/close will be
Expand All @@ -1762,6 +1784,7 @@ ds_cont_tgt_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid,
D_ERROR("open "DF_UUID"/"DF_UUID"/"DF_UUID":"DF_RC"\n",
DP_UUID(pool_uuid), DP_UUID(cont_uuid),
DP_UUID(cont_hdl_uuid), DP_RC(rc));
}

return rc;
}
Expand Down
20 changes: 6 additions & 14 deletions src/include/daos_srv/container.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2015-2024 Intel Corporation.
* (C) Copyright 2025 Google LLC
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -66,20 +67,11 @@ struct ds_cont_child {
ABT_cond sc_scrub_cond;
ABT_cond sc_rebuild_cond;
ABT_cond sc_fini_cond;
uint32_t sc_dtx_resyncing:1,
sc_dtx_reindex:1,
sc_dtx_reindex_abort:1,
sc_dtx_delay_reset:1,
sc_dtx_registered:1,
sc_props_fetched:1,
sc_stopping:1,
sc_destroying:1,
sc_vos_agg_active:1,
sc_ec_agg_active:1,
/* flag of CONT_CAPA_READ_DATA/_WRITE_DATA disabled */
sc_rw_disabled:1,
sc_scrubbing:1,
sc_rebuilding:1;
uint32_t sc_dtx_resyncing : 1, sc_dtx_reindex : 1, sc_dtx_reindex_abort : 1,
sc_dtx_delay_reset : 1, sc_dtx_registered : 1, sc_props_fetched : 1, sc_stopping : 1,
sc_destroying : 1, sc_vos_agg_active : 1, sc_ec_agg_active : 1,
/* flag of CONT_CAPA_READ_DATA/_WRITE_DATA disabled */
sc_rw_disabled : 1, sc_scrubbing : 1, sc_rebuilding : 1, sc_open_initializing : 1;
uint32_t sc_dtx_batched_gen;
/* Tracks the schedule request for aggregation ULT */
struct sched_request *sc_agg_req;
Expand Down

0 comments on commit 8ddde42

Please sign in to comment.