Skip to content

Commit

Permalink
Merge pull request #15107 from daos-stack/mjmac/google/2.6
Browse files Browse the repository at this point in the history
Merge upstream/release/2.6 into upstream/google/2.6
  • Loading branch information
jolivier23 authored Sep 10, 2024
2 parents c485fa4 + f2c36ad commit 5d14963
Show file tree
Hide file tree
Showing 42 changed files with 418 additions and 217 deletions.
2 changes: 1 addition & 1 deletion TAG
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.6.0-rc3
2.6.1-rc1
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.6.0
2.6.1
6 changes: 6 additions & 0 deletions debian/changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
daos (2.6.1-1) unstable; urgency=medium
[ Phillip Henderson ]
* First release candidate for 2.6.1

-- Phillip Henderson <[email protected]> Mon, 09 Sep 2024 08:46:00 -0500

daos (2.6.0-5) unstable; urgency=medium
[ Tom Nabarro ]
* Add pciutils runtime dep for daos_server lspci call
Expand Down
1 change: 1 addition & 0 deletions docs/admin/env_variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ Environment variables in this section only apply to the server side.
|DAOS\_DTX\_RPC\_HELPER\_THD|DTX RPC helper threshold. The valid range is [18, unlimited). The default value is 513.|
|DAOS\_DTX\_BATCHED\_ULT\_MAX|The max count of DTX batched commit ULTs. The valid range is [0, unlimited). 0 means to commit DTX synchronously. The default value is 32.|
|DAOS\_FORWARD\_NEIGHBOR|Set to enable I/O forwarding on neighbor xstream in the absence of helper threads.|
|DAOS\_POOL\_RF|Redundancy factor for the pool. The valid range is [1, 4]. The default value is 2.|

## Server and Client environment variables

Expand Down
24 changes: 24 additions & 0 deletions docs/admin/pool_operations.md
Original file line number Diff line number Diff line change
Expand Up @@ -916,6 +916,30 @@ and possibly repair a pmemobj file. As discussed in the previous section, the
rebuild status can be consulted via the pool query and will be expanded
with more information.

## Pool Redundancy Factor

If the DAOS system experiences cascading failures, where the number of failed
fault domains exceeds a pool's redundancy factor, there could be unrecoverable
errors and applications could suffer from data loss. This can happen in cases
of power or network outages and would cause node/engine failures. In most cases
those failures can be recovered and DAOS engines can be restarted and the system
can function again.

Administrator can set the default pool redundancy factor by environment variable
"DAOS_POOL_RF" in the server yaml file. If SWIM detects and reports an engine is
dead and the number of failed fault domain exceeds or is going to exceed the pool
redundancy factor, it will not change pool map immediately. Instead, it will give
critical log message:
intolerable unavailability: engine rank x
In this case, the system administrator should check and try to recover those
failed engines and bring them back with:
dmg system start --ranks=x
one by one. A reintegrate call is not needed.

For true unrecoverable failures, the administrator can still exclude engines.
However, data loss is expected as the number of unrecoverable failures exceeds
the pool redundancy factor.

## Recovering Container Ownership

Typically users are expected to manage their containers. However, in the event
Expand Down
4 changes: 2 additions & 2 deletions src/chk/chk_engine.c
Original file line number Diff line number Diff line change
Expand Up @@ -668,7 +668,7 @@ chk_engine_pool_mbs_one(struct chk_pool_rec *cpr, struct pool_map *map, struct c
int rc = 0;
bool unknown;

dom = pool_map_find_node_by_rank(map, mbs->cpm_rank);
dom = pool_map_find_dom_by_rank(map, mbs->cpm_rank);
if (dom == NULL) {
D_ASSERT(mbs->cpm_rank != dss_self_rank());

Expand Down Expand Up @@ -777,7 +777,7 @@ chk_engine_find_dangling_pm(struct chk_pool_rec *cpr, struct pool_map *map)
int j;
bool down;

rank_nr = pool_map_find_nodes(map, PO_COMP_ID_ALL, &doms);
rank_nr = pool_map_find_ranks(map, PO_COMP_ID_ALL, &doms);
if (rank_nr <= 0)
D_GOTO(out, rc = rank_nr);

Expand Down
54 changes: 23 additions & 31 deletions src/common/lru.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,10 @@ lru_hop_rec_decref(struct d_hash_table *htable, d_list_t *link)

D_ASSERT(llink->ll_ref > 0);
llink->ll_ref--;
if (llink->ll_ref == 1 && llink->ll_ops->lop_wakeup)

/* eviction waiter is the last one holds refcount */
if (llink->ll_wait_evict &&
llink->ll_ops->lop_wakeup && daos_lru_is_last_user(llink))
llink->ll_ops->lop_wakeup(llink);

/* Delete from hash only if no more references */
Expand Down Expand Up @@ -215,15 +218,6 @@ daos_lru_ref_hold(struct daos_lru_cache *lcache, void *key,
if (link != NULL) {
llink = link2llink(link);
D_ASSERT(llink->ll_evicted == 0);
if (llink->ll_evicting) {
/**
* Avoid calling `lru_hop_rec_decref()` at this point
* to prevent `wakeup()` from being invoked twice.
*/
D_ASSERT(llink->ll_ref > 1);
llink->ll_ref--;
D_GOTO(out, rc = -DER_SHUTDOWN);
}
/* remove busy item from LRU */
if (!d_list_empty(&llink->ll_qlink))
d_list_del_init(&llink->ll_qlink);
Expand Down Expand Up @@ -257,24 +251,17 @@ daos_lru_ref_hold(struct daos_lru_cache *lcache, void *key,
return rc;
}

static void
lru_ref_release_internal(struct daos_lru_cache *lcache, struct daos_llink *llink, bool wait)
void
daos_lru_ref_release(struct daos_lru_cache *lcache, struct daos_llink *llink)
{
D_ASSERT(lcache != NULL && llink != NULL && llink->ll_ref > 1);
D_ASSERT(d_list_empty(&llink->ll_qlink));

lru_hop_rec_decref(&lcache->dlc_htable, &llink->ll_link);

if (wait && llink->ll_ref > 1) {
D_ASSERT(llink->ll_evicting == 0);
llink->ll_evicting = 1;
lcache->dlc_ops->lop_wait(llink);
llink->ll_evicting = 0;
llink->ll_evicted = 1;
}

if (llink->ll_ref == 1) { /* the last refcount */
if (lcache->dlc_csize == 0)
/* zero-sized cache always evicts unused item */
if (lcache->dlc_csize == 0 && !llink->ll_evicted)
llink->ll_evicted = 1;

if (llink->ll_evicted) {
Expand All @@ -297,15 +284,20 @@ lru_ref_release_internal(struct daos_lru_cache *lcache, struct daos_llink *llink
}

void
daos_lru_ref_release(struct daos_lru_cache *lcache, struct daos_llink *llink)
{
lru_ref_release_internal(lcache, llink, false);
}

void
daos_lru_ref_wait_evict(struct daos_lru_cache *lcache, struct daos_llink *llink)
daos_lru_ref_evict_wait(struct daos_lru_cache *lcache, struct daos_llink *llink)
{
D_ASSERT(lcache->dlc_ops->lop_wait);

lru_ref_release_internal(lcache, llink, true);
if (!llink->ll_evicted)
daos_lru_ref_evict(lcache, llink);

if (lcache->dlc_ops->lop_wait && !daos_lru_is_last_user(llink)) {
/* Wait until I'm the last one.
* XXX: the implementation can only support one waiter for now, if there
* is a secondary ULT calls this function on the same item, it will hit
* the assertion.
*/
D_ASSERT(!llink->ll_wait_evict);
llink->ll_wait_evict = 1;
lcache->dlc_ops->lop_wait(llink);
llink->ll_wait_evict = 0;
}
}
18 changes: 9 additions & 9 deletions src/common/pool_map.c
Original file line number Diff line number Diff line change
Expand Up @@ -1573,7 +1573,7 @@ add_domain_tree_to_pool_buf(struct pool_map *map, struct pool_buf *map_buf,
if (map) {
struct pool_domain *found_dom;

found_dom = pool_map_find_node_by_rank(map, rank);
found_dom = pool_map_find_dom_by_rank(map, rank);
if (found_dom) {
if (found_dom->do_comp.co_status == PO_COMP_ST_NEW)
found_new_dom = true;
Expand Down Expand Up @@ -2038,7 +2038,7 @@ pool_map_find_domain(struct pool_map *map, pool_comp_type_t type, uint32_t id,
}

/**
* Find all nodes in the pool map.
* Find all ranks in the pool map.
*
* \param map [IN] pool map to search.
* \param id [IN] id to search.
Expand All @@ -2048,7 +2048,7 @@ pool_map_find_domain(struct pool_map *map, pool_comp_type_t type, uint32_t id,
* 0 if none.
*/
int
pool_map_find_nodes(struct pool_map *map, uint32_t id,
pool_map_find_ranks(struct pool_map *map, uint32_t id,
struct pool_domain **domain_pp)
{
return pool_map_find_domain(map, PO_COMP_TP_RANK, id,
Expand Down Expand Up @@ -2102,14 +2102,14 @@ pool_map_find_target(struct pool_map *map, uint32_t id,
* \return domain found by rank.
*/
struct pool_domain *
pool_map_find_node_by_rank(struct pool_map *map, uint32_t rank)
pool_map_find_dom_by_rank(struct pool_map *map, uint32_t rank)
{
struct pool_domain *doms;
struct pool_domain *found = NULL;
int doms_cnt;
int i;

doms_cnt = pool_map_find_nodes(map, PO_COMP_ID_ALL, &doms);
doms_cnt = pool_map_find_ranks(map, PO_COMP_ID_ALL, &doms);
if (doms_cnt <= 0)
return NULL;

Expand Down Expand Up @@ -2150,7 +2150,7 @@ pool_map_find_targets_on_ranks(struct pool_map *map, d_rank_list_t *rank_list,
for (i = 0; i < rank_list->rl_nr; i++) {
struct pool_domain *dom;

dom = pool_map_find_node_by_rank(map, rank_list->rl_ranks[i]);
dom = pool_map_find_dom_by_rank(map, rank_list->rl_ranks[i]);
if (dom == NULL) {
pool_target_id_list_free(tgts);
return 0;
Expand Down Expand Up @@ -2191,7 +2191,7 @@ pool_map_find_target_by_rank_idx(struct pool_map *map, uint32_t rank,
{
struct pool_domain *dom;

dom = pool_map_find_node_by_rank(map, rank);
dom = pool_map_find_dom_by_rank(map, rank);
if (dom == NULL)
return 0;

Expand Down Expand Up @@ -2867,7 +2867,7 @@ pool_map_find_by_rank_status(struct pool_map *map,

*tgt_ppp = NULL;
*tgt_cnt = 0;
dom = pool_map_find_node_by_rank(map, rank);
dom = pool_map_find_dom_by_rank(map, rank);
if (dom == NULL)
return 0;

Expand Down Expand Up @@ -2902,7 +2902,7 @@ pool_map_get_ranks(uuid_t pool_uuid, struct pool_map *map, bool get_enabled, d_r
struct pool_domain *domains = NULL;
d_rank_list_t *ranklist = NULL;

nnodes_tot = pool_map_find_nodes(map, PO_COMP_ID_ALL, &domains);
nnodes_tot = pool_map_find_ranks(map, PO_COMP_ID_ALL, &domains);
for (i = 0; i < nnodes_tot; i++) {
if (pool_map_node_status_match(&domains[i], ENABLED))
nnodes_enabled++;
Expand Down
2 changes: 1 addition & 1 deletion src/container/cli.c
Original file line number Diff line number Diff line change
Expand Up @@ -3386,7 +3386,7 @@ dc_cont_node_id2ptr(daos_handle_t coh, uint32_t node_id,
pool = dc_hdl2pool(dc->dc_pool_hdl);
D_ASSERT(pool != NULL);
D_RWLOCK_RDLOCK(&pool->dp_map_lock);
n = pool_map_find_nodes(pool->dp_map, node_id, dom);
n = pool_map_find_ranks(pool->dp_map, node_id, dom);
D_RWLOCK_UNLOCK(&pool->dp_map_lock);
dc_pool_put(pool);
dc_cont_put(dc);
Expand Down
15 changes: 7 additions & 8 deletions src/container/srv_container.c
Original file line number Diff line number Diff line change
Expand Up @@ -1667,7 +1667,7 @@ cont_ec_agg_alloc(struct cont_svc *cont_svc, uuid_t cont_uuid,
{
struct cont_ec_agg *ec_agg = NULL;
struct pool_domain *doms;
int node_nr;
int rank_nr;
int rc = 0;
int i;

Expand All @@ -1676,19 +1676,18 @@ cont_ec_agg_alloc(struct cont_svc *cont_svc, uuid_t cont_uuid,
return -DER_NOMEM;

D_ASSERT(cont_svc->cs_pool->sp_map != NULL);
node_nr = pool_map_find_nodes(cont_svc->cs_pool->sp_map,
PO_COMP_ID_ALL, &doms);
if (node_nr < 0)
D_GOTO(out, rc = node_nr);
rank_nr = pool_map_find_ranks(cont_svc->cs_pool->sp_map, PO_COMP_ID_ALL, &doms);
if (rank_nr < 0)
D_GOTO(out, rc = rank_nr);

D_ALLOC_ARRAY(ec_agg->ea_server_ephs, node_nr);
D_ALLOC_ARRAY(ec_agg->ea_server_ephs, rank_nr);
if (ec_agg->ea_server_ephs == NULL)
D_GOTO(out, rc = -DER_NOMEM);

uuid_copy(ec_agg->ea_cont_uuid, cont_uuid);
ec_agg->ea_servers_num = node_nr;
ec_agg->ea_servers_num = rank_nr;
ec_agg->ea_current_eph = 0;
for (i = 0; i < node_nr; i++) {
for (i = 0; i < rank_nr; i++) {
ec_agg->ea_server_ephs[i].rank = doms[i].do_comp.co_rank;
ec_agg->ea_server_ephs[i].eph = 0;
}
Expand Down
4 changes: 3 additions & 1 deletion src/container/srv_target.c
Original file line number Diff line number Diff line change
Expand Up @@ -1261,7 +1261,9 @@ cont_child_destroy_one(void *vin)
D_GOTO(out_pool, rc = -DER_BUSY);
} /* else: resync should have completed, try again */

daos_lru_ref_wait_evict(tls->dt_cont_cache, &cont->sc_list);
/* nobody should see it again after eviction */
daos_lru_ref_evict_wait(tls->dt_cont_cache, &cont->sc_list);
daos_lru_ref_release(tls->dt_cont_cache, &cont->sc_list);
}

D_DEBUG(DB_MD, DF_CONT": destroying vos container\n",
Expand Down
14 changes: 7 additions & 7 deletions src/dtx/dtx_coll.c
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt
struct dtx_coll_target *dct;
struct dtx_coll_entry *dce = NULL;
struct daos_obj_md md = { 0 };
uint32_t node_nr;
uint32_t rank_nr;
d_rank_t my_rank = dss_self_rank();
d_rank_t max_rank = 0;
int rc = 0;
Expand Down Expand Up @@ -192,19 +192,19 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt
}
}

node_nr = pool_map_node_nr(map->pl_poolmap);
if (unlikely(node_nr == 1))
rank_nr = pool_map_rank_nr(map->pl_poolmap);
if (unlikely(rank_nr == 1))
D_GOTO(out, rc = 0);

dce->dce_ranks = d_rank_list_alloc(node_nr - 1);
dce->dce_ranks = d_rank_list_alloc(rank_nr - 1);
if (dce->dce_ranks == NULL)
D_GOTO(out, rc = -DER_NOMEM);

D_ALLOC_ARRAY(dce->dce_hints, node_nr);
D_ALLOC_ARRAY(dce->dce_hints, rank_nr);
if (dce->dce_hints == NULL)
D_GOTO(out, rc = -DER_NOMEM);

for (i = 0; i < node_nr; i++)
for (i = 0; i < rank_nr; i++)
dce->dce_hints[i] = (uint8_t)(-1);

md.omd_id = oid.id_pub;
Expand All @@ -220,7 +220,7 @@ dtx_coll_prep(uuid_t po_uuid, daos_unit_oid_t oid, struct dtx_id *xid, struct dt
goto out;
}

for (i = 0, j = 0; i < layout->ol_nr && j < node_nr - 1; i++) {
for (i = 0, j = 0; i < layout->ol_nr && j < rank_nr - 1; i++) {
if (layout->ol_shards[i].po_target == -1 || layout->ol_shards[i].po_shard == -1)
continue;

Expand Down
2 changes: 1 addition & 1 deletion src/dtx/dtx_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -1341,7 +1341,7 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_hdl *coh, int resul
* it persistently. Otherwise, the subsequent DTX resync may not find it as
* to regard it as failed transaction and abort it.
*/
if (result == 0 && !dth->dth_active && !dth->dth_prepared &&
if (result == 0 && !dth->dth_active && !dth->dth_prepared && !dth->dth_solo &&
(dth->dth_dist || dth->dth_modification_cnt > 0)) {
result = vos_dtx_attach(dth, true, dth->dth_ent != NULL ? true : false);
if (unlikely(result < 0)) {
Expand Down
7 changes: 7 additions & 0 deletions src/dtx/tests/srv_mock.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,13 @@ ds_pool_child_put(struct ds_pool_child *child)
assert_true(false);
}

struct ds_pool_child *
ds_pool_child_find(const uuid_t uuid)
{
assert_true(false);
return NULL;
}

struct ds_pool_child *
ds_pool_child_lookup(const uuid_t uuid)
{
Expand Down
Loading

0 comments on commit 5d14963

Please sign in to comment.