From f01b947db85f5928b0ae3f5b7c4a1a6300737e6d Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Sat, 14 Sep 2024 23:18:14 +0800 Subject: [PATCH] DAOS-16559 container: fix race of concurrent container destroy If there are more than one caller of destroy against a container, the current container destory handler will hit an assertion. This patch fixes the race of concurrent destroy against a same container. Signed-off-by: Liang Zhen --- src/common/lru.c | 29 +++++++++++++++++------------ src/include/daos/lru.h | 4 ++-- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/src/common/lru.c b/src/common/lru.c index de86d367e0e..4351479c5f4 100644 --- a/src/common/lru.c +++ b/src/common/lru.c @@ -37,9 +37,12 @@ lru_hop_rec_decref(struct d_hash_table *htable, d_list_t *link) D_ASSERT(llink->ll_ref > 0); llink->ll_ref--; - /* eviction waiter is the last one holds refcount */ - if (llink->ll_wait_evict && - llink->ll_ops->lop_wakeup && daos_lru_is_last_user(llink)) + /* someone is waiting for eviction of this element, no one else takes refcount + * except the hash table. + */ + if (llink->ll_wait_evict > 0 && + llink->ll_wait_evict + 1 == llink->ll_ref && + llink->ll_ops->lop_wakeup) llink->ll_ops->lop_wakeup(llink); /* Delete from hash only if no more references */ @@ -289,15 +292,17 @@ daos_lru_ref_evict_wait(struct daos_lru_cache *lcache, struct daos_llink *llink) if (!llink->ll_evicted) daos_lru_ref_evict(lcache, llink); - if (lcache->dlc_ops->lop_wait && !daos_lru_is_last_user(llink)) { - /* Wait until I'm the last one. - * XXX: the implementation can only support one waiter for now, if there - * is a secondary ULT calls this function on the same item, it will hit - * the assertion. - */ - D_ASSERT(!llink->ll_wait_evict); - llink->ll_wait_evict = 1; + /* - hash table has +1 ll_ref, myself also has +1 ll_ref (reason of 2). + * - each eviction caller has +1 ll_ref, also has +1 ll_wait_evict. + * - each active user has +1 ll_ref. + * + * if (ll_ref - ll_wait_evict) == 2, it means there is no active user on + * container, I don't need to wait anymore. + */ + if (llink->ll_ref - llink->ll_wait_evict > 2 && lcache->dlc_ops->lop_wait) { + llink->ll_wait_evict++; lcache->dlc_ops->lop_wait(llink); - llink->ll_wait_evict = 0; + D_ASSERT(llink->ll_wait_evict > 0); + llink->ll_wait_evict--; } } diff --git a/src/include/daos/lru.h b/src/include/daos/lru.h index 40bee5c492b..14fcc4511ea 100644 --- a/src/include/daos/lru.h +++ b/src/include/daos/lru.h @@ -37,8 +37,8 @@ struct daos_llink { d_list_t ll_link; /**< LRU hash link */ d_list_t ll_qlink; /**< Temp link for traverse */ uint32_t ll_ref; /**< refcount for this ref */ - uint32_t ll_evicted:1; /**< has been evicted */ - uint32_t ll_wait_evict:1; /**< wait for completion of eviction */ + uint32_t ll_wait_evict:24, /**< wait for completion of eviction */ + ll_evicted:1; /**< has been evicted */ struct daos_llink_ops *ll_ops; /**< ops to maintain refs */ };