diff --git a/src/common/checksum.c b/src/common/checksum.c index 69cf89ab07a..c36f14e3c6d 100644 --- a/src/common/checksum.c +++ b/src/common/checksum.c @@ -278,6 +278,10 @@ daos_csummer_compare_csum_info(struct daos_csummer *obj, match = daos_csummer_csum_compare(obj, ci_idx2csum(a, i), ci_idx2csum(b, i), a->cs_len); + if (unlikely(!match)) + D_ERROR("Checksum mismatch at index %d/%d "DF_CI_BUF" != "DF_CI_BUF"\n", i, + a->cs_nr, DP_CI_BUF(ci_idx2csum(a, i), a->cs_len), + DP_CI_BUF(ci_idx2csum(b, i), b->cs_len)); } return match; diff --git a/src/object/cli_csum.h b/src/object/cli_csum.h index 3b1431b8c1f..566c707054d 100644 --- a/src/object/cli_csum.h +++ b/src/object/cli_csum.h @@ -11,6 +11,9 @@ #include #include "obj_internal.h" +/** How many times to retry UPDATE RPCs on checksum error */ +#define MAX_CSUM_RETRY 10 + int dc_obj_csum_update(struct daos_csummer *csummer, struct cont_props props, daos_obj_id_t param, daos_key_t *dkey, daos_iod_t *iods, d_sg_list_t *sgls, const uint32_t iod_nr, struct dcs_layout *layout, struct dcs_csum_info **dkey_csum, diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index 2c0c0a952ea..318ba0cb51e 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -4684,12 +4684,15 @@ obj_comp_cb(tse_task_t *task, void *data) int rc; obj_auxi = tse_task_stack_pop(task, sizeof(*obj_auxi)); - obj_auxi->io_retry = 0; - obj_auxi->result = 0; - obj_auxi->csum_retry = 0; - obj_auxi->tx_uncertain = 0; - obj_auxi->nvme_io_err = 0; obj = obj_auxi->obj; + + /** Clear various bits for a new attempt */ + obj_auxi->io_retry = 0; + obj_auxi->result = 0; + obj_auxi->csum_retry = 0; + obj_auxi->tx_uncertain = 0; + obj_auxi->nvme_io_err = 0; + rc = obj_comp_cb_internal(obj_auxi); if (rc != 0 || obj_auxi->result) { if (task->dt_result == 0) @@ -4760,9 +4763,28 @@ obj_comp_cb(tse_task_t *task, void *data) obj_auxi->tx_uncertain = 1; else obj_auxi->nvme_io_err = 1; - } else if (task->dt_result != -DER_NVME_IO) { - /* Don't retry update for CSUM & UNCERTAIN errors */ - obj_auxi->io_retry = 0; + } else { + if (obj_auxi->opc == DAOS_OBJ_RPC_UPDATE && + task->dt_result == -DER_CSUM) { + struct shard_rw_args *rw_arg = &obj_auxi->rw_args; + + /** Retry a few times on checksum error on update */ + if (rw_arg->csum_retry_cnt < MAX_CSUM_RETRY) { + obj_auxi->csum_retry = 1; + rw_arg->csum_retry_cnt++; + D_DEBUG(DB_IO, DF_OID" checksum error on " + "update, retrying\n", + DP_OID(obj->cob_md.omd_id)); + } else { + D_ERROR(DF_OID" checksum error on update, " + "too many retries. Failing I/O\n", + DP_OID(obj->cob_md.omd_id)); + obj_auxi->io_retry = 0; + } + } else if (task->dt_result != -DER_NVME_IO) { + /* Don't retry update for UNCERTAIN errors */ + obj_auxi->io_retry = 0; + } } } else { obj_auxi->io_retry = 0; diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h index ec6aa3b817e..ae5c9c82fd1 100644 --- a/src/object/obj_internal.h +++ b/src/object/obj_internal.h @@ -281,6 +281,7 @@ struct shard_rw_args { struct dcs_csum_info *dkey_csum; struct dcs_iod_csums *iod_csums; struct obj_reasb_req *reasb_req; + uint16_t csum_retry_cnt; }; struct coll_sparse_targets { @@ -474,8 +475,8 @@ struct obj_auxi_args { rebuilding:1, for_migrate:1; /* request flags. currently only: ORF_RESEND */ - uint32_t flags; uint32_t specified_shard; + uint32_t flags; uint16_t retry_cnt; uint16_t inprogress_cnt; struct obj_req_tgts req_tgts;