Skip to content

Commit

Permalink
DAOS-16508 csum: retry a few times on checksum mismatch on update (#1…
Browse files Browse the repository at this point in the history
…5069) (#15099)

Unlike fetch, we return DER_CSUM on update (turned into EIO by dfs) without any retry.
We should retry a few times in case it is a transient error.

The patch also prints more information about the actual checksum mismatch.

Signed-off-by: Johann Lombardi <[email protected]>
Co-authored-by: Johann Lombardi <[email protected]>
  • Loading branch information
jolivier23 and johannlombardi authored Sep 9, 2024
1 parent 6ee42c8 commit c485fa4
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 9 deletions.
4 changes: 4 additions & 0 deletions src/common/checksum.c
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,10 @@ daos_csummer_compare_csum_info(struct daos_csummer *obj,
match = daos_csummer_csum_compare(obj, ci_idx2csum(a, i),
ci_idx2csum(b, i),
a->cs_len);
if (unlikely(!match))
D_ERROR("Checksum mismatch at index %d/%d "DF_CI_BUF" != "DF_CI_BUF"\n", i,
a->cs_nr, DP_CI_BUF(ci_idx2csum(a, i), a->cs_len),
DP_CI_BUF(ci_idx2csum(b, i), b->cs_len));
}

return match;
Expand Down
3 changes: 3 additions & 0 deletions src/object/cli_csum.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
#include <daos/cont_props.h>
#include "obj_internal.h"

/** How many times to retry UPDATE RPCs on checksum error */
#define MAX_CSUM_RETRY 10

int dc_obj_csum_update(struct daos_csummer *csummer, struct cont_props props, daos_obj_id_t param,
daos_key_t *dkey, daos_iod_t *iods, d_sg_list_t *sgls, const uint32_t iod_nr,
struct dcs_layout *layout, struct dcs_csum_info **dkey_csum,
Expand Down
38 changes: 30 additions & 8 deletions src/object/cli_obj.c
Original file line number Diff line number Diff line change
Expand Up @@ -4684,12 +4684,15 @@ obj_comp_cb(tse_task_t *task, void *data)
int rc;

obj_auxi = tse_task_stack_pop(task, sizeof(*obj_auxi));
obj_auxi->io_retry = 0;
obj_auxi->result = 0;
obj_auxi->csum_retry = 0;
obj_auxi->tx_uncertain = 0;
obj_auxi->nvme_io_err = 0;
obj = obj_auxi->obj;

/** Clear various bits for a new attempt */
obj_auxi->io_retry = 0;
obj_auxi->result = 0;
obj_auxi->csum_retry = 0;
obj_auxi->tx_uncertain = 0;
obj_auxi->nvme_io_err = 0;

rc = obj_comp_cb_internal(obj_auxi);
if (rc != 0 || obj_auxi->result) {
if (task->dt_result == 0)
Expand Down Expand Up @@ -4760,9 +4763,28 @@ obj_comp_cb(tse_task_t *task, void *data)
obj_auxi->tx_uncertain = 1;
else
obj_auxi->nvme_io_err = 1;
} else if (task->dt_result != -DER_NVME_IO) {
/* Don't retry update for CSUM & UNCERTAIN errors */
obj_auxi->io_retry = 0;
} else {
if (obj_auxi->opc == DAOS_OBJ_RPC_UPDATE &&
task->dt_result == -DER_CSUM) {
struct shard_rw_args *rw_arg = &obj_auxi->rw_args;

/** Retry a few times on checksum error on update */
if (rw_arg->csum_retry_cnt < MAX_CSUM_RETRY) {
obj_auxi->csum_retry = 1;
rw_arg->csum_retry_cnt++;
D_DEBUG(DB_IO, DF_OID" checksum error on "
"update, retrying\n",
DP_OID(obj->cob_md.omd_id));
} else {
D_ERROR(DF_OID" checksum error on update, "
"too many retries. Failing I/O\n",
DP_OID(obj->cob_md.omd_id));
obj_auxi->io_retry = 0;
}
} else if (task->dt_result != -DER_NVME_IO) {
/* Don't retry update for UNCERTAIN errors */
obj_auxi->io_retry = 0;
}
}
} else {
obj_auxi->io_retry = 0;
Expand Down
3 changes: 2 additions & 1 deletion src/object/obj_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ struct shard_rw_args {
struct dcs_csum_info *dkey_csum;
struct dcs_iod_csums *iod_csums;
struct obj_reasb_req *reasb_req;
uint16_t csum_retry_cnt;
};

struct coll_sparse_targets {
Expand Down Expand Up @@ -474,8 +475,8 @@ struct obj_auxi_args {
rebuilding:1,
for_migrate:1;
/* request flags. currently only: ORF_RESEND */
uint32_t flags;
uint32_t specified_shard;
uint32_t flags;
uint16_t retry_cnt;
uint16_t inprogress_cnt;
struct obj_req_tgts req_tgts;
Expand Down

0 comments on commit c485fa4

Please sign in to comment.