From a12c062892ec45d7d8b8983e6ef4bf1392da7a1d Mon Sep 17 00:00:00 2001 From: "david.chen" Date: Fri, 25 Oct 2024 00:26:12 +0000 Subject: [PATCH] ZFS send should use spill block prefetched from send_reader_thread Currently, even though send_reader_thread prefetches spill block, do_dump() will not use it and issues its own blocking arc_read. This causes significant performance degradation when sending datasets with lots of spill blocks. For unmodified spill blocks, we also create send_range struct for them in send_reader_thread and issue prefetches for them. We piggyback them on the dnode send_range instead of enqueueing them so we don't break send_range_after check. Signed-off-by: Chunwei Chen --- module/zfs/dmu_send.c | 124 +++++++++++++++++++++--------------------- 1 file changed, 63 insertions(+), 61 deletions(-) diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index a174972e9b57..135bd48f978d 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -180,6 +180,8 @@ struct send_range { */ dnode_phys_t *dnp; blkptr_t bp; + /* Piggyback unmodified spill block */ + struct send_range *spill_range; } object; struct srr { uint32_t datablksz; @@ -231,6 +233,8 @@ range_free(struct send_range *range) size_t size = sizeof (dnode_phys_t) * (range->sru.object.dnp->dn_extra_slots + 1); kmem_free(range->sru.object.dnp, size); + if (range->sru.object.spill_range) + range_free(range->sru.object.spill_range); } else if (range->type == DATA) { mutex_enter(&range->sru.data.lock); while (range->sru.data.io_outstanding) @@ -617,7 +621,7 @@ dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, drrs->drr_length = blksz; drrs->drr_toguid = dscp->dsc_toguid; - /* See comment in dump_dnode() for full details */ + /* See comment in piggyback_unmodified_spill() for full details */ if (zfs_send_unmodified_spill_blocks && (BP_GET_LOGICAL_BIRTH(bp) <= dscp->dsc_fromtxg)) { drrs->drr_flags |= DRR_SPILL_UNMODIFIED; @@ -793,35 +797,6 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0) return (SET_ERROR(EINTR)); - /* - * Send DRR_SPILL records for unmodified spill blocks. This is useful - * because changing certain attributes of the object (e.g. blocksize) - * can cause old versions of ZFS to incorrectly remove a spill block. - * Including these records in the stream forces an up to date version - * to always be written ensuring they're never lost. Current versions - * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can - * ignore these unmodified spill blocks. - */ - if (zfs_send_unmodified_spill_blocks && - (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && - (BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= dscp->dsc_fromtxg)) { - struct send_range record; - blkptr_t *bp = DN_SPILL_BLKPTR(dnp); - - memset(&record, 0, sizeof (struct send_range)); - record.type = DATA; - record.object = object; - record.eos_marker = B_FALSE; - record.start_blkid = DMU_SPILL_BLKID; - record.end_blkid = record.start_blkid + 1; - record.sru.data.bp = *bp; - record.sru.data.obj_type = dnp->dn_type; - record.sru.data.datablksz = BP_GET_LSIZE(bp); - - if (do_dump(dscp, &record) != 0) - return (SET_ERROR(EINTR)); - } - if (dscp->dsc_err != 0) return (SET_ERROR(EINTR)); @@ -911,6 +886,9 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) case OBJECT: err = dump_dnode(dscp, &range->sru.object.bp, range->object, range->sru.object.dnp); + /* Dump piggybacked unmodified spill block */ + if (!err && range->sru.object.spill_range) + err = do_dump(dscp, range->sru.object.spill_range); return (err); case OBJECT_RANGE: { ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); @@ -939,34 +917,7 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) ASSERT3U(srdp->datablksz, ==, BP_GET_LSIZE(bp)); ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); - if (BP_GET_TYPE(bp) == DMU_OT_SA) { - arc_flags_t aflags = ARC_FLAG_WAIT; - zio_flag_t zioflags = ZIO_FLAG_CANFAIL; - - if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) { - ASSERT(BP_IS_PROTECTED(bp)); - zioflags |= ZIO_FLAG_RAW; - } - zbookmark_phys_t zb; - ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID); - zb.zb_objset = dmu_objset_id(dscp->dsc_os); - zb.zb_object = range->object; - zb.zb_level = 0; - zb.zb_blkid = range->start_blkid; - - arc_buf_t *abuf = NULL; - if (!dscp->dsc_dso->dso_dryrun && arc_read(NULL, spa, - bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, - zioflags, &aflags, &zb) != 0) - return (SET_ERROR(EIO)); - - err = dump_spill(dscp, bp, zb.zb_object, - (abuf == NULL ? NULL : abuf->b_data)); - if (abuf != NULL) - arc_buf_destroy(abuf, &abuf); - return (err); - } if (send_do_embed(bp, dscp->dsc_featureflags)) { err = dump_write_embedded(dscp, range->object, range->start_blkid * srdp->datablksz, @@ -975,8 +926,9 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) } ASSERT(range->object > dscp->dsc_resume_object || (range->object == dscp->dsc_resume_object && + (range->start_blkid == DMU_SPILL_BLKID || range->start_blkid * srdp->datablksz >= - dscp->dsc_resume_offset)); + dscp->dsc_resume_offset))); /* it's a level-0 block of a regular object */ mutex_enter(&srdp->lock); @@ -1006,8 +958,6 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) ASSERT(dscp->dsc_dso->dso_dryrun || srdp->abuf != NULL || srdp->abd != NULL); - uint64_t offset = range->start_blkid * srdp->datablksz; - char *data = NULL; if (srdp->abd != NULL) { data = abd_to_buf(srdp->abd); @@ -1016,6 +966,14 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) data = srdp->abuf->b_data; } + if (BP_GET_TYPE(bp) == DMU_OT_SA) { + ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID); + err = dump_spill(dscp, bp, range->object, data); + return (err); + } + + uint64_t offset = range->start_blkid * srdp->datablksz; + /* * If we have large blocks stored on disk but the send flags * don't allow us to send large blocks, we split the data from @@ -1098,6 +1056,8 @@ range_alloc(enum type type, uint64_t object, uint64_t start_blkid, range->sru.data.io_outstanding = 0; range->sru.data.io_err = 0; range->sru.data.io_compressed = B_FALSE; + } else if (type == OBJECT) { + range->sru.object.spill_range = NULL; } return (range); } @@ -1742,6 +1702,45 @@ enqueue_range(struct send_reader_thread_arg *srta, bqueue_t *q, dnode_t *dn, bqueue_enqueue(q, range, datablksz); } +/* + * Send DRR_SPILL records for unmodified spill blocks. This is useful + * because changing certain attributes of the object (e.g. blocksize) + * can cause old versions of ZFS to incorrectly remove a spill block. + * Including these records in the stream forces an up to date version + * to always be written ensuring they're never lost. Current versions + * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can + * ignore these unmodified spill blocks. + */ +static uint64_t +piggyback_unmodified_spill(struct send_reader_thread_arg *srta, + struct send_range *range) +{ + if (range->type != OBJECT || + !zfs_send_unmodified_spill_blocks) + return (0); + + struct send_range *spill_range = NULL; + dnode_phys_t *dnp = range->sru.object.dnp; + uint64_t fromtxg = srta->smta->to_arg->fromtxg; + + if (!(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) || + !(BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= fromtxg)) + return (0); + + blkptr_t *bp = DN_SPILL_BLKPTR(dnp); + + spill_range = range_alloc(DATA, range->object, DMU_SPILL_BLKID, + DMU_SPILL_BLKID+1, B_FALSE); + spill_range->sru.data.bp = *bp; + spill_range->sru.data.obj_type = dnp->dn_type; + spill_range->sru.data.datablksz = BP_GET_LSIZE(bp); + + issue_data_read(srta, spill_range); + range->sru.object.spill_range = spill_range; + + return (BP_GET_LSIZE(bp)); +} + /* * This thread is responsible for two things: First, it retrieves the correct * blkptr in the to ds if we need to send the data because of something from @@ -1783,7 +1782,10 @@ send_reader_thread(void *arg) case OBJECT: case OBJECT_RANGE: case REDACT: // Redacted blocks must exist - bqueue_enqueue(outq, range, sizeof (*range)); + /* For OBJECT */ + uint64_t spill = + piggyback_unmodified_spill(srta, range); + bqueue_enqueue(outq, range, sizeof (*range) + spill); range = get_next_range_nofree(inq, range); break; case PREVIOUSLY_REDACTED: {