Skip to content

Commit 9341713

Browse files
author
Paul Dagnelie
committed
Make ganging redundancy respect redundant_metadata property
The redundant_metadata setting in ZFS allows users to trade resilience for performance and space savings. This applies to all data and metadata blocks in zfs, with one exception: gang blocks. Gang blocks currently just take the copies property of the IO being ganged and, if it's 1, sets it to 2. This means that we always make at least two copies of a gang header, which is good for resilience. However, if the users care more about performance than resilience, their gang blocks will be even more of a penalty than usual. We add logic to calculate the number of gang headers copies directly, and store it as a separate IO property. This is stored in the IO properties and not calculated when we decide to gang because by that point we may not have easy access to the relevant information about what kind of block is being stored. We also check the redundant_metadata property when doing so, and use that to decide whether to store an extra copy of the gang headers, compared to the underlying blocks. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Paul Dagnelie <[email protected]>
1 parent 62a9d37 commit 9341713

File tree

15 files changed

+327
-20
lines changed

15 files changed

+327
-20
lines changed

cmd/zdb/zdb.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2544,12 +2544,14 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
25442544

25452545
blkbuf[0] = '\0';
25462546

2547-
for (i = 0; i < ndvas; i++)
2547+
for (i = 0; i < ndvas; i++) {
25482548
(void) snprintf(blkbuf + strlen(blkbuf),
2549-
buflen - strlen(blkbuf), "%llu:%llx:%llx ",
2549+
buflen - strlen(blkbuf), "%llu:%llx:%llx%s ",
25502550
(u_longlong_t)DVA_GET_VDEV(&dva[i]),
25512551
(u_longlong_t)DVA_GET_OFFSET(&dva[i]),
2552-
(u_longlong_t)DVA_GET_ASIZE(&dva[i]));
2552+
(u_longlong_t)DVA_GET_ASIZE(&dva[i]),
2553+
(DVA_GET_GANG(&dva[i]) ? "G" : ""));
2554+
}
25532555

25542556
if (BP_IS_HOLE(bp)) {
25552557
(void) snprintf(blkbuf + strlen(blkbuf),
@@ -8980,7 +8982,7 @@ zdb_read_block(char *thing, spa_t *spa)
89808982

89818983
DVA_SET_VDEV(&dva[0], vd->vdev_id);
89828984
DVA_SET_OFFSET(&dva[0], offset);
8983-
DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
8985+
DVA_SET_GANG(&dva[0], 0);
89848986
DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
89858987

89868988
BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
@@ -8995,7 +8997,7 @@ zdb_read_block(char *thing, spa_t *spa)
89958997
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
89968998

89978999
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
8998-
zio = zio_root(spa, NULL, NULL, 0);
9000+
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
89999001

90009002
if (vd == vd->vdev_top) {
90019003
/*

include/sys/dbuf.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ typedef struct dbuf_dirty_record {
173173
arc_buf_t *dr_data;
174174
override_states_t dr_override_state;
175175
uint8_t dr_copies;
176+
uint8_t dr_gang_copies;
176177
boolean_t dr_nopwrite;
177178
boolean_t dr_brtwrite;
178179
boolean_t dr_diowrite;

include/sys/zio.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,7 @@ typedef struct zio_prop {
349349
uint8_t zp_complevel;
350350
uint8_t zp_level;
351351
uint8_t zp_copies;
352+
uint8_t zp_gang_copies;
352353
dmu_object_type_t zp_type;
353354
boolean_t zp_dedup;
354355
boolean_t zp_dedup_verify;
@@ -574,7 +575,7 @@ extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
574575
zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb);
575576

576577
extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
577-
boolean_t nopwrite, boolean_t brtwrite);
578+
int gang_copies, boolean_t nopwrite, boolean_t brtwrite);
578579

579580
extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
580581

module/zfs/arc.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6907,6 +6907,8 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
69076907
localprop.zp_nopwrite = B_FALSE;
69086908
localprop.zp_copies =
69096909
MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
6910+
localprop.zp_gang_copies =
6911+
MIN(localprop.zp_gang_copies, SPA_DVAS_PER_BP - 1);
69106912
}
69116913
zio_flags |= ZIO_FLAG_RAW;
69126914
} else if (ARC_BUF_COMPRESSED(buf)) {

module/zfs/dbuf.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5351,8 +5351,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
53515351
mutex_enter(&db->db_mtx);
53525352
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
53535353
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
5354-
dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
5355-
dr->dt.dl.dr_brtwrite);
5354+
dr->dt.dl.dr_copies, dr->dt.dl.dr_gang_copies,
5355+
dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite);
53565356
mutex_exit(&db->db_mtx);
53575357
} else if (data == NULL) {
53585358
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||

module/zfs/dmu.c

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1915,6 +1915,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
19151915
dr->dt.dl.dr_overridden_by = *zio->io_bp;
19161916
dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
19171917
dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
1918+
dr->dt.dl.dr_gang_copies = zio->io_prop.zp_gang_copies;
19181919

19191920
/*
19201921
* Old style holes are filled with all zeros, whereas
@@ -2321,6 +2322,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
23212322
boolean_t dedup_verify = os->os_dedup_verify;
23222323
boolean_t encrypt = B_FALSE;
23232324
int copies = os->os_copies;
2325+
int gang_copies = os->os_copies;
23242326

23252327
/*
23262328
* We maintain different write policies for each of the following
@@ -2353,15 +2355,24 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
23532355
switch (os->os_redundant_metadata) {
23542356
case ZFS_REDUNDANT_METADATA_ALL:
23552357
copies++;
2358+
gang_copies++;
23562359
break;
23572360
case ZFS_REDUNDANT_METADATA_MOST:
23582361
if (level >= zfs_redundant_metadata_most_ditto_level ||
23592362
DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
23602363
copies++;
2364+
if (level + 1 >=
2365+
zfs_redundant_metadata_most_ditto_level ||
2366+
DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
2367+
gang_copies++;
23612368
break;
23622369
case ZFS_REDUNDANT_METADATA_SOME:
2363-
if (DMU_OT_IS_CRITICAL(type))
2370+
if (DMU_OT_IS_CRITICAL(type)) {
23642371
copies++;
2372+
gang_copies++;
2373+
} else if (DMU_OT_IS_METADATA(type)) {
2374+
gang_copies++;
2375+
}
23652376
break;
23662377
case ZFS_REDUNDANT_METADATA_NONE:
23672378
break;
@@ -2435,6 +2446,12 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
24352446
nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
24362447
ZCHECKSUM_FLAG_NOPWRITE) &&
24372448
compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
2449+
2450+
if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
2451+
(os->os_redundant_metadata ==
2452+
ZFS_REDUNDANT_METADATA_MOST &&
2453+
zfs_redundant_metadata_most_ditto_level <= 1))
2454+
gang_copies++;
24382455
}
24392456

24402457
/*
@@ -2451,6 +2468,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
24512468

24522469
if (DMU_OT_IS_ENCRYPTED(type)) {
24532470
copies = MIN(copies, SPA_DVAS_PER_BP - 1);
2471+
gang_copies = MIN(gang_copies, SPA_DVAS_PER_BP - 1);
24542472
nopwrite = B_FALSE;
24552473
} else {
24562474
dedup = B_FALSE;
@@ -2468,6 +2486,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
24682486
zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
24692487
zp->zp_level = level;
24702488
zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
2489+
zp->zp_gang_copies = MIN(gang_copies, spa_max_replication(os->os_spa));
24712490
zp->zp_dedup = dedup;
24722491
zp->zp_dedup_verify = dedup && dedup_verify;
24732492
zp->zp_nopwrite = nopwrite;

module/zfs/dmu_recv.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2299,6 +2299,9 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
22992299
zp.zp_nopwrite = B_FALSE;
23002300
zp.zp_copies = MIN(zp.zp_copies,
23012301
SPA_DVAS_PER_BP - 1);
2302+
zp.zp_gang_copies =
2303+
MIN(zp.zp_gang_copies,
2304+
SPA_DVAS_PER_BP - 1);
23022305
}
23032306
zio_flags |= ZIO_FLAG_RAW;
23042307
} else if (DRR_WRITE_COMPRESSED(drrw)) {

module/zfs/zio.c

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1404,8 +1404,8 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
14041404
}
14051405

14061406
void
1407-
zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite,
1408-
boolean_t brtwrite)
1407+
zio_write_override(zio_t *zio, blkptr_t *bp, int copies, int gang_copies,
1408+
boolean_t nopwrite, boolean_t brtwrite)
14091409
{
14101410
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
14111411
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
@@ -1422,6 +1422,7 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite,
14221422
zio->io_prop.zp_nopwrite = nopwrite;
14231423
zio->io_prop.zp_brtwrite = brtwrite;
14241424
zio->io_prop.zp_copies = copies;
1425+
zio->io_prop.zp_gang_copies = gang_copies;
14251426
zio->io_bp_override = bp;
14261427
}
14271428

@@ -3130,15 +3131,13 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
31303131
boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);
31313132

31323133
/*
3133-
* If one copy was requested, store 2 copies of the GBH, so that we
3134-
* can still traverse all the data (e.g. to free or scrub) even if a
3135-
* block is damaged. Note that we can't store 3 copies of the GBH in
3136-
* all cases, e.g. with encryption, which uses DVA[2] for the IV+salt.
3134+
* Store multiple copies of the GBH, so that we can still traverse
3135+
* all the data (e.g. to free or scrub) even if a block is damaged.
3136+
* This value respects the redundant_metadata property.
31373137
*/
3138-
int gbh_copies = copies;
3139-
if (gbh_copies == 1) {
3140-
gbh_copies = MIN(2, spa_max_replication(spa));
3141-
}
3138+
int gbh_copies = gio->io_prop.zp_gang_copies;
3139+
ASSERT3S(gbh_copies, >, 0);
3140+
ASSERT3S(gbh_copies, <=, SPA_DVAS_PER_BP);
31423141

31433142
ASSERT(ZIO_HAS_ALLOCATOR(pio));
31443143
int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
@@ -3158,6 +3157,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
31583157
* since metaslab_class_throttle_reserve() always allows
31593158
* additional reservations for gang blocks.
31603159
*/
3160+
ASSERT3U(gbh_copies, >=, copies);
31613161
VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
31623162
pio->io_allocator, pio, flags));
31633163
}
@@ -3220,6 +3220,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
32203220
zp.zp_type = zp.zp_storage_type = DMU_OT_NONE;
32213221
zp.zp_level = 0;
32223222
zp.zp_copies = gio->io_prop.zp_copies;
3223+
zp.zp_gang_copies = gio->io_prop.zp_gang_copies;
32233224
zp.zp_dedup = B_FALSE;
32243225
zp.zp_dedup_verify = B_FALSE;
32253226
zp.zp_nopwrite = B_FALSE;
@@ -3934,7 +3935,7 @@ zio_ddt_write(zio_t *zio)
39343935
* grow the DDT entry by to satisfy the request.
39353936
*/
39363937
zio_prop_t czp = *zp;
3937-
czp.zp_copies = need_dvas;
3938+
czp.zp_copies = czp.zp_gang_copies = need_dvas;
39383939
zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
39393940
zio->io_orig_size, zio->io_orig_size, &czp,
39403941
zio_ddt_child_write_ready, NULL,

tests/runfiles/common.run

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -723,6 +723,10 @@ tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg',
723723
'large_dnode_005_pos', 'large_dnode_007_neg', 'large_dnode_009_pos']
724724
tags = ['functional', 'features', 'large_dnode']
725725

726+
[tests/functional/gang_blocks]
727+
tests = ['gang_blocks_redundant']
728+
tags = ['functional', 'gang_blocks']
729+
726730
[tests/functional/grow]
727731
pre =
728732
post =

tests/zfs-tests/include/tunables.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ MAX_DATASET_NESTING max_dataset_nesting zfs_max_dataset_nesting
6262
MAX_MISSING_TVDS max_missing_tvds zfs_max_missing_tvds
6363
METASLAB_DEBUG_LOAD metaslab.debug_load metaslab_debug_load
6464
METASLAB_FORCE_GANGING metaslab.force_ganging metaslab_force_ganging
65+
METASLAB_FORCE_GANGING_PCT metaslab.force_ganging_pct metaslab_force_ganging_pct
6566
MULTIHOST_FAIL_INTERVALS multihost.fail_intervals zfs_multihost_fail_intervals
6667
MULTIHOST_HISTORY multihost.history zfs_multihost_history
6768
MULTIHOST_IMPORT_INTERVALS multihost.import_intervals zfs_multihost_import_intervals

0 commit comments

Comments
 (0)