From f9d59b579e5100949ee35d5df5a9488220031f2c Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Sat, 14 Jun 2025 09:37:18 -0400 Subject: [PATCH 01/61] ZIL: Relax parallel write ZIOs processing ZIL introduced dependencies between its write ZIOs to permit flush defer, when we flush vdev caches only once all the write ZIOs has completed. But it was recently spotted that it serializes not only ZIO completions handling, but also their ready stage. It means ZIO pipeline can't calculate checksums for the following ZIOs until all the previous are checksumed, even though it is not required. On a systems where memory throughput of a single CPU core is limited, it creates single-core CPU bottleneck, which is difficult to see due to ZIO pipeline design with many taskqueue threads. While it would be great to bypass the ready stage waits, it would require changes to ZIO code, and I haven't found a clean way to do it. But I've noticed that we don't need any dependency between the write ZIOs if the previous one has some waiters, which means it won't defer any flushes and work as a barrier for the earlier ones. Bypassing it won't help large single-thread writes, since all the write ZIOs except the last in that case won't have waiters, and so will be dependent. But in that case the ZIO processing might not be a bottleneck, since there will be only one thread populating the write buffers, that will likely be the bottleneck. But bypassing the ZIO dependency on multi-threaded write workloads really allows them to scale beyond the checksuming throughput of one CPU core. My tests with writing 12 files on a same dataset on a pool with 4 striped NVMes as SLOGs from 12 threads with 1MB blocks on a system with Xeon Silver 4114 CPU show total throughput increase from 4.3GB/s to 8.5GB/s, increasing the SLOGs busy from ~30% to ~70%. Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #17458 --- module/zfs/zil.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index ac271d398155..88567ce30cd3 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1691,7 +1691,7 @@ zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb) * If the previous lwb's write hasn't already completed, we also want * to order the completion of the lwb write zios (above, we only order * the completion of the lwb root zios). This is required because of - * how we can defer the flush commands for each lwb. + * how we can defer the flush commands for any lwb without waiters. * * When the flush commands are deferred, the previous lwb will rely on * this lwb to flush the vdevs written to by that previous lwb. Thus, @@ -1708,7 +1708,10 @@ zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb) */ if (prev_lwb->lwb_state == LWB_STATE_ISSUED) { ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL); - zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio); + if (list_is_empty(&prev_lwb->lwb_waiters)) { + zio_add_child(lwb->lwb_write_zio, + prev_lwb->lwb_write_zio); + } } else { ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); } From 661310ff5cc89534dcdd77f6f11f62823e31b67f Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Thu, 13 Mar 2025 10:47:03 -0700 Subject: [PATCH 02/61] FDT dedup log sync -- remove incremental This PR condenses the FDT dedup log syncing into a single sync pass. This reduces the overhead of modifying indirect blocks for the dedup table multiple times per txg. In addition, changes were made to the formula for how much to sync per txg. We now also consider the backlog we have to clear, to prevent it from growing too large, or remaining large on an idle system. Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Authored-by: Don Brady Authored-by: Paul Dagnelie Signed-off-by: Paul Dagnelie Closes #17038 --- include/sys/ddt.h | 7 +- include/sys/vdev.h | 1 + include/sys/zfs_debug.h | 1 + man/man4/zfs.4 | 83 +++-- module/zfs/ddt.c | 342 +++++++++--------- module/zfs/vdev_queue.c | 10 + tests/runfiles/common.run | 4 +- tests/zfs-tests/include/tunables.cfg | 2 + tests/zfs-tests/tests/Makefile.am | 1 + .../functional/dedup/dedup_fdt_pacing.ksh | 109 ++++++ .../tests/functional/dedup/dedup_prune.ksh | 4 +- .../tests/functional/dedup/dedup_quota.ksh | 3 + .../functional/dedup/dedup_zap_shrink.ksh | 3 + 13 files changed, 367 insertions(+), 203 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh diff --git a/include/sys/ddt.h b/include/sys/ddt.h index d2fef16c9a05..8bdd7ca3a860 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -286,14 +286,11 @@ typedef struct { ddt_log_t *ddt_log_active; /* pointers into ddt_log */ ddt_log_t *ddt_log_flushing; /* swapped when flush starts */ - hrtime_t ddt_flush_start; /* log flush start this txg */ - uint32_t ddt_flush_pass; /* log flush pass this txg */ - - int32_t ddt_flush_count; /* entries flushed this txg */ - int32_t ddt_flush_min; /* min rem entries to flush */ int32_t ddt_log_ingest_rate; /* rolling log ingest rate */ int32_t ddt_log_flush_rate; /* rolling log flush rate */ int32_t ddt_log_flush_time_rate; /* avg time spent flushing */ + uint32_t ddt_log_flush_pressure; /* pressure to apply for cap */ + uint32_t ddt_log_flush_prev_backlog; /* prev backlog size */ uint64_t ddt_flush_force_txg; /* flush hard before this txg */ diff --git a/include/sys/vdev.h b/include/sys/vdev.h index de6e5eb1235a..5aad22dba6b3 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -173,6 +173,7 @@ extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority); extern uint32_t vdev_queue_length(vdev_t *vd); extern uint64_t vdev_queue_last_offset(vdev_t *vd); extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p); +extern boolean_t vdev_queue_pool_busy(spa_t *spa); extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd); diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h index dc1ec06688f8..871936da15f6 100644 --- a/include/sys/zfs_debug.h +++ b/include/sys/zfs_debug.h @@ -60,6 +60,7 @@ extern int zfs_dbgmsg_enable; #define ZFS_DEBUG_METASLAB_ALLOC (1 << 13) #define ZFS_DEBUG_BRT (1 << 14) #define ZFS_DEBUG_RAIDZ_RECONSTRUCT (1 << 15) +#define ZFS_DEBUG_DDT (1 << 16) extern void __set_error(const char *file, const char *func, int line, int err); extern void __zfs_dbgmsg(char *buf); diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 2523ce266358..730236481b55 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1057,27 +1057,6 @@ milliseconds until the operation completes. .It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int Enable prefetching dedup-ed blocks which are going to be freed. . -.It Sy zfs_dedup_log_flush_passes_max Ns = Ns Sy 8 Ns Pq uint -Maximum number of dedup log flush passes (iterations) each transaction. -.Pp -At the start of each transaction, OpenZFS will estimate how many entries it -needs to flush out to keep up with the change rate, taking the amount and time -taken to flush on previous txgs into account (see -.Sy zfs_dedup_log_flush_flow_rate_txgs ) . -It will spread this amount into a number of passes. -At each pass, it will use the amount already flushed and the total time taken -by flushing and by other IO to recompute how much it should do for the remainder -of the txg. -.Pp -Reducing the max number of passes will make flushing more aggressive, flushing -out more entries on each pass. -This can be faster, but also more likely to compete with other IO. -Increasing the max number of passes will put fewer entries onto each pass, -keeping the overhead of dedup changes to a minimum but possibly causing a large -number of changes to be dumped on the last pass, which can blow out the txg -sync time beyond -.Sy zfs_txg_timeout . -. .It Sy zfs_dedup_log_flush_min_time_ms Ns = Ns Sy 1000 Ns Pq uint Minimum time to spend on dedup log flush each transaction. .Pp @@ -1087,22 +1066,58 @@ up to This occurs even if doing so would delay the transaction, that is, other IO completes under this time. . -.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 1000 Ns Pq uint +.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 100 Ns Pq uint Flush at least this many entries each transaction. .Pp -OpenZFS will estimate how many entries it needs to flush each transaction to -keep up with the ingest rate (see -.Sy zfs_dedup_log_flush_flow_rate_txgs ) . -This sets the minimum for that estimate. -Raising it can force OpenZFS to flush more aggressively, keeping the log small -and so reducing pool import times, but can make it less able to back off if -log flushing would compete with other IO too much. -. +OpenZFS will flush a fraction of the log every TXG, to keep the size +proportional to the ingest rate (see +.Sy zfs_dedup_log_flush_txgs ) . +This sets the minimum for that estimate, which prevents the backlog from +completely draining if the ingest rate falls. +Raising it can force OpenZFS to flush more aggressively, reducing the backlog +to zero more quickly, but can make it less able to back off if log +flushing would compete with other IO too much. +. +.It Sy zfs_dedup_log_flush_entries_max Ns = Ns Sy UINT_MAX Ns Pq uint +Flush at most this many entries each transaction. +.Pp +Mostly used for debugging purposes. +.It Sy zfs_dedup_log_flush_txgs Ns = Ns Sy 100 Ns Pq uint +Target number of TXGs to process the whole dedup log. +.Pp +Every TXG, OpenZFS will process the inverse of this number times the size +of the DDT backlog. +This will keep the backlog at a size roughly equal to the ingest rate +times this value. +This offers a balance between a more efficient DDT log, with better +aggregation, and shorter import times, which increase as the size of the +DDT log increases. +Increasing this value will result in a more efficient DDT log, but longer +import times. +.It Sy zfs_dedup_log_cap Ns = Ns Sy UINT_MAX Ns Pq uint +Soft cap for the size of the current dedup log. +.Pp +If the log is larger than this size, we increase the aggressiveness of +the flushing to try to bring it back down to the soft cap. +Setting it will reduce import times, but will reduce the efficiency of +the DDT log, increasing the expected number of IOs required to flush the same +amount of data. +.It Sy zfs_dedup_log_hard_cap Ns = Ns Sy 0 Ns | Ns 1 Pq uint +Whether to treat the log cap as a firm cap or not. +.Pp +When set to 0 (the default), the +.Sy zfs_dedup_log_cap +will increase the maximum number of log entries we flush in a given txg. +This will bring the backlog size down towards the cap, but not at the expense +of making TXG syncs take longer. +If this is set to 1, the cap acts more like a hard cap than a soft cap; it will +also increase the minimum number of log entries we flush per TXG. +Enabling it will reduce worst-case import times, at the cost of increased TXG +sync times. .It Sy zfs_dedup_log_flush_flow_rate_txgs Ns = Ns Sy 10 Ns Pq uint Number of transactions to use to compute the flow rate. .Pp -OpenZFS will estimate how many entries it needs to flush each transaction by -monitoring the number of entries changed (ingest rate), number of entries +OpenZFS will estimate number of entries changed (ingest rate), number of entries flushed (flush rate) and time spent flushing (flush time rate) and combining these into an overall "flow rate". It will use an exponential weighted moving average over some number of recent @@ -1638,6 +1653,10 @@ _ 2048 ZFS_DEBUG_TRIM Verify TRIM ranges are always within the allocatable range tree. 4096 ZFS_DEBUG_LOG_SPACEMAP Verify that the log summary is consistent with the spacemap log and enable \fBzfs_dbgmsgs\fP for metaslab loading and flushing. + 8192 ZFS_DEBUG_METASLAB_ALLOC Enable debugging messages when allocations fail. + 16384 ZFS_DEBUG_BRT Enable BRT-related debugging messages. + 32768 ZFS_DEBUG_RAIDZ_RECONSTRUCT Enabled debugging messages for raidz reconstruction. + 65536 ZFS_DEBUG_DDT Enable DDT-related debugging messages. .TE .Sy \& * No Requires debug build . . diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 2fce4f393c38..2c3040fe656c 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -250,11 +250,6 @@ static uint32_t zfs_ddt_prunes_per_txg = 50000; boolean_t ddt_prune_artificial_age = B_FALSE; boolean_t ddt_dump_prune_histogram = B_FALSE; -/* - * Don't do more than this many incremental flush passes per txg. - */ -uint_t zfs_dedup_log_flush_passes_max = 8; - /* * Minimum time to flush per txg. */ @@ -263,7 +258,32 @@ uint_t zfs_dedup_log_flush_min_time_ms = 1000; /* * Minimum entries to flush per txg. */ -uint_t zfs_dedup_log_flush_entries_min = 1000; +uint_t zfs_dedup_log_flush_entries_min = 200; + +/* + * Target number of TXGs until the whole dedup log has been flushed. + * The log size will float around this value times the ingest rate. + */ +uint_t zfs_dedup_log_flush_txgs = 100; + +/* + * Maximum entries to flush per txg. Used for testing the dedup log. + */ +uint_t zfs_dedup_log_flush_entries_max = UINT_MAX; + +/* + * Soft cap for the size of the current dedup log. If the log is larger + * than this size, we slightly increase the aggressiveness of the flushing to + * try to bring it back down to the soft cap. + */ +uint_t zfs_dedup_log_cap = UINT_MAX; + +/* + * If this is set to B_TRUE, the cap above acts more like a hard cap: + * flushing is significantly more aggressive, increasing the minimum amount we + * flush per txg, as well as the maximum. + */ +boolean_t zfs_dedup_log_hard_cap = B_FALSE; /* * Number of txgs to average flow rates across. @@ -1600,6 +1620,7 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c) ddt->ddt_spa = spa; ddt->ddt_os = spa->spa_meta_objset; ddt->ddt_version = DDT_VERSION_UNCONFIGURED; + ddt->ddt_log_flush_pressure = 10; ddt_log_alloc(ddt); ddt_table_alloc_kstats(ddt); @@ -2013,146 +2034,6 @@ _ewma(int32_t val, int32_t prev, uint32_t weight) return (new); } -/* Returns true if done for this txg */ -static boolean_t -ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx) -{ - if (ddt->ddt_flush_pass == 0) { - if (spa_sync_pass(ddt->ddt_spa) == 1) { - /* First run this txg, get set up */ - ddt->ddt_flush_start = gethrtime(); - ddt->ddt_flush_count = 0; - - /* - * How many entries we need to flush. We want to at - * least match the ingest rate. - */ - ddt->ddt_flush_min = MAX( - ddt->ddt_log_ingest_rate, - zfs_dedup_log_flush_entries_min); - - /* - * If we've been asked to flush everything in a hurry, - * try to dump as much as possible on this txg. In - * this case we're only limited by time, not amount. - */ - if (ddt->ddt_flush_force_txg > 0) - ddt->ddt_flush_min = - MAX(ddt->ddt_flush_min, avl_numnodes( - &ddt->ddt_log_flushing->ddl_tree)); - } else { - /* We already decided we're done for this txg */ - return (B_FALSE); - } - } else if (ddt->ddt_flush_pass == spa_sync_pass(ddt->ddt_spa)) { - /* - * We already did some flushing on this pass, skip it. This - * happens when dsl_process_async_destroys() runs during a scan - * (on pass 1) and does an additional ddt_sync() to update - * freed blocks. - */ - return (B_FALSE); - } - - if (spa_sync_pass(ddt->ddt_spa) > - MAX(zfs_dedup_log_flush_passes_max, 1)) { - /* Too many passes this txg, defer until next. */ - ddt->ddt_flush_pass = 0; - return (B_TRUE); - } - - if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) { - /* Nothing to flush, done for this txg. */ - ddt->ddt_flush_pass = 0; - return (B_TRUE); - } - - uint64_t target_time = txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ? - MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms), - SEC2NSEC(zfs_txg_timeout)) : SEC2NSEC(zfs_txg_timeout); - - uint64_t elapsed_time = gethrtime() - ddt->ddt_flush_start; - - if (elapsed_time >= target_time) { - /* Too long since we started, done for this txg. */ - ddt->ddt_flush_pass = 0; - return (B_TRUE); - } - - ddt->ddt_flush_pass++; - ASSERT3U(spa_sync_pass(ddt->ddt_spa), ==, ddt->ddt_flush_pass); - - /* - * Estimate how much time we'll need to flush the remaining entries - * based on how long it normally takes. - */ - uint32_t want_time; - if (ddt->ddt_flush_pass == 1) { - /* First pass, use the average time/entries */ - if (ddt->ddt_log_flush_rate == 0) - /* Zero rate, just assume the whole time */ - want_time = target_time; - else - want_time = ddt->ddt_flush_min * - ddt->ddt_log_flush_time_rate / - ddt->ddt_log_flush_rate; - } else { - /* Later pass, calculate from this txg so far */ - want_time = ddt->ddt_flush_min * - elapsed_time / ddt->ddt_flush_count; - } - - /* Figure out how much time we have left */ - uint32_t remain_time = target_time - elapsed_time; - - /* Smear the remaining entries over the remaining passes. */ - uint32_t nentries = ddt->ddt_flush_min / - (MAX(1, zfs_dedup_log_flush_passes_max) + 1 - ddt->ddt_flush_pass); - if (want_time > remain_time) { - /* - * We're behind; try to catch up a bit by doubling the amount - * this pass. If we're behind that means we're in a later - * pass and likely have most of the remaining time to - * ourselves. If we're in the last couple of passes, then - * doubling might just take us over the timeout, but probably - * not be much, and it stops us falling behind. If we're - * in the middle passes, there'll be more to do, but it - * might just help us catch up a bit and we'll recalculate on - * the next pass anyway. - */ - nentries = MIN(ddt->ddt_flush_min, nentries*2); - } - - ddt_lightweight_entry_t ddlwe; - uint32_t count = 0; - while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) { - ddt_sync_flush_entry(ddt, &ddlwe, - ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx); - - /* End this pass if we've synced as much as we need to. */ - if (++count >= nentries) - break; - } - ddt->ddt_flush_count += count; - ddt->ddt_flush_min -= count; - - if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) { - /* We emptied it, so truncate on-disk */ - DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries); - ddt_log_truncate(ddt, tx); - /* No more passes needed this txg */ - ddt->ddt_flush_pass = 0; - } else { - /* More to do next time, save checkpoint */ - DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count); - ddt_log_checkpoint(ddt, &ddlwe, tx); - } - - ddt_sync_update_stats(ddt, tx); - - return (ddt->ddt_flush_pass == 0); -} - static inline void ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg) { @@ -2190,19 +2071,135 @@ ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg) static void ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx) { + spa_t *spa = ddt->ddt_spa; ASSERT(avl_is_empty(&ddt->ddt_tree)); - /* Don't do any flushing when the pool is ready to shut down */ - if (tx->tx_txg > spa_final_dirty_txg(ddt->ddt_spa)) + /* + * Don't do any flushing when the pool is ready to shut down, or in + * passes beyond the first. + */ + if (spa_sync_pass(spa) > 1 || tx->tx_txg > spa_final_dirty_txg(spa)) return; - /* Try to flush some. */ - if (!ddt_sync_flush_log_incremental(ddt, tx)) - /* More to do next time */ - return; + hrtime_t flush_start = gethrtime(); + uint32_t count = 0; + + /* + * How many entries we need to flush. We need to at + * least match the ingest rate, and also consider the + * current backlog of entries. + */ + uint64_t backlog = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) + + avl_numnodes(&ddt->ddt_log_active->ddl_tree); + + if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) + goto housekeeping; + + uint64_t txgs = MAX(1, zfs_dedup_log_flush_txgs); + uint64_t cap = MAX(1, zfs_dedup_log_cap); + uint64_t flush_min = MAX(backlog / txgs, + zfs_dedup_log_flush_entries_min); + + /* + * The theory for this block is that if we increase the pressure while + * we're growing above the cap, and remove it when we're significantly + * below the cap, we'll stay near cap while not bouncing around too + * much. + * + * The factor of 10 is to smooth the pressure effect by expressing it + * in tenths. The addition of the cap to the backlog in the second + * block is to round up, instead of down. We never let the pressure go + * below 1 (10 tenths). + */ + if (cap != UINT_MAX && backlog > cap && + backlog > ddt->ddt_log_flush_prev_backlog) { + ddt->ddt_log_flush_pressure += 10 * backlog / cap; + } else if (cap != UINT_MAX && backlog < cap) { + ddt->ddt_log_flush_pressure -= + 11 - (((10 * backlog) + cap - 1) / cap); + ddt->ddt_log_flush_pressure = + MAX(ddt->ddt_log_flush_pressure, 10); + } + + if (zfs_dedup_log_hard_cap && cap != UINT_MAX) + flush_min = MAX(flush_min, MIN(backlog - cap, + (flush_min * ddt->ddt_log_flush_pressure) / 10)); + + uint64_t flush_max; + + /* + * If we've been asked to flush everything in a hurry, + * try to dump as much as possible on this txg. In + * this case we're only limited by time, not amount. + * + * Otherwise, if we are over the cap, try to get back down to it. + * + * Finally if there is no cap (or no pressure), just set the max a + * little higher than the min to help smooth out variations in flush + * times. + */ + if (ddt->ddt_flush_force_txg > 0) + flush_max = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree); + else if (cap != UINT32_MAX && !zfs_dedup_log_hard_cap) + flush_max = MAX(flush_min * 5 / 4, MIN(backlog - cap, + (flush_min * ddt->ddt_log_flush_pressure) / 10)); + else + flush_max = flush_min * 5 / 4; + flush_max = MIN(flush_max, zfs_dedup_log_flush_entries_max); + + /* + * When the pool is busy or someone is explicitly waiting for this txg + * to complete, use the zfs_dedup_log_flush_min_time_ms. Otherwise use + * half of the time in the txg timeout. + */ + uint64_t target_time; + + if (txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) || + vdev_queue_pool_busy(spa)) { + target_time = MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms), + SEC2NSEC(zfs_txg_timeout) / 2); + } else { + target_time = SEC2NSEC(zfs_txg_timeout) / 2; + } + + ddt_lightweight_entry_t ddlwe; + while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) { + ddt_sync_flush_entry(ddt, &ddlwe, + ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx); + + /* End if we've synced as much as we needed to. */ + if (++count >= flush_max) + break; + + /* + * As long as we've flushed the absolute minimum, + * stop if we're way over our target time. + */ + uint64_t diff = gethrtime() - flush_start; + if (count > zfs_dedup_log_flush_entries_min && + diff >= target_time * 2) + break; - /* No more flushing this txg, so we can do end-of-txg housekeeping */ + /* + * End if we've passed the minimum flush and we're out of time. + */ + if (count > flush_min && diff >= target_time) + break; + } + if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) { + /* We emptied it, so truncate on-disk */ + DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries); + ddt_log_truncate(ddt, tx); + } else { + /* More to do next time, save checkpoint */ + DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count); + ddt_log_checkpoint(ddt, &ddlwe, tx); + } + + ddt_sync_update_stats(ddt, tx); + +housekeeping: if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) && !avl_is_empty(&ddt->ddt_log_active->ddl_tree)) { /* @@ -2219,12 +2216,13 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx) /* If force flush is no longer necessary, turn it off. */ ddt_flush_force_update_txg(ddt, 0); + ddt->ddt_log_flush_prev_backlog = backlog; + /* - * Update flush rate. This is an exponential weighted moving average of - * the number of entries flushed over recent txgs. + * Update flush rate. This is an exponential weighted moving + * average of the number of entries flushed over recent txgs. */ - ddt->ddt_log_flush_rate = _ewma( - ddt->ddt_flush_count, ddt->ddt_log_flush_rate, + ddt->ddt_log_flush_rate = _ewma(count, ddt->ddt_log_flush_rate, zfs_dedup_log_flush_flow_rate_txgs); DDT_KSTAT_SET(ddt, dds_log_flush_rate, ddt->ddt_log_flush_rate); @@ -2232,12 +2230,21 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx) * Update flush time rate. This is an exponential weighted moving * average of the total time taken to flush over recent txgs. */ - ddt->ddt_log_flush_time_rate = _ewma( - ddt->ddt_log_flush_time_rate, - ((int32_t)(NSEC2MSEC(gethrtime() - ddt->ddt_flush_start))), + ddt->ddt_log_flush_time_rate = _ewma(ddt->ddt_log_flush_time_rate, + (int32_t)NSEC2MSEC(gethrtime() - flush_start), zfs_dedup_log_flush_flow_rate_txgs); DDT_KSTAT_SET(ddt, dds_log_flush_time_rate, ddt->ddt_log_flush_time_rate); + if (avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) > 0 && + zfs_flags & ZFS_DEBUG_DDT) { + zfs_dbgmsg("%lu entries remain(%lu in active), flushed %u @ " + "txg %llu, in %llu ms, flush rate %d, time rate %d", + (ulong_t)avl_numnodes(&ddt->ddt_log_flushing->ddl_tree), + (ulong_t)avl_numnodes(&ddt->ddt_log_active->ddl_tree), + count, (u_longlong_t)tx->tx_txg, + (u_longlong_t)NSEC2MSEC(gethrtime() - flush_start), + ddt->ddt_log_flush_rate, ddt->ddt_log_flush_time_rate); + } } static void @@ -2785,14 +2792,23 @@ ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit, ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW, "Enable prefetching dedup-ed blks"); -ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_passes_max, UINT, ZMOD_RW, - "Max number of incremental dedup log flush passes per transaction"); - ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_min_time_ms, UINT, ZMOD_RW, "Min time to spend on incremental dedup log flush each transaction"); ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_min, UINT, ZMOD_RW, "Min number of log entries to flush each transaction"); +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_max, UINT, ZMOD_RW, + "Max number of log entries to flush each transaction"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_txgs, UINT, ZMOD_RW, + "Number of TXGs to try to rotate the log in"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_cap, UINT, ZMOD_RW, + "Soft cap for the size of the current dedup log"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_hard_cap, UINT, ZMOD_RW, + "Whether to use the soft cap as a hard cap"); + ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW, "Number of txgs to average flow rates across"); diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index e85e3ec4b9f4..393fb9515d07 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -1050,6 +1050,16 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) mutex_exit(&vq->vq_lock); } +boolean_t +vdev_queue_pool_busy(spa_t *spa) +{ + dsl_pool_t *dp = spa_get_dsl(spa); + uint64_t min_bytes = zfs_dirty_data_max * + zfs_vdev_async_write_active_min_dirty_percent / 100; + + return (dp->dp_dirty_total > min_bytes); +} + /* * As these two methods are only used for load calculations we're not * concerned if we get an incorrect value on 32bit platforms due to lack of diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index b668623b1836..21c2422cc7f5 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -683,8 +683,8 @@ post = tags = ['functional', 'deadman'] [tests/functional/dedup] -tests = ['dedup_fdt_create', 'dedup_fdt_import', 'dedup_legacy_create', - 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade', +tests = ['dedup_fdt_create', 'dedup_fdt_import', 'dedup_fdt_pacing', + 'dedup_legacy_create', 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade', 'dedup_legacy_fdt_mixed', 'dedup_quota', 'dedup_prune', 'dedup_zap_shrink'] pre = post = diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 2024c44cc138..0a546dd44553 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -32,6 +32,8 @@ DDT_ZAP_DEFAULT_BS dedup.ddt_zap_default_bs ddt_zap_default_bs DDT_ZAP_DEFAULT_IBS dedup.ddt_zap_default_ibs ddt_zap_default_ibs DDT_DATA_IS_SPECIAL ddt_data_is_special zfs_ddt_data_is_special DEDUP_LOG_TXG_MAX dedup.log_txg_max zfs_dedup_log_txg_max +DEDUP_LOG_FLUSH_ENTRIES_MAX dedup.log_flush_entries_max zfs_dedup_log_flush_entries_max +DEDUP_LOG_FLUSH_ENTRIES_MIN dedup.log_flush_entries_min zfs_dedup_log_flush_entries_min DEADMAN_CHECKTIME_MS deadman.checktime_ms zfs_deadman_checktime_ms DEADMAN_EVENTS_PER_SECOND deadman_events_per_second zfs_deadman_events_per_second DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index aa7e7c79faab..3783fad7e59c 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1442,6 +1442,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/dedup/setup.ksh \ functional/dedup/dedup_fdt_create.ksh \ functional/dedup/dedup_fdt_import.ksh \ + functional/dedup/dedup_fdt_pacing.ksh \ functional/dedup/dedup_legacy_create.ksh \ functional/dedup/dedup_legacy_import.ksh \ functional/dedup/dedup_legacy_fdt_upgrade.ksh \ diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh new file mode 100755 index 000000000000..8cbc93d6eb74 --- /dev/null +++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh @@ -0,0 +1,109 @@ +#!/bin/ksh -p +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 Klara, Inc. +# + +# Ensure dedup log flushes are appropriately paced + +. $STF_SUITE/include/libtest.shlib + +log_assert "dedup (FDT) paces out log entries appropriately" + +function get_ddt_log_entries +{ + zdb -D $TESTPOOL | grep -- "-log-sha256-" | sed 's/.*entries=//' | \ + awk '{sum += $1} END {print sum}' +} + +function cleanup +{ + if poolexists $TESTPOOL; then + destroy_pool $TESTPOOL + fi + log_must restore_tunable DEDUP_LOG_FLUSH_ENTRIES_MAX +} + +log_onexit cleanup + +# Create a pool with fast dedup enabled. We disable block cloning to ensure +# it doesn't get in the way of dedup. +log_must zpool create -f \ + -o feature@fast_dedup=enabled \ + -o feature@block_cloning=disabled \ + $TESTPOOL $DISKS + +# Create a filesystem with a small recordsize so that we get more DDT entries, +# disable compression so our writes create predictable results on disk, and +# use 'xattr=sa' to prevent selinux xattrs influencing our accounting +log_must zfs create \ + -o dedup=on \ + -o compression=off \ + -o xattr=sa \ + -o checksum=sha256 \ + -o recordsize=4k $TESTPOOL/fs + +# Set the dedup log to only flush a single entry per txg. +# It's hard to guarantee that exactly one flush will happen per txg, or that +# we don't miss a txg due to weird latency or anything, so we build some +# wiggle room into subsequent checks. + +log_must save_tunable DEDUP_LOG_FLUSH_ENTRIES_MAX +log_must set_tunable32 DEDUP_LOG_FLUSH_ENTRIES_MAX 1 + +# Create a file. This is 256 full blocks, so will produce 256 entries in the +# dedup log. +log_must dd if=/dev/urandom of=/$TESTPOOL/fs/file1 bs=128k count=8 +sync_pool + +# Verify there are at least 240 entries in the dedup log. +log_entries=$(get_ddt_log_entries) +[[ "$log_entries" -gt 240 ]] || \ + log_fail "Fewer than 240 entries in dedup log: $log_entries" + +# Wait for 5 TXGs to sync. +for i in `seq 1 5`; do + sync_pool +done + +# Verify there are at least 220 entries in the dedup log. +log_entries2=$(get_ddt_log_entries) +[[ $((log_entries - log_entries2)) -lt 20 ]] || \ + log_fail "Too many entries pruned from dedup log: " \ + "from $log_entries to $log_entries2" +[[ $((log_entries - log_entries2)) -gt 5 ]] || \ + log_fail "Too few entries pruned from dedup log: " \ + "from $log_entries to $log_entries2" + +# Set the log flush rate high enough to clear the whole list. +log_must set_tunable32 DEDUP_LOG_FLUSH_ENTRIES_MAX 1024 +sync_pool + +# Verify there are 0 entries in the dedup log. +log_entries3=$(get_ddt_log_entries) +[[ "$log_entries3" -eq 0 ]] || \ + log_fail "Entries still present in dedup log: $log_entries3" + +# Verify there are 256 entries in the unique table. +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique:.*entries=256'" + +log_pass "dedup (FDT) paces out log entries appropriately" diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_prune.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_prune.ksh index 9568f2157db3..6b4937cc4a2c 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_prune.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_prune.ksh @@ -48,13 +48,15 @@ log_assert "Verify DDT pruning correctly removes non-duplicate entries" # entries appear in the DDT ZAP log_must save_tunable DEDUP_LOG_TXG_MAX log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 - +log_must save_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN +log_must set_tunable32 DEDUP_LOG_FLUSH_ENTRIES_MIN 100000 function cleanup { if poolexists $TESTPOOL ; then destroy_pool $TESTPOOL fi log_must restore_tunable DEDUP_LOG_TXG_MAX + log_must restore_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN } function ddt_entries diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh index 3702a806e709..764fbfa4cd6a 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh @@ -55,6 +55,8 @@ POOL="dedup_pool" # where things appear on-disk log_must save_tunable DEDUP_LOG_TXG_MAX log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 +log_must save_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN +log_must set_tunable32 DEDUP_LOG_FLUSH_ENTRIES_MIN 100000 function cleanup { @@ -63,6 +65,7 @@ function cleanup fi log_must rm -fd $VDEV_GENERAL $VDEV_DEDUP $MOUNTDIR log_must restore_tunable DEDUP_LOG_TXG_MAX + log_must restore_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN } diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_zap_shrink.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_zap_shrink.ksh index af2803c95396..597bad253ec3 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_zap_shrink.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_zap_shrink.ksh @@ -44,6 +44,8 @@ log_assert "Create a large number of entries in the DDT. " \ # entries appear in the DDT ZAP log_must save_tunable DEDUP_LOG_TXG_MAX log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 +log_must save_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN +log_must set_tunable32 DEDUP_LOG_FLUSH_ENTRIES_MIN 100000 function cleanup { @@ -51,6 +53,7 @@ function cleanup destroy_pool $TESTPOOL fi log_must restore_tunable DEDUP_LOG_TXG_MAX + log_must restore_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN } log_onexit cleanup From a4e775d2ca36bccf0e260506a5b3def7a6021ee4 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 18 Mar 2025 21:25:50 -0400 Subject: [PATCH 03/61] Some arc_release() cleanup - Don't drop L2ARC header if we have more buffers in this header. Since we leave them the header, leave them the L2ARC header also. Honestly we are not required to drop it even if there are no other buffers, but then we'd need to allocate it a separate header, which we might drop soon if the old block is really deleted. Multiple buffers in a header likely mean active snapshots or dedup, so we know that the block in L2ARC will remain valid. It might be rare, but why not? - Remove some impossible assertions and conditions. Reviewed-by: Tony Hutter Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #17126 --- module/zfs/arc.c | 45 ++++++++++++--------------------------------- 1 file changed, 12 insertions(+), 33 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 75be2b02a7e5..1f350b3fe97c 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -6627,27 +6627,11 @@ arc_release(arc_buf_t *buf, const void *tag) arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(state, !=, arc_anon); + ASSERT3P(state, !=, arc_l2c_only); /* this buffer is not on any list */ ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); - if (HDR_HAS_L2HDR(hdr)) { - mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); - - /* - * We have to recheck this conditional again now that - * we're holding the l2ad_mtx to prevent a race with - * another thread which might be concurrently calling - * l2arc_evict(). In that case, l2arc_evict() might have - * destroyed the header's L2 portion as we were waiting - * to acquire the l2ad_mtx. - */ - if (HDR_HAS_L2HDR(hdr)) - arc_hdr_l2hdr_destroy(hdr); - - mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); - } - /* * Do we have more than one buf? */ @@ -6659,10 +6643,6 @@ arc_release(arc_buf_t *buf, const void *tag) boolean_t protected = HDR_PROTECTED(hdr); enum zio_compress compress = arc_hdr_get_compress(hdr); arc_buf_contents_t type = arc_buf_type(hdr); - VERIFY3U(hdr->b_type, ==, type); - - ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); - VERIFY3S(remove_reference(hdr, tag), >, 0); if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); @@ -6670,10 +6650,10 @@ arc_release(arc_buf_t *buf, const void *tag) } /* - * Pull the data off of this hdr and attach it to - * a new anonymous hdr. Also find the last buffer + * Pull the buffer off of this hdr and find the last buffer * in the hdr's buffer list. */ + VERIFY3S(remove_reference(hdr, tag), >, 0); arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); ASSERT3P(lastbuf, !=, NULL); @@ -6682,7 +6662,6 @@ arc_release(arc_buf_t *buf, const void *tag) * buffer, then we must stop sharing that block. */ if (ARC_BUF_SHARED(buf)) { - ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); ASSERT(!arc_buf_is_shared(lastbuf)); /* @@ -6704,7 +6683,6 @@ arc_release(arc_buf_t *buf, const void *tag) abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, psize); } - VERIFY3P(lastbuf->b_data, !=, NULL); } else if (HDR_SHARED_DATA(hdr)) { /* * Uncompressed shared buffers are always at the end @@ -6720,18 +6698,10 @@ arc_release(arc_buf_t *buf, const void *tag) } ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); - ASSERT3P(state, !=, arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_size[type], arc_buf_size(buf), buf); - if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { - ASSERT3P(state, !=, arc_l2c_only); - (void) zfs_refcount_remove_many( - &state->arcs_esize[type], - arc_buf_size(buf), buf); - } - arc_cksum_verify(buf); arc_buf_unwatch(buf); @@ -6759,6 +6729,15 @@ arc_release(arc_buf_t *buf, const void *tag) /* protected by hash lock, or hdr is on arc_anon */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + + if (HDR_HAS_L2HDR(hdr)) { + mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); + /* Recheck to prevent race with l2arc_evict(). */ + if (HDR_HAS_L2HDR(hdr)) + arc_hdr_l2hdr_destroy(hdr); + mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); + } + hdr->b_l1hdr.b_mru_hits = 0; hdr->b_l1hdr.b_mru_ghost_hits = 0; hdr->b_l1hdr.b_mfu_hits = 0; From 954894ee5373e9400d05b29b8a5499fd66e4e868 Mon Sep 17 00:00:00 2001 From: Mariusz Zaborski Date: Sat, 7 Jun 2025 04:43:10 +0200 Subject: [PATCH 04/61] scrub: generate scrub_finish event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `scn_min_txg` can now be used not only with resilver. Instead of checking `scn_min_txg` to determine whether it’s a resilver or a scrub, simply check which function is defined. Thanks to this change, a scrub_finish event is generated when performing a scrub from the saved txg. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Alexander Motin Signed-off-by: Mariusz Zaborski Closes #17432 --- module/zfs/dsl_scan.c | 5 +- tests/zfs-tests/tests/Makefile.am | 1 + ...ol_events_scrub_txg_continue_from_last.ksh | 71 +++++++++++++++++++ 3 files changed, 76 insertions(+), 1 deletion(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_scrub_txg_continue_from_last.ksh diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index e10b1a879204..06f0c848a6eb 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -235,6 +235,9 @@ static uint_t zfs_resilver_defer_percent = 10; #define DSL_SCAN_IS_SCRUB(scn) \ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB) +#define DSL_SCAN_IS_RESILVER(scn) \ + ((scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) + /* * Enable/disable the processing of the free_bpobj object. */ @@ -1169,7 +1172,7 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE); - if (scn->scn_phys.scn_min_txg) { + if (DSL_SCAN_IS_RESILVER(scn)) { nvlist_t *aux = fnvlist_alloc(); fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "healing"); diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 3783fad7e59c..37a3bac6ca70 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1085,6 +1085,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_events/zpool_events_errors.ksh \ functional/cli_root/zpool_events/zpool_events_follow.ksh \ functional/cli_root/zpool_events/zpool_events_poolname.ksh \ + functional/cli_root/zpool_events/zpool_events_scrub_txg_continue_from_last.ksh \ functional/cli_root/zpool_expand/cleanup.ksh \ functional/cli_root/zpool_expand/setup.ksh \ functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_scrub_txg_continue_from_last.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_scrub_txg_continue_from_last.ksh new file mode 100755 index 000000000000..3bcbe9d49ccd --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_scrub_txg_continue_from_last.ksh @@ -0,0 +1,71 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# Copyright (c) 2025, Klara Inc. +# +# This software was developed by +# Mariusz Zaborski +# under sponsorship from Wasabi Technology, Inc. and Klara Inc. + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_events/zpool_events.kshlib + +# +# DESCRIPTION: +# Verify that using “zpool scrub -C” correctly generates events. +# +# STRATEGY: +# 1. Run an initial “zpool scrub” on the test pool to generate a txg. +# 2. Clear existing pool events. +# 3. Run “zpool scrub -C” to scrub from the last txg. +# 4. Capture the event log and confirm it contains both “scrub_start” and +# “scrub_finish” entries. +# + +verify_runnable "global" + +function cleanup +{ + rm -f $EVENTS_FILE +} + +EVENTS_FILE="$TESTDIR/zpool_events.$$" +log_onexit cleanup + +log_assert "Verify scrub -C events." + +# Run an initial “zpool scrub” +log_must zpool scrub -w $TESTPOOL + +# Clear existing pool events. +log_must zpool events -c + +# Generate new scrub events. +log_must zpool scrub -Cw $TESTPOOL + +# Verify events. +log_must eval "zpool events -H > $EVENTS_FILE" +log_must grep "scrub_start" $EVENTS_FILE +log_must grep "scrub_finish" $EVENTS_FILE + +log_pass "Verified scrub -C generate correct events." From 85ce6b8ab29b801b018f2eda0930ffc9d52baff8 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 11 Jun 2025 14:13:48 -0400 Subject: [PATCH 05/61] Polish db_rwlock scope dbuf_verify(): Don't need the lock, since we only compare pointers. dbuf_findbp(): Don't need the lock, since aside of unneeded assert we only produce the pointer, but don't de-reference it. dnode_next_offset_level(): When working on top level indirection should lock dnode buffer's db_rwlock, since it is our parent. If dnode has no buffer, then it is meta-dnode or one of quotas and we should lock the dataset's ds_bp_rwlock instead. Reviewed-by: Alan Somers Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #17441 --- module/zfs/dbuf.c | 17 +++-------------- module/zfs/dnode.c | 11 +++++++++++ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 51082b57f893..44d6ff639e98 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1185,16 +1185,9 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); ASSERT3U(db->db_parent->db.db_object, ==, db->db.db_object); - /* - * dnode_grow_indblksz() can make this fail if we don't - * have the parent's rwlock. XXX indblksz no longer - * grows. safe to do this now? - */ - if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) { - ASSERT3P(db->db_blkptr, ==, - ((blkptr_t *)db->db_parent->db.db_data + - db->db_blkid % epb)); - } + ASSERT3P(db->db_blkptr, ==, + ((blkptr_t *)db->db_parent->db.db_data + + db->db_blkid % epb)); } } if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && @@ -3391,12 +3384,8 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, *parentp = NULL; return (err); } - rw_enter(&(*parentp)->db_rwlock, RW_READER); *bpp = ((blkptr_t *)(*parentp)->db.db_data) + (blkid & ((1ULL << epbs) - 1)); - if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs))) - ASSERT(BP_IS_HOLE(*bpp)); - rw_exit(&(*parentp)->db_rwlock); return (0); } else { /* the block is referenced from the dnode */ diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 6f68e76561bc..99f3ca72ab87 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -2559,6 +2559,11 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, error = 0; epb = dn->dn_phys->dn_nblkptr; data = dn->dn_phys->dn_blkptr; + if (dn->dn_dbuf != NULL) + rw_enter(&dn->dn_dbuf->db_rwlock, RW_READER); + else if (dmu_objset_ds(dn->dn_objset) != NULL) + rrw_enter(&dmu_objset_ds(dn->dn_objset)->ds_bp_rwlock, + RW_READER, FTAG); } else { uint64_t blkid = dbuf_whichblock(dn, lvl, *offset); error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db); @@ -2663,6 +2668,12 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, if (db != NULL) { rw_exit(&db->db_rwlock); dbuf_rele(db, FTAG); + } else { + if (dn->dn_dbuf != NULL) + rw_exit(&dn->dn_dbuf->db_rwlock); + else if (dmu_objset_ds(dn->dn_objset) != NULL) + rrw_exit(&dmu_objset_ds(dn->dn_objset)->ds_bp_rwlock, + FTAG); } return (error); From 7e945a5b3f0184575284877d5a579abfe80df16c Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Thu, 19 Jun 2025 06:25:58 -0700 Subject: [PATCH 06/61] Fix other nonrot bugs There are still a variety of bugs involving the vdev_nonrot property that will cause problems if you try to run the test suite with segment-based weighting disabled, and with other things in the weighting code. Parents' nonrot property need to be updated when children are added. When vdevs are expanded and more metaslabs are added, the weights have to be recalculated (since the number of metaslabs is an input to the lba bias function). When opening, faulted or unopenable children should not be considered for whether a vdev is nonrot or not (since the nonrot property is determined during a successful open, this can cause false negatives). And draid spares need to have the nonrot property set correctly. Sponsored-by: Eshtek, creators of HexOS Sponsored-by: Klara, Inc. Reviewed-by: Allan Jude Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Rob Norris Signed-off-by: Paul Dagnelie Closes #17469 --- module/zfs/vdev.c | 18 ++++++++++++++++++ module/zfs/vdev_draid.c | 1 + 2 files changed, 19 insertions(+) diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 8407e17eab03..89e87277967b 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -540,6 +540,7 @@ vdev_add_child(vdev_t *pvd, vdev_t *cvd) pvd->vdev_child = newchild; pvd->vdev_child[id] = cvd; + pvd->vdev_nonrot &= cvd->vdev_nonrot; cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); @@ -1361,6 +1362,7 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; + mvd->vdev_nonrot = cvd->vdev_nonrot; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); @@ -1567,6 +1569,18 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) vd->vdev_ms = mspp; vd->vdev_ms_count = newc; + /* + * Weighting algorithms can depend on the number of metaslabs in the + * vdev. In order to ensure that all weights are correct at all times, + * we need to recalculate here. + */ + for (uint64_t m = 0; m < oldc; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + mutex_enter(&msp->ms_lock); + metaslab_recalculate_weight_and_sort(msp); + mutex_exit(&msp->ms_lock); + } + for (uint64_t m = oldc; m < newc; m++) { uint64_t object = 0; /* @@ -1948,6 +1962,10 @@ vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func) taskq_wait(tq); for (int c = 0; c < children; c++) { vdev_t *cvd = vd->vdev_child[c]; + + if (open_func(cvd) == B_FALSE || + cvd->vdev_state <= VDEV_STATE_FAULTED) + continue; vd->vdev_nonrot &= cvd->vdev_nonrot; } diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index d39a05458fe7..bdb69bb43fe0 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -2482,6 +2482,7 @@ vdev_draid_spare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, *max_psize = max_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; vds->vds_draid_vdev = tvd; + vd->vdev_nonrot = tvd->vdev_nonrot; return (0); } From 21d5f257249e69803def77d8fad3b02918b1ad90 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Wed, 9 Jul 2025 07:10:00 +0500 Subject: [PATCH 07/61] Validate mountpoint on path-based unmount using statx Use statx to verify that path-based unmounts proceed only if the mountpoint reported by statx matches the MNTTAB entry reported by libzfs, aborting the operation if they differ. Align `zfs umount /path` behavior with `zfs umount dataset`. Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #17481 --- cmd/zfs/zfs_main.c | 19 ++++++++++++++ config/user-statx.m4 | 34 ++++++++++++++++++++++++ config/user.m4 | 1 + lib/libspl/include/os/linux/sys/stat.h | 5 ++++ lib/libspl/os/linux/getmntany.c | 36 +++++++++++++++++++++----- 5 files changed, 89 insertions(+), 6 deletions(-) create mode 100644 config/user-statx.m4 diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index c9ebae575aeb..7db2273cd570 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -7716,6 +7716,7 @@ unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) struct extmnttab entry; const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount"; ino_t path_inode; + char *zfs_mntpnt, *entry_mntpnt; /* * Search for the given (major,minor) pair in the mount table. @@ -7757,6 +7758,24 @@ unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) goto out; } + /* + * If the filesystem is mounted, check that the mountpoint matches + * the one in the mnttab entry w.r.t. provided path. If it doesn't, + * then we should not proceed further. + */ + entry_mntpnt = strdup(entry.mnt_mountp); + if (zfs_is_mounted(zhp, &zfs_mntpnt)) { + if (strcmp(zfs_mntpnt, entry_mntpnt) != 0) { + (void) fprintf(stderr, gettext("cannot %s '%s': " + "not an original mountpoint\n"), cmdname, path); + free(zfs_mntpnt); + free(entry_mntpnt); + goto out; + } + free(zfs_mntpnt); + } + free(entry_mntpnt); + if (op == OP_SHARE) { char nfs_mnt_prop[ZFS_MAXPROPLEN]; char smbshare_prop[ZFS_MAXPROPLEN]; diff --git a/config/user-statx.m4 b/config/user-statx.m4 new file mode 100644 index 000000000000..0315f93e0c20 --- /dev/null +++ b/config/user-statx.m4 @@ -0,0 +1,34 @@ +dnl # +dnl # Check for statx() function and STATX_MNT_ID availability +dnl # +AC_DEFUN([ZFS_AC_CONFIG_USER_STATX], [ + AC_CHECK_HEADERS([linux/stat.h], + [have_stat_headers=yes], + [have_stat_headers=no]) + + AS_IF([test "x$have_stat_headers" = "xyes"], [ + AC_CHECK_FUNC([statx], [ + AC_DEFINE([HAVE_STATX], [1], [statx() is available]) + + dnl Check for STATX_MNT_ID availability + AC_MSG_CHECKING([for STATX_MNT_ID]) + AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM([[ + #include + ]], [[ + struct statx stx; + int mask = STATX_MNT_ID; + (void)mask; + (void)stx.stx_mnt_id; + ]]) + ], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_STATX_MNT_ID], [1], [STATX_MNT_ID is available]) + ], [ + AC_MSG_RESULT([no]) + ]) + ]) + ], [ + AC_MSG_WARN([linux/stat.h not found; skipping statx support]) + ]) +]) dnl end AC_DEFUN diff --git a/config/user.m4 b/config/user.m4 index badd920d2b8a..62e59ed94437 100644 --- a/config/user.m4 +++ b/config/user.m4 @@ -17,6 +17,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [ ZFS_AC_CONFIG_USER_LIBUDEV ZFS_AC_CONFIG_USER_LIBUUID ZFS_AC_CONFIG_USER_LIBBLKID + ZFS_AC_CONFIG_USER_STATX ]) ZFS_AC_CONFIG_USER_LIBTIRPC ZFS_AC_CONFIG_USER_LIBCRYPTO diff --git a/lib/libspl/include/os/linux/sys/stat.h b/lib/libspl/include/os/linux/sys/stat.h index 488554f4e844..a605af962a6d 100644 --- a/lib/libspl/include/os/linux/sys/stat.h +++ b/lib/libspl/include/os/linux/sys/stat.h @@ -31,6 +31,11 @@ #include /* for BLKGETSIZE64 */ +#ifdef HAVE_STATX +#include +#include +#endif + /* * Emulate Solaris' behavior of returning the block device size in fstat64(). */ diff --git a/lib/libspl/os/linux/getmntany.c b/lib/libspl/os/linux/getmntany.c index dcdf7b3d6fc9..ee1cdf59b9e5 100644 --- a/lib/libspl/os/linux/getmntany.c +++ b/lib/libspl/os/linux/getmntany.c @@ -85,13 +85,21 @@ _sol_getmntent(FILE *fp, struct mnttab *mgetp) } static int -getextmntent_impl(FILE *fp, struct extmnttab *mp) +getextmntent_impl(FILE *fp, struct extmnttab *mp, uint64_t *mnt_id) { int ret; struct stat64 st; + *mnt_id = 0; ret = _sol_getmntent(fp, (struct mnttab *)mp); if (ret == 0) { +#ifdef HAVE_STATX_MNT_ID + struct statx stx; + if (statx(AT_FDCWD, mp->mnt_mountp, + AT_STATX_SYNC_AS_STAT | AT_SYMLINK_NOFOLLOW, + STATX_MNT_ID, &stx) == 0 && (stx.stx_mask & STATX_MNT_ID)) + *mnt_id = stx.stx_mnt_id; +#endif if (stat64(mp->mnt_mountp, &st) != 0) { mp->mnt_major = 0; mp->mnt_minor = 0; @@ -110,6 +118,12 @@ getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf) struct stat64 st; FILE *fp; int match; + boolean_t have_mnt_id = B_FALSE; + uint64_t target_mnt_id = 0; + uint64_t entry_mnt_id; +#ifdef HAVE_STATX_MNT_ID + struct statx stx; +#endif if (strlen(path) >= MAXPATHLEN) { (void) fprintf(stderr, "invalid object; pathname too long\n"); @@ -128,6 +142,13 @@ getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf) return (-1); } +#ifdef HAVE_STATX_MNT_ID + if (statx(AT_FDCWD, path, AT_STATX_SYNC_AS_STAT | AT_SYMLINK_NOFOLLOW, + STATX_MNT_ID, &stx) == 0 && (stx.stx_mask & STATX_MNT_ID)) { + have_mnt_id = B_TRUE; + target_mnt_id = stx.stx_mnt_id; + } +#endif if ((fp = fopen(MNTTAB, "re")) == NULL) { (void) fprintf(stderr, "cannot open %s\n", MNTTAB); @@ -139,12 +160,15 @@ getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf) */ match = 0; - while (getextmntent_impl(fp, entry) == 0) { - if (makedev(entry->mnt_major, entry->mnt_minor) == - statbuf->st_dev) { - match = 1; - break; + while (getextmntent_impl(fp, entry, &entry_mnt_id) == 0) { + if (have_mnt_id) { + match = (entry_mnt_id == target_mnt_id); + } else { + match = makedev(entry->mnt_major, entry->mnt_minor) == + statbuf->st_dev; } + if (match) + break; } (void) fclose(fp); From acf3871ef8bbe6be9a0bc87f7b00f41f4d8e9b40 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Wed, 16 Jul 2025 10:20:57 -0700 Subject: [PATCH 08/61] Correct weight recalculation of space-based metaslabs Currently, after a failed allocation, the metaslab code recalculates the weight for a metaslab. However, for space-based metaslabs, it uses the maximum free segment size instead of the normal weighting algorithm. This is presumably because the normal metaslab weight is (roughly) intended to estimate the size of the largest free segment, but it doesn't do that reliably at most fragmentation levels. This means that recalculated metaslabs are forced to a weight that isn't really using the same units as the rest of them, resulting in undesirable behaviors. We switch this to use the normal space-weighting function. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Paul Dagnelie Sponsored-by: Wasabi Technology, Inc. Sponsored-by: Klara, Inc. Closes #17531 --- module/zfs/metaslab.c | 34 +++++++--------------------------- 1 file changed, 7 insertions(+), 27 deletions(-) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 93bbc02f9452..4befa60346bb 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -5073,29 +5073,16 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, /* * We were unable to allocate from this metaslab so determine - * a new weight for this metaslab. Now that we have loaded - * the metaslab we can provide a better hint to the metaslab - * selector. - * - * For space-based metaslabs, we use the maximum block size. - * This information is only available when the metaslab - * is loaded and is more accurate than the generic free - * space weight that was calculated by metaslab_weight(). - * This information allows us to quickly compare the maximum - * available allocation in the metaslab to the allocation - * size being requested. - * - * For segment-based metaslabs, determine the new weight - * based on the highest bucket in the range tree. We - * explicitly use the loaded segment weight (i.e. the range - * tree histogram) since it contains the space that is - * currently available for allocation and is accurate - * even within a sync pass. + * a new weight for this metaslab. The weight was last + * recalculated either when we loaded it (if this is the first + * TXG it's been loaded in), or the last time a txg was synced + * out. */ uint64_t weight; if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { - weight = metaslab_largest_allocatable(msp); - WEIGHT_SET_SPACEBASED(weight); + metaslab_set_fragmentation(msp, B_TRUE); + weight = metaslab_space_weight(msp) & + ~METASLAB_ACTIVE_MASK; } else { weight = metaslab_weight_from_range_tree(msp); } @@ -5107,13 +5094,6 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, * For the case where we use the metaslab that is * active for another allocator we want to make * sure that we retain the activation mask. - * - * Note that we could attempt to use something like - * metaslab_recalculate_weight_and_sort() that - * retains the activation mask here. That function - * uses metaslab_weight() to set the weight though - * which is not as accurate as the calculations - * above. */ weight |= msp->ms_weight & METASLAB_ACTIVE_MASK; metaslab_group_sort(mg, msp, weight); From 347d68048a2a6ce7aba77875bda8afba417d20c8 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 17 Jul 2025 18:31:19 -0400 Subject: [PATCH 09/61] ZIL: Force writing of open LWB on suspend Under parallel workloads ZIL may delay writes of open LWBs that are not full enough. On suspend we do not expect anything new to appear since zil_get_commit_list() will not let it pass, only returning TXG number to wait for. But I suspect that waiting for the TXG commit without having the last LWB issued may not wait for its completion, resulting in panic described in #17509. Reviewed-by: Alexander Motin Reviewed-by: Rob Norris Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #17521 --- module/zfs/zil.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 88567ce30cd3..c2d0f45753b6 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -2901,19 +2901,14 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); - /* - * Return if there's nothing to commit before we dirty the fs by - * calling zil_create(). - */ - if (list_is_empty(&zilog->zl_itx_commit_list)) - return; - - list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); - list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t), - offsetof(zil_commit_waiter_t, zcw_node)); - lwb = list_tail(&zilog->zl_lwb_list); if (lwb == NULL) { + /* + * Return if there's nothing to commit before we dirty the fs. + */ + if (list_is_empty(&zilog->zl_itx_commit_list)) + return; + lwb = zil_create(zilog); } else { /* @@ -2941,6 +2936,10 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) } } + list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); + list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t), + offsetof(zil_commit_waiter_t, zcw_node)); + while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) { lr_t *lrc = &itx->itx_lr; uint64_t txg = lrc->lrc_txg; @@ -3110,7 +3109,8 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) * possible, without significantly impacting the latency * of each individual itx. */ - if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) { + if (lwb->lwb_state == LWB_STATE_OPENED && + (!zilog->zl_parallel || zilog->zl_suspend > 0)) { zil_burst_done(zilog); list_insert_tail(ilwbs, lwb); lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); From c79d5e4f33d95070f20ac950f8131fe930ae802b Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Fri, 18 Jul 2025 08:45:13 -0700 Subject: [PATCH 10/61] Define sops->free_inode() to prevent use-after-free during lookup On Linux, when doing path lookup with LOOKUP_RCU, dentry and inode can be dereferenced without refcounts and locks. For this reason, dentry and inode must only be freed after RCU grace period. However, zfs currently frees inode in zfs_inode_destroy synchronously and we can't use GPL-only call_rcu() in zfs directly. Fortunately, on Linux 5.2 and after, if we define sops->free_inode(), the kernel will do call_rcu() for us. This issue may be triggered more easily with init_on_free=1 boot parameter: BUG: kernel NULL pointer dereference, address: 0000000000000020 RIP: 0010:selinux_inode_permission+0x10e/0x1c0 Call Trace: ? show_trace_log_lvl+0x1be/0x2d9 ? show_trace_log_lvl+0x1be/0x2d9 ? show_trace_log_lvl+0x1be/0x2d9 ? security_inode_permission+0x37/0x60 ? __die_body.cold+0x8/0xd ? no_context+0x113/0x220 ? exc_page_fault+0x6d/0x130 ? asm_exc_page_fault+0x1e/0x30 ? selinux_inode_permission+0x10e/0x1c0 security_inode_permission+0x37/0x60 link_path_walk.part.0.constprop.0+0xb5/0x360 ? path_init+0x27d/0x3c0 path_lookupat+0x3e/0x1a0 filename_lookup+0xc0/0x1d0 ? __check_object_size.part.0+0x123/0x150 ? strncpy_from_user+0x4e/0x130 ? getname_flags.part.0+0x4b/0x1c0 vfs_statx+0x72/0x120 ? ioctl_has_perm.constprop.0.isra.0+0xbd/0x120 __do_sys_newlstat+0x39/0x70 ? __x64_sys_ioctl+0x8d/0xd0 do_syscall_64+0x30/0x40 entry_SYSCALL_64_after_hwframe+0x62/0xc7 Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Rob Norris Signed-off-by: Chunwei Chen Co-authored-by: Chunwei Chen Closes #17546 --- config/kernel-free-inode.m4 | 24 +++++++++++++++++++++++ config/kernel.m4 | 2 ++ include/os/linux/zfs/sys/zfs_znode_impl.h | 1 + module/os/linux/zfs/zfs_znode_os.c | 17 ++++++++++++++-- module/os/linux/zfs/zpl_super.c | 12 ++++++++++++ 5 files changed, 54 insertions(+), 2 deletions(-) create mode 100644 config/kernel-free-inode.m4 diff --git a/config/kernel-free-inode.m4 b/config/kernel-free-inode.m4 new file mode 100644 index 000000000000..baa1c34845bb --- /dev/null +++ b/config/kernel-free-inode.m4 @@ -0,0 +1,24 @@ +dnl # +dnl # Linux 5.2 API change +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SOPS_FREE_INODE], [ + ZFS_LINUX_TEST_SRC([super_operations_free_inode], [ + #include + + static void free_inode(struct inode *) { } + + static struct super_operations sops __attribute__ ((unused)) = { + .free_inode = free_inode, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SOPS_FREE_INODE], [ + AC_MSG_CHECKING([whether sops->free_inode() exists]) + ZFS_LINUX_TEST_RESULT([super_operations_free_inode], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SOPS_FREE_INODE, 1, [sops->free_inode() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index c99aed357fb7..c5482da6425f 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -132,6 +132,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_PIN_USER_PAGES ZFS_AC_KERNEL_SRC_TIMER ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_WB_ERR + ZFS_AC_KERNEL_SRC_SOPS_FREE_INODE case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -248,6 +249,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_PIN_USER_PAGES ZFS_AC_KERNEL_TIMER ZFS_AC_KERNEL_SUPER_BLOCK_S_WB_ERR + ZFS_AC_KERNEL_SOPS_FREE_INODE case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/include/os/linux/zfs/sys/zfs_znode_impl.h b/include/os/linux/zfs/sys/zfs_znode_impl.h index b38847b20462..6a77e40abe10 100644 --- a/include/os/linux/zfs/sys/zfs_znode_impl.h +++ b/include/os/linux/zfs/sys/zfs_znode_impl.h @@ -157,6 +157,7 @@ struct znode; extern int zfs_sync(struct super_block *, int, cred_t *); extern int zfs_inode_alloc(struct super_block *, struct inode **ip); +extern void zfs_inode_free(struct inode *); extern void zfs_inode_destroy(struct inode *); extern void zfs_mark_inode_dirty(struct inode *); extern boolean_t zfs_relatime_need_update(const struct inode *); diff --git a/module/os/linux/zfs/zfs_znode_os.c b/module/os/linux/zfs/zfs_znode_os.c index 607b3995cb60..7b28f2640188 100644 --- a/module/os/linux/zfs/zfs_znode_os.c +++ b/module/os/linux/zfs/zfs_znode_os.c @@ -371,6 +371,12 @@ zfs_inode_alloc(struct super_block *sb, struct inode **ip) return (0); } +void +zfs_inode_free(struct inode *ip) +{ + kmem_cache_free(znode_cache, ITOZ(ip)); +} + /* * Called in multiple places when an inode should be destroyed. */ @@ -395,8 +401,15 @@ zfs_inode_destroy(struct inode *ip) nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } - - kmem_cache_free(znode_cache, zp); +#ifndef HAVE_SOPS_FREE_INODE + /* + * inode needs to be freed in RCU callback. If we have + * super_operations->free_inode, Linux kernel will do call_rcu + * for us. But if we don't have it, since call_rcu is GPL-only + * symbol, we can only free synchronously and accept the risk. + */ + zfs_inode_free(ip); +#endif } static void diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c index a682bfd33c38..94dcdd0b887d 100644 --- a/module/os/linux/zfs/zpl_super.c +++ b/module/os/linux/zfs/zpl_super.c @@ -45,6 +45,15 @@ zpl_inode_alloc(struct super_block *sb) return (ip); } +#ifdef HAVE_SOPS_FREE_INODE +static void +zpl_inode_free(struct inode *ip) +{ + ASSERT(atomic_read(&ip->i_count) == 0); + zfs_inode_free(ip); +} +#endif + static void zpl_inode_destroy(struct inode *ip) { @@ -455,6 +464,9 @@ zpl_prune_sb(uint64_t nr_to_scan, void *arg) const struct super_operations zpl_super_operations = { .alloc_inode = zpl_inode_alloc, +#ifdef HAVE_SOPS_FREE_INODE + .free_inode = zpl_inode_free, +#endif .destroy_inode = zpl_inode_destroy, .dirty_inode = zpl_dirty_inode, .write_inode = NULL, From 0c928f7a37bd2834659972df30aaeebbc4bc70c9 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Fri, 25 Jul 2025 03:47:46 +0500 Subject: [PATCH 11/61] ZED: Fix device type detection and pool iteration logic During hotplug REMOVED events, devid matching fails for partition-based spares because devid information is not stored in pool config for partitioned devices. However, when devid is populated by the hotplug event, the original code skipped the search logic entirely, skipping vdev_guid matching and resulting in wrong device type detection that caused spares to be incorrectly identified as l2arc devices. Additionally, fix zfs_agent_iter_pool() to use the return value from zfs_agent_iter_vdev() instead of relying on search parameters, which was previously ignored. Also add pool_guid optimization to enable targeted pool searching when pool_guid is available. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #17545 --- cmd/zed/agents/zfs_agents.c | 67 ++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/cmd/zed/agents/zfs_agents.c b/cmd/zed/agents/zfs_agents.c index 8718dbde03b6..c0590edc7516 100644 --- a/cmd/zed/agents/zfs_agents.c +++ b/cmd/zed/agents/zfs_agents.c @@ -134,11 +134,13 @@ zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg) * of blkid cache and L2ARC VDEV does not contain pool guid in its * blkid, so this is a special case for L2ARC VDEV. */ - else if (gsp->gs_vdev_guid != 0 && gsp->gs_devid == NULL && + else if (gsp->gs_vdev_guid != 0 && nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, &vdev_guid) == 0 && gsp->gs_vdev_guid == vdev_guid) { - (void) nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, - &gsp->gs_devid); + if (gsp->gs_devid == NULL) { + (void) nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, + &gsp->gs_devid); + } (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME, &gsp->gs_vdev_expandtime); return (B_TRUE); @@ -156,22 +158,28 @@ zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg) /* * For each vdev in this pool, look for a match by devid */ - if ((config = zpool_get_config(zhp, NULL)) != NULL) { - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvl) == 0) { - (void) zfs_agent_iter_vdev(zhp, nvl, gsp); - } - } - /* - * if a match was found then grab the pool guid - */ - if (gsp->gs_vdev_guid && gsp->gs_devid) { - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, - &gsp->gs_pool_guid); - } + boolean_t found = B_FALSE; + uint64_t pool_guid; + /* Get pool configuration and extract pool GUID */ + if ((config = zpool_get_config(zhp, NULL)) == NULL || + nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) != 0) + goto out; + + /* Skip this pool if we're looking for a specific pool */ + if (gsp->gs_pool_guid != 0 && pool_guid != gsp->gs_pool_guid) + goto out; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) == 0) + found = zfs_agent_iter_vdev(zhp, nvl, gsp); + + if (found && gsp->gs_pool_guid == 0) + gsp->gs_pool_guid = pool_guid; + +out: zpool_close(zhp); - return (gsp->gs_devid != NULL && gsp->gs_vdev_guid != 0); + return (found); } void @@ -233,20 +241,17 @@ zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl) * For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or * ZFS_EV_POOL_GUID may be missing so find them. */ - if (devid == NULL || pool_guid == 0 || vdev_guid == 0) { - if (devid == NULL) - search.gs_vdev_guid = vdev_guid; - else - search.gs_devid = devid; - zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search); - if (devid == NULL) - devid = search.gs_devid; - if (pool_guid == 0) - pool_guid = search.gs_pool_guid; - if (vdev_guid == 0) - vdev_guid = search.gs_vdev_guid; - devtype = search.gs_vdev_type; - } + search.gs_devid = devid; + search.gs_vdev_guid = vdev_guid; + search.gs_pool_guid = pool_guid; + zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search); + if (devid == NULL) + devid = search.gs_devid; + if (pool_guid == 0) + pool_guid = search.gs_pool_guid; + if (vdev_guid == 0) + vdev_guid = search.gs_vdev_guid; + devtype = search.gs_vdev_type; /* * We want to avoid reporting "remove" events coming from From 6d378564b49774e4ecc90c4c2db3bad775562a60 Mon Sep 17 00:00:00 2001 From: Andriy Tkachuk Date: Fri, 25 Jul 2025 02:24:15 +0100 Subject: [PATCH 12/61] zdb: fix checksum calculation for decompressed blocks Currently, when reading compressed blocks with -R and decompressing them with :d option and specifying lsize, which is normally bigger than psize for compressed blocks, the checksum is calculated on decompressed data. But it makes no sense since zfs always calculates checksum on physical, i.e. compressed data. So reading the same block produces different checksum results depending on how we read it, whether we decompress it or not, which, again, makes no sense. Fix: use psize instead of lsize when calculating the checksum so that it is always calculated on the physical block size, no matter was it compressed or not. Signed-off-by: Andriy Tkachuk Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Closes #17547 --- cmd/zdb/zdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 1ca97d5c153e..a50dd2d62bd8 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -9118,7 +9118,7 @@ zdb_read_block(char *thing, spa_t *spa) ck_zio->io_offset = DVA_GET_OFFSET(&bp->blk_dva[0]); ck_zio->io_bp = bp; - zio_checksum_compute(ck_zio, ck, pabd, lsize); + zio_checksum_compute(ck_zio, ck, pabd, psize); printf( "%12s\t" "cksum=%016llx:%016llx:%016llx:%016llx\n", From 582e7847f6ef47283c1c15850df261a52f4834d8 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 25 Jul 2025 07:42:23 -0700 Subject: [PATCH 13/61] Default to zfs_bclone_wait_dirty=1 Update the default FICLONE and FICLONERANGE ioctl behavior to wait on dirty blocks. While this does remove some control from the application, in practice ZFS is better positioned to the optimial thing and immediately force a TXG sync. Reviewed-by: Rob Norris Reviewed-by: Alexander Motin Reviewed-by: George Melikov Signed-off-by: Brian Behlendorf Closes #17455 --- man/man4/zfs.4 | 15 ++++++------- module/zfs/zfs_vnops.c | 13 ++++++------ ...loning_copyfilerange_fallback_same_txg.ksh | 21 +++++++++++++++++++ .../functional/cp_files/cp_files_002_pos.ksh | 4 +++- 4 files changed, 39 insertions(+), 14 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 730236481b55..f0c15bd75860 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1384,14 +1384,15 @@ If this setting is 0, then even if feature@block_cloning is enabled, using functions and system calls that attempt to clone blocks will act as though the feature is disabled. . -.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int -When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be -written to disk. -This allows the clone operation to reliably succeed when a file is +.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 1 Ns | Ns 0 Pq int +When set to 1 the FICLONE and FICLONERANGE ioctls will wait for any dirty +data to be written to disk before proceeding. +This ensures that the clone operation reliably succeeds, even if a file is modified and then immediately cloned. -For small files this may be slower than making a copy of the file. -Therefore, this setting defaults to 0 which causes a clone operation to -immediately fail when encountering a dirty block. +Note that for small files this may be slower than simply copying the file. +When set to 0 the clone operation will immediately fail if it encounters +any dirty blocks. +By default waiting is enabled. . .It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string Select a BLAKE3 implementation. diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 7bf630b55f7e..4ef391ed7729 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -67,13 +67,14 @@ int zfs_bclone_enabled = 1; /* - * When set zfs_clone_range() waits for dirty data to be written to disk. - * This allows the clone operation to reliably succeed when a file is modified - * and then immediately cloned. For small files this may be slower than making - * a copy of the file and is therefore not the default. However, in certain - * scenarios this behavior may be desirable so a tunable is provided. + * When set to 1 the FICLONE and FICLONERANGE ioctls will wait for any dirty + * data to be written to disk before proceeding. This ensures that the clone + * operation reliably succeeds, even if a file is modified and then immediately + * cloned. Note that for small files this may be slower than simply copying + * the file. When set to 0 the clone operation will immediately fail if it + * encounters any dirty blocks. By default waiting is enabled. */ -int zfs_bclone_wait_dirty = 0; +int zfs_bclone_wait_dirty = 1; /* * Enable Direct I/O. If this setting is 0, then all I/O requests will be diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh index 54ffdc75669a..4cede26b913a 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh @@ -41,16 +41,22 @@ function cleanup { datasetexists $TESTPOOL && destroy_pool $TESTPOOL set_tunable64 TXG_TIMEOUT $timeout + log_must restore_tunable BCLONE_WAIT_DIRTY } log_onexit cleanup +log_must save_tunable BCLONE_WAIT_DIRTY + log_must set_tunable64 TXG_TIMEOUT 5000 log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS log_must sync_pool $TESTPOOL true +# Verify fallback to copy when there are dirty blocks +log_must set_tunable32 BCLONE_WAIT_DIRTY 0 + log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4 log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288 @@ -61,5 +67,20 @@ log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone) log_must [ "$blocks" = "" ] +log_must rm /$TESTPOOL/file /$TESTPOOL/clone + +# Verify blocks are cloned even when there are dirty blocks +log_must set_tunable32 BCLONE_WAIT_DIRTY 1 + +log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4 +log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288 + +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone + +typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone) +log_must [ "$blocks" = "0 1 2 3" ] + log_pass $claim diff --git a/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh index 8f3e6d12e53d..449dedacb307 100755 --- a/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh @@ -56,7 +56,7 @@ function cleanup { datasetexists $TESTPOOL/cp-reflink && \ destroy_dataset $$TESTPOOL/cp-reflink -f - log_must set_tunable32 BCLONE_WAIT_DIRTY 0 + log_must restore_tunable BCLONE_WAIT_DIRTY } function verify_copy @@ -81,6 +81,8 @@ SRC_SIZE=$((1024 + $RANDOM % 1024)) # A smaller recordsize is used merely to speed up the test. RECORDSIZE=4096 +log_must save_tunable BCLONE_WAIT_DIRTY + log_must zfs create -o recordsize=$RECORDSIZE $TESTPOOL/cp-reflink CP_TESTDIR=$(get_prop mountpoint $TESTPOOL/cp-reflink) From 271b9797c5176ba6ecfcf6fbc494a4deb49d2b1c Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Tue, 29 Jul 2025 14:28:01 -0700 Subject: [PATCH 14/61] Don't use wrong weight when passivating group When we're passivating a metaslab group we start by passivating the metaslabs that have been activated for each of the allocators. To do that, we need to provide a weight. However, currently this erroneously always uses a segment-based weight, even if segment-based weighting is disabled. Use the normal weight function, which will decide which type of weight to use. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Paul Dagnelie Closes #17566 --- module/zfs/metaslab.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 4befa60346bb..3a831697a408 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -969,14 +969,16 @@ metaslab_group_passivate(metaslab_group_t *mg) if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, - metaslab_weight_from_range_tree(msp)); + metaslab_weight(msp, B_TRUE) & + ~METASLAB_ACTIVE_MASK); mutex_exit(&msp->ms_lock); } msp = mga->mga_secondary; if (msp != NULL) { mutex_enter(&msp->ms_lock); metaslab_passivate(msp, - metaslab_weight_from_range_tree(msp)); + metaslab_weight(msp, B_TRUE) & + ~METASLAB_ACTIVE_MASK); mutex_exit(&msp->ms_lock); } } From fc658b99357b813a54769023ef663868edb9225f Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Wed, 30 Jul 2025 02:09:48 +0200 Subject: [PATCH 15/61] Faster checksum benchmark on system boot While booting, only the needed 256KiB benchmarks are done now. The delay for checking all checksums occurs when requested via: - Linux: cat /proc/spl/kstat/zfs/chksum_bench - FreeBSD: sysctl kstat.zfs.misc.chksum_bench Reported by: Lahiru Gunathilake Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Tino Reichardt Co-authored-by: Colin Percival Closes #17563 Closes #17560 --- module/zfs/zfs_chksum.c | 69 ++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 29 deletions(-) diff --git a/module/zfs/zfs_chksum.c b/module/zfs/zfs_chksum.c index 5c92be21c0c8..21852bf3d865 100644 --- a/module/zfs/zfs_chksum.c +++ b/module/zfs/zfs_chksum.c @@ -32,9 +32,6 @@ #include #include -/* limit benchmarking to max 256KiB, when EdonR is slower then this: */ -#define LIMIT_PERF_MBS 300 - typedef struct { const char *name; const char *impl; @@ -52,9 +49,15 @@ typedef struct { zio_checksum_tmpl_free_t *(free); } chksum_stat_t; +#define AT_STARTUP 0 +#define AT_BENCHMARK 1 +#define AT_DONE 2 + static chksum_stat_t *chksum_stat_data = 0; -static int chksum_stat_cnt = 0; static kstat_t *chksum_kstat = NULL; +static int chksum_stat_limit = AT_STARTUP; +static int chksum_stat_cnt = 0; +static void chksum_benchmark(void); /* * Sample output on i3-1005G1 System: @@ -129,6 +132,9 @@ chksum_kstat_data(char *buf, size_t size, void *data) static void * chksum_kstat_addr(kstat_t *ksp, loff_t n) { + /* full benchmark */ + chksum_benchmark(); + if (n < chksum_stat_cnt) ksp->ks_private = (void *)(chksum_stat_data + n); else @@ -176,47 +182,36 @@ chksum_run(chksum_stat_t *cs, abd_t *abd, void *ctx, int round, kpreempt_enable(); run_bw = size * run_count * NANOSEC; - run_bw /= run_time_ns; /* B/s */ + run_bw /= run_time_ns; /* B/s */ *result = run_bw/1024/1024; /* MiB/s */ } -#define LIMIT_INIT 0 -#define LIMIT_NEEDED 1 -#define LIMIT_NOLIMIT 2 - static void chksum_benchit(chksum_stat_t *cs) { abd_t *abd; void *ctx = 0; void *salt = &cs->salt.zcs_bytes; - static int chksum_stat_limit = LIMIT_INIT; memset(salt, 0, sizeof (cs->salt.zcs_bytes)); if (cs->init) ctx = cs->init(&cs->salt); + /* benchmarks in startup mode */ + if (chksum_stat_limit == AT_STARTUP) { + abd = abd_alloc_linear(1<<18, B_FALSE); + chksum_run(cs, abd, ctx, 5, &cs->bs256k); + goto done; + } + /* allocate test memory via abd linear interface */ abd = abd_alloc_linear(1<<20, B_FALSE); + + /* benchmarks when requested */ chksum_run(cs, abd, ctx, 1, &cs->bs1k); chksum_run(cs, abd, ctx, 2, &cs->bs4k); chksum_run(cs, abd, ctx, 3, &cs->bs16k); chksum_run(cs, abd, ctx, 4, &cs->bs64k); - chksum_run(cs, abd, ctx, 5, &cs->bs256k); - - /* check if we ran on a slow cpu */ - if (chksum_stat_limit == LIMIT_INIT) { - if (cs->bs1k < LIMIT_PERF_MBS) { - chksum_stat_limit = LIMIT_NEEDED; - } else { - chksum_stat_limit = LIMIT_NOLIMIT; - } - } - - /* skip benchmarks >= 1MiB when the CPU is to slow */ - if (chksum_stat_limit == LIMIT_NEEDED) - goto abort; - chksum_run(cs, abd, ctx, 6, &cs->bs1m); abd_free(abd); @@ -225,7 +220,7 @@ chksum_benchit(chksum_stat_t *cs) chksum_run(cs, abd, ctx, 7, &cs->bs4m); chksum_run(cs, abd, ctx, 8, &cs->bs16m); -abort: +done: abd_free(abd); /* free up temp memory */ @@ -243,7 +238,6 @@ chksum_benchmark(void) /* we need the benchmark only for the kernel module */ return; #endif - chksum_stat_t *cs; uint64_t max; uint32_t id, cbid = 0, id_save; @@ -251,8 +245,14 @@ chksum_benchmark(void) const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256"); const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512"); + /* benchmarks are done */ + if (chksum_stat_limit == AT_DONE) + return; + + /* count implementations */ - chksum_stat_cnt = 2; + chksum_stat_cnt = 1; /* edonr */ + chksum_stat_cnt += 1; /* skein */ chksum_stat_cnt += sha256->getcnt(); chksum_stat_cnt += sha512->getcnt(); chksum_stat_cnt += blake3->getcnt(); @@ -332,6 +332,17 @@ chksum_benchmark(void) } } blake3->setid(id_save); + + switch (chksum_stat_limit) { + case AT_STARTUP: + /* next time we want a full benchmark */ + chksum_stat_limit = AT_BENCHMARK; + break; + case AT_BENCHMARK: + /* no further benchmarks */ + chksum_stat_limit = AT_DONE; + break; + } } void @@ -341,7 +352,7 @@ chksum_init(void) blake3_per_cpu_ctx_init(); #endif - /* Benchmark supported implementations */ + /* 256KiB benchmark */ chksum_benchmark(); /* Install kstats for all implementations */ From 95abbc71c39360d9d57da756a46af05716999d17 Mon Sep 17 00:00:00 2001 From: Igor Ostapenko Date: Thu, 31 Jul 2025 17:44:42 +0300 Subject: [PATCH 16/61] range_tree: Provide more debug details upon unexpected add/remove Sponsored-by: Klara, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Igor Ostapenko Closes #17581 --- cmd/zdb/zdb.c | 15 +++--- include/sys/metaslab_impl.h | 2 + include/sys/range_tree.h | 9 ++++ include/sys/vdev_impl.h | 1 + module/zfs/dnode.c | 19 ++++++- module/zfs/metaslab.c | 66 ++++++++++++++++++------- module/zfs/range_tree.c | 96 +++++++++++++++++++++++++----------- module/zfs/vdev.c | 36 +++++++++++--- module/zfs/vdev_initialize.c | 5 +- module/zfs/vdev_raidz.c | 6 ++- module/zfs/vdev_rebuild.c | 5 +- module/zfs/vdev_removal.c | 30 ++++++----- module/zfs/vdev_trim.c | 24 ++++++--- 13 files changed, 227 insertions(+), 87 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index a50dd2d62bd8..1f4f3bfe2d70 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -619,8 +619,9 @@ livelist_metaslab_validate(spa_t *spa) metaslab_calculate_range_tree_type(vd, m, &start, &shift); metaslab_verify_t mv; - mv.mv_allocated = zfs_range_tree_create(NULL, - type, NULL, start, shift); + mv.mv_allocated = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + 0, "livelist_metaslab_validate:mv_allocated"); mv.mv_vdid = vd->vdev_id; mv.mv_msid = m->ms_id; mv.mv_start = m->ms_start; @@ -6320,8 +6321,9 @@ zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs)); - zfs_range_tree_t *allocs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + zfs_range_tree_t *allocs = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + 0, "zdb_claim_removing:allocs"); for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; @@ -8449,8 +8451,9 @@ dump_zpool(spa_t *spa) if (dump_opt['d'] || dump_opt['i']) { spa_feature_t f; - mos_refd_objs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + mos_refd_objs = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + 0, "dump_zpool:mos_refd_objs"); dump_objset(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index 5f999c02b8ac..2d45694f304f 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -568,6 +568,8 @@ typedef struct metaslab_unflushed_phys { uint64_t msp_unflushed_txg; } metaslab_unflushed_phys_t; +char *metaslab_rt_name(metaslab_group_t *, metaslab_t *, const char *); + #ifdef __cplusplus } #endif diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h index 23e80f64284b..0f6884682459 100644 --- a/include/sys/range_tree.h +++ b/include/sys/range_tree.h @@ -49,6 +49,9 @@ typedef enum zfs_range_seg_type { ZFS_RANGE_SEG_NUM_TYPES, } zfs_range_seg_type_t; +#define ZFS_RT_NAME(rt) (((rt)->rt_name != NULL) ? (rt)->rt_name : "") +#define ZFS_RT_F_DYN_NAME (1ULL << 0) /* if rt_name must be freed */ + /* * Note: the range_tree may not be accessed concurrently; consumers * must provide external locking if required. @@ -68,6 +71,9 @@ typedef struct zfs_range_tree { void *rt_arg; uint64_t rt_gap; /* allowable inter-segment gap */ + uint64_t rt_flags; + const char *rt_name; /* details for debugging */ + /* * The rt_histogram maintains a histogram of ranges. Each bucket, * rt_histogram[i], contains the number of ranges whose size is: @@ -281,6 +287,9 @@ zfs_range_tree_t *zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops, uint64_t gap); zfs_range_tree_t *zfs_range_tree_create(const zfs_range_tree_ops_t *ops, zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift); +zfs_range_tree_t *zfs_range_tree_create_flags(const zfs_range_tree_ops_t *ops, + zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, + uint64_t flags, const char *name); void zfs_range_tree_destroy(zfs_range_tree_t *rt); boolean_t zfs_range_tree_contains(zfs_range_tree_t *rt, uint64_t start, uint64_t size); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 58a6cdcdc3ea..53e811ef3a35 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -651,6 +651,7 @@ uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b); int param_get_raidz_impl(char *buf, zfs_kernel_param_t *kp); #endif int param_set_raidz_impl(ZFS_MODULE_PARAM_ARGS); +char *vdev_rt_name(vdev_t *vd, const char *name); /* * Vdev ashift optimization tunables diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 99f3ca72ab87..5a067fd59499 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -86,6 +86,19 @@ int zfs_default_ibs = DN_MAX_INDBLKSHIFT; static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); #endif /* _KERNEL */ +static char * +rt_name(dnode_t *dn, const char *name) +{ + struct objset *os = dn->dn_objset; + + return (kmem_asprintf("{spa=%s objset=%llu obj=%llu %s}", + spa_name(os->os_spa), + (u_longlong_t)(os->os_dsl_dataset ? + os->os_dsl_dataset->ds_object : DMU_META_OBJSET), + (u_longlong_t)dn->dn_object, + name)); +} + static int dbuf_compare(const void *x1, const void *x2) { @@ -2436,8 +2449,10 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) { int txgoff = tx->tx_txg & TXG_MASK; if (dn->dn_free_ranges[txgoff] == NULL) { - dn->dn_free_ranges[txgoff] = zfs_range_tree_create(NULL, - ZFS_RANGE_SEG64, NULL, 0, 0); + dn->dn_free_ranges[txgoff] = + zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, rt_name(dn, "dn_free_ranges")); } zfs_range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); zfs_range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks); diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 3a831697a408..cca8d7eaf2e6 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -370,6 +370,16 @@ static metaslab_stats_t metaslab_stats = { #define METASLABSTAT_BUMP(stat) \ atomic_inc_64(&metaslab_stats.stat.value.ui64); +char * +metaslab_rt_name(metaslab_group_t *mg, metaslab_t *ms, const char *name) +{ + return (kmem_asprintf("{spa=%s vdev_guid=%llu ms_id=%llu %s}", + spa_name(mg->mg_vd->vdev_spa), + (u_longlong_t)mg->mg_vd->vdev_guid, + (u_longlong_t)ms->ms_id, + name)); +} + static kstat_t *metaslab_ksp; @@ -2757,30 +2767,43 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, zfs_range_seg_type_t type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift); - ms->ms_allocatable = zfs_range_tree_create(NULL, type, NULL, start, - shift); + ms->ms_allocatable = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_allocatable")); for (int t = 0; t < TXG_SIZE; t++) { - ms->ms_allocating[t] = zfs_range_tree_create(NULL, type, - NULL, start, shift); - } - ms->ms_freeing = zfs_range_tree_create(NULL, type, NULL, start, shift); - ms->ms_freed = zfs_range_tree_create(NULL, type, NULL, start, shift); + ms->ms_allocating[t] = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, + metaslab_rt_name(mg, ms, "ms_allocating")); + } + ms->ms_freeing = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_freeing")); + ms->ms_freed = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_freed")); for (int t = 0; t < TXG_DEFER_SIZE; t++) { - ms->ms_defer[t] = zfs_range_tree_create(NULL, type, NULL, - start, shift); + ms->ms_defer[t] = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_defer")); } - ms->ms_checkpointing = - zfs_range_tree_create(NULL, type, NULL, start, shift); - ms->ms_unflushed_allocs = - zfs_range_tree_create(NULL, type, NULL, start, shift); + ms->ms_checkpointing = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_checkpointing")); + ms->ms_unflushed_allocs = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_unflushed_allocs")); metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); mrap->mra_bt = &ms->ms_unflushed_frees_by_size; mrap->mra_floor_shift = metaslab_by_size_min_shift; - ms->ms_unflushed_frees = zfs_range_tree_create(&metaslab_rt_ops, - type, mrap, start, shift); + ms->ms_unflushed_frees = zfs_range_tree_create_flags( + &metaslab_rt_ops, type, mrap, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_unflushed_frees")); - ms->ms_trim = zfs_range_tree_create(NULL, type, NULL, start, shift); + ms->ms_trim = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, metaslab_rt_name(mg, ms, "ms_trim")); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms, B_FALSE); @@ -3754,7 +3777,10 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp, &start, &shift); - condense_tree = zfs_range_tree_create(NULL, type, NULL, start, shift); + condense_tree = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, + metaslab_rt_name(msp->ms_group, msp, "condense_tree")); for (int t = 0; t < TXG_DEFER_SIZE; t++) { zfs_range_tree_walk(msp->ms_defer[t], @@ -3811,8 +3837,10 @@ metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) * followed by FREES (due to space_map_write() in metaslab_sync()) for * sync pass 1. */ - zfs_range_tree_t *tmp_tree = zfs_range_tree_create(NULL, type, NULL, - start, shift); + zfs_range_tree_t *tmp_tree = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, + ZFS_RT_F_DYN_NAME, + metaslab_rt_name(msp->ms_group, msp, "tmp_tree")); zfs_range_tree_add(tmp_tree, msp->ms_start, msp->ms_size); space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx); space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 373636c69254..fc2b17606bd2 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -201,10 +201,10 @@ ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg64_find_in_buf, zfs_range_seg64_t, ZFS_BTREE_FIND_IN_BUF_FUNC(zfs_range_tree_seg_gap_find_in_buf, zfs_range_seg_gap_t, zfs_range_tree_seg_gap_compare) -zfs_range_tree_t * -zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops, +static zfs_range_tree_t * +zfs_range_tree_create_impl(const zfs_range_tree_ops_t *ops, zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, - uint64_t gap) + uint64_t gap, uint64_t flags, const char *name) { zfs_range_tree_t *rt = kmem_zalloc(sizeof (zfs_range_tree_t), KM_SLEEP); @@ -236,6 +236,8 @@ zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops, rt->rt_ops = ops; rt->rt_gap = gap; + rt->rt_flags = flags; + rt->rt_name = name; rt->rt_arg = arg; rt->rt_type = type; rt->rt_start = start; @@ -247,11 +249,30 @@ zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops, return (rt); } +zfs_range_tree_t * +zfs_range_tree_create_gap(const zfs_range_tree_ops_t *ops, + zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, + uint64_t gap) +{ + return (zfs_range_tree_create_impl(ops, type, arg, start, shift, gap, + 0, NULL)); +} + zfs_range_tree_t * zfs_range_tree_create(const zfs_range_tree_ops_t *ops, zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift) { - return (zfs_range_tree_create_gap(ops, type, arg, start, shift, 0)); + return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0, + 0, NULL)); +} + +zfs_range_tree_t * +zfs_range_tree_create_flags(const zfs_range_tree_ops_t *ops, + zfs_range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, + uint64_t flags, const char *name) +{ + return (zfs_range_tree_create_impl(ops, type, arg, start, shift, 0, + flags, name)); } void @@ -262,6 +283,9 @@ zfs_range_tree_destroy(zfs_range_tree_t *rt) if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL) rt->rt_ops->rtop_destroy(rt, rt->rt_arg); + if (rt->rt_name != NULL && (rt->rt_flags & ZFS_RT_F_DYN_NAME)) + kmem_strfree((char *)(uintptr_t)rt->rt_name); + zfs_btree_destroy(&rt->rt_root); kmem_free(rt, sizeof (*rt)); } @@ -271,15 +295,17 @@ zfs_range_tree_adjust_fill(zfs_range_tree_t *rt, zfs_range_seg_t *rs, int64_t delta) { if (delta < 0 && delta * -1 >= zfs_rs_get_fill(rs, rt)) { - zfs_panic_recover("zfs: attempting to decrease fill to or " - "below 0; probable double remove in segment [%llx:%llx]", + zfs_panic_recover("zfs: rt=%s: attempting to decrease fill to " + "or below 0; probable double remove in segment [%llx:%llx]", + ZFS_RT_NAME(rt), (longlong_t)zfs_rs_get_start(rs, rt), (longlong_t)zfs_rs_get_end(rs, rt)); } if (zfs_rs_get_fill(rs, rt) + delta > zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) { - zfs_panic_recover("zfs: attempting to increase fill beyond " - "max; probable double add in segment [%llx:%llx]", + zfs_panic_recover("zfs: rt=%s: attempting to increase fill " + "beyond max; probable double add in segment [%llx:%llx]", + ZFS_RT_NAME(rt), (longlong_t)zfs_rs_get_start(rs, rt), (longlong_t)zfs_rs_get_end(rs, rt)); } @@ -319,14 +345,17 @@ zfs_range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) * the normal code paths. */ if (rs != NULL) { + uint64_t rstart = zfs_rs_get_start(rs, rt); + uint64_t rend = zfs_rs_get_end(rs, rt); if (gap == 0) { - zfs_panic_recover("zfs: adding existent segment to " - "range tree (offset=%llx size=%llx)", - (longlong_t)start, (longlong_t)size); + zfs_panic_recover("zfs: rt=%s: adding segment " + "(offset=%llx size=%llx) overlapping with existing " + "one (offset=%llx size=%llx)", + ZFS_RT_NAME(rt), + (longlong_t)start, (longlong_t)size, + (longlong_t)rstart, (longlong_t)(rend - rstart)); return; } - uint64_t rstart = zfs_rs_get_start(rs, rt); - uint64_t rend = zfs_rs_get_end(rs, rt); if (rstart <= start && rend >= end) { zfs_range_tree_adjust_fill(rt, rs, fill); return; @@ -451,6 +480,7 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size, zfs_range_seg_t *rs; zfs_range_seg_max_t rsearch, rs_tmp; uint64_t end = start + size; + uint64_t rstart, rend; boolean_t left_over, right_over; VERIFY3U(size, !=, 0); @@ -464,12 +494,15 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size, /* Make sure we completely overlap with someone */ if (rs == NULL) { - zfs_panic_recover("zfs: removing nonexistent segment from " - "range tree (offset=%llx size=%llx)", - (longlong_t)start, (longlong_t)size); + zfs_panic_recover("zfs: rt=%s: removing nonexistent segment " + "from range tree (offset=%llx size=%llx)", + ZFS_RT_NAME(rt), (longlong_t)start, (longlong_t)size); return; } + rstart = zfs_rs_get_start(rs, rt); + rend = zfs_rs_get_end(rs, rt); + /* * Range trees with gap support must only remove complete segments * from the tree. This allows us to maintain accurate fill accounting @@ -479,31 +512,36 @@ zfs_range_tree_remove_impl(zfs_range_tree_t *rt, uint64_t start, uint64_t size, if (rt->rt_gap != 0) { if (do_fill) { if (zfs_rs_get_fill(rs, rt) == size) { - start = zfs_rs_get_start(rs, rt); - end = zfs_rs_get_end(rs, rt); + start = rstart; + end = rend; size = end - start; } else { zfs_range_tree_adjust_fill(rt, rs, -size); return; } - } else if (zfs_rs_get_start(rs, rt) != start || - zfs_rs_get_end(rs, rt) != end) { - zfs_panic_recover("zfs: freeing partial segment of " - "gap tree (offset=%llx size=%llx) of " + } else if (rstart != start || rend != end) { + zfs_panic_recover("zfs: rt=%s: freeing partial segment " + "of gap tree (offset=%llx size=%llx) of " "(offset=%llx size=%llx)", + ZFS_RT_NAME(rt), (longlong_t)start, (longlong_t)size, - (longlong_t)zfs_rs_get_start(rs, rt), - (longlong_t)zfs_rs_get_end(rs, rt) - - zfs_rs_get_start(rs, rt)); + (longlong_t)rstart, (longlong_t)(rend - rstart)); return; } } - VERIFY3U(zfs_rs_get_start(rs, rt), <=, start); - VERIFY3U(zfs_rs_get_end(rs, rt), >=, end); + if (!(rstart <= start && rend >= end)) { + panic("zfs: rt=%s: removing segment " + "(offset=%llx size=%llx) not completely overlapped by " + "existing one (offset=%llx size=%llx)", + ZFS_RT_NAME(rt), + (longlong_t)start, (longlong_t)size, + (longlong_t)rstart, (longlong_t)(rend - rstart)); + return; + } - left_over = (zfs_rs_get_start(rs, rt) != start); - right_over = (zfs_rs_get_end(rs, rt) != end); + left_over = (rstart != start); + right_over = (rend != end); zfs_range_tree_stat_decr(rt, rs); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 89e87277967b..bada0aada27d 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -243,6 +243,25 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent) vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); } +char * +vdev_rt_name(vdev_t *vd, const char *name) +{ + return (kmem_asprintf("{spa=%s vdev_guid=%llu %s}", + spa_name(vd->vdev_spa), + (u_longlong_t)vd->vdev_guid, + name)); +} + +static char * +vdev_rt_name_dtl(vdev_t *vd, const char *name, vdev_dtl_type_t dtl_type) +{ + return (kmem_asprintf("{spa=%s vdev_guid=%llu %s[%d]}", + spa_name(vd->vdev_spa), + (u_longlong_t)vd->vdev_guid, + name, + dtl_type)); +} + /* * Virtual device management. */ @@ -679,8 +698,9 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); - vd->vdev_obsolete_segments = zfs_range_tree_create(NULL, - ZFS_RANGE_SEG64, NULL, 0, 0); + vd->vdev_obsolete_segments = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_obsolete_segments")); /* * Initialize rate limit structs for events. We rate limit ZIO delay @@ -734,8 +754,9 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { - vd->vdev_dtl[t] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + vd->vdev_dtl[t] = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name_dtl(vd, "vdev_dtl", t)); } txg_list_create(&vd->vdev_ms_list, spa, @@ -3437,7 +3458,9 @@ vdev_dtl_load(vdev_t *vd) return (error); ASSERT(vd->vdev_dtl_sm != NULL); - rt = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); + rt = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_dtl_load:rt")); error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC); if (error == 0) { mutex_enter(&vd->vdev_dtl_lock); @@ -3585,7 +3608,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) ASSERT(vd->vdev_dtl_sm != NULL); } - rtsync = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); + rtsync = zfs_range_tree_create_flags(NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "rtsync")); mutex_enter(&vd->vdev_dtl_lock); zfs_range_tree_walk(rt, zfs_range_tree_add, rtsync); diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index 8ff38889b797..c932175479fc 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -541,8 +541,9 @@ vdev_initialize_thread(void *arg) abd_t *deadbeef = vdev_initialize_block_alloc(); - vd->vdev_initialize_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + vd->vdev_initialize_tree = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_initialize_tree")); for (uint64_t i = 0; !vd->vdev_detached && i < vd->vdev_top->vdev_ms_count; i++) { diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index b62dc6b0b91c..0a89ea6bb525 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -4556,8 +4556,10 @@ spa_raidz_expand_thread(void *arg, zthr_t *zthr) uint64_t shift, start; zfs_range_seg_type_t type = metaslab_calculate_range_tree_type( raidvd, msp, &start, &shift); - zfs_range_tree_t *rt = zfs_range_tree_create(NULL, type, NULL, - start, shift); + zfs_range_tree_t *rt = zfs_range_tree_create_flags( + NULL, type, NULL, start, shift, ZFS_RT_F_DYN_NAME, + metaslab_rt_name(msp->ms_group, msp, + "spa_raidz_expand_thread:rt")); zfs_range_tree_add(rt, msp->ms_start, msp->ms_size); zfs_range_tree_walk(msp->ms_allocatable, zfs_range_tree_remove, rt); diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index ea6f86993088..99a1828b7cdd 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -787,8 +787,9 @@ vdev_rebuild_thread(void *arg) vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; vr->vr_top_vdev = vd; vr->vr_scan_msp = NULL; - vr->vr_scan_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, - 0, 0); + vr->vr_scan_tree = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vr_scan_tree")); mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL); diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index bc3a1e84255a..13dde35f6116 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -364,13 +364,15 @@ spa_vdev_removal_create(vdev_t *vd) spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); - svr->svr_allocd_segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + svr->svr_allocd_segs = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "svr_allocd_segs")); svr->svr_vdev_id = vd->vdev_id; for (int i = 0; i < TXG_SIZE; i++) { - svr->svr_frees[i] = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + svr->svr_frees[i] = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "svr_frees")); list_create(&svr->svr_new_segments[i], sizeof (vdev_indirect_mapping_entry_t), offsetof(vdev_indirect_mapping_entry_t, vime_node)); @@ -1179,8 +1181,9 @@ spa_vdev_copy_segment(vdev_t *vd, zfs_range_tree_t *segs, * relative to the start of the range to be copied (i.e. relative to the * local variable "start"). */ - zfs_range_tree_t *obsolete_segs = zfs_range_tree_create(NULL, - ZFS_RANGE_SEG64, NULL, 0, 0); + zfs_range_tree_t *obsolete_segs = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "obsolete_segs")); zfs_btree_index_t where; zfs_range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where); @@ -1448,8 +1451,9 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, * allocated segments that we are copying. We may also be copying * free segments (of up to vdev_removal_max_span bytes). */ - zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + zfs_range_tree_t *segs = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "spa_vdev_copy_impl:segs")); for (;;) { zfs_range_tree_t *rt = svr->svr_allocd_segs; zfs_range_seg_t *rs = zfs_range_tree_first(rt); @@ -1610,8 +1614,9 @@ spa_vdev_remove_thread(void *arg) vca.vca_read_error_bytes = 0; vca.vca_write_error_bytes = 0; - zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + zfs_range_tree_t *segs = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "spa_vdev_remove_thread:segs")); mutex_enter(&svr->svr_lock); @@ -1894,8 +1899,9 @@ spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) vdev_indirect_mapping_max_offset(vim)); } - zfs_range_tree_t *segs = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + zfs_range_tree_t *segs = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, ZFS_RT_F_DYN_NAME, + vdev_rt_name(vd, "spa_vdev_remove_cancel_sync:segs")); for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { metaslab_t *msp = vd->vdev_ms[msi]; diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 43998577c0ad..574671c9c19f 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -902,7 +902,9 @@ vdev_trim_thread(void *arg) ta.trim_vdev = vd; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min; - ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); + ta.trim_tree = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "trim_tree")); ta.trim_type = TRIM_TYPE_MANUAL; ta.trim_flags = 0; @@ -1305,8 +1307,10 @@ vdev_autotrim_thread(void *arg) * Allocate an empty range tree which is swapped in * for the existing ms_trim tree while it is processed. */ - trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, - NULL, 0, 0); + trim_tree = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, + vdev_rt_name(vd, "autotrim_tree")); zfs_range_tree_swap(&msp->ms_trim, &trim_tree); ASSERT(zfs_range_tree_is_empty(msp->ms_trim)); @@ -1360,8 +1364,10 @@ vdev_autotrim_thread(void *arg) if (!cvd->vdev_ops->vdev_op_leaf) continue; - ta->trim_tree = zfs_range_tree_create(NULL, - ZFS_RANGE_SEG64, NULL, 0, 0); + ta->trim_tree = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, + vdev_rt_name(vd, "autotrim_tree")); zfs_range_tree_walk(trim_tree, vdev_trim_range_add, ta); } @@ -1600,7 +1606,9 @@ vdev_trim_l2arc_thread(void *arg) vd->vdev_trim_secure = 0; ta.trim_vdev = vd; - ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); + ta.trim_tree = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "trim_tree")); ta.trim_type = TRIM_TYPE_MANUAL; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; @@ -1735,7 +1743,9 @@ vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size) ASSERT(!vd->vdev_top->vdev_rz_expanding); ta.trim_vdev = vd; - ta.trim_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 0, 0); + ta.trim_tree = zfs_range_tree_create_flags( + NULL, ZFS_RANGE_SEG64, NULL, 0, 0, + ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "trim_tree")); ta.trim_type = TRIM_TYPE_SIMPLE; ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; From 90790955a62b1cb81a218bca8f4b109190d8a8d0 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Fri, 20 Jun 2025 20:03:55 +0500 Subject: [PATCH 17/61] SPDX: Add missing CDDL-1.0 license Signed-off-by: Ameer Hamza --- tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh index 8cbc93d6eb74..8028e4f0884e 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh @@ -1,4 +1,5 @@ #!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 # CDDL HEADER START # # The contents of this file are subject to the terms of the From a46ce73ca815bc70dc1372f777797f5660f713db Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Wed, 19 Mar 2025 15:58:29 -0700 Subject: [PATCH 18/61] Make ganging redundancy respect redundant_metadata property (#17073) The redundant_metadata setting in ZFS allows users to trade resilience for performance and space savings. This applies to all data and metadata blocks in zfs, with one exception: gang blocks. Gang blocks currently just take the copies property of the IO being ganged and, if it's 1, sets it to 2. This means that we always make at least two copies of a gang header, which is good for resilience. However, if the users care more about performance than resilience, their gang blocks will be even more of a penalty than usual. We add logic to calculate the number of gang headers copies directly, and store it as a separate IO property. This is stored in the IO properties and not calculated when we decide to gang because by that point we may not have easy access to the relevant information about what kind of block is being stored. We also check the redundant_metadata property when doing so, and use that to decide whether to store an extra copy of the gang headers, compared to the underlying blocks. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Paul Dagnelie Co-authored-by: Paul Dagnelie Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter --- cmd/zdb/zdb.c | 12 +- include/sys/dbuf.h | 1 + include/sys/zio.h | 3 +- module/zfs/arc.c | 2 + module/zfs/dbuf.c | 4 +- module/zfs/dmu.c | 21 ++- module/zfs/dmu_recv.c | 3 + module/zfs/zio.c | 23 ++-- tests/runfiles/common.run | 4 + tests/zfs-tests/include/tunables.cfg | 1 + tests/zfs-tests/tests/Makefile.am | 4 + .../tests/functional/gang_blocks/cleanup.ksh | 31 +++++ .../functional/gang_blocks/gang_blocks.kshlib | 120 ++++++++++++++++++ .../gang_blocks/gang_blocks_redundant.ksh | 88 +++++++++++++ .../tests/functional/gang_blocks/setup.ksh | 30 +++++ 15 files changed, 327 insertions(+), 20 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/gang_blocks/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib create mode 100755 tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_redundant.ksh create mode 100755 tests/zfs-tests/tests/functional/gang_blocks/setup.ksh diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 1f4f3bfe2d70..565c078bb195 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -2546,12 +2546,14 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp, blkbuf[0] = '\0'; - for (i = 0; i < ndvas; i++) + for (i = 0; i < ndvas; i++) { (void) snprintf(blkbuf + strlen(blkbuf), - buflen - strlen(blkbuf), "%llu:%llx:%llx ", + buflen - strlen(blkbuf), "%llu:%llx:%llx%s ", (u_longlong_t)DVA_GET_VDEV(&dva[i]), (u_longlong_t)DVA_GET_OFFSET(&dva[i]), - (u_longlong_t)DVA_GET_ASIZE(&dva[i])); + (u_longlong_t)DVA_GET_ASIZE(&dva[i]), + (DVA_GET_GANG(&dva[i]) ? "G" : "")); + } if (BP_IS_HOLE(bp)) { (void) snprintf(blkbuf + strlen(blkbuf), @@ -8984,7 +8986,7 @@ zdb_read_block(char *thing, spa_t *spa) DVA_SET_VDEV(&dva[0], vd->vdev_id); DVA_SET_OFFSET(&dva[0], offset); - DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH)); + DVA_SET_GANG(&dva[0], 0); DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize)); BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); @@ -8999,7 +9001,7 @@ zdb_read_block(char *thing, spa_t *spa) BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - zio = zio_root(spa, NULL, NULL, 0); + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); if (vd == vd->vdev_top) { /* diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index f7542cf5daef..78e41b572795 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -174,6 +174,7 @@ typedef struct dbuf_dirty_record { arc_buf_t *dr_data; override_states_t dr_override_state; uint8_t dr_copies; + uint8_t dr_gang_copies; boolean_t dr_nopwrite; boolean_t dr_brtwrite; boolean_t dr_diowrite; diff --git a/include/sys/zio.h b/include/sys/zio.h index af47d6f87a41..78adca4d7d00 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -350,6 +350,7 @@ typedef struct zio_prop { uint8_t zp_complevel; uint8_t zp_level; uint8_t zp_copies; + uint8_t zp_gang_copies; dmu_object_type_t zp_type; boolean_t zp_dedup; boolean_t zp_dedup_verify; @@ -575,7 +576,7 @@ extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb); extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, - boolean_t nopwrite, boolean_t brtwrite); + int gang_copies, boolean_t nopwrite, boolean_t brtwrite); extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 1f350b3fe97c..998bb7cf6f4c 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -7065,6 +7065,8 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, localprop.zp_nopwrite = B_FALSE; localprop.zp_copies = MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1); + localprop.zp_gang_copies = + MIN(localprop.zp_gang_copies, SPA_DVAS_PER_BP - 1); } zio_flags |= ZIO_FLAG_RAW; } else if (ARC_BUF_COMPRESSED(buf)) { diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 44d6ff639e98..9a7ef5c1e8f9 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -5364,8 +5364,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, - dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite, - dr->dt.dl.dr_brtwrite); + dr->dt.dl.dr_copies, dr->dt.dl.dr_gang_copies, + dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite); mutex_exit(&db->db_mtx); } else if (data == NULL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 9b3c7a53c813..f9df4ddcfacb 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1916,6 +1916,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) dr->dt.dl.dr_overridden_by = *zio->io_bp; dr->dt.dl.dr_override_state = DR_OVERRIDDEN; dr->dt.dl.dr_copies = zio->io_prop.zp_copies; + dr->dt.dl.dr_gang_copies = zio->io_prop.zp_gang_copies; /* * Old style holes are filled with all zeros, whereas @@ -2322,6 +2323,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) boolean_t dedup_verify = os->os_dedup_verify; boolean_t encrypt = B_FALSE; int copies = os->os_copies; + int gang_copies = os->os_copies; /* * We maintain different write policies for each of the following @@ -2354,15 +2356,24 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) switch (os->os_redundant_metadata) { case ZFS_REDUNDANT_METADATA_ALL: copies++; + gang_copies++; break; case ZFS_REDUNDANT_METADATA_MOST: if (level >= zfs_redundant_metadata_most_ditto_level || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)) copies++; + if (level + 1 >= + zfs_redundant_metadata_most_ditto_level || + DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)) + gang_copies++; break; case ZFS_REDUNDANT_METADATA_SOME: - if (DMU_OT_IS_CRITICAL(type)) + if (DMU_OT_IS_CRITICAL(type)) { copies++; + gang_copies++; + } else if (DMU_OT_IS_METADATA(type)) { + gang_copies++; + } break; case ZFS_REDUNDANT_METADATA_NONE: break; @@ -2445,6 +2456,12 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) && compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); + + if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL || + (os->os_redundant_metadata == + ZFS_REDUNDANT_METADATA_MOST && + zfs_redundant_metadata_most_ditto_level <= 1)) + gang_copies++; } /* @@ -2461,6 +2478,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) if (DMU_OT_IS_ENCRYPTED(type)) { copies = MIN(copies, SPA_DVAS_PER_BP - 1); + gang_copies = MIN(gang_copies, SPA_DVAS_PER_BP - 1); nopwrite = B_FALSE; } else { dedup = B_FALSE; @@ -2478,6 +2496,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; zp->zp_level = level; zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); + zp->zp_gang_copies = MIN(gang_copies, spa_max_replication(os->os_spa)); zp->zp_dedup = dedup; zp->zp_dedup_verify = dedup && dedup_verify; zp->zp_nopwrite = nopwrite; diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 1a10ac156b23..6d27dabc2e56 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -2310,6 +2310,9 @@ flush_write_batch_impl(struct receive_writer_arg *rwa) zp.zp_nopwrite = B_FALSE; zp.zp_copies = MIN(zp.zp_copies, SPA_DVAS_PER_BP - 1); + zp.zp_gang_copies = + MIN(zp.zp_gang_copies, + SPA_DVAS_PER_BP - 1); } zio_flags |= ZIO_FLAG_RAW; } else if (DRR_WRITE_COMPRESSED(drrw)) { diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 126b3fb54db7..feadf28ada72 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1415,8 +1415,8 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, } void -zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite, - boolean_t brtwrite) +zio_write_override(zio_t *zio, blkptr_t *bp, int copies, int gang_copies, + boolean_t nopwrite, boolean_t brtwrite) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); @@ -1433,6 +1433,7 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite, zio->io_prop.zp_nopwrite = nopwrite; zio->io_prop.zp_brtwrite = brtwrite; zio->io_prop.zp_copies = copies; + zio->io_prop.zp_gang_copies = gang_copies; zio->io_bp_override = bp; } @@ -3144,15 +3145,13 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA); /* - * If one copy was requested, store 2 copies of the GBH, so that we - * can still traverse all the data (e.g. to free or scrub) even if a - * block is damaged. Note that we can't store 3 copies of the GBH in - * all cases, e.g. with encryption, which uses DVA[2] for the IV+salt. + * Store multiple copies of the GBH, so that we can still traverse + * all the data (e.g. to free or scrub) even if a block is damaged. + * This value respects the redundant_metadata property. */ - int gbh_copies = copies; - if (gbh_copies == 1) { - gbh_copies = MIN(2, spa_max_replication(spa)); - } + int gbh_copies = gio->io_prop.zp_gang_copies; + ASSERT3S(gbh_copies, >, 0); + ASSERT3S(gbh_copies, <=, SPA_DVAS_PER_BP); ASSERT(ZIO_HAS_ALLOCATOR(pio)); int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; @@ -3172,6 +3171,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) * since metaslab_class_throttle_reserve() always allows * additional reservations for gang blocks. */ + ASSERT3U(gbh_copies, >=, copies); VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, pio->io_allocator, pio, flags)); } @@ -3234,6 +3234,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) zp.zp_type = zp.zp_storage_type = DMU_OT_NONE; zp.zp_level = 0; zp.zp_copies = gio->io_prop.zp_copies; + zp.zp_gang_copies = gio->io_prop.zp_gang_copies; zp.zp_dedup = B_FALSE; zp.zp_dedup_verify = B_FALSE; zp.zp_nopwrite = B_FALSE; @@ -3954,7 +3955,7 @@ zio_ddt_write(zio_t *zio) * grow the DDT entry by to satisfy the request. */ zio_prop_t czp = *zp; - czp.zp_copies = need_dvas; + czp.zp_copies = czp.zp_gang_copies = need_dvas; zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, &czp, zio_ddt_child_write_ready, NULL, diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 21c2422cc7f5..f46975beac00 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -725,6 +725,10 @@ tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_007_neg', 'large_dnode_009_pos'] tags = ['functional', 'features', 'large_dnode'] +[tests/functional/gang_blocks] +tests = ['gang_blocks_redundant'] +tags = ['functional', 'gang_blocks'] + [tests/functional/grow] pre = post = diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 0a546dd44553..79dc64ad9350 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -64,6 +64,7 @@ MAX_DATASET_NESTING max_dataset_nesting zfs_max_dataset_nesting MAX_MISSING_TVDS max_missing_tvds zfs_max_missing_tvds METASLAB_DEBUG_LOAD metaslab.debug_load metaslab_debug_load METASLAB_FORCE_GANGING metaslab.force_ganging metaslab_force_ganging +METASLAB_FORCE_GANGING_PCT metaslab.force_ganging_pct metaslab_force_ganging_pct MULTIHOST_FAIL_INTERVALS multihost.fail_intervals zfs_multihost_fail_intervals MULTIHOST_HISTORY multihost.history zfs_multihost_history MULTIHOST_IMPORT_INTERVALS multihost.import_intervals zfs_multihost_import_intervals diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 37a3bac6ca70..218552b6d897 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -276,6 +276,7 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/events/events.cfg \ functional/events/events_common.kshlib \ functional/fault/fault.cfg \ + functional/gang_blocks/gang_blocks.kshlib \ functional/grow/grow.cfg \ functional/history/history.cfg \ functional/history/history_common.kshlib \ @@ -1562,6 +1563,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/features/large_dnode/large_dnode_008_pos.ksh \ functional/features/large_dnode/large_dnode_009_pos.ksh \ functional/features/large_dnode/setup.ksh \ + functional/gang_blocks/cleanup.ksh \ + functional/gang_blocks/gang_blocks_redundant.ksh \ + functional/gang_blocks/setup.ksh \ functional/grow/grow_pool_001_pos.ksh \ functional/grow/grow_replicas_001_pos.ksh \ functional/history/cleanup.ksh \ diff --git a/tests/zfs-tests/tests/functional/gang_blocks/cleanup.ksh b/tests/zfs-tests/tests/functional/gang_blocks/cleanup.ksh new file mode 100755 index 000000000000..4ae6ec16fae4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/cleanup.ksh @@ -0,0 +1,31 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib + +restore_tunable METASLAB_FORCE_GANGING +restore_tunable METASLAB_FORCE_GANGING_PCT +default_cleanup diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib new file mode 100644 index 000000000000..8799a1436c56 --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib @@ -0,0 +1,120 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 By Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# Get 0th DVA of first L0 block of file +# +# $1 filesystem +# $2 object number +# +function get_object_info +{ + typeset fs=$1 + typeset obj=$2 + + zdb -dddddd $fs $obj +} + +# +# $1 filesystem +# $2 path to file +# $3 block filter +# +function get_blocks_filter +{ + typeset fs=$1 + typeset path=$2 + + typeset full_path="$(get_prop mountpoint $fs)/$path" + typeset obj="$(ls -i $full_path | awk '{print $1}')" + + get_object_info $fs $obj | grep $3 | grep -v Dataset +} + +function get_first_block +{ + get_blocks_filter $1 $2 L0 | head -n 1 +} + +function get_first_block_dva +{ + get_first_block $1 $2 | sed 's/.*L0 \([^ ]*\).*/\1/' +} + +# Takes a zdb compressed blkptr line on stdin +function get_num_dvas +{ + sed 's/.*L[0-9] \(.*\) [a-f0-9]*L.*/\1/' | awk '{print NF}' +} + +function check_gang_dva +{ + typeset last_byte="$(echo -n $1 | tail -c 1)" + [[ "$last_byte" == "G" ]] || return 1 + return 0 +} + +function check_is_gang_dva +{ + check_gang_dva $1 || log_fail "Not a gang DVA: \"$1\"" +} + +function check_not_gang_dva +{ + check_gang_dva $1 && log_fail "Gang DVA: \"$1\"" +} + +# +# Get the gang header contents of the given dva in the given pool +# +# $1 pool +# $2 dva +# $3 size (in hexidecimal) +# +function read_gang_header +{ + typeset pool=$1 + typeset dva=$2 + typeset size=$3 + + check_is_gang_dva $dva + + zdb -R $pool "${dva%:*}:$size:g" 2>&1 | grep -v "Found vdev:" +} + +function preamble +{ + save_tunable METASLAB_FORCE_GANGING + save_tunable METASLAB_FORCE_GANGING_PCT +} + +function cleanup +{ + destroy_pool $TESTPOOL + restore_tunable METASLAB_FORCE_GANGING + restore_tunable METASLAB_FORCE_GANGING_PCT +} diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_redundant.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_redundant.ksh new file mode 100755 index 000000000000..1c44a7c5e598 --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_redundant.ksh @@ -0,0 +1,88 @@ +#!/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +# +# Description: +# Verify that the redundant_metadata setting is respected by gang headers +# +# Strategy: +# 1. Create a filesystem with redundant_metadata={all,most,some,none} +# 2. Verify that gang blocks at different levels have the right amount of redundancy +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/gang_blocks/gang_blocks.kshlib + +log_assert "Verify that gang blocks at different levels have the right amount of redundancy." + +function cleanup2 +{ + for red in all most some none; do zfs destroy $TESTPOOL/$TESTFS-$red; done + cleanup +} + +preamble +log_onexit cleanup2 + +log_must zpool create -f -o ashift=9 $TESTPOOL $DISKS +set_tunable64 METASLAB_FORCE_GANGING 1500 +set_tunable32 METASLAB_FORCE_GANGING_PCT 100 +for red in all most some none; do + log_must zfs create -o redundant_metadata=$red -o recordsize=512 \ + $TESTPOOL/$TESTFS-$red + if [[ "$red" == "all" ]]; then + log_must zfs set recordsize=8k $TESTPOOL/$TESTFS-$red + fi + mountpoint=$(get_prop mountpoint $TESTPOOL/$TESTFS-$red) + + path="${mountpoint}/file" + log_must dd if=/dev/urandom of=$path bs=1M count=1 + log_must zpool sync $TESTPOOL + num_l0_dvas=$(get_first_block $TESTPOOL/$TESTFS-$red file | get_num_dvas) + if [[ "$red" == "all" ]]; then + [[ "$num_l0_dvas" -eq 2 ]] || \ + log_fail "wrong number of DVAs for L0 in $red: $num_l0_dvas" + else + [[ "$num_l0_dvas" -eq 1 ]] || \ + log_fail "wrong number of DVAs for L0 in $red: $num_l0_dvas" + fi + + num_l1_dvas=$(get_blocks_filter $TESTPOOL/$TESTFS-$red file L1 | head -n 1 | get_num_dvas) + if [[ "$red" == "all" || "$red" == "most" ]]; then + [[ "$num_l1_dvas" -eq 2 ]] || \ + log_fail "wrong number of DVAs for L1 in $red: $num_l1_dvas" + else + [[ "$num_l1_dvas" -eq 1 ]] || \ + log_fail "wrong number of DVAs for L1 in $red: $num_l1_dvas" + fi + + for i in `seq 1 80`; do + dd if=/dev/urandom of=/$mountpoint/f$i bs=512 count=1 2>/dev/null || log_fail "dd failed" + done + log_must zpool sync $TESTPOOL + obj_0_gangs=$(get_object_info $TESTPOOL/$TESTFS-$red 0 L0 | grep G) + num_obj_0_dvas=$(echo "$obj_0_gangs" | head -n 1 | get_num_dvas) + if [[ "$red" != "none" ]]; then + [[ "$num_obj_0_dvas" -eq 2 ]] || \ + log_fail "wrong number of DVAs for obj 0 in $red: $num_obj_0_dvas" + else + [[ "$num_obj_0_dvas" -eq 1 ]] || \ + log_fail "wrong number of DVAs for obj 0 in $red: $num_obj_0_dvas" + fi + log_note "Level $red passed" +done + +log_pass "Gang blocks at different levels have the right amount of redundancy." diff --git a/tests/zfs-tests/tests/functional/gang_blocks/setup.ksh b/tests/zfs-tests/tests/functional/gang_blocks/setup.ksh new file mode 100755 index 000000000000..0d2b239a069d --- /dev/null +++ b/tests/zfs-tests/tests/functional/gang_blocks/setup.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 by Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib + +set_tunable64 METASLAB_FORCE_GANGING 16777217 +set_tunable32 METASLAB_FORCE_GANGING_PCT 0 From 3ad3f439bbcfb971256d05531da701ff46cf04de Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 21 Mar 2025 03:01:11 +1100 Subject: [PATCH 19/61] zts: add spdx license tags to gang_blocks tests (#17160) Missed in #17073, probably because that PR was branched before #17001 was landed and never rebased. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Signed-off-by: Rob Norris Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter --- tests/zfs-tests/tests/functional/gang_blocks/cleanup.ksh | 1 + tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib | 1 + .../tests/functional/gang_blocks/gang_blocks_redundant.ksh | 1 + tests/zfs-tests/tests/functional/gang_blocks/setup.ksh | 1 + 4 files changed, 4 insertions(+) diff --git a/tests/zfs-tests/tests/functional/gang_blocks/cleanup.ksh b/tests/zfs-tests/tests/functional/gang_blocks/cleanup.ksh index 4ae6ec16fae4..92a1d1aabb6a 100755 --- a/tests/zfs-tests/tests/functional/gang_blocks/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/gang_blocks/cleanup.ksh @@ -1,4 +1,5 @@ #!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 # # CDDL HEADER START # diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib index 8799a1436c56..553533377aa4 100644 --- a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib +++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks.kshlib @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: CDDL-1.0 # # CDDL HEADER START # diff --git a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_redundant.ksh b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_redundant.ksh index 1c44a7c5e598..504d6aa47e2f 100755 --- a/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_redundant.ksh +++ b/tests/zfs-tests/tests/functional/gang_blocks/gang_blocks_redundant.ksh @@ -1,4 +1,5 @@ #!/bin/ksh +# SPDX-License-Identifier: CDDL-1.0 # # This file and its contents are supplied under the terms of the # Common Development and Distribution License ("CDDL"), version 1.0. diff --git a/tests/zfs-tests/tests/functional/gang_blocks/setup.ksh b/tests/zfs-tests/tests/functional/gang_blocks/setup.ksh index 0d2b239a069d..05bfb04709f0 100755 --- a/tests/zfs-tests/tests/functional/gang_blocks/setup.ksh +++ b/tests/zfs-tests/tests/functional/gang_blocks/setup.ksh @@ -1,4 +1,5 @@ #!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 # # CDDL HEADER START # From fd5a27c9dbbf2491aec0518d220aface4fd27c53 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Wed, 25 Jun 2025 09:05:36 -0700 Subject: [PATCH 20/61] Ensure that gang_copies is always at least as large as copies As discussed in the comments of PR #17004, you can theoretically run into a case where a gang child has more copies than the gang header, which can lead to some odd accounting behavior (and even trip a VERIFY). While the accounting code could be changed to handle this, it fundamentally doesn't seem to make a lot of sense to allow this to happen. If the data is supposed to have a certain level of reliability, that isn't actually achieved unless the gang_copies property is set to match it. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Alexander Motin Signed-off-by: Paul Dagnelie Closes #17484 --- module/zfs/dmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index f9df4ddcfacb..9c39853e3f7c 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2496,7 +2496,8 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; zp->zp_level = level; zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); - zp->zp_gang_copies = MIN(gang_copies, spa_max_replication(os->os_spa)); + zp->zp_gang_copies = MIN(MAX(gang_copies, copies), + spa_max_replication(os->os_spa)); zp->zp_dedup = dedup; zp->zp_dedup_verify = dedup && dedup_verify; zp->zp_nopwrite = nopwrite; From 30fa92bff34775a98a90280dfda71694c1742582 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 16 May 2025 13:23:32 -0400 Subject: [PATCH 21/61] Increase meta-dnode redundancy in "some" mode Loss of one indirect block of the meta dnode likely means loss of the whole dataset. It is worse than one file that the man page promises, and in my opinion is not much better than "none" mode. This change restores redundancy of the meta-dnode indirect blocks, while same time still corrects expectations in the man page. Reviewed-by: Akash B Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #17339 --- include/sys/dmu.h | 4 ++-- man/man7/zfsprops.7 | 3 ++- module/zfs/dmu.c | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 2505c1d28ec5..b80fc5aa8d4d 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -144,9 +144,9 @@ typedef enum dmu_object_byteswap { #define DMU_OT_IS_DDT(ot) \ ((ot) == DMU_OT_DDT_ZAP) -#define DMU_OT_IS_CRITICAL(ot) \ +#define DMU_OT_IS_CRITICAL(ot, level) \ (DMU_OT_IS_METADATA(ot) && \ - (ot) != DMU_OT_DNODE && \ + ((ot) != DMU_OT_DNODE || (level) > 0) && \ (ot) != DMU_OT_DIRECTORY_CONTENTS && \ (ot) != DMU_OT_SA) diff --git a/man/man7/zfsprops.7 b/man/man7/zfsprops.7 index cf925af9e9b0..88ffc08dbbea 100644 --- a/man/man7/zfsprops.7 +++ b/man/man7/zfsprops.7 @@ -1596,7 +1596,8 @@ When set to ZFS stores an extra copy of only critical metadata. This can improve file create performance since less metadata needs to be written. -If a single on-disk block is corrupt, at worst a single user file can be lost. +If a single on-disk block is corrupt, multiple user files or directories +can be lost. .Pp When set to .Sy none , diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 9c39853e3f7c..8216786fb5d9 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2368,7 +2368,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) gang_copies++; break; case ZFS_REDUNDANT_METADATA_SOME: - if (DMU_OT_IS_CRITICAL(type)) { + if (DMU_OT_IS_CRITICAL(type, level)) { copies++; gang_copies++; } else if (DMU_OT_IS_METADATA(type)) { From 4808641e71bbc81e45491a0d4266c9de216eaf24 Mon Sep 17 00:00:00 2001 From: shodanshok Date: Mon, 21 Jul 2025 19:32:01 +0200 Subject: [PATCH 22/61] enforce arc_dnode_limit Linux kernel shrinker in the context of null/root memcg does not scan dentry and inode caches added by a task running in non-root memcg. For ZFS this means that dnode cache routinely overflows, evicting valuable meta/data and putting additional memory pressure on the system. This patch restores zfs_prune_aliases as fallback when the kernel shrinker does nothing, enabling zfs to actually free dnodes. Moreover, it (indirectly) calls arc_evict when dnode_size > dnode_limit. Reviewed-by: Rob Norris Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Gionatan Danti Closes #17487 Closes #17542 --- include/sys/arc_impl.h | 2 +- module/os/linux/zfs/zfs_vfsops.c | 65 ++++++++++++++++++++++++++++++++ module/zfs/arc.c | 22 ++++++----- 3 files changed, 78 insertions(+), 11 deletions(-) diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 1b30389107c5..b55d5da3378c 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -954,7 +954,7 @@ typedef struct arc_sums { wmsum_t arcstat_data_size; wmsum_t arcstat_metadata_size; wmsum_t arcstat_dbuf_size; - wmsum_t arcstat_dnode_size; + aggsum_t arcstat_dnode_size; wmsum_t arcstat_bonus_size; wmsum_t arcstat_l2_hits; wmsum_t arcstat_l2_misses; diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 56af4fe0a464..7961549e637b 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -1176,6 +1176,63 @@ zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp) return (error); } +/* + * Dentry and inode caches referenced by a task in non-root memcg are + * not going to be scanned by the kernel-provided shrinker. So, if + * kernel prunes nothing, fall back to this manual walk to free dnodes. + * To avoid scanning the same znodes multiple times they are always rotated + * to the end of the z_all_znodes list. New znodes are inserted at the + * end of the list so we're always scanning the oldest znodes first. + */ +static int +zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) +{ + znode_t **zp_array, *zp; + int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *)); + int objects = 0; + int i = 0, j = 0; + + zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP); + + mutex_enter(&zfsvfs->z_znodes_lock); + while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) { + + if ((i++ > nr_to_scan) || (j >= max_array)) + break; + + ASSERT(list_link_active(&zp->z_link_node)); + list_remove(&zfsvfs->z_all_znodes, zp); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + + /* Skip active znodes and .zfs entries */ + if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir) + continue; + + if (igrab(ZTOI(zp)) == NULL) + continue; + + zp_array[j] = zp; + j++; + } + mutex_exit(&zfsvfs->z_znodes_lock); + + for (i = 0; i < j; i++) { + zp = zp_array[i]; + + ASSERT3P(zp, !=, NULL); + d_prune_aliases(ZTOI(zp)); + + if (atomic_read(&ZTOI(zp)->i_count) == 1) + objects++; + + zrele(zp); + } + + vmem_free(zp_array, max_array * sizeof (znode_t *)); + + return (objects); +} + /* * The ARC has requested that the filesystem drop entries from the dentry * and inode caches. This can occur when the ARC needs to free meta data @@ -1227,6 +1284,14 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) *objects = (*shrinker->scan_objects)(shrinker, &sc); #endif + /* + * Fall back to zfs_prune_aliases if kernel's shrinker did nothing + * due to dentry and inode caches being referenced by a task running + * in non-root memcg. + */ + if (*objects == 0) + *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); + zfs_exit(zfsvfs, FTAG); dprintf_ds(zfsvfs->z_os->os_dsl_dataset, diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 998bb7cf6f4c..5e70d95e510f 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -2631,7 +2631,7 @@ arc_space_consume(uint64_t space, arc_space_type_t type) ARCSTAT_INCR(arcstat_bonus_size, space); break; case ARC_SPACE_DNODE: - ARCSTAT_INCR(arcstat_dnode_size, space); + aggsum_add(&arc_sums.arcstat_dnode_size, space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, space); @@ -2677,7 +2677,7 @@ arc_space_return(uint64_t space, arc_space_type_t type) ARCSTAT_INCR(arcstat_bonus_size, -space); break; case ARC_SPACE_DNODE: - ARCSTAT_INCR(arcstat_dnode_size, -space); + aggsum_add(&arc_sums.arcstat_dnode_size, -space); break; case ARC_SPACE_DBUF: ARCSTAT_INCR(arcstat_dbuf_size, -space); @@ -4490,7 +4490,7 @@ arc_evict(void) * target is not evictable or if they go over arc_dnode_limit. */ int64_t prune = 0; - int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size); + int64_t dn = aggsum_value(&arc_sums.arcstat_dnode_size); int64_t nem = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) - zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) @@ -5082,11 +5082,13 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve) * in the ARC. In practice, that's in the tens of MB, which is low * enough to be safe. */ - int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c - + int64_t arc_over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c - zfs_max_recordsize; + int64_t dn_over = aggsum_lower_bound(&arc_sums.arcstat_dnode_size) - + arc_dnode_limit; /* Always allow at least one block of overflow. */ - if (over < 0) + if (arc_over < 0 && dn_over <= 0) return (ARC_OVF_NONE); /* If we are under memory pressure, report severe overflow. */ @@ -5097,7 +5099,7 @@ arc_is_overflowing(boolean_t lax, boolean_t use_reserve) int64_t overflow = (arc_c >> zfs_arc_overflow_shift) / 2; if (use_reserve) overflow *= 3; - return (over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); + return (arc_over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE); } static abd_t * @@ -7324,7 +7326,7 @@ arc_kstat_update(kstat_t *ksp, int rw) #if defined(COMPAT_FREEBSD11) as->arcstat_other_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size) + - wmsum_value(&arc_sums.arcstat_dnode_size) + + aggsum_value(&arc_sums.arcstat_dnode_size) + wmsum_value(&arc_sums.arcstat_dbuf_size); #endif @@ -7366,7 +7368,7 @@ arc_kstat_update(kstat_t *ksp, int rw) &as->arcstat_uncached_evictable_metadata); as->arcstat_dnode_size.value.ui64 = - wmsum_value(&arc_sums.arcstat_dnode_size); + aggsum_value(&arc_sums.arcstat_dnode_size); as->arcstat_bonus_size.value.ui64 = wmsum_value(&arc_sums.arcstat_bonus_size); as->arcstat_l2_hits.value.ui64 = @@ -7736,7 +7738,7 @@ arc_state_init(void) wmsum_init(&arc_sums.arcstat_data_size, 0); wmsum_init(&arc_sums.arcstat_metadata_size, 0); wmsum_init(&arc_sums.arcstat_dbuf_size, 0); - wmsum_init(&arc_sums.arcstat_dnode_size, 0); + aggsum_init(&arc_sums.arcstat_dnode_size, 0); wmsum_init(&arc_sums.arcstat_bonus_size, 0); wmsum_init(&arc_sums.arcstat_l2_hits, 0); wmsum_init(&arc_sums.arcstat_l2_misses, 0); @@ -7895,7 +7897,7 @@ arc_state_fini(void) wmsum_fini(&arc_sums.arcstat_data_size); wmsum_fini(&arc_sums.arcstat_metadata_size); wmsum_fini(&arc_sums.arcstat_dbuf_size); - wmsum_fini(&arc_sums.arcstat_dnode_size); + aggsum_fini(&arc_sums.arcstat_dnode_size); wmsum_fini(&arc_sums.arcstat_bonus_size); wmsum_fini(&arc_sums.arcstat_l2_hits); wmsum_fini(&arc_sums.arcstat_l2_misses); From c405a7a35cf9ff56848a8bc2888dbed3bc51c6c4 Mon Sep 17 00:00:00 2001 From: khoang98 <43098119+khoang98@users.noreply.github.com> Date: Fri, 1 Aug 2025 19:47:41 -0400 Subject: [PATCH 23/61] Skip dbuf_evict_one() from dbuf_evict_notify() for reclaim thread Avoid calling dbuf_evict_one() from memory reclaim contexts (e.g. Linux kswapd, FreeBSD pagedaemon). This prevents deadlock caused by reclaim threads waiting for the dbuf hash lock in the call sequence: dbuf_evict_one -> dbuf_destroy -> arc_buf_destroy Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Kaitlin Hoang Closes #17561 --- include/os/freebsd/spl/sys/misc.h | 5 +++++ include/os/linux/spl/sys/misc.h | 6 ++++++ include/sys/zfs_context.h | 5 +++++ module/os/freebsd/spl/spl_misc.c | 9 +++++++++ module/os/linux/spl/spl-thread.c | 12 ++++++++++++ module/zfs/dbuf.c | 10 +++++++++- 6 files changed, 46 insertions(+), 1 deletion(-) diff --git a/include/os/freebsd/spl/sys/misc.h b/include/os/freebsd/spl/sys/misc.h index 091ebe772810..acce8734b2c5 100644 --- a/include/os/freebsd/spl/sys/misc.h +++ b/include/os/freebsd/spl/sys/misc.h @@ -56,4 +56,9 @@ struct opensolaris_utsname { #define task_io_account_read(n) #define task_io_account_write(n) +/* + * Check if the current thread is a memory reclaim thread. + */ +extern int current_is_reclaim_thread(void); + #endif /* _OPENSOLARIS_SYS_MISC_H_ */ diff --git a/include/os/linux/spl/sys/misc.h b/include/os/linux/spl/sys/misc.h index 0b44786f8a6e..fbaaf229bd1a 100644 --- a/include/os/linux/spl/sys/misc.h +++ b/include/os/linux/spl/sys/misc.h @@ -24,7 +24,13 @@ #define _OS_LINUX_SPL_MISC_H #include +#include extern void spl_signal_kobj_evt(struct block_device *bdev); +/* + * Check if the current thread is a memory reclaim thread. + */ +extern int current_is_reclaim_thread(void); + #endif diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index b3d48e257538..272b22174fba 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -236,6 +236,11 @@ typedef pthread_t kthread_t; #define thread_join(t) pthread_join((pthread_t)(t), NULL) #define newproc(f, a, cid, pri, ctp, pid) (ENOSYS) +/* + * Check if the current thread is a memory reclaim thread. + * Always returns false in userspace (no memory reclaim thread). + */ +#define current_is_reclaim_thread() (0) /* in libzpool, p0 exists only to have its address taken */ typedef struct proc { diff --git a/module/os/freebsd/spl/spl_misc.c b/module/os/freebsd/spl/spl_misc.c index f9125a067cd1..3f360d167b17 100644 --- a/module/os/freebsd/spl/spl_misc.c +++ b/module/os/freebsd/spl/spl_misc.c @@ -101,6 +101,15 @@ spl_panic(const char *file, const char *func, int line, const char *fmt, ...) va_end(ap); } +/* + * Check if the current thread is a memory reclaim thread. + * Returns true if curproc is pageproc (FreeBSD's page daemon). + */ +int +current_is_reclaim_thread(void) +{ + return (curproc == pageproc); +} SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY, opensolaris_utsname_init, NULL); diff --git a/module/os/linux/spl/spl-thread.c b/module/os/linux/spl/spl-thread.c index 1398483a3ac8..f42f455222de 100644 --- a/module/os/linux/spl/spl-thread.c +++ b/module/os/linux/spl/spl-thread.c @@ -28,6 +28,7 @@ #include #include #include +#include /* * Thread interfaces @@ -197,3 +198,14 @@ issig(void) } EXPORT_SYMBOL(issig); + +/* + * Check if the current thread is a memory reclaim thread. + * Returns true if current thread is kswapd. + */ +int +current_is_reclaim_thread(void) +{ + return (current_is_kswapd()); +} +EXPORT_SYMBOL(current_is_reclaim_thread); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 9a7ef5c1e8f9..63f801c59818 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -866,8 +866,16 @@ dbuf_evict_notify(uint64_t size) * and grabbing the lock results in massive lock contention. */ if (size > dbuf_cache_target_bytes()) { - if (size > dbuf_cache_hiwater_bytes()) + /* + * Avoid calling dbuf_evict_one() from memory reclaim context + * (e.g. Linux kswapd, FreeBSD pagedaemon) to prevent deadlocks. + * Memory reclaim threads can get stuck waiting for the dbuf + * hash lock. + */ + if (size > dbuf_cache_hiwater_bytes() && + !current_is_reclaim_thread()) { dbuf_evict_one(); + } cv_signal(&dbuf_evict_cv); } } From abb6211e7a2837229a2307da1f743e13b743e870 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 24 Jun 2025 05:51:02 +1000 Subject: [PATCH 24/61] Linux 6.16: remove writepage and readahead_page Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #17443 --- config/kernel-pagemap-readahead-page.m4 | 23 ++++++++++++ ...l-readpages.m4 => kernel-vfs-readpages.m4} | 0 config/kernel-vfs-writepage.m4 | 24 +++++++++++++ config/kernel.m4 | 4 +++ include/os/linux/Makefile.am | 1 + .../os/linux/kernel/linux/pagemap_compat.h | 36 +++++++++++++++++++ module/os/linux/zfs/zpl_file.c | 9 ++--- 7 files changed, 93 insertions(+), 4 deletions(-) create mode 100644 config/kernel-pagemap-readahead-page.m4 rename config/{kernel-readpages.m4 => kernel-vfs-readpages.m4} (100%) create mode 100644 config/kernel-vfs-writepage.m4 create mode 100644 include/os/linux/kernel/linux/pagemap_compat.h diff --git a/config/kernel-pagemap-readahead-page.m4 b/config/kernel-pagemap-readahead-page.m4 new file mode 100644 index 000000000000..30f3d56682fb --- /dev/null +++ b/config/kernel-pagemap-readahead-page.m4 @@ -0,0 +1,23 @@ +dnl # +dnl # Linux 6.16 removed readahead_page +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_PAGEMAP_READAHEAD_PAGE], [ + ZFS_LINUX_TEST_SRC([pagemap_has_readahead_page], [ + #include + ], [ + struct page *p __attribute__ ((unused)) = NULL; + struct readahead_control *ractl __attribute__ ((unused)) = NULL; + p = readahead_page(ractl); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_PAGEMAP_READAHEAD_PAGE], [ + AC_MSG_CHECKING([whether readahead_page() exists]) + ZFS_LINUX_TEST_RESULT([pagemap_has_readahead_page], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_PAGEMAP_READAHEAD_PAGE, 1, + [readahead_page() exists]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/config/kernel-readpages.m4 b/config/kernel-vfs-readpages.m4 similarity index 100% rename from config/kernel-readpages.m4 rename to config/kernel-vfs-readpages.m4 diff --git a/config/kernel-vfs-writepage.m4 b/config/kernel-vfs-writepage.m4 new file mode 100644 index 000000000000..d438e85b457c --- /dev/null +++ b/config/kernel-vfs-writepage.m4 @@ -0,0 +1,24 @@ +dnl # +dnl # Linux 6.16 removes address_space_operations ->writepage +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_WRITEPAGE], [ + ZFS_LINUX_TEST_SRC([vfs_has_writepage], [ + #include + + static const struct address_space_operations + aops __attribute__ ((unused)) = { + .writepage = NULL, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_WRITEPAGE], [ + AC_MSG_CHECKING([whether aops->writepage exists]) + ZFS_LINUX_TEST_RESULT([vfs_has_writepage], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_WRITEPAGE, 1, + [address_space_operations->writepage exists]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index c5482da6425f..e3e7625db7d8 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -82,6 +82,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_VFS_MIGRATEPAGE ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_SRC_VFS_READPAGES + ZFS_AC_KERNEL_SRC_VFS_WRITEPAGE ZFS_AC_KERNEL_SRC_VFS_SET_PAGE_DIRTY_NOBUFFERS ZFS_AC_KERNEL_SRC_VFS_IOV_ITER ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE @@ -111,6 +112,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_STANDALONE_LINUX_STDARG ZFS_AC_KERNEL_SRC_STRLCPY ZFS_AC_KERNEL_SRC_PAGEMAP_FOLIO_WAIT_BIT + ZFS_AC_KERNEL_SRC_PAGEMAP_READAHEAD_PAGE ZFS_AC_KERNEL_SRC_ADD_DISK ZFS_AC_KERNEL_SRC_KTHREAD ZFS_AC_KERNEL_SRC_ZERO_PAGE @@ -198,6 +200,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_VFS_MIGRATEPAGE ZFS_AC_KERNEL_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_VFS_READPAGES + ZFS_AC_KERNEL_VFS_WRITEPAGE ZFS_AC_KERNEL_VFS_SET_PAGE_DIRTY_NOBUFFERS ZFS_AC_KERNEL_VFS_IOV_ITER ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE @@ -227,6 +230,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_STANDALONE_LINUX_STDARG ZFS_AC_KERNEL_STRLCPY ZFS_AC_KERNEL_PAGEMAP_FOLIO_WAIT_BIT + ZFS_AC_KERNEL_PAGEMAP_READAHEAD_PAGE ZFS_AC_KERNEL_ADD_DISK ZFS_AC_KERNEL_KTHREAD ZFS_AC_KERNEL_ZERO_PAGE diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am index b7bdd892ec1d..4fe6705defe5 100644 --- a/include/os/linux/Makefile.am +++ b/include/os/linux/Makefile.am @@ -8,6 +8,7 @@ kernel_linux_HEADERS = \ %D%/kernel/linux/mm_compat.h \ %D%/kernel/linux/mod_compat.h \ %D%/kernel/linux/page_compat.h \ + %D%/kernel/linux/pagemap_compat.h \ %D%/kernel/linux/simd.h \ %D%/kernel/linux/simd_aarch64.h \ %D%/kernel/linux/simd_arm.h \ diff --git a/include/os/linux/kernel/linux/pagemap_compat.h b/include/os/linux/kernel/linux/pagemap_compat.h new file mode 100644 index 000000000000..a0465ede0105 --- /dev/null +++ b/include/os/linux/kernel/linux/pagemap_compat.h @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2025, Rob Norris + */ + +#ifndef _ZFS_PAGEMAP_COMPAT_H +#define _ZFS_PAGEMAP_COMPAT_H + +#include + +#ifndef HAVE_PAGEMAP_READAHEAD_PAGE +#define readahead_page(ractl) (&(__readahead_folio(ractl)->page)) +#endif + +#endif diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index d27e8e8e8a0e..5cc55f3ee3bd 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -36,10 +36,7 @@ #include #include #include -#if defined(HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS) || \ - defined(HAVE_VFS_FILEMAP_DIRTY_FOLIO) -#include -#endif +#include #include #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO #include @@ -555,6 +552,7 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) return (result); } +#ifdef HAVE_VFS_WRITEPAGE /* * Write out dirty pages to the ARC, this function is only required to * support mmap(2). Mapped pages may be dirtied by memory operations @@ -571,6 +569,7 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc) return (zpl_putpage(pp, wbc, &for_sync)); } +#endif /* * The flag combination which matches the behavior of zfs_space() is @@ -1040,7 +1039,9 @@ const struct address_space_operations zpl_address_space_operations = { #else .readpage = zpl_readpage, #endif +#ifdef HAVE_VFS_WRITEPAGE .writepage = zpl_writepage, +#endif .writepages = zpl_writepages, .direct_IO = zpl_direct_IO, #ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS From 809b553940e6a4c8df38bcc3172c46022c31548b Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Mon, 12 May 2025 13:22:17 -0400 Subject: [PATCH 25/61] Introduce zfs rewrite subcommand (#17246) This allows to rewrite content of specified file(s) as-is without modifications, but at a different location, compression, checksum, dedup, copies and other parameter values. It is faster than read plus write, since it does not require data copying to user-space. It is also faster for sync=always datasets, since without data modification it does not require ZIL writing. Also since it is protected by normal range range locks, it can be done under any other load. Also it does not affect file's modification time or other properties. Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Reviewed-by: Tony Hutter Reviewed-by: Rob Norris --- cmd/zfs/zfs_main.c | 204 +++++++++++++++++- contrib/debian/openzfs-zfsutils.install | 1 + include/sys/fs/zfs.h | 9 + include/sys/zfs_vnops.h | 1 + man/Makefile.am | 1 + man/man8/zfs-rewrite.8 | 76 +++++++ man/man8/zfs.8 | 8 +- module/os/freebsd/zfs/zfs_vnops_os.c | 12 ++ module/os/linux/zfs/zpl_file.c | 23 ++ module/zfs/zfs_vnops.c | 137 ++++++++++++ tests/runfiles/common.run | 4 + tests/runfiles/sanity.run | 4 + tests/zfs-tests/tests/Makefile.am | 3 + .../cli_root/zfs_rewrite/cleanup.ksh | 26 +++ .../functional/cli_root/zfs_rewrite/setup.ksh | 28 +++ .../cli_root/zfs_rewrite/zfs_rewrite.ksh | 104 +++++++++ 16 files changed, 636 insertions(+), 5 deletions(-) create mode 100644 man/man8/zfs-rewrite.8 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite.ksh diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 7db2273cd570..a16e9148420a 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -121,6 +122,7 @@ static int zfs_do_change_key(int argc, char **argv); static int zfs_do_project(int argc, char **argv); static int zfs_do_version(int argc, char **argv); static int zfs_do_redact(int argc, char **argv); +static int zfs_do_rewrite(int argc, char **argv); static int zfs_do_wait(int argc, char **argv); #ifdef __FreeBSD__ @@ -193,6 +195,7 @@ typedef enum { HELP_CHANGE_KEY, HELP_VERSION, HELP_REDACT, + HELP_REWRITE, HELP_JAIL, HELP_UNJAIL, HELP_WAIT, @@ -227,7 +230,7 @@ static zfs_command_t command_table[] = { { "promote", zfs_do_promote, HELP_PROMOTE }, { "rename", zfs_do_rename, HELP_RENAME }, { "bookmark", zfs_do_bookmark, HELP_BOOKMARK }, - { "program", zfs_do_channel_program, HELP_CHANNEL_PROGRAM }, + { "diff", zfs_do_diff, HELP_DIFF }, { NULL }, { "list", zfs_do_list, HELP_LIST }, { NULL }, @@ -249,27 +252,31 @@ static zfs_command_t command_table[] = { { NULL }, { "send", zfs_do_send, HELP_SEND }, { "receive", zfs_do_receive, HELP_RECEIVE }, + { "redact", zfs_do_redact, HELP_REDACT }, { NULL }, { "allow", zfs_do_allow, HELP_ALLOW }, - { NULL }, { "unallow", zfs_do_unallow, HELP_UNALLOW }, { NULL }, { "hold", zfs_do_hold, HELP_HOLD }, { "holds", zfs_do_holds, HELP_HOLDS }, { "release", zfs_do_release, HELP_RELEASE }, - { "diff", zfs_do_diff, HELP_DIFF }, + { NULL }, { "load-key", zfs_do_load_key, HELP_LOAD_KEY }, { "unload-key", zfs_do_unload_key, HELP_UNLOAD_KEY }, { "change-key", zfs_do_change_key, HELP_CHANGE_KEY }, - { "redact", zfs_do_redact, HELP_REDACT }, + { NULL }, + { "program", zfs_do_channel_program, HELP_CHANNEL_PROGRAM }, + { "rewrite", zfs_do_rewrite, HELP_REWRITE }, { "wait", zfs_do_wait, HELP_WAIT }, #ifdef __FreeBSD__ + { NULL }, { "jail", zfs_do_jail, HELP_JAIL }, { "unjail", zfs_do_unjail, HELP_UNJAIL }, #endif #ifdef __linux__ + { NULL }, { "zone", zfs_do_zone, HELP_ZONE }, { "unzone", zfs_do_unzone, HELP_UNZONE }, #endif @@ -432,6 +439,9 @@ get_usage(zfs_help_t idx) case HELP_REDACT: return (gettext("\tredact " " ...\n")); + case HELP_REWRITE: + return (gettext("\trewrite [-rvx] [-o ] [-l ] " + "\n")); case HELP_JAIL: return (gettext("\tjail \n")); case HELP_UNJAIL: @@ -9032,6 +9042,192 @@ zfs_do_project(int argc, char **argv) return (ret); } +static int +zfs_rewrite_file(const char *path, boolean_t verbose, zfs_rewrite_args_t *args) +{ + int fd, ret = 0; + + fd = open(path, O_WRONLY); + if (fd < 0) { + ret = errno; + (void) fprintf(stderr, gettext("failed to open %s: %s\n"), + path, strerror(errno)); + return (ret); + } + + if (ioctl(fd, ZFS_IOC_REWRITE, args) < 0) { + ret = errno; + (void) fprintf(stderr, gettext("failed to rewrite %s: %s\n"), + path, strerror(errno)); + } else if (verbose) { + printf("%s\n", path); + } + + close(fd); + return (ret); +} + +static int +zfs_rewrite_dir(const char *path, boolean_t verbose, boolean_t xdev, dev_t dev, + zfs_rewrite_args_t *args, nvlist_t *dirs) +{ + struct dirent *ent; + DIR *dir; + int ret = 0, err; + + dir = opendir(path); + if (dir == NULL) { + if (errno == ENOENT) + return (0); + ret = errno; + (void) fprintf(stderr, gettext("failed to opendir %s: %s\n"), + path, strerror(errno)); + return (ret); + } + + size_t plen = strlen(path) + 1; + while ((ent = readdir(dir)) != NULL) { + char *fullname; + struct stat st; + + if (ent->d_type != DT_REG && ent->d_type != DT_DIR) + continue; + + if (strcmp(ent->d_name, ".") == 0 || + strcmp(ent->d_name, "..") == 0) + continue; + + if (plen + strlen(ent->d_name) >= PATH_MAX) { + (void) fprintf(stderr, gettext("path too long %s/%s\n"), + path, ent->d_name); + ret = ENAMETOOLONG; + continue; + } + + if (asprintf(&fullname, "%s/%s", path, ent->d_name) == -1) { + (void) fprintf(stderr, + gettext("failed to allocate memory\n")); + ret = ENOMEM; + continue; + } + + if (xdev) { + if (lstat(fullname, &st) < 0) { + ret = errno; + (void) fprintf(stderr, + gettext("failed to stat %s: %s\n"), + fullname, strerror(errno)); + free(fullname); + continue; + } + if (st.st_dev != dev) { + free(fullname); + continue; + } + } + + if (ent->d_type == DT_REG) { + err = zfs_rewrite_file(fullname, verbose, args); + if (err) + ret = err; + } else { /* DT_DIR */ + fnvlist_add_uint64(dirs, fullname, dev); + } + + free(fullname); + } + + closedir(dir); + return (ret); +} + +static int +zfs_rewrite_path(const char *path, boolean_t verbose, boolean_t recurse, + boolean_t xdev, zfs_rewrite_args_t *args, nvlist_t *dirs) +{ + struct stat st; + int ret = 0; + + if (lstat(path, &st) < 0) { + ret = errno; + (void) fprintf(stderr, gettext("failed to stat %s: %s\n"), + path, strerror(errno)); + return (ret); + } + + if (S_ISREG(st.st_mode)) { + ret = zfs_rewrite_file(path, verbose, args); + } else if (S_ISDIR(st.st_mode) && recurse) { + ret = zfs_rewrite_dir(path, verbose, xdev, st.st_dev, args, + dirs); + } + return (ret); +} + +static int +zfs_do_rewrite(int argc, char **argv) +{ + int ret = 0, err, c; + boolean_t recurse = B_FALSE, verbose = B_FALSE, xdev = B_FALSE; + + if (argc < 2) + usage(B_FALSE); + + zfs_rewrite_args_t args; + memset(&args, 0, sizeof (args)); + + while ((c = getopt(argc, argv, "l:o:rvx")) != -1) { + switch (c) { + case 'l': + args.len = strtoll(optarg, NULL, 0); + break; + case 'o': + args.off = strtoll(optarg, NULL, 0); + break; + case 'r': + recurse = B_TRUE; + break; + case 'v': + verbose = B_TRUE; + break; + case 'x': + xdev = B_TRUE; + break; + default: + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argv += optind; + argc -= optind; + if (argc == 0) { + (void) fprintf(stderr, + gettext("missing file or directory target(s)\n")); + usage(B_FALSE); + } + + nvlist_t *dirs = fnvlist_alloc(); + for (int i = 0; i < argc; i++) { + err = zfs_rewrite_path(argv[i], verbose, recurse, xdev, &args, + dirs); + if (err) + ret = err; + } + nvpair_t *dir; + while ((dir = nvlist_next_nvpair(dirs, NULL)) != NULL) { + err = zfs_rewrite_dir(nvpair_name(dir), verbose, xdev, + fnvpair_value_uint64(dir), &args, dirs); + if (err) + ret = err; + fnvlist_remove_nvpair(dirs, dir); + } + fnvlist_free(dirs); + + return (ret); +} + static int zfs_do_wait(int argc, char **argv) { diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install index 546745930bff..4573cc77ea74 100644 --- a/contrib/debian/openzfs-zfsutils.install +++ b/contrib/debian/openzfs-zfsutils.install @@ -73,6 +73,7 @@ usr/share/man/man8/zfs-recv.8 usr/share/man/man8/zfs-redact.8 usr/share/man/man8/zfs-release.8 usr/share/man/man8/zfs-rename.8 +usr/share/man/man8/zfs-rewrite.8 usr/share/man/man8/zfs-rollback.8 usr/share/man/man8/zfs-send.8 usr/share/man/man8/zfs-set.8 diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 2d27aee217e0..bfaea6fb3fde 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1614,6 +1614,15 @@ typedef enum zfs_ioc { #endif +typedef struct zfs_rewrite_args { + uint64_t off; + uint64_t len; + uint64_t flags; + uint64_t arg; +} zfs_rewrite_args_t; + +#define ZFS_IOC_REWRITE _IOW(0x83, 3, zfs_rewrite_args_t) + /* * ZFS-specific error codes used for returning descriptive errors * to the userland through zfs ioctls. diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h index 21f0da4fe6b4..08cf0e2a6e48 100644 --- a/include/sys/zfs_vnops.h +++ b/include/sys/zfs_vnops.h @@ -40,6 +40,7 @@ extern int zfs_clone_range(znode_t *, uint64_t *, znode_t *, uint64_t *, uint64_t *, cred_t *); extern int zfs_clone_range_replay(znode_t *, uint64_t, uint64_t, uint64_t, const blkptr_t *, size_t); +extern int zfs_rewrite(znode_t *, uint64_t, uint64_t, uint64_t, uint64_t); extern int zfs_getsecattr(znode_t *, vsecattr_t *, int, cred_t *); extern int zfs_setsecattr(znode_t *, vsecattr_t *, int, cred_t *); diff --git a/man/Makefile.am b/man/Makefile.am index fde704933764..6a7b2d3e46b7 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -50,6 +50,7 @@ dist_man_MANS = \ %D%/man8/zfs-redact.8 \ %D%/man8/zfs-release.8 \ %D%/man8/zfs-rename.8 \ + %D%/man8/zfs-rewrite.8 \ %D%/man8/zfs-rollback.8 \ %D%/man8/zfs-send.8 \ %D%/man8/zfs-set.8 \ diff --git a/man/man8/zfs-rewrite.8 b/man/man8/zfs-rewrite.8 new file mode 100644 index 000000000000..423d6d439e28 --- /dev/null +++ b/man/man8/zfs-rewrite.8 @@ -0,0 +1,76 @@ +.\" SPDX-License-Identifier: CDDL-1.0 +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or https://opensource.org/licenses/CDDL-1.0. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" Copyright (c) 2025 iXsystems, Inc. +.\" +.Dd May 6, 2025 +.Dt ZFS-REWRITE 8 +.Os +. +.Sh NAME +.Nm zfs-rewrite +.Nd rewrite specified files without modification +.Sh SYNOPSIS +.Nm zfs +.Cm rewrite +.Oo Fl rvx Ns Oc +.Op Fl l Ar length +.Op Fl o Ar offset +.Ar file Ns | Ns Ar directory Ns … +. +.Sh DESCRIPTION +Rewrite blocks of specified +.Ar file +as is without modification at a new location and possibly with new +properties, such as checksum, compression, dedup, copies, etc, +as if they were atomically read and written back. +.Bl -tag -width "-r" +.It Fl l Ar length +Rewrite at most this number of bytes. +.It Fl o Ar offset +Start at this offset in bytes. +.It Fl r +Recurse into directories. +.It Fl v +Print names of all successfully rewritten files. +.It Fl x +Don't cross file system mount points when recursing. +.El +.Sh NOTES +Rewrite of cloned blocks and blocks that are part of any snapshots, +same as some property changes may increase pool space usage. +Holes that were never written or were previously zero-compressed are +not rewritten and will remain holes even if compression is disabled. +.Pp +Rewritten blocks will be seen as modified in next snapshot and as such +included into the incremental +.Nm zfs Cm send +stream. +.Pp +If a +.Fl l +or +.Fl o +value request a rewrite to regions past the end of the file, then those +regions are silently ignored, and no error is reported. +. +.Sh SEE ALSO +.Xr zfsprops 7 diff --git a/man/man8/zfs.8 b/man/man8/zfs.8 index 5bdeb7f9e455..e16a3a82b672 100644 --- a/man/man8/zfs.8 +++ b/man/man8/zfs.8 @@ -37,7 +37,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd May 12, 2022 +.Dd April 18, 2025 .Dt ZFS 8 .Os . @@ -299,6 +299,12 @@ Execute ZFS administrative operations programmatically via a Lua script-language channel program. .El . +.Ss Data rewrite +.Bl -tag -width "" +.It Xr zfs-rewrite 8 +Rewrite specified files without modification. +.El +. .Ss Jails .Bl -tag -width "" .It Xr zfs-jail 8 diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 16b0a1d4bcd6..68367f105691 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -306,6 +306,18 @@ zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred, *(offset_t *)data = off; return (0); } + case ZFS_IOC_REWRITE: { + zfs_rewrite_args_t *args = (zfs_rewrite_args_t *)data; + if ((flag & FWRITE) == 0) + return (SET_ERROR(EBADF)); + error = vn_lock(vp, LK_SHARED); + if (error) + return (error); + error = zfs_rewrite(VTOZ(vp), args->off, args->len, + args->flags, args->arg); + VOP_UNLOCK(vp); + return (error); + } } return (SET_ERROR(ENOTTY)); } diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 5cc55f3ee3bd..1a82c13e1523 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -984,6 +984,27 @@ zpl_ioctl_setdosflags(struct file *filp, void __user *arg) return (err); } +static int +zpl_ioctl_rewrite(struct file *filp, void __user *arg) +{ + struct inode *ip = file_inode(filp); + zfs_rewrite_args_t args; + fstrans_cookie_t cookie; + int err; + + if (copy_from_user(&args, arg, sizeof (args))) + return (-EFAULT); + + if (unlikely(!(filp->f_mode & FMODE_WRITE))) + return (-EBADF); + + cookie = spl_fstrans_mark(); + err = -zfs_rewrite(ITOZ(ip), args.off, args.len, args.flags, args.arg); + spl_fstrans_unmark(cookie); + + return (err); +} + static long zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -1002,6 +1023,8 @@ zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return (zpl_ioctl_getdosflags(filp, (void *)arg)); case ZFS_IOC_SETDOSFLAGS: return (zpl_ioctl_setdosflags(filp, (void *)arg)); + case ZFS_IOC_REWRITE: + return (zpl_ioctl_rewrite(filp, (void *)arg)); default: return (-ENOTTY); } diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 4ef391ed7729..a419e144cc74 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1059,6 +1059,143 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) return (0); } +/* + * Rewrite a range of file as-is without modification. + * + * IN: zp - znode of file to be rewritten. + * off - Offset of the range to rewrite. + * len - Length of the range to rewrite. + * flags - Random rewrite parameters. + * arg - flags-specific argument. + * + * RETURN: 0 if success + * error code if failure + */ +int +zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags, + uint64_t arg) +{ + int error; + + if (flags != 0 || arg != 0) + return (SET_ERROR(EINVAL)); + + zfsvfs_t *zfsvfs = ZTOZSB(zp); + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); + + if (zfs_is_readonly(zfsvfs)) { + zfs_exit(zfsvfs, FTAG); + return (SET_ERROR(EROFS)); + } + + if (off >= zp->z_size) { + zfs_exit(zfsvfs, FTAG); + return (0); + } + if (len == 0 || len > zp->z_size - off) + len = zp->z_size - off; + + /* Flush any mmap()'d data to disk */ + if (zn_has_cached_data(zp, off, off + len - 1)) + zn_flush_cached_data(zp, B_TRUE); + + zfs_locked_range_t *lr; + lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); + + const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); + const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); + const uint64_t projid = zp->z_projid; + + dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); + DB_DNODE_ENTER(db); + dnode_t *dn = DB_DNODE(db); + + uint64_t n, noff = off, nr = 0, nw = 0; + while (len > 0) { + /* + * Rewrite only actual data, skipping any holes. This might + * be inaccurate for dirty files, but we don't really care. + */ + if (noff == off) { + /* Find next data in the file. */ + error = dnode_next_offset(dn, 0, &noff, 1, 1, 0); + if (error || noff >= off + len) { + if (error == ESRCH) /* No more data. */ + error = 0; + break; + } + ASSERT3U(noff, >=, off); + len -= noff - off; + off = noff; + + /* Find where the data end. */ + error = dnode_next_offset(dn, DNODE_FIND_HOLE, &noff, + 1, 1, 0); + if (error != 0) + noff = off + len; + } + ASSERT3U(noff, >, off); + + if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || + zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || + (projid != ZFS_DEFAULT_PROJID && + zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, + projid))) { + error = SET_ERROR(EDQUOT); + break; + } + + n = MIN(MIN(len, noff - off), + DMU_MAX_ACCESS / 2 - P2PHASE(off, zp->z_blksz)); + + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_write_by_dnode(tx, dn, off, n); + error = dmu_tx_assign(tx, DMU_TX_WAIT); + if (error) { + dmu_tx_abort(tx); + break; + } + + /* Mark all dbufs within range as dirty to trigger rewrite. */ + dmu_buf_t **dbp; + int numbufs; + error = dmu_buf_hold_array_by_dnode(dn, off, n, TRUE, FTAG, + &numbufs, &dbp, DMU_READ_PREFETCH); + if (error) { + dmu_tx_abort(tx); + break; + } + for (int i = 0; i < numbufs; i++) { + nr += dbp[i]->db_size; + if (dmu_buf_is_dirty(dbp[i], tx)) + continue; + nw += dbp[i]->db_size; + dmu_buf_will_dirty(dbp[i], tx); + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + + dmu_tx_commit(tx); + + len -= n; + off += n; + + if (issig()) { + error = SET_ERROR(EINTR); + break; + } + } + + DB_DNODE_EXIT(db); + + dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr); + dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nw); + + zfs_rangelock_exit(lr); + zfs_exit(zfsvfs, FTAG); + return (error); +} + int zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) { diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index f46975beac00..bbe17b073cbf 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -306,6 +306,10 @@ tags = ['functional', 'cli_root', 'zfs_rename'] tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos'] tags = ['functional', 'cli_root', 'zfs_reservation'] +[tests/functional/cli_root/zfs_rewrite] +tests = ['zfs_rewrite'] +tags = ['functional', 'cli_root', 'zfs_rewrite'] + [tests/functional/cli_root/zfs_rollback] tests = ['zfs_rollback_001_pos', 'zfs_rollback_002_pos', 'zfs_rollback_003_neg', 'zfs_rollback_004_neg'] diff --git a/tests/runfiles/sanity.run b/tests/runfiles/sanity.run index 64018e93eb6d..9664e445ae16 100644 --- a/tests/runfiles/sanity.run +++ b/tests/runfiles/sanity.run @@ -194,6 +194,10 @@ tags = ['functional', 'cli_root', 'zfs_rename'] tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos'] tags = ['functional', 'cli_root', 'zfs_reservation'] +[tests/functional/cli_root/zfs_rewrite] +tests = ['zfs_rewrite'] +tags = ['functional', 'cli_root', 'zfs_rewrite'] + [tests/functional/cli_root/zfs_rollback] tests = ['zfs_rollback_003_neg', 'zfs_rollback_004_neg'] tags = ['functional', 'cli_root', 'zfs_rollback'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 218552b6d897..64c54815b6c9 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -863,6 +863,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zfs_reservation/setup.ksh \ functional/cli_root/zfs_reservation/zfs_reservation_001_pos.ksh \ functional/cli_root/zfs_reservation/zfs_reservation_002_pos.ksh \ + functional/cli_root/zfs_rewrite/cleanup.ksh \ + functional/cli_root/zfs_rewrite/setup.ksh \ + functional/cli_root/zfs_rewrite/zfs_rewrite.ksh \ functional/cli_root/zfs_rollback/cleanup.ksh \ functional/cli_root/zfs_rollback/setup.ksh \ functional/cli_root/zfs_rollback/zfs_rollback_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/cleanup.ksh new file mode 100755 index 000000000000..5e73dd34936e --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/cleanup.ksh @@ -0,0 +1,26 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/setup.ksh new file mode 100755 index 000000000000..dddfdf8a4679 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/setup.ksh @@ -0,0 +1,28 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib + +DISK=${DISKS%% *} + +default_setup $DISK diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite.ksh new file mode 100755 index 000000000000..d1c0b3c64c27 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_rewrite/zfs_rewrite.ksh @@ -0,0 +1,104 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, iXsystems, Inc. +# + +# DESCRIPTION: +# Verify zfs rewrite rewrites specified files blocks. +# +# STRATEGY: +# 1. Create two files, one of which is in a directory. +# 2. Save the checksums and block pointers. +# 3. Rewrite part of the files. +# 4. Verify checksums are the same. +# 5. Verify block pointers of the rewritten part have changed. +# 6. Rewrite all the files. +# 7. Verify checksums are the same. +# 8. Verify all block pointers have changed. + +. $STF_SUITE/include/libtest.shlib + +typeset tmp=$(mktemp) +typeset bps=$(mktemp) +typeset bps1=$(mktemp) +typeset bps2=$(mktemp) + +function cleanup +{ + rm -rf $tmp $bps $bps1 $bps2 $TESTDIR/* +} + +log_assert "zfs rewrite rewrites specified files blocks" + +log_onexit cleanup + +log_must zfs set recordsize=128k $TESTPOOL/$TESTFS + +log_must mkdir $TESTDIR/dir +log_must dd if=/dev/urandom of=$TESTDIR/file1 bs=128k count=8 +log_must dd if=$TESTDIR/file1 of=$TESTDIR/dir/file2 bs=128k +log_must sync_pool $TESTPOOL +typeset orig_hash1=$(xxh128digest $TESTDIR/file1) +typeset orig_hash2=$(xxh128digest $TESTDIR/dir/file2) + +log_must [ "$orig_hash1" = "$orig_hash2" ] +log_must eval "zdb -Ovv $TESTPOOL/$TESTFS file1 > $tmp" +log_must eval "awk '/ L0 / { print l++ \" \" \$3 }' < $tmp > $bps1" +log_must eval "zdb -Ovv $TESTPOOL/$TESTFS dir/file2 > $tmp" +log_must eval "awk '/ L0 / { print l++ \" \" \$3 }' < $tmp > $bps2" + +log_must zfs rewrite -o 327680 -l 262144 -r -x $TESTDIR/file1 $TESTDIR/dir/file2 +log_must sync_pool $TESTPOOL +typeset new_hash1=$(xxh128digest $TESTDIR/file1) +typeset new_hash2=$(xxh128digest $TESTDIR/dir/file2) +log_must [ "$orig_hash1" = "$new_hash1" ] +log_must [ "$orig_hash2" = "$new_hash2" ] + +log_must eval "zdb -Ovv $TESTPOOL/$TESTFS file1 > $tmp" +log_must eval "awk '/ L0 / { print l++ \" \" \$3 }' < $tmp > $bps" +typeset same=$(echo $(sort -n $bps $bps1 | uniq -d | cut -f1 -d' ')) +log_must [ "$same" = "0 1 5 6 7" ] +log_must eval "zdb -Ovv $TESTPOOL/$TESTFS dir/file2 > $tmp" +log_must eval "awk '/ L0 / { print l++ \" \" \$3 }' < $tmp > $bps" +typeset same=$(echo $(sort -n $bps $bps2 | uniq -d | cut -f1 -d' ')) +log_must [ "$same" = "0 1 5 6 7" ] + +log_must zfs rewrite -r $TESTDIR/file1 $TESTDIR/dir/file2 +log_must sync_pool $TESTPOOL +typeset new_hash1=$(xxh128digest $TESTDIR/file1) +typeset new_hash2=$(xxh128digest $TESTDIR/dir/file2) +log_must [ "$orig_hash1" = "$new_hash1" ] +log_must [ "$orig_hash2" = "$new_hash2" ] + +log_must eval "zdb -Ovv $TESTPOOL/$TESTFS file1 > $tmp" +log_must eval "awk '/ L0 / { print l++ \" \" \$3 }' < $tmp > $bps" +typeset same=$(echo $(sort -n $bps $bps1 | uniq -d | cut -f1 -d' ')) +log_must [ -z "$same" ] +log_must eval "zdb -Ovv $TESTPOOL/$TESTFS dir/file2 > $tmp" +log_must eval "awk '/ L0 / { print l++ \" \" \$3 }' < $tmp > $bps" +typeset same=$(echo $(sort -n $bps $bps2 | uniq -d | cut -f1 -d' ')) +log_must [ -z "$same" ] + +log_pass From 22eb2bdce31fa0f5688f48cbfcdd4edbf3c795ee Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 10 Jun 2025 12:30:06 -0400 Subject: [PATCH 26/61] Make TX abort after assign safer It is not right, but there are few examples when TX is aborted after being assigned in case of error. To handle it better on production systems add extra cleanup steps. While here, replace couple dmu_tx_abort() in simple cases. Reviewed-by: Rob Norris Reviewed-by: Brian Behlendorf Reviewed-by: Igor Kozhukhov Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #17438 --- module/zfs/zfs_vnops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index a419e144cc74..9489890702c5 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1163,7 +1163,7 @@ zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags, error = dmu_buf_hold_array_by_dnode(dn, off, n, TRUE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (error) { - dmu_tx_abort(tx); + dmu_tx_commit(tx); break; } for (int i = 0; i < numbufs; i++) { From 1d293b377a33205a6c07da799b903e89a319d1e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20F=C3=BCl=C3=B6p?= Date: Mon, 16 Jun 2025 17:12:09 +0200 Subject: [PATCH 27/61] Linux build: handle CONFIG_OBJTOOL_WERROR=y MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Linux 5.16 by default fails the build on objtool warnings. We have known and understood objtool warnings we can't fix without involving Linux maintainers. To work around this we introduce an objtool wrapper script which removes the `--Werror` flag. Reviewed-by: Brian Behlendorf Signed-off-by: Attila Fülöp Closes #17456 --- config/kernel-objtool.m4 | 17 ++++++++++++++++ config/zfs-build.m4 | 40 ++++++++++++++++++++++++++++++++++++++ configure.ac | 2 ++ module/Makefile.in | 1 + scripts/.gitignore | 1 + scripts/objtool-wrapper.in | 36 ++++++++++++++++++++++++++++++++++ scripts/spdxcheck.pl | 1 + 7 files changed, 98 insertions(+) create mode 100644 scripts/objtool-wrapper.in diff --git a/config/kernel-objtool.m4 b/config/kernel-objtool.m4 index e616ccebcbc0..3020440eb388 100644 --- a/config/kernel-objtool.m4 +++ b/config/kernel-objtool.m4 @@ -49,6 +49,15 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_OBJTOOL], [ #error "STACK_FRAME_NON_STANDARD is not defined." #endif ]) + + dnl # 6.15 made CONFIG_OBJTOOL_WERROR=y the default. We need to handle + dnl # this or our build will fail. + ZFS_LINUX_TEST_SRC([config_objtool_werror], [ + #if !defined(CONFIG_OBJTOOL_WERROR) + #error "CONFIG_OBJTOOL_WERROR is not defined." + #endif + ]) + ]) AC_DEFUN([ZFS_AC_KERNEL_OBJTOOL], [ @@ -84,6 +93,14 @@ AC_DEFUN([ZFS_AC_KERNEL_OBJTOOL], [ ],[ AC_MSG_RESULT(no) ]) + + AC_MSG_CHECKING([whether CONFIG_OBJTOOL_WERROR is defined]) + ZFS_LINUX_TEST_RESULT([config_objtool_werror],[ + AC_MSG_RESULT(yes) + CONFIG_OBJTOOL_WERROR_DEFINED=yes + ],[ + AC_MSG_RESULT(no) + ]) ],[ AC_MSG_RESULT(no) ]) diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index 55fc029f0847..7cf1b02d8757 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -205,6 +205,46 @@ AC_DEFUN([ZFS_AC_DEBUG_INVARIANTS], [ AC_MSG_RESULT([$enable_invariants]) ]) +dnl # Disabled by default. If enabled allows a configured "turn objtools +dnl # warnings into errors" (CONFIG_OBJTOOL_WERROR) behavior to take effect. +dnl # If disabled, objtool warnings are never turned into errors. It can't +dnl # be enabled if the kernel wasn't compiled with CONFIG_OBJTOOL_WERROR=y. +dnl # +AC_DEFUN([ZFS_AC_OBJTOOL_WERROR], [ + AC_MSG_CHECKING([whether objtool error on warning behavior is enabled]) + AC_ARG_ENABLE([objtool-werror], + [AS_HELP_STRING([--enable-objtool-werror], + [Enable objtool's error on warning behaviour if present @<:@default=no@:>@])], + [enable_objtool_werror=$enableval], + [enable_objtool_werror=no]) + AC_MSG_RESULT([$enable_objtool_werror]) + + AS_IF([test x$CONFIG_OBJTOOL_WERROR_DEFINED = xyes],[ + AS_IF([test x$enable_objtool_werror = xyes],[ + AC_MSG_NOTICE([enable-objtool-werror defined, keeping -Werror ]) + ],[ + AC_MSG_NOTICE([enable-objtool-werror undefined, disabling -Werror ]) + OBJTOOL_DISABLE_WERROR=y + abs_objtool_binary=$kernelsrc/tools/objtool/objtool + AS_IF([test -x $abs_objtool_binary],[],[ + AC_MSG_ERROR([*** objtool binary $abs_objtool_binary not found]) + ]) + dnl # The path to the wrapper is defined in modules/Makefile.in. + ]) + ],[ + dnl # We can't enable --Werror if it's not there. + AS_IF([test x$enable_objtool_werror = xyes],[ + AC_MSG_ERROR([ + *** Cannot enable objtool-werror, + *** a kernel built with CONFIG_OBJTOOL_WERROR=y is required. + ]) + ],[]) + ]) + + AC_SUBST(OBJTOOL_DISABLE_WERROR) + AC_SUBST(abs_objtool_binary) +]) + AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [ AX_COUNT_CPUS([]) AC_SUBST(CPU_COUNT) diff --git a/configure.ac b/configure.ac index c05c874affc9..f4b52e1f7abc 100644 --- a/configure.ac +++ b/configure.ac @@ -65,6 +65,7 @@ ZFS_AC_DEBUGINFO ZFS_AC_DEBUG_KMEM ZFS_AC_DEBUG_KMEM_TRACKING ZFS_AC_DEBUG_INVARIANTS +ZFS_AC_OBJTOOL_WERROR AC_CONFIG_FILES([ contrib/debian/rules @@ -86,6 +87,7 @@ AC_CONFIG_FILES([ zfs.release ]) +AC_CONFIG_FILES([scripts/objtool-wrapper], [chmod +x scripts/objtool-wrapper]) AC_OUTPUT diff --git a/module/Makefile.in b/module/Makefile.in index a65cbfce1a90..e9a268121762 100644 --- a/module/Makefile.in +++ b/module/Makefile.in @@ -57,6 +57,7 @@ modules-Linux: $(if @KERNEL_LD@,LD=@KERNEL_LD@) $(if @KERNEL_LLVM@,LLVM=@KERNEL_LLVM@) \ $(if @KERNEL_CROSS_COMPILE@,CROSS_COMPILE=@KERNEL_CROSS_COMPILE@) \ $(if @KERNEL_ARCH@,ARCH=@KERNEL_ARCH@) \ + $(if @OBJTOOL_DISABLE_WERROR@,objtool=@top_builddir@/scripts/objtool-wrapper) \ M="$$PWD" @KERNEL_MAKE@ CONFIG_ZFS=m modules modules-FreeBSD: diff --git a/scripts/.gitignore b/scripts/.gitignore index 5621a6e147a0..443cb7b8484e 100644 --- a/scripts/.gitignore +++ b/scripts/.gitignore @@ -1 +1,2 @@ common.sh +objtool-wrapper diff --git a/scripts/objtool-wrapper.in b/scripts/objtool-wrapper.in new file mode 100644 index 000000000000..0451f8718233 --- /dev/null +++ b/scripts/objtool-wrapper.in @@ -0,0 +1,36 @@ +#!/bin/sh + +# SPDX-License-Identifier: MIT +# +# Copyright (c) 2025 Attila Fülöp +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +# Filter out objtools '--Werror' flag. + +objtool="@abs_objtool_binary@" +args=$(echo "$*" | sed s/--Werror//) + +if [ -z "$objtool" ]; then + echo "$(basename "$0"): No objtool binary configured" 1>&2 + exit 1; +fi + +# shellcheck disable=SC2086 +exec "$objtool" $args diff --git a/scripts/spdxcheck.pl b/scripts/spdxcheck.pl index bddda22334a8..47128402f7bc 100755 --- a/scripts/spdxcheck.pl +++ b/scripts/spdxcheck.pl @@ -94,6 +94,7 @@ etc/init.d/zfs-share.in etc/init.d/zfs-zed.in etc/zfs/zfs-functions.in + scripts/objtool-wrapper.in # Misc items that have clear licensing info but aren't easily matched, # or are the first of a class that we aren't ready to match yet. From 86bf73c1ebb2844b883aaca4782f5f41cd5b3f05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20F=C3=BCl=C3=B6p?= Date: Tue, 15 Jul 2025 00:10:02 +0200 Subject: [PATCH 28/61] objtool wrapper: use absolute path to call the wrapper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Older kernel versions run make outside of the build directory. This works since all paths are absolute. Relative paths will fail in such a scenario. Use an absolute path to the objtool wrapper as well, since the relative path breaks the build on older kernels. Reviewed-by: Brian Behlendorf Signed-off-by: Attila Fülöp Closes #17541 --- module/Makefile.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/Makefile.in b/module/Makefile.in index e9a268121762..859ba8649dd7 100644 --- a/module/Makefile.in +++ b/module/Makefile.in @@ -57,7 +57,7 @@ modules-Linux: $(if @KERNEL_LD@,LD=@KERNEL_LD@) $(if @KERNEL_LLVM@,LLVM=@KERNEL_LLVM@) \ $(if @KERNEL_CROSS_COMPILE@,CROSS_COMPILE=@KERNEL_CROSS_COMPILE@) \ $(if @KERNEL_ARCH@,ARCH=@KERNEL_ARCH@) \ - $(if @OBJTOOL_DISABLE_WERROR@,objtool=@top_builddir@/scripts/objtool-wrapper) \ + $(if @OBJTOOL_DISABLE_WERROR@,objtool=@abs_top_builddir@/scripts/objtool-wrapper) \ M="$$PWD" @KERNEL_MAKE@ CONFIG_ZFS=m modules modules-FreeBSD: From a826f7a993a6db10a1f6596c60df6a590ff2753d Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Wed, 18 Jun 2025 16:19:21 +0200 Subject: [PATCH 29/61] ZTS: Use FreeBSD cloudinit images FreeBSD provides CI-IMAGES since some time. These images are based on nuageinit, which does not support fqdn and sudo for example. So we need currently some workarounds to get it working. The FreeBSD images will be more compatible with cloud-init in some near future. Then we can remove the workaround things. These versions are used for testing: - freebsd13-4r (RELEASE) - freebsd14-3s (STABLE) - freebsd15-0c (CURRENT) Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Tino Reichardt Closes #17462 --- .github/workflows/scripts/qemu-2-start.sh | 143 ++++++++++++++-------- .github/workflows/zfs-qemu.yml | 10 +- 2 files changed, 99 insertions(+), 54 deletions(-) diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh index 28da6700e541..7e20a98c2faf 100755 --- a/.github/workflows/scripts/qemu-2-start.sh +++ b/.github/workflows/scripts/qemu-2-start.sh @@ -12,10 +12,10 @@ OS="$1" # OS variant (virt-install --os-variant list) OSv=$OS -# compressed with .zst extension -REPO="https://github.com/mcmilk/openzfs-freebsd-images" -FREEBSD="$REPO/releases/download/v2025-04-13" -URLzs="" +# FreeBSD urls's +FREEBSD_REL="https://download.freebsd.org/releases/CI-IMAGES" +FREEBSD_SNAP="https://download.freebsd.org/snapshots/CI-IMAGES" +URLxz="" # Ubuntu mirrors UBMIRROR="https://cloud-images.ubuntu.com" @@ -72,49 +72,56 @@ case "$OS" in URL="https://download.fedoraproject.org/pub/fedora/linux/releases/42/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-42-1.1.x86_64.qcow2" ;; freebsd13-4r) - OSNAME="FreeBSD 13.4-RELEASE" + FreeBSD="13.4-RELEASE" + OSNAME="FreeBSD $FreeBSD" OSv="freebsd13.0" - URLzs="$FREEBSD/amd64-freebsd-13.4-RELEASE.qcow2.zst" - BASH="/usr/local/bin/bash" + URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz" + KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz" NIC="rtl8139" ;; freebsd13-5r) - OSNAME="FreeBSD 13.5-RELEASE" + FreeBSD="13.5-RELEASE" + OSNAME="FreeBSD $FreeBSD" OSv="freebsd13.0" - URLzs="$FREEBSD/amd64-freebsd-13.5-RELEASE.qcow2.zst" - BASH="/usr/local/bin/bash" + URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz" + KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz" NIC="rtl8139" ;; - freebsd14-1r) - OSNAME="FreeBSD 14.1-RELEASE" + freebsd14-2r) + FreeBSD="14.2-RELEASE" + OSNAME="FreeBSD $FreeBSD" OSv="freebsd14.0" - URLzs="$FREEBSD/amd64-freebsd-14.1-RELEASE.qcow2.zst" - BASH="/usr/local/bin/bash" + KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz" + URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz" ;; - freebsd14-2r) - OSNAME="FreeBSD 14.2-RELEASE" + freebsd14-3r) + FreeBSD="14.3-RELEASE" + OSNAME="FreeBSD $FreeBSD" OSv="freebsd14.0" - URLzs="$FREEBSD/amd64-freebsd-14.2-RELEASE.qcow2.zst" - BASH="/usr/local/bin/bash" + URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz" + KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz" ;; freebsd13-5s) - OSNAME="FreeBSD 13.5-STABLE" + FreeBSD="13.5-STABLE" + OSNAME="FreeBSD $FreeBSD" OSv="freebsd13.0" - URLzs="$FREEBSD/amd64-freebsd-13.5-STABLE.qcow2.zst" - BASH="/usr/local/bin/bash" + URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz" + KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz" NIC="rtl8139" ;; - freebsd14-2s) - OSNAME="FreeBSD 14.2-STABLE" + freebsd14-3s) + FreeBSD="14.3-STABLE" + OSNAME="FreeBSD $FreeBSD" OSv="freebsd14.0" - URLzs="$FREEBSD/amd64-freebsd-14.2-STABLE.qcow2.zst" - BASH="/usr/local/bin/bash" + URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz" + KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz" ;; freebsd15-0c) - OSNAME="FreeBSD 15.0-CURRENT" + FreeBSD="15.0-CURRENT" + OSNAME="FreeBSD $FreeBSD" OSv="freebsd14.0" - URLzs="$FREEBSD/amd64-freebsd-15.0-CURRENT.qcow2.zst" - BASH="/usr/local/bin/bash" + URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz" + KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz" ;; tumbleweed) OSNAME="openSUSE Tumbleweed" @@ -168,31 +175,37 @@ echo "CPU=\"$CPU\"" >> $ENV sudo mkdir -p "/mnt/tests" sudo chown -R $(whoami) /mnt/tests +DISK="/dev/zvol/zpool/openzfs" +sudo zfs create -ps -b 64k -V 80g zpool/openzfs +while true; do test -b $DISK && break; sleep 1; done + # we are downloading via axel, curl and wget are mostly slower and # require more return value checking -IMG="/mnt/tests/cloudimg.qcow2" -if [ ! -z "$URLzs" ]; then - echo "Loading image $URLzs ..." - time axel -q -o "$IMG.zst" "$URLzs" - zstd -q -d --rm "$IMG.zst" +IMG="/mnt/tests/cloud-image" +if [ ! -z "$URLxz" ]; then + echo "Loading $URLxz ..." + time axel -q -o "$IMG" "$URLxz" + echo "Loading $KSRC ..." + time axel -q -o ~/src.txz $KSRC else - echo "Loading image $URL ..." + echo "Loading $URL ..." time axel -q -o "$IMG" "$URL" fi -DISK="/dev/zvol/zpool/openzfs" -FORMAT="raw" -sudo zfs create -ps -b 64k -V 80g zpool/openzfs -while true; do test -b $DISK && break; sleep 1; done echo "Importing VM image to zvol..." -sudo qemu-img dd -f qcow2 -O raw if=$IMG of=$DISK bs=4M +if [ ! -z "$URLxz" ]; then + xzcat -T0 $IMG | sudo dd of=$DISK bs=4M +else + sudo qemu-img dd -f qcow2 -O raw if=$IMG of=$DISK bs=4M +fi rm -f $IMG PUBKEY=$(cat ~/.ssh/id_ed25519.pub) -cat < /tmp/user-data +if [ ${OS:0:7} != "freebsd" ]; then + cat < /tmp/user-data #cloud-config -fqdn: $OS +hostname: $OS users: - name: root @@ -208,6 +221,19 @@ growpart: devices: ['/'] ignore_growroot_disabled: false EOF +else + cat < /tmp/user-data +#cloud-config + +hostname: $OS + +# minimized config without sudo for nuageinit of FreeBSD +growpart: + mode: auto + devices: ['/'] + ignore_growroot_disabled: false +EOF +fi sudo virsh net-update default add ip-dhcp-host \ "" --live --config @@ -223,16 +249,9 @@ sudo virt-install \ --graphics none \ --network bridge=virbr0,model=$NIC,mac='52:54:00:83:79:00' \ --cloud-init user-data=/tmp/user-data \ - --disk $DISK,bus=virtio,cache=none,format=$FORMAT,driver.discard=unmap \ + --disk $DISK,bus=virtio,cache=none,format=raw,driver.discard=unmap \ --import --noautoconsole >/dev/null -# enable KSM on Linux -if [ ${OS:0:7} != "freebsd" ]; then - sudo virsh dommemstat --domain "openzfs" --period 5 - sudo virsh node-memory-tune 100 50 1 - echo 1 | sudo tee /sys/kernel/mm/ksm/run > /dev/null -fi - # Give the VMs hostnames so we don't have to refer to them with # hardcoded IP addresses. # @@ -252,3 +271,29 @@ StrictHostKeyChecking no # small timeout, used in while loops later ConnectTimeout 1 EOF + +if [ ${OS:0:7} != "freebsd" ]; then + # enable KSM on Linux + sudo virsh dommemstat --domain "openzfs" --period 5 + sudo virsh node-memory-tune 100 50 1 + echo 1 | sudo tee /sys/kernel/mm/ksm/run > /dev/null +else + # on FreeBSD we need some more init stuff, because of nuageinit + BASH="/usr/local/bin/bash" + while pidof /usr/bin/qemu-system-x86_64 >/dev/null; do + ssh 2>/dev/null root@vm0 "uname -a" && break + done + ssh root@vm0 "pkg install -y bash ca_root_nss git qemu-guest-agent python3 py311-cloud-init" + ssh root@vm0 "chsh -s $BASH root" + ssh root@vm0 'sysrc qemu_guest_agent_enable="YES"' + ssh root@vm0 'sysrc cloudinit_enable="YES"' + ssh root@vm0 "pw add user zfs -w no -s $BASH" + ssh root@vm0 'mkdir -p ~zfs/.ssh' + ssh root@vm0 'echo "zfs ALL=(ALL:ALL) NOPASSWD: ALL" >> /usr/local/etc/sudoers' + ssh root@vm0 'echo "PubkeyAuthentication yes" >> /etc/ssh/sshd_config' + scp ~/.ssh/id_ed25519.pub "root@vm0:~zfs/.ssh/authorized_keys" + ssh root@vm0 'chown -R zfs ~zfs' + ssh root@vm0 'service sshd restart' + scp ~/src.txz "root@vm0:/tmp/src.txz" + ssh root@vm0 'tar -C / -zxf /tmp/src.txz' +fi diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml index 1d9899ae895f..035d8be7e227 100644 --- a/.github/workflows/zfs-qemu.yml +++ b/.github/workflows/zfs-qemu.yml @@ -39,8 +39,8 @@ jobs: - name: Generate OS config and CI type id: os run: | - FULL_OS='["almalinux8", "almalinux9", "almalinux10", "debian11", "debian12", "fedora41", "fedora42", "freebsd13-4r", "freebsd14-2s", "freebsd15-0c", "ubuntu22", "ubuntu24"]' - QUICK_OS='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora42", "freebsd14-2r", "ubuntu24"]' + FULL_OS='["almalinux8", "almalinux9", "almalinux10", "debian11", "debian12", "fedora41", "fedora42", "freebsd13-4r", "freebsd14-3s", "freebsd15-0c", "ubuntu22", "ubuntu24"]' + QUICK_OS='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora42", "freebsd14-3s", "ubuntu24"]' # determine CI type when running on PR ci_type="full" if ${{ github.event_name == 'pull_request' }}; then @@ -84,9 +84,9 @@ jobs: # rhl: almalinux8, almalinux9, centos-stream9, fedora41 # debian: debian11, debian12, ubuntu22, ubuntu24 # misc: archlinux, tumbleweed - # FreeBSD variants of 2024-12: - # FreeBSD Release: freebsd13-4r, freebsd14-2r - # FreeBSD Stable: freebsd13-4s, freebsd14-2s + # FreeBSD variants of 2025-06: + # FreeBSD Release: freebsd13-4r, freebsd13-5r, freebsd14-1r, freebsd14-2r, freebsd14-3r + # FreeBSD Stable: freebsd13-5s, freebsd14-3s # FreeBSD Current: freebsd15-0c os: ${{ fromJson(needs.test-config.outputs.test_os) }} runs-on: ubuntu-24.04 From 094305c93739459892240a51df5b2a2a29f1b488 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Thu, 19 Jun 2025 14:41:31 -0700 Subject: [PATCH 30/61] Fix TestGroup warning due to missing tags Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Paul Dagnelie Co-authored-by: Paul Dagnelie Closes #17473 --- tests/runfiles/linux.run | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 14c0fcbbb63a..c1ac11eaa886 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -213,6 +213,8 @@ tags = ['functional', 'snapshot'] [tests/functional/syncfs:Linux] tests = ['syncfs_suspend'] tags = ['functional', 'syncfs'] +pre = +post = [tests/functional/tmpfile:Linux] tests = ['tmpfile_001_pos', 'tmpfile_002_pos', 'tmpfile_003_pos', From 5289f6f961557bf504c4ef7009d0e5342f10f65a Mon Sep 17 00:00:00 2001 From: Olivier Certner Date: Mon, 30 Jun 2025 16:24:23 +0200 Subject: [PATCH 31/61] spa: ZIO_TASKQ_ISSUE: Use symbolic priority This allows to change the meaning of priority differences in FreeBSD without requiring code changes in ZFS. This upstreams commit fd141584cf89d7d2 from FreeBSD src. Sponsored-by: The FreeBSD Foundation Reviewed-by: Alexander Motin Signed-off-by: Olivier Certner Closes #17489 --- include/os/freebsd/spl/sys/proc.h | 4 +++- include/os/linux/spl/sys/sysmacros.h | 4 +++- include/sys/zfs_context.h | 4 +++- module/zfs/spa.c | 21 +++------------------ 4 files changed, 12 insertions(+), 21 deletions(-) diff --git a/include/os/freebsd/spl/sys/proc.h b/include/os/freebsd/spl/sys/proc.h index a03b815a22a6..c6bc10d6babe 100644 --- a/include/os/freebsd/spl/sys/proc.h +++ b/include/os/freebsd/spl/sys/proc.h @@ -45,7 +45,9 @@ #ifdef _KERNEL #define CPU curcpu #define minclsyspri PRIBIO -#define defclsyspri minclsyspri +#define defclsyspri minclsyspri +/* Write issue taskq priority. */ +#define wtqclsyspri ((PVM + PRIBIO) / 2) #define maxclsyspri PVM #define max_ncpus (mp_maxid + 1) #define boot_max_ncpus (mp_maxid + 1) diff --git a/include/os/linux/spl/sys/sysmacros.h b/include/os/linux/spl/sys/sysmacros.h index e932ea72f1be..db48222b712a 100644 --- a/include/os/linux/spl/sys/sysmacros.h +++ b/include/os/linux/spl/sys/sysmacros.h @@ -92,8 +92,10 @@ * Treat shim tasks as SCHED_NORMAL tasks */ #define minclsyspri (MAX_PRIO-1) -#define maxclsyspri (MAX_RT_PRIO) #define defclsyspri (DEFAULT_PRIO) +/* Write issue taskq priority. */ +#define wtqclsyspri (MAX_RT_PRIO + 1) +#define maxclsyspri (MAX_RT_PRIO) #ifndef NICE_TO_PRIO #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 272b22174fba..c06443b4b71f 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -628,8 +628,10 @@ extern void delay(clock_t ticks); * Process priorities as defined by setpriority(2) and getpriority(2). */ #define minclsyspri 19 -#define maxclsyspri -20 #define defclsyspri 0 +/* Write issue taskq priority. */ +#define wtqclsyspri -19 +#define maxclsyspri -20 #define CPU_SEQID ((uintptr_t)pthread_self() & (max_ncpus - 1)) #define CPU_SEQID_UNSTABLE CPU_SEQID diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 01363181b27e..74a943ef8eb7 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1231,29 +1231,14 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) spa->spa_proc, zio_taskq_basedc, flags); } else { #endif - pri_t pri = maxclsyspri; /* * The write issue taskq can be extremely CPU * intensive. Run it at slightly less important * priority than the other taskqs. - * - * Under Linux and FreeBSD this means incrementing - * the priority value as opposed to platforms like - * illumos where it should be decremented. - * - * On FreeBSD, if priorities divided by four (RQ_PPQ) - * are equal then a difference between them is - * insignificant. */ - if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) { -#if defined(__linux__) - pri++; -#elif defined(__FreeBSD__) - pri += 4; -#else -#error "unknown OS" -#endif - } + const pri_t pri = (t == ZIO_TYPE_WRITE && + q == ZIO_TASKQ_ISSUE) ? + wtqclsyspri : maxclsyspri; tq = taskq_create_proc(name, value, pri, 50, INT_MAX, spa->spa_proc, flags); #ifdef HAVE_SYSDC From 024e60b927d75d263d682d1f35fca9ea00e79fb2 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Mon, 30 Jun 2025 16:16:27 -0700 Subject: [PATCH 32/61] Missing tests in make pkg ``` Warning: TestGroup '/var/tmp/tests/functional/ctime' not added to this run. Auxiliary script '/var/tmp/tests/functional/ctime/setup' failed verification. ``` Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Chunwei Chen Closes #17491 --- tests/zfs-tests/tests/Makefile.am | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 64c54815b6c9..dc33258b1f7c 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1440,6 +1440,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/crtime/setup.ksh \ functional/crypto/icp_aes_ccm.ksh \ functional/crypto/icp_aes_gcm.ksh \ + functional/ctime/cleanup.ksh \ + functional/ctime/ctime_001_pos.ksh \ + functional/ctime/setup.ksh \ functional/deadman/deadman_ratelimit.ksh \ functional/deadman/deadman_sync.ksh \ functional/deadman/deadman_zio.ksh \ From 74b539d3dcb2929ed3aef6c55bf6692761c08d0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20R=C3=BCegg?= Date: Sat, 21 Jun 2025 16:20:16 +0300 Subject: [PATCH 33/61] pyzfs: Update ax_python_devel.m4 to serial 37 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes an obvious typo, where a variable was missing the required leading dollar sign ($) Reviewed-by: Brian Behlendorf Signed-off-by: Martin Rüegg Closes #17480 --- config/ax_python_devel.m4 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/ax_python_devel.m4 b/config/ax_python_devel.m4 index 1f480db6d233..935056cc4c0a 100644 --- a/config/ax_python_devel.m4 +++ b/config/ax_python_devel.m4 @@ -72,7 +72,7 @@ # modified version of the Autoconf Macro, you may extend this special # exception to the GPL to apply to your modified version as well. -#serial 36 +#serial 37 AU_ALIAS([AC_PYTHON_DEVEL], [AX_PYTHON_DEVEL]) AC_DEFUN([AX_PYTHON_DEVEL],[ @@ -316,7 +316,7 @@ EOD` PYTHON_LIBS="-L$ac_python_libdir -lpython$ac_python_version" fi - if test -z "PYTHON_LIBS"; then + if test -z "$PYTHON_LIBS"; then AC_MSG_WARN([ Cannot determine location of your Python DSO. Please check it was installed with dynamic libraries enabled, or try setting PYTHON_LIBS by hand. From 6c1130a730842cbabcb744c342bc89337cb94b0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20R=C3=BCegg?= Date: Sat, 21 Jun 2025 18:55:19 +0300 Subject: [PATCH 34/61] pyzfs: Adapt python lib directory evaluation from ax_python_devel.m4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 71216b91d281e7e58f5e29ca4d4553945e080fe9 introduced a regression on debian/ubuntu systems during build. The reason being, that building the RPM for pyzfs was using a different library path than building the library itself. This is now harmonized. Reviewed-by: Brian Behlendorf Signed-off-by: Martin Rüegg Closes #16155 Closes #17480 --- rpm/generic/zfs.spec.in | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index 8cf13023f537..47313a6b5fbb 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -87,7 +87,19 @@ %define __python %{__use_python} %define __python_pkg_version %{__use_python_pkg_version} %endif -%define __python_sitelib %(%{__python} -Esc "from distutils.sysconfig import get_python_lib; print(get_python_lib())" 2>/dev/null || %{__python} -Esc "import sysconfig; print(sysconfig.get_path('purelib'))") +%define __python_sitelib %(%{__python} -Esc " +import sysconfig; +if hasattr(sysconfig, 'get_default_scheme'): + scheme = sysconfig.get_default_scheme() +else: + scheme = sysconfig._get_default_scheme() +if scheme == 'posix_local': + scheme = 'posix_prefix' +prefix = '%{_prefix}' +if prefix == 'NONE': + prefix = '%{ac_default_prefix}' +sitedir = sysconfig.get_path('purelib', scheme, vars={'base': prefix}) +print(sitedir);" 2>/dev/null || %{__python} -Esc "from distutils import sysconfig; print(sysconfig.get_python_lib(0,0))") Name: @PACKAGE@ Version: @VERSION@ From f7698f47e867a4f59e371284f1dfdb363475bedb Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 3 Jul 2025 10:27:05 -0700 Subject: [PATCH 35/61] CI: run ztest on compressed zpool When running ztest under the CI a common failure mode is for the underlying filesystem to run out of available free space. Since the storage associated with a GitHub-hosted running is fixed, we instead create a pool and use a compressed ZFS dataset to store the ztest vdev files. This significantly increases the available capacity since the data written by ztest is highly compressible. A compression ratio of over 40:1 is conservatively achieved using the default lz4 compression. Autotrimming is enabled to ensure freed blocks are discarded from the backing cipool vdev file. Reviewed-by: Tino Reichardt Reviewed-by: George Melikov Signed-off-by: Brian Behlendorf Closes #17501 --- .github/workflows/zloop.yml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/zloop.yml b/.github/workflows/zloop.yml index 7b3bf49d90d5..4ae3ccdc5484 100644 --- a/.github/workflows/zloop.yml +++ b/.github/workflows/zloop.yml @@ -12,7 +12,8 @@ jobs: zloop: runs-on: ubuntu-24.04 env: - TEST_DIR: /var/tmp/zloop + WORK_DIR: /mnt/zloop + CORE_DIR: /mnt/zloop/cores steps: - uses: actions/checkout@v4 with: @@ -40,38 +41,37 @@ jobs: sudo modprobe zfs - name: Tests run: | - sudo mkdir -p $TEST_DIR - # run for 10 minutes or at most 6 iterations for a maximum runner - # time of 60 minutes. - sudo /usr/share/zfs/zloop.sh -t 600 -I 6 -l -m 1 -- -T 120 -P 60 + sudo truncate -s 256G /mnt/vdev + sudo zpool create cipool -m $WORK_DIR -O compression=on -o autotrim=on /mnt/vdev + sudo /usr/share/zfs/zloop.sh -t 600 -I 6 -l -m 1 -c $CORE_DIR -f $WORK_DIR -- -T 120 -P 60 - name: Prepare artifacts if: failure() run: | - sudo chmod +r -R $TEST_DIR/ + sudo chmod +r -R $WORK_DIR/ - name: Ztest log if: failure() run: | - grep -B10 -A1000 'ASSERT' $TEST_DIR/*/ztest.out || tail -n 1000 $TEST_DIR/*/ztest.out + grep -B10 -A1000 'ASSERT' $CORE_DIR/*/ztest.out || tail -n 1000 $CORE_DIR/*/ztest.out - name: Gdb log if: failure() run: | - sed -n '/Backtraces (full)/q;p' $TEST_DIR/*/ztest.gdb + sed -n '/Backtraces (full)/q;p' $CORE_DIR/*/ztest.gdb - name: Zdb log if: failure() run: | - cat $TEST_DIR/*/ztest.zdb + cat $CORE_DIR/*/ztest.zdb - uses: actions/upload-artifact@v4 if: failure() with: name: Logs path: | - /var/tmp/zloop/*/ - !/var/tmp/zloop/*/vdev/ + /mnt/zloop/*/ + !/mnt/zloop/cores/*/vdev/ if-no-files-found: ignore - uses: actions/upload-artifact@v4 if: failure() with: name: Pool files path: | - /var/tmp/zloop/*/vdev/ + /mnt/zloop/cores/*/vdev/ if-no-files-found: ignore From 90d2c4407a34ecb8175d2dc6a747c37833d49b4b Mon Sep 17 00:00:00 2001 From: Igor Ostapenko Date: Fri, 4 Jul 2025 02:00:13 +0300 Subject: [PATCH 36/61] ztest: Fix false positive of ENOSPC handling Before running a pass zs_enospc_count is checked to free up some space by destroying a random dataset. But the space freed may still be not re-usable during the TXG_DEFER window breaking the next dataset creation in ztest_generic_run(). Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Igor Ostapenko Closes #17506 --- cmd/ztest.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/cmd/ztest.c b/cmd/ztest.c index d3bbef245831..8c0fc4513b71 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -7812,6 +7812,9 @@ ztest_dataset_open(int d) ztest_dataset_name(name, ztest_opts.zo_pool, d); + if (ztest_opts.zo_verbose >= 6) + (void) printf("Opening %s\n", name); + (void) pthread_rwlock_rdlock(&ztest_name_lock); error = ztest_dataset_create(name); @@ -8307,41 +8310,44 @@ static void ztest_generic_run(ztest_shared_t *zs, spa_t *spa) { kthread_t **run_threads; - int t; + int i, ndatasets; run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), UMEM_NOFAIL); /* - * Kick off all the tests that run in parallel. + * Actual number of datasets to be used. */ - for (t = 0; t < ztest_opts.zo_threads; t++) { - if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { - umem_free(run_threads, ztest_opts.zo_threads * - sizeof (kthread_t *)); - return; - } + ndatasets = MIN(ztest_opts.zo_datasets, ztest_opts.zo_threads); + + /* + * Prepare the datasets first. + */ + for (i = 0; i < ndatasets; i++) + VERIFY0(ztest_dataset_open(i)); - run_threads[t] = thread_create(NULL, 0, ztest_thread, - (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, + /* + * Kick off all the tests that run in parallel. + */ + for (i = 0; i < ztest_opts.zo_threads; i++) { + run_threads[i] = thread_create(NULL, 0, ztest_thread, + (void *)(uintptr_t)i, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); } /* * Wait for all of the tests to complete. */ - for (t = 0; t < ztest_opts.zo_threads; t++) - VERIFY0(thread_join(run_threads[t])); + for (i = 0; i < ztest_opts.zo_threads; i++) + VERIFY0(thread_join(run_threads[i])); /* * Close all datasets. This must be done after all the threads * are joined so we can be sure none of the datasets are in-use * by any of the threads. */ - for (t = 0; t < ztest_opts.zo_threads; t++) { - if (t < ztest_opts.zo_datasets) - ztest_dataset_close(t); - } + for (i = 0; i < ndatasets; i++) + ztest_dataset_close(i); txg_wait_synced(spa_get_dsl(spa), 0); @@ -8464,6 +8470,7 @@ ztest_run(ztest_shared_t *zs) int d = ztest_random(ztest_opts.zo_datasets); ztest_dataset_destroy(d); + txg_wait_synced(spa_get_dsl(spa), 0); } zs->zs_enospc_count = 0; From 2518f4b1249bc8a5ae50b338898012e64a231153 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 9 Jul 2025 14:34:02 -0700 Subject: [PATCH 37/61] Revert "Fix incorrect expected error in ztest" This reverts commit 2076011e0c4c2d8ad6a59534a4784a6aa5f4f3df. The comment which explains EINVAL should be expected for this case was wrong, not the code. The kernel will return ENOTSUP when attaching a distributed spare to the wrong top-level dRAID vdev. See the check for this in spa_vdev_attach(). Reviewed-by: Paul Dagnelie Signed-off-by: Brian Behlendorf Closes #17503 --- cmd/ztest.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/ztest.c b/cmd/ztest.c index 8c0fc4513b71..2c8250405296 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -3881,7 +3881,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) * If newvd is too small, it should fail with EOVERFLOW. * * If newvd is a distributed spare and it's being attached to a - * dRAID which is not its parent it should fail with EINVAL. + * dRAID which is not its parent it should fail with ENOTSUP. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops && (!replacing || @@ -3900,7 +3900,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) else if (ashift > oldvd->vdev_top->vdev_ashift) expected_error = EDOM; else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) - expected_error = EINVAL; + expected_error = ENOTSUP; else expected_error = 0; From 80b6457fcd1ba3fc396dc96e07256168fdb7cc91 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 9 Jul 2025 17:38:32 -0400 Subject: [PATCH 38/61] CI: Switch from FreeBSD 13.4 to 13.5 FreeBSD 13.4 is EOL since June 30, 2025. Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Alexander Motin Closes #17519 --- .github/workflows/scripts/qemu-2-start.sh | 8 -------- .github/workflows/zfs-qemu.yml | 4 ++-- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh index 7e20a98c2faf..885a64037f89 100755 --- a/.github/workflows/scripts/qemu-2-start.sh +++ b/.github/workflows/scripts/qemu-2-start.sh @@ -71,14 +71,6 @@ case "$OS" in OSv="fedora-unknown" URL="https://download.fedoraproject.org/pub/fedora/linux/releases/42/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-42-1.1.x86_64.qcow2" ;; - freebsd13-4r) - FreeBSD="13.4-RELEASE" - OSNAME="FreeBSD $FreeBSD" - OSv="freebsd13.0" - URLxz="$FREEBSD_REL/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI.raw.xz" - KSRC="$FREEBSD_REL/../amd64/$FreeBSD/src.txz" - NIC="rtl8139" - ;; freebsd13-5r) FreeBSD="13.5-RELEASE" OSNAME="FreeBSD $FreeBSD" diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml index 035d8be7e227..ea17014a117f 100644 --- a/.github/workflows/zfs-qemu.yml +++ b/.github/workflows/zfs-qemu.yml @@ -39,7 +39,7 @@ jobs: - name: Generate OS config and CI type id: os run: | - FULL_OS='["almalinux8", "almalinux9", "almalinux10", "debian11", "debian12", "fedora41", "fedora42", "freebsd13-4r", "freebsd14-3s", "freebsd15-0c", "ubuntu22", "ubuntu24"]' + FULL_OS='["almalinux8", "almalinux9", "almalinux10", "debian11", "debian12", "fedora41", "fedora42", "freebsd13-5r", "freebsd14-3s", "freebsd15-0c", "ubuntu22", "ubuntu24"]' QUICK_OS='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora42", "freebsd14-3s", "ubuntu24"]' # determine CI type when running on PR ci_type="full" @@ -85,7 +85,7 @@ jobs: # debian: debian11, debian12, ubuntu22, ubuntu24 # misc: archlinux, tumbleweed # FreeBSD variants of 2025-06: - # FreeBSD Release: freebsd13-4r, freebsd13-5r, freebsd14-1r, freebsd14-2r, freebsd14-3r + # FreeBSD Release: freebsd13-5r, freebsd14-2r, freebsd14-3r # FreeBSD Stable: freebsd13-5s, freebsd14-3s # FreeBSD Current: freebsd15-0c os: ${{ fromJson(needs.test-config.outputs.test_os) }} From 6b38d0f7ffb9769190014c1b06b3f4d058ee7ab7 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Wed, 9 Jul 2025 23:40:32 +0200 Subject: [PATCH 39/61] ZTS: Fix FreeBSD 15.0 ksh errors The package ksh93 is replaced by ksh now. This works for FreeBSD 13 and 14 also. Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Reviewed-by: Alexander Motin Signed-off-by: Tino Reichardt Closes #17523 --- .github/workflows/scripts/qemu-3-deps-vm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scripts/qemu-3-deps-vm.sh b/.github/workflows/scripts/qemu-3-deps-vm.sh index a581b13c2f58..904fbfbf1e1e 100755 --- a/.github/workflows/scripts/qemu-3-deps-vm.sh +++ b/.github/workflows/scripts/qemu-3-deps-vm.sh @@ -51,7 +51,7 @@ function freebsd() { echo "##[group]Install Development Tools" sudo pkg install -y autoconf automake autotools base64 checkbashisms fio \ - gdb gettext gettext-runtime git gmake gsed jq ksh93 lcov libtool lscpu \ + gdb gettext gettext-runtime git gmake gsed jq ksh lcov libtool lscpu \ pkgconf python python3 pamtester pamtester qemu-guest-agent rsync xxhash sudo pkg install -xy \ '^samba4[[:digit:]]+$' \ From 7882e85a9b811a6bf878c96707162de024bbacd7 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Fri, 11 Jul 2025 17:49:06 +0200 Subject: [PATCH 40/61] Delete unused .cirrus.yml The Cirrus_CI was planned for testing FreeBSD, but never really used I think. Currently it's not needed anymore, so remove it. Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Signed-off-by: Tino Reichardt Closes #17155 Closes #17535 --- .cirrus.yml | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 .cirrus.yml diff --git a/.cirrus.yml b/.cirrus.yml deleted file mode 100644 index 366bb87fbb14..000000000000 --- a/.cirrus.yml +++ /dev/null @@ -1,21 +0,0 @@ -env: - CIRRUS_CLONE_DEPTH: 1 - ARCH: amd64 - -build_task: - matrix: - freebsd_instance: - image_family: freebsd-13-5 - freebsd_instance: - image_family: freebsd-14-2 - freebsd_instance: - image_family: freebsd-15-0-snap - prepare_script: - - pkg install -y autoconf automake libtool gettext-runtime gmake ksh93 py311-packaging py311-cffi py311-sysctl - configure_script: - - env MAKE=gmake ./autogen.sh - - env MAKE=gmake ./configure --with-config="user" --with-python=3.11 - build_script: - - gmake -j `sysctl -n kern.smp.cpus` - install_script: - - gmake install From 8c4f625c1253cacf943390da8e03da2af04d84b0 Mon Sep 17 00:00:00 2001 From: Carl George Date: Tue, 15 Jul 2025 12:00:35 -0500 Subject: [PATCH 41/61] CI: Add CentOS Stream 9/10 to the FULL_OS runner list Testing on CentOS Stream provides several months advance notice of changes coming to the RHEL kernel. This should help OpenZFS be proactive instead of reactive to new RHEL minor versions. Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Carl George ZFS-CI-Type: full Closes #16904 Closes #17526 --- .github/workflows/zfs-qemu.yml | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml index ea17014a117f..cda620313189 100644 --- a/.github/workflows/zfs-qemu.yml +++ b/.github/workflows/zfs-qemu.yml @@ -5,16 +5,6 @@ on: pull_request: workflow_dispatch: inputs: - include_stream9: - type: boolean - required: false - default: false - description: 'Test on CentOS 9 stream' - include_stream10: - type: boolean - required: false - default: false - description: 'Test on CentOS 10 stream' fedora_kernel_ver: type: string required: false @@ -39,7 +29,7 @@ jobs: - name: Generate OS config and CI type id: os run: | - FULL_OS='["almalinux8", "almalinux9", "almalinux10", "debian11", "debian12", "fedora41", "fedora42", "freebsd13-5r", "freebsd14-3s", "freebsd15-0c", "ubuntu22", "ubuntu24"]' + FULL_OS='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian11", "debian12", "fedora41", "fedora42", "freebsd13-5r", "freebsd14-3s", "freebsd15-0c", "ubuntu22", "ubuntu24"]' QUICK_OS='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora42", "freebsd14-3s", "ubuntu24"]' # determine CI type when running on PR ci_type="full" @@ -63,14 +53,6 @@ jobs: os_json=$(echo ${os_selection} | jq -c) fi - # Add optional runners - if [ "${{ github.event.inputs.include_stream9 }}" == 'true' ]; then - os_json=$(echo $os_json | jq -c '. += ["centos-stream9"]') - fi - if [ "${{ github.event.inputs.include_stream10 }}" == 'true' ]; then - os_json=$(echo $os_json | jq -c '. += ["centos-stream10"]') - fi - echo $os_json echo "os=$os_json" >> $GITHUB_OUTPUT echo "ci_type=$ci_type" >> $GITHUB_OUTPUT From 6af1f61ad4c6064c05c5f7eefc160fea57ea058d Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Tue, 15 Jul 2025 17:01:49 -0700 Subject: [PATCH 42/61] Fix zdb pool/ with -k When examining the root dataset with zdb -k, we get into a mismatched state. main() knows we are not examining the whole pool, but it strips off the trailing slash. import_checkpointed_state() then thinks we are examining the whole pool, and does not update the target path appropriately. The fix is to directly inform import_checkpointed_state that we are examining a filesystem, and not the whole pool. Sponsored-by: Klara, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Rob Norris Signed-off-by: Paul Dagnelie Co-authored-by: Paul Dagnelie Closes #17536 --- cmd/zdb/zdb.c | 20 ++++++++++--------- .../pool_checkpoint/checkpoint_zdb.ksh | 2 ++ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 565c078bb195..8685109db1c2 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -7708,7 +7708,8 @@ zdb_set_skip_mmp(char *target) * applies to the new_path parameter if allocated. */ static char * -import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) +import_checkpointed_state(char *target, nvlist_t *cfg, boolean_t target_is_spa, + char **new_path) { int error = 0; char *poolname, *bogus_name = NULL; @@ -7716,11 +7717,11 @@ import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) /* If the target is not a pool, the extract the pool name */ char *path_start = strchr(target, '/'); - if (path_start != NULL) { + if (target_is_spa || path_start == NULL) { + poolname = target; + } else { size_t poolname_len = path_start - target; poolname = strndup(target, poolname_len); - } else { - poolname = target; } if (cfg == NULL) { @@ -7751,10 +7752,11 @@ import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) "with error %d\n", bogus_name, error); } - if (new_path != NULL && path_start != NULL) { - if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) { + if (new_path != NULL && !target_is_spa) { + if (asprintf(new_path, "%s%s", bogus_name, + path_start != NULL ? path_start : "") == -1) { free(bogus_name); - if (path_start != NULL) + if (!target_is_spa && path_start != NULL) free(poolname); return (NULL); } @@ -7983,7 +7985,7 @@ verify_checkpoint_blocks(spa_t *spa) * name) so we can do verification on it against the current state * of the pool. */ - checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL, + checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL, B_TRUE, NULL); ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0); @@ -9700,7 +9702,7 @@ main(int argc, char **argv) char *checkpoint_target = NULL; if (dump_opt['k']) { checkpoint_pool = import_checkpointed_state(target, cfg, - &checkpoint_target); + target_is_spa, &checkpoint_target); if (checkpoint_target != NULL) target = checkpoint_target; diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_zdb.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_zdb.ksh index cd4573b2e4d1..b364a5cb4bdc 100755 --- a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_zdb.ksh +++ b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_zdb.ksh @@ -63,6 +63,7 @@ log_must eval "zdb $TESTPOOL | grep -q \"Checkpointed uberblock found\"" log_mustnot eval "zdb -k $TESTPOOL | grep -q \"Checkpointed uberblock found\"" log_mustnot eval "zdb $TESTPOOL | grep \"Dataset $FS1\"" log_must eval "zdb -k $TESTPOOL | grep \"Dataset $CHECKPOINTED_FS1\"" +log_must eval "zdb -k $TESTPOOL/ | grep \"$TESTPOOL$BOGUS_SUFFIX\"" log_must zpool export $TESTPOOL @@ -70,6 +71,7 @@ log_must eval "zdb -e $TESTPOOL | grep \"Checkpointed uberblock found\"" log_mustnot eval "zdb -k -e $TESTPOOL | grep \"Checkpointed uberblock found\"" log_mustnot eval "zdb -e $TESTPOOL | grep \"Dataset $FS1\"" log_must eval "zdb -k -e $TESTPOOL | grep \"Dataset $CHECKPOINTED_FS1\"" +log_must eval "zdb -k -e $TESTPOOL/ | grep \"$TESTPOOL$BOGUS_SUFFIX\"" log_must zpool import $TESTPOOL From e7e0bb3b6180769f5d82c944a3dcc59cd41ab0d6 Mon Sep 17 00:00:00 2001 From: Coleman Kane Date: Thu, 24 Jul 2025 18:38:58 -0400 Subject: [PATCH 43/61] linux: Fix out-of-src builds The linux kernel modules haven't been building successfully when the build occurs in a separate directory than the source code, which is a common build pattern in Linux. Was not able to determine the root cause, but the %.o targets in subdirectories are no longer being matched by the pattern targets in the Linux Kbuild system. This change fixes the issue by dynamically creating the missing ones inside our Kbuild. Reviewed-by: Brian Behlendorf Signed-off-by: Coleman Kane Closes #17517 --- module/Kbuild.in | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/module/Kbuild.in b/module/Kbuild.in index 35ef15ff4d8e..c537aced88ad 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -494,3 +494,34 @@ UBSAN_SANITIZE_zfs/sa.o := n ifeq ($(CONFIG_ALTIVEC),y) $(obj)/zfs/vdev_raidz_math_powerpc_altivec.o : c_flags += -maltivec endif + +# The following recipes attempt to fix out of src-tree builds, where $(src) != $(obj), so that the +# subdir %.c/%.S -> %.o targets will work as expected. The in-kernel pattern targets do not seem to +# be working on subdirs since about ~6.10 +zobjdirs = $(dir $(zfs-objs)) $(dir $(spl-objs)) \ + $(dir $(zfs-$(CONFIG_X86))) $(dir $(zfs-$(CONFIG_UML_X86))) $(dir $(zfs-$(CONFIG_ARM64))) \ + $(dir $(zfs-$(CONFIG_PPC64))) $(dir $(zfs-$(CONFIG_PPC))) + +z_cdirs = $(sort $(filter-out lua/setjmp/ $(addprefix icp/asm-aarch64/, aes/ blake3/ modes/ sha2/) \ + $(addprefix icp/asm-x86_64/, aes/ blake3/ modes/ sha2/) \ + $(addprefix icp/asm-ppc/, aes/ blake3/ modes/ sha2/) \ + $(addprefix icp/asm-ppc64/, aes/ blake3/ modes/ sha2/), $(zobjdirs))) +z_sdirs = $(sort $(filter lua/setjmp/ $(addprefix icp/asm-aarch64/, aes/ blake3/ modes/ sha2/) \ + $(addprefix icp/asm-x86_64/, aes/ blake3/ modes/ sha2/) \ + $(addprefix icp/asm-ppc/, aes/ blake3/ modes/ sha2/) \ + $(addprefix icp/asm-ppc64/, aes/ blake3/ modes/ sha2/), $(zobjdirs))) + +define ZKMOD_C_O_MAKE_TARGET +$1%.o: $(src)/$1%.c FORCE + $$(call if_changed_rule,cc_o_c) + $$(call cmd,force_checksrc) +endef + +define ZKMOD_S_O_MAKE_TARGET +$1%.o: $(src)/$1%.S FORCE + $$(call if_changed_rule,as_o_S) + $$(call cmd,force_checksrc) +endef + +$(foreach target,$(z_cdirs), $(eval $(call ZKMOD_C_O_MAKE_TARGET,$(target)))) +$(foreach target,$(z_sdirs), $(eval $(call ZKMOD_S_O_MAKE_TARGET,$(target)))) From 82a0868ce4d503a8f7c40e2709ebe67518fbf532 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 25 Jul 2025 15:47:21 -0700 Subject: [PATCH 44/61] CI: Remove Debian backports The latest Debian 11 image includes bullseye-backports as a default repository in the /etc/apt/sources.list. However, this repository has gone end of life which effectively breaks the default install. We shouldn't need anything in backports so lets unconditionally remove backports on all Debian builders to resolve the issue. Reviewed-by: George Melikov Signed-off-by: Brian Behlendorf Closes #17569 --- .github/workflows/scripts/qemu-3-deps-vm.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/scripts/qemu-3-deps-vm.sh b/.github/workflows/scripts/qemu-3-deps-vm.sh index 904fbfbf1e1e..c41ecd09d52e 100755 --- a/.github/workflows/scripts/qemu-3-deps-vm.sh +++ b/.github/workflows/scripts/qemu-3-deps-vm.sh @@ -28,6 +28,7 @@ function debian() { export DEBIAN_FRONTEND="noninteractive" echo "##[group]Running apt-get update+upgrade" + sudo sed -i '/[[:alpha:]]-backports/d' /etc/apt/sources.list sudo apt-get update -y sudo apt-get upgrade -y echo "##[endgroup]" From 245adb6a4f6c0ed32e78f1960ff7a446d74afff6 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 5 Feb 2025 00:47:50 +1100 Subject: [PATCH 45/61] ZTS: include microsecond timestamps on all output When reviewing test output after a failure, it's often quite difficult to work out the order and timing of events, and to correlate test suite output with kernel logs. This adds timestamps to ZTS output to help with this, in three places: - all of the standard log_XXX functions ultimately end up in _printline, which now prefixes output with a timestamp. An escape hatch environment variable is provided for user_cmd, which often calls the logging functions while also depending on the captured output. - the test runner logging function log() also now prefixes its output with a timestamp. - on failure, when capturing the kernel log in zfs_dmesg.ksh, the "iso" time format is requested. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Rob Norris Closes #17045 --- tests/test-runner/bin/test-runner.py.in | 5 ++++- tests/test-runner/bin/zts-report.py.in | 4 +++- tests/test-runner/include/logapi.shlib | 7 ++++++- tests/zfs-tests/callbacks/zfs_dmesg.ksh | 7 ++++++- tests/zfs-tests/include/libtest.shlib | 4 +++- 5 files changed, 22 insertions(+), 5 deletions(-) diff --git a/tests/test-runner/bin/test-runner.py.in b/tests/test-runner/bin/test-runner.py.in index 5bf13f5c08af..2158208be6e5 100755 --- a/tests/test-runner/bin/test-runner.py.in +++ b/tests/test-runner/bin/test-runner.py.in @@ -15,6 +15,7 @@ # # Copyright (c) 2012, 2018 by Delphix. All rights reserved. # Copyright (c) 2019 Datto Inc. +# Copyright (c) 2025, Klara, Inc. # # This script must remain compatible with Python 3.6+. # @@ -372,6 +373,8 @@ User: %s stdout/stderr/merged in its own file. """ + timeprefix = datetime.now().strftime('[%FT%T.%f] ') + logname = getpwuid(os.getuid()).pw_name rer = '' if self.reran is True: @@ -383,7 +386,7 @@ User: %s msga = 'Test: %s%s ' % (self.pathname, user) msgb = '[%s] [%s]%s\n' % (self.result.runtime, self.result.result, rer) pad = ' ' * (80 - (len(msga) + len(msgb))) - result_line = msga + pad + msgb + result_line = timeprefix + msga + pad + msgb # The result line is always written to the log file. If -q was # specified only failures are written to the console, otherwise diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 40f5083d1294..001970120148 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -15,6 +15,7 @@ # # Copyright (c) 2017 by Delphix. All rights reserved. # Copyright (c) 2018 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2025, Klara, Inc. # # This script must remain compatible with Python 3.6+. # @@ -381,7 +382,8 @@ def process_results(pathname): prefix = '/zfs-tests/tests/(?:functional|perf/regression)/' pattern = \ - r'^Test(?:\s+\(\S+\))?:' + \ + r'^(?:\[[0-9\-T:\.]+\]\s+)?' + \ + r'Test(?:\s+\(\S+\))?:' + \ rf'\s*\S*{prefix}(\S+)' + \ r'\s*\(run as (\S+)\)\s*\[(\S+)\]\s*\[(\S+)\]' pattern_log = r'^\s*Log directory:\s*(\S*)' diff --git a/tests/test-runner/include/logapi.shlib b/tests/test-runner/include/logapi.shlib index 670ecfefb986..29e0c7f1c9ca 100644 --- a/tests/test-runner/include/logapi.shlib +++ b/tests/test-runner/include/logapi.shlib @@ -25,6 +25,7 @@ # Use is subject to license terms. # # Copyright (c) 2012, 2020 by Delphix. All rights reserved. +# Copyright (c) 2025, Klara, Inc. # STF_PASS=0 @@ -465,7 +466,11 @@ function _endlog function _printline { - echo "$@" + if [[ -n "$ZTS_LOG_SUPPRESS_TIMESTAMP" ]] ; then + printf '[%(%FT%T.%6N)T] %s\n' now "$*" + else + echo "$@" + fi } # Output an error message diff --git a/tests/zfs-tests/callbacks/zfs_dmesg.ksh b/tests/zfs-tests/callbacks/zfs_dmesg.ksh index 73c654125319..de31765a52e4 100755 --- a/tests/zfs-tests/callbacks/zfs_dmesg.ksh +++ b/tests/zfs-tests/callbacks/zfs_dmesg.ksh @@ -15,6 +15,7 @@ # # Copyright (c) 2016 by Delphix. All rights reserved. # Copyright (c) 2017 Lawrence Livermore National Security, LLC. +# Copyright (c) 2025, Klara, Inc. # # $1: number of lines to output (default: 200) @@ -25,7 +26,11 @@ echo " Tailing last $lines lines of dmesg log" echo "=================================================================" # report and reset afterwards -sudo dmesg -c | tail -n $lines +dmesg_args="-c" +if [[ $(uname) = "Linux" ]] ; then + dmesg_args="$dmesg_args --time-format=iso" +fi +sudo dmesg $dmesg_args | tail -n $lines echo "=================================================================" echo " End of dmesg log" diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 8bffe9d8240c..d8e7bb73e6e5 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -2884,7 +2884,9 @@ function user_run typeset out=$TEST_BASE_DIR/out typeset err=$TEST_BASE_DIR/err - sudo -Eu $user env PATH="$PATH" ksh <<<"$*" >$out 2>$err + sudo -Eu $user \ + env PATH="$PATH" ZTS_LOG_SUPPRESS_TIMESTAMP=1 \ + ksh <<<"$*" >$out 2>$err typeset res=$? log_note "out: $(<$out)" log_note "err: $(<$err)" From df5e02d253c604bff1c8b8d1d5702be4a13c173b Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 5 Aug 2025 11:18:06 +1000 Subject: [PATCH 46/61] CI: match and trim out internal timestamp for test prefix Adjust the regexes to match the test line with timestamps, then remove them for the summary. The internal timestamp is still in the full logs. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Rob Norris Closes #17045 --- .github/workflows/scripts/qemu-6-tests.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/scripts/qemu-6-tests.sh b/.github/workflows/scripts/qemu-6-tests.sh index e8e6adecd62f..5ab822f4f076 100755 --- a/.github/workflows/scripts/qemu-6-tests.sh +++ b/.github/workflows/scripts/qemu-6-tests.sh @@ -21,11 +21,13 @@ function prefix() { S=$((DIFF-(M*60))) CTR=$(cat /tmp/ctr) - echo $LINE| grep -q "^Test[: ]" && CTR=$((CTR+1)) && echo $CTR > /tmp/ctr + echo $LINE| grep -q '^\[.*] Test[: ]' && CTR=$((CTR+1)) && echo $CTR > /tmp/ctr BASE="$HOME/work/zfs/zfs" COLOR="$BASE/scripts/zfs-tests-color.sh" - CLINE=$(echo $LINE| grep "^Test[ :]" | sed -e 's|/usr/local|/usr|g' \ + CLINE=$(echo $LINE| grep '^\[.*] Test[: ]' \ + | sed -e 's|^\[.*] Test|Test|g' \ + | sed -e 's|/usr/local|/usr|g' \ | sed -e 's| /usr/share/zfs/zfs-tests/tests/| |g' | $COLOR) if [ -z "$CLINE" ]; then printf "vm${ID}: %s\n" "$LINE" From 97fe86837c9a75048494273ed7ec7272e7f76730 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 1 Aug 2025 13:51:46 +1000 Subject: [PATCH 47/61] ZTS: mmap_ftruncate test to confirm async writeback behaviour Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #17584 --- tests/runfiles/common.run | 2 +- tests/zfs-tests/cmd/.gitignore | 1 + tests/zfs-tests/cmd/Makefile.am | 4 +- tests/zfs-tests/cmd/mmap_ftruncate.c | 85 +++++++++++++++++++ tests/zfs-tests/include/commands.cfg | 1 + tests/zfs-tests/tests/Makefile.am | 1 + .../tests/functional/mmap/mmap_ftruncate.ksh | 80 +++++++++++++++++ 7 files changed, 172 insertions(+), 2 deletions(-) create mode 100644 tests/zfs-tests/cmd/mmap_ftruncate.c create mode 100755 tests/zfs-tests/tests/functional/mmap/mmap_ftruncate.ksh diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index bbe17b073cbf..6d96ec435841 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -788,7 +788,7 @@ tags = ['functional', 'migration'] [tests/functional/mmap] tests = ['mmap_mixed', 'mmap_read_001_pos', 'mmap_seek_001_pos', - 'mmap_sync_001_pos', 'mmap_write_001_pos'] + 'mmap_sync_001_pos', 'mmap_write_001_pos', 'mmap_ftruncate'] tags = ['functional', 'mmap'] [tests/functional/mount] diff --git a/tests/zfs-tests/cmd/.gitignore b/tests/zfs-tests/cmd/.gitignore index e9a6f8f0ac17..1cd90024e94d 100644 --- a/tests/zfs-tests/cmd/.gitignore +++ b/tests/zfs-tests/cmd/.gitignore @@ -23,6 +23,7 @@ /mkfiles /mktree /mmap_exec +/mmap_ftruncate /mmap_libaio /mmap_seek /mmap_sync diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am index 4498c9a73711..12107278cb09 100644 --- a/tests/zfs-tests/cmd/Makefile.am +++ b/tests/zfs-tests/cmd/Makefile.am @@ -72,7 +72,9 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/mkbusy %D%/mkfile %D%/mkfiles %D%/mktree %C%_mkfile_LDADD = $(LTLIBINTL) -scripts_zfs_tests_bin_PROGRAMS += %D%/mmap_exec %D%/mmap_seek %D%/mmap_sync %D%/mmapwrite %D%/readmmap +scripts_zfs_tests_bin_PROGRAMS += \ + %D%/mmap_exec %D%/mmap_ftruncate %D%/mmap_seek \ + %D%/mmap_sync %D%/mmapwrite %D%/readmmap %C%_mmapwrite_LDADD = -lpthread if WANT_MMAP_LIBAIO diff --git a/tests/zfs-tests/cmd/mmap_ftruncate.c b/tests/zfs-tests/cmd/mmap_ftruncate.c new file mode 100644 index 000000000000..91cdfe3715e6 --- /dev/null +++ b/tests/zfs-tests/cmd/mmap_ftruncate.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: CDDL-1.0 +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2025, Klara, Inc. + */ + +/* + * Tests async writeback behaviour. Creates a file, maps it into memory, and + * dirties every page within it. Then, calls ftruncate() to collapse the file + * back down to 0. This causes the kernel to begin writeback on the dirty + * pages so they can be freed, before it can complete the ftruncate() call. + * None of these are sync operations, so they should avoid the various "force + * flush" codepaths. + */ + +#include +#include +#include +#include +#include +#include + +#define _pdfail(f, l, s) \ + do { perror("[" f "#" #l "] " s); exit(2); } while (0) +#define pdfail(str) _pdfail(__FILE__, __LINE__, str) + +int +main(int argc, char **argv) { + if (argc != 3) { + printf("usage: mmap_ftruncate \n"); + exit(2); + } + + const char *file = argv[1]; + + char *end; + off_t sz = strtoull(argv[2], &end, 0); + if (end == argv[2] || *end != '\0' || sz == 0) { + fprintf(stderr, "E: invalid size"); + exit(2); + } + + int fd = open(file, O_CREAT|O_TRUNC|O_RDWR, S_IRUSR|S_IWUSR); + if (fd < 0) + pdfail("open"); + + if (ftruncate(fd, sz) < 0) + pdfail("ftruncate"); + + char *p = mmap(NULL, sz, PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + pdfail("mmap"); + + for (off_t off = 0; off < sz; off += 4096) + p[off] = 1; + + if (ftruncate(fd, 0) < 0) + pdfail("ftruncate"); + + if (munmap(p, sz) < 0) + pdfail("munmap"); + + close(fd); + return (0); +} diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 1c7e42a06e05..bbaa8665ecc8 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -205,6 +205,7 @@ export ZFSTEST_FILES='badsend mkfiles mktree mmap_exec + mmap_ftruncate mmap_libaio mmap_seek mmap_sync diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index dc33258b1f7c..d9ea50ebe923 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1660,6 +1660,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/mmap/mmap_seek_001_pos.ksh \ functional/mmap/mmap_sync_001_pos.ksh \ functional/mmap/mmap_write_001_pos.ksh \ + functional/mmap/mmap_ftruncate.ksh \ functional/mmap/setup.ksh \ functional/mmp/cleanup.ksh \ functional/mmp/mmp_active_import.ksh \ diff --git a/tests/zfs-tests/tests/functional/mmap/mmap_ftruncate.ksh b/tests/zfs-tests/tests/functional/mmap/mmap_ftruncate.ksh new file mode 100755 index 000000000000..63ebf95de7f0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/mmap/mmap_ftruncate.ksh @@ -0,0 +1,80 @@ +#!/bin/ksh -p +# SPDX-License-Identifier: CDDL-1.0 +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025, Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# This verifies that async writeback of dirty mmap()'d pages completes quickly. +# ftruncate() is an operation that will trigger async writeback, but is not +# itself a syncing operation, making it a useful proxy for any way the kernel +# might trigger async writeback. +# +# The guts of this test is in the mmap_ftruncate program. This driver sets a +# larger zfs_txg_timeout. Test failure occurs ftruncate() blocks waiting for +# the writeback until the txg timeout is reached and the changes are forcibly +# written out. Success means the DMU has accepted the changes and cleared the +# page dirty flags. +# + +TIMEOUT=180 +TESTFILE=/$TESTPOOL/truncfile +TESTSIZE=$((2*1024*1024*1024)) # 2G + +verify_runnable "global" + +typeset claim="async writeback of dirty mmap()'d pages completes quickly" + +log_assert $claim + +log_must save_tunable TXG_TIMEOUT + +function cleanup +{ + log_must restore_tunable TXG_TIMEOUT + rm -f $TESTFILE +} +log_onexit cleanup + +log_must set_tunable32 TXG_TIMEOUT $TIMEOUT +log_must zpool sync -f + +# run mmap_ftruncate and record the run time +typeset -i start=$(date +%s) +log_must mmap_ftruncate $TESTFILE $TESTSIZE +typeset -i end=$(date +%s) +typeset -i delta=$((end - start)) + +# in practice, mmap_ftruncate needs a few seconds to dirty all the pages, and +# when this test passes, the ftruncate() call itself should be near-instant. +# when it fails, then its only the txg sync that allows ftruncate() to +# complete, in that case, the run time will be extremely close to the timeout, +# so to avoid any confusion at the edges, we require that it complets within +# half the transaction time. for any timeout higher than ~30s that should be a +# very bright line down the middle. +log_must test $delta -lt $((TIMEOUT / 2)) + +log_pass $claim From f72226a75cc05e9e013642add3ceab50f789953e Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 28 Jul 2025 10:33:40 +1000 Subject: [PATCH 48/61] Linux: sync: remove async/sync accounting All this machinery is there to try to understand when there an async writeback waiting to complete because the intent log callbacks are still outstanding, and force them with a timely zil_commit(). The next commit fixes this properly, so there's no need for all this extra housekeeping. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #17584 --- include/os/linux/zfs/sys/trace_acl.h | 7 +---- include/sys/zfs_znode.h | 2 -- module/os/freebsd/zfs/zfs_znode_os.c | 7 ----- module/os/linux/zfs/zfs_ctldir.c | 2 -- module/os/linux/zfs/zfs_vnops_os.c | 25 ----------------- module/os/linux/zfs/zfs_znode_os.c | 7 ----- module/os/linux/zfs/zpl_file.c | 41 ---------------------------- module/zfs/zfs_vnops.c | 2 -- 8 files changed, 1 insertion(+), 92 deletions(-) diff --git a/include/os/linux/zfs/sys/trace_acl.h b/include/os/linux/zfs/sys/trace_acl.h index 8923657daf02..d88b4937ef08 100644 --- a/include/os/linux/zfs/sys/trace_acl.h +++ b/include/os/linux/zfs/sys/trace_acl.h @@ -59,8 +59,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __field(uint64_t, z_size) __field(uint64_t, z_pflags) __field(uint32_t, z_sync_cnt) - __field(uint32_t, z_sync_writes_cnt) - __field(uint32_t, z_async_writes_cnt) __field(mode_t, z_mode) __field(boolean_t, z_is_sa) __field(boolean_t, z_is_ctldir) @@ -92,8 +90,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __entry->z_size = zn->z_size; __entry->z_pflags = zn->z_pflags; __entry->z_sync_cnt = zn->z_sync_cnt; - __entry->z_sync_writes_cnt = zn->z_sync_writes_cnt; - __entry->z_async_writes_cnt = zn->z_async_writes_cnt; __entry->z_mode = zn->z_mode; __entry->z_is_sa = zn->z_is_sa; __entry->z_is_ctldir = zn->z_is_ctldir; @@ -117,7 +113,7 @@ DECLARE_EVENT_CLASS(zfs_ace_class, TP_printk("zn { id %llu unlinked %u atime_dirty %u " "zn_prefetch %u blksz %u seq %u " "mapcnt %llu size %llu pflags %llu " - "sync_cnt %u sync_writes_cnt %u async_writes_cnt %u " + "sync_cnt %u " "mode 0x%x is_sa %d is_ctldir %d " "inode { uid %u gid %u ino %lu nlink %u size %lli " "blkbits %u bytes %u mode 0x%x generation %x } } " @@ -126,7 +122,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __entry->z_zn_prefetch, __entry->z_blksz, __entry->z_seq, __entry->z_mapcnt, __entry->z_size, __entry->z_pflags, __entry->z_sync_cnt, - __entry->z_sync_writes_cnt, __entry->z_async_writes_cnt, __entry->z_mode, __entry->z_is_sa, __entry->z_is_ctldir, __entry->i_uid, __entry->i_gid, __entry->i_ino, __entry->i_nlink, __entry->i_size, __entry->i_blkbits, diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index b3a267e16f3e..fa3c7b5b39c8 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -201,8 +201,6 @@ typedef struct znode { uint64_t z_size; /* file size (cached) */ uint64_t z_pflags; /* pflags (cached) */ uint32_t z_sync_cnt; /* synchronous open count */ - uint32_t z_sync_writes_cnt; /* synchronous write count */ - uint32_t z_async_writes_cnt; /* asynchronous write count */ mode_t z_mode; /* mode (cached) */ kmutex_t z_acl_lock; /* acl data lock */ zfs_acl_t *z_acl_cached; /* cached acl */ diff --git a/module/os/freebsd/zfs/zfs_znode_os.c b/module/os/freebsd/zfs/zfs_znode_os.c index e97a0dd84040..0d3e36c1e034 100644 --- a/module/os/freebsd/zfs/zfs_znode_os.c +++ b/module/os/freebsd/zfs/zfs_znode_os.c @@ -150,8 +150,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; zp->z_vnode = NULL; - zp->z_sync_writes_cnt = 0; - zp->z_async_writes_cnt = 0; return (0); } @@ -172,9 +170,6 @@ zfs_znode_cache_destructor(void *buf, void *arg) ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); - - ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); - ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); } @@ -455,8 +450,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; - zp->z_sync_writes_cnt = 0; - zp->z_async_writes_cnt = 0; atomic_store_ptr(&zp->z_cached_symlink, NULL); zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index 84b25cb2c5ac..6552a933ce0a 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -511,8 +511,6 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, zp->z_pflags = 0; zp->z_mode = 0; zp->z_sync_cnt = 0; - zp->z_sync_writes_cnt = 0; - zp->z_async_writes_cnt = 0; ip->i_generation = 0; ip->i_ino = id; ip->i_mode = (S_IFDIR | S_IRWXUGO); diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 9ceb6cb8dbdd..1d8f76d86bfe 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -3694,11 +3694,9 @@ static void zfs_putpage_async_commit_cb(void *arg) { struct page *pp = arg; - znode_t *zp = ITOZ(pp->mapping->host); ClearPageError(pp); end_page_writeback(pp); - atomic_dec_32(&zp->z_async_writes_cnt); } /* @@ -3818,15 +3816,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, zfs_rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { - /* - * Speed up any non-sync page writebacks since - * they may take several seconds to complete. - * Refer to the comment in zpl_fsync() for details. - */ - if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { - zil_commit(zfsvfs->z_log, zp->z_id); - } - if (PageWriteback(pp)) #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT folio_wait_bit(page_folio(pp), PG_writeback); @@ -3852,8 +3841,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, * was in fact not skipped and should not be counted as if it were. */ wbc->pages_skipped--; - if (!for_sync) - atomic_inc_32(&zp->z_async_writes_cnt); set_page_writeback(pp); unlock_page(pp); @@ -3872,8 +3859,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, #endif ClearPageError(pp); end_page_writeback(pp); - if (!for_sync) - atomic_dec_32(&zp->z_async_writes_cnt); zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (err); @@ -3907,16 +3892,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, * performance reasons. */ commit = B_TRUE; - } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) { - /* - * If the caller does not intend to wait synchronously - * for this page writeback to complete and there are active - * synchronous calls on this file, do a commit so that - * the latter don't accidentally end up waiting for - * our writeback to complete. Refer to the comment in - * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details. - */ - commit = B_TRUE; } zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit, diff --git a/module/os/linux/zfs/zfs_znode_os.c b/module/os/linux/zfs/zfs_znode_os.c index 7b28f2640188..cbeb18580d20 100644 --- a/module/os/linux/zfs/zfs_znode_os.c +++ b/module/os/linux/zfs/zfs_znode_os.c @@ -126,8 +126,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_acl_cached = NULL; zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; - zp->z_sync_writes_cnt = 0; - zp->z_async_writes_cnt = 0; return (0); } @@ -149,9 +147,6 @@ zfs_znode_cache_destructor(void *buf, void *arg) ASSERT3P(zp->z_dirlocks, ==, NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); - - ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); - ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); } static int @@ -548,8 +543,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; - zp->z_sync_writes_cnt = 0; - zp->z_async_writes_cnt = 0; zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 1a82c13e1523..ef7bd7352084 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -111,52 +111,11 @@ zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; znode_t *zp = ITOZ(inode); - zfsvfs_t *zfsvfs = ITOZSB(inode); cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; - /* - * The variables z_sync_writes_cnt and z_async_writes_cnt work in - * tandem so that sync writes can detect if there are any non-sync - * writes going on and vice-versa. The "vice-versa" part to this logic - * is located in zfs_putpage() where non-sync writes check if there are - * any ongoing sync writes. If any sync and non-sync writes overlap, - * we do a commit to complete the non-sync writes since the latter can - * potentially take several seconds to complete and thus block sync - * writes in the upcoming call to filemap_write_and_wait_range(). - */ - atomic_inc_32(&zp->z_sync_writes_cnt); - /* - * If the following check does not detect an overlapping non-sync write - * (say because it's just about to start), then it is guaranteed that - * the non-sync write will detect this sync write. This is because we - * always increment z_sync_writes_cnt / z_async_writes_cnt before doing - * the check on z_async_writes_cnt / z_sync_writes_cnt here and in - * zfs_putpage() respectively. - */ - if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { - if ((error = zpl_enter(zfsvfs, FTAG)) != 0) { - atomic_dec_32(&zp->z_sync_writes_cnt); - return (error); - } - zil_commit(zfsvfs->z_log, zp->z_id); - zpl_exit(zfsvfs, FTAG); - } - error = filemap_write_and_wait_range(inode->i_mapping, start, end); - - /* - * The sync write is not complete yet but we decrement - * z_sync_writes_cnt since zfs_fsync() increments and decrements - * it internally. If a non-sync write starts just after the decrement - * operation but before we call zfs_fsync(), it may not detect this - * overlapping sync write but it does not matter since we have already - * gone past filemap_write_and_wait_range() and we won't block due to - * the non-sync write. - */ - atomic_dec_32(&zp->z_sync_writes_cnt); - if (error) return (error); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 9489890702c5..b6d76a548e40 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -109,9 +109,7 @@ zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); - atomic_inc_32(&zp->z_sync_writes_cnt); zil_commit(zfsvfs->z_log, zp->z_id); - atomic_dec_32(&zp->z_sync_writes_cnt); zfs_exit(zfsvfs, FTAG); } return (error); From b9c45fe68cc4976f525a3b844a3d867b120bae7e Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 28 Jul 2025 10:51:00 +1000 Subject: [PATCH 49/61] Linux: zfs_putpage: complete async page writeback immediately For async page writeback, we do not need to wait for the page to be on disk before returning to the caller; it's enough that the data from the dirty page be on the DMU and in the in-memory ZIL, just like any other write. So, if this is not a syncing write, don't add a callback to the itx, and instead just unlock the page immediately. (This is effectively the same concept used for FreeBSD in d323fbf49c). Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #17584 Closes #14290 --- module/os/linux/zfs/zfs_vnops_os.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 1d8f76d86bfe..7107012d8168 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -3682,16 +3682,7 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, } static void -zfs_putpage_sync_commit_cb(void *arg) -{ - struct page *pp = arg; - - ClearPageError(pp); - end_page_writeback(pp); -} - -static void -zfs_putpage_async_commit_cb(void *arg) +zfs_putpage_commit_cb(void *arg) { struct page *pp = arg; @@ -3895,8 +3886,12 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, } zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit, - B_FALSE, for_sync ? zfs_putpage_sync_commit_cb : - zfs_putpage_async_commit_cb, pp); + B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp); + + if (!for_sync) { + ClearPageError(pp); + end_page_writeback(pp); + } dmu_tx_commit(tx); From 0c7d6e20e66ea74f51b51a9d60e77bd21acecada Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 1 Aug 2025 16:05:07 +1000 Subject: [PATCH 50/61] Linux: zfs_putpage: document (and fix!) confusing sync/commit modes The structure of zfs_putpage() and its callers is tricky to follow. There's a lot more we could do to improve it, but at least now we have some description of one of the trickier bits. Writing this exposed a very subtle bug: most async pages pushed out through zpl_putpages() would go to the ZIL with commit=false, which can yield a less-efficient write policy. So this commit updates that too. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #17584 --- module/os/linux/zfs/zfs_vnops_os.c | 55 ++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 7107012d8168..b13075b9a9d9 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -25,6 +25,7 @@ * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2025, Klara, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ @@ -3875,17 +3876,49 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); - boolean_t commit = B_FALSE; - if (wbc->sync_mode != WB_SYNC_NONE) { - /* - * Note that this is rarely called under writepages(), because - * writepages() normally handles the entire commit for - * performance reasons. - */ - commit = B_TRUE; - } + /* + * A note about for_sync vs wbc->sync_mode. + * + * for_sync indicates that this is a syncing writeback, that is, kernel + * caller expects the data to be durably stored before being notified. + * Often, but not always, the call was triggered by a userspace syncing + * op (eg fsync(), msync(MS_SYNC)). For our purposes, for_sync==TRUE + * means that that page should remain "locked" (in the writeback state) + * until it is definitely on disk (ie zil_commit() or spa_sync()). + * Otherwise, we can unlock and return as soon as it is on the + * in-memory ZIL. + * + * wbc->sync_mode has similar meaning. wbc is passed from the kernel to + * zpl_writepages()/zpl_writepage(); wbc->sync_mode==WB_SYNC_NONE + * indicates this a regular async writeback (eg a cache eviction) and + * so does not need a durability guarantee, while WB_SYNC_ALL indicates + * a syncing op that must be waited on (by convention, we test for + * !WB_SYNC_NONE rather than WB_SYNC_ALL, to prefer durability over + * performance should there ever be a new mode that we have not yet + * added support for). + * + * So, why a separate for_sync field? This is because zpl_writepages() + * calls zfs_putpage() multiple times for a single "logical" operation. + * It wants all the individual pages to be for_sync==TRUE ie only + * unlocked once durably stored, but it only wants one call to + * zil_commit() at the very end, once all the pages are synced. So, + * it repurposes sync_mode slightly to indicate who issue and wait for + * the IO: for NONE, the caller to zfs_putpage() will do it, while for + * ALL, zfs_putpage should do it. + * + * Summary: + * for_sync: 0=unlock immediately; 1 unlock once on disk + * sync_mode: NONE=caller will commit; ALL=we will commit + */ + boolean_t need_commit = (wbc->sync_mode != WB_SYNC_NONE); - zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit, + /* + * We use for_sync as the "commit" arg to zfs_log_write() (arg 7) + * because it is a policy flag that indicates "someone will call + * zil_commit() soon". for_sync=TRUE means exactly that; the only + * question is whether it will be us, or zpl_writepages(). + */ + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, for_sync, B_FALSE, for_sync ? zfs_putpage_commit_cb : NULL, pp); if (!for_sync) { @@ -3897,7 +3930,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, zfs_rangelock_exit(lr); - if (commit) + if (need_commit) zil_commit(zfsvfs->z_log, zp->z_id); dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); From 57b614e0258dc6192d01f2bd9b34d6b810cb04a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20F=C3=BCl=C3=B6p?= Date: Mon, 4 Aug 2025 01:30:58 +0200 Subject: [PATCH 51/61] SIMD: Don't require definition of `HAVE_XSAVE` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we fail the compilation via the #error directive if `HAVE_XSAVE` isn't defined. This breaks i586 builds since we check the toolchains SIMD support only on i686 and onward. Remove the requirement to fix the build on i586. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Attila Fülöp Closes #13303 Closes #17590 --- include/os/linux/kernel/linux/simd_x86.h | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/include/os/linux/kernel/linux/simd_x86.h b/include/os/linux/kernel/linux/simd_x86.h index cd245a5f0135..e8004e18c4a4 100644 --- a/include/os/linux/kernel/linux/simd_x86.h +++ b/include/os/linux/kernel/linux/simd_x86.h @@ -139,15 +139,6 @@ */ #if defined(HAVE_KERNEL_FPU_INTERNAL) -/* - * For kernels not exporting *kfpu_{begin,end} we have to use inline assembly - * with the XSAVE{,OPT,S} instructions, so we need the toolchain to support at - * least XSAVE. - */ -#if !defined(HAVE_XSAVE) -#error "Toolchain needs to support the XSAVE assembler instruction" -#endif - #ifndef XFEATURE_MASK_XTILE /* * For kernels where this doesn't exist yet, we still don't want to break @@ -335,9 +326,13 @@ kfpu_begin(void) return; } #endif +#if defined(HAVE_XSAVE) if (static_cpu_has(X86_FEATURE_XSAVE)) { kfpu_do_xsave("xsave", state, ~XFEATURE_MASK_XTILE); - } else if (static_cpu_has(X86_FEATURE_FXSR)) { + return; + } +#endif + if (static_cpu_has(X86_FEATURE_FXSR)) { kfpu_save_fxsr(state); } else { kfpu_save_fsave(state); @@ -390,9 +385,13 @@ kfpu_end(void) goto out; } #endif +#if defined(HAVE_XSAVE) if (static_cpu_has(X86_FEATURE_XSAVE)) { kfpu_do_xrstor("xrstor", state, ~XFEATURE_MASK_XTILE); - } else if (static_cpu_has(X86_FEATURE_FXSR)) { + goto out; + } +#endif + if (static_cpu_has(X86_FEATURE_FXSR)) { kfpu_restore_fxsr(state); } else { kfpu_restore_fsave(state); From 11f844175ebae45a317ad7bd7721e28ccbc622bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Attila=20F=C3=BCl=C3=B6p?= Date: Mon, 4 Aug 2025 02:11:48 +0200 Subject: [PATCH 52/61] config: Avoid `void main()` in toolchain-simd.m4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Be standard-compliant by using `int main()`. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Attila Fülöp Closes #13303 Closes #17590 --- config/toolchain-simd.m4 | 69 ++++++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 23 deletions(-) diff --git a/config/toolchain-simd.m4 b/config/toolchain-simd.m4 index 061576fd94e3..344807fc830c 100644 --- a/config/toolchain-simd.m4 +++ b/config/toolchain-simd.m4 @@ -38,9 +38,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE], [ AC_MSG_CHECKING([whether host toolchain supports SSE]) AC_LINK_IFELSE([AC_LANG_SOURCE([[ - void main() + int main() { __asm__ __volatile__("xorps %xmm0, %xmm1"); + return (0); } ]])], [ AC_DEFINE([HAVE_SSE], 1, [Define if host toolchain supports SSE]) @@ -57,9 +58,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE2], [ AC_MSG_CHECKING([whether host toolchain supports SSE2]) AC_LINK_IFELSE([AC_LANG_SOURCE([[ - void main() + int main() { __asm__ __volatile__("pxor %xmm0, %xmm1"); + return (0); } ]])], [ AC_DEFINE([HAVE_SSE2], 1, [Define if host toolchain supports SSE2]) @@ -76,10 +78,11 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE3], [ AC_MSG_CHECKING([whether host toolchain supports SSE3]) AC_LINK_IFELSE([AC_LANG_SOURCE([[ - void main() + int main() { char v[16]; __asm__ __volatile__("lddqu %0,%%xmm0" :: "m"(v[0])); + return (0); } ]])], [ AC_DEFINE([HAVE_SSE3], 1, [Define if host toolchain supports SSE3]) @@ -96,9 +99,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSSE3], [ AC_MSG_CHECKING([whether host toolchain supports SSSE3]) AC_LINK_IFELSE([AC_LANG_SOURCE([[ - void main() + int main() { __asm__ __volatile__("pshufb %xmm0,%xmm1"); + return (0); } ]])], [ AC_DEFINE([HAVE_SSSE3], 1, [Define if host toolchain supports SSSE3]) @@ -115,9 +119,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE4_1], [ AC_MSG_CHECKING([whether host toolchain supports SSE4.1]) AC_LINK_IFELSE([AC_LANG_SOURCE([[ - void main() + int main() { __asm__ __volatile__("pmaxsb %xmm0,%xmm1"); + return (0); } ]])], [ AC_DEFINE([HAVE_SSE4_1], 1, [Define if host toolchain supports SSE4.1]) @@ -134,9 +139,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE4_2], [ AC_MSG_CHECKING([whether host toolchain supports SSE4.2]) AC_LINK_IFELSE([AC_LANG_SOURCE([[ - void main() + int main() { __asm__ __volatile__("pcmpgtq %xmm0, %xmm1"); + return (0); } ]])], [ AC_DEFINE([HAVE_SSE4_2], 1, [Define if host toolchain supports SSE4.2]) @@ -153,10 +159,11 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX], [ AC_MSG_CHECKING([whether host toolchain supports AVX]) AC_LINK_IFELSE([AC_LANG_SOURCE([[ - void main() + int main() { char v[32]; __asm__ __volatile__("vmovdqa %0,%%ymm0" :: "m"(v[0])); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -174,9 +181,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX2], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("vpshufb %ymm0,%ymm1,%ymm2"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -194,9 +202,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512F], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("vpandd %zmm0,%zmm1,%zmm2"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -214,9 +223,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512CD], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("vplzcntd %zmm0,%zmm1"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -234,9 +244,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512DQ], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("vandpd %zmm0,%zmm1,%zmm2"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -254,9 +265,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512BW], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("vpshufb %zmm0,%zmm1,%zmm2"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -274,9 +286,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512IFMA], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("vpmadd52luq %zmm0,%zmm1,%zmm2"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -294,9 +307,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VBMI], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("vpermb %zmm0,%zmm1,%zmm2"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -314,9 +328,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512PF], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("vgatherpf0dps (%rsi,%zmm0,4){%k1}"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -334,9 +349,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512ER], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("vexp2pd %zmm0,%zmm1"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -354,9 +370,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VL], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("vpabsq %zmm0,%zmm1"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -374,9 +391,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AES], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("aesenc %xmm0, %xmm1"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -394,9 +412,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_PCLMULQDQ], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("pclmulqdq %0, %%xmm0, %%xmm1" :: "i"(0)); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -414,9 +433,10 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_MOVBE], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { __asm__ __volatile__("movbe 0(%eax), %eax"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -434,10 +454,11 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVE], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { char b[4096] __attribute__ ((aligned (64))); __asm__ __volatile__("xsave %[b]\n" : : [b] "m" (*b) : "memory"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -455,10 +476,11 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVEOPT], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { char b[4096] __attribute__ ((aligned (64))); __asm__ __volatile__("xsaveopt %[b]\n" : : [b] "m" (*b) : "memory"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) @@ -476,10 +498,11 @@ AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_XSAVES], [ AC_LINK_IFELSE([AC_LANG_SOURCE([ [ - void main() + int main() { char b[4096] __attribute__ ((aligned (64))); __asm__ __volatile__("xsaves %[b]\n" : : [b] "m" (*b) : "memory"); + return (0); } ]])], [ AC_MSG_RESULT([yes]) From d1d706350e19721337826ae8e8de96e70d75c26e Mon Sep 17 00:00:00 2001 From: Todd Zullinger Date: Thu, 7 Aug 2025 14:39:56 -0400 Subject: [PATCH 53/61] rpm: don't list /sbin/zgenhostid twice in %files The location of zgenhostid was changed in 0ae733c7a (Install zgenhostid to sbindir, 2021-01-21). We include all files within sbindir two lines earlier, which causes rpmbuild to report: File listed twice: /sbin/zgenhostid Drop the redundant entry from the %files section. Reviewed-by: Brian Behlendorf Signed-off-by: Todd Zullinger Closes #17601 --- rpm/generic/zfs.spec.in | 1 - 1 file changed, 1 deletion(-) diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index 47313a6b5fbb..dd2eb3814e53 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -506,7 +506,6 @@ systemctl --system daemon-reload >/dev/null || true # Core utilities %{_sbindir}/* %{_bindir}/raidz_test -%{_sbindir}/zgenhostid %{_bindir}/zvol_wait # Optional Python 3 scripts %{_bindir}/arc_summary From a49c95729905ffa57edfb4d0ea2aba61e81aa35b Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 9 Aug 2025 02:39:14 +1000 Subject: [PATCH 54/61] linux/zvol_os: fix crash with blk-mq on Linux 4.19 03987f71e3 (#16069) added a workaround to get the blk-mq hardware context for older kernels that don't cache it in the struct request. However, this workaround appears to be incomplete. In 4.19, the rq data context is optional. If its not initialised, then the cached rq->cpu will be -1, and so using it to index into mq_map causes a crash. Given that the upstream 4.19 is now in extended LTS and rarely seen, RHEL8 4.18+ has long carried "modern" blk-mq support, and the cached hardware context has been available since 5.1, I'm not going to huge lengths to get queue selection correct for the very few people that are likely to feel it. To that end, we simply call raw_smp_processor_id() to get a valid CPU id and use that instead. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Paul Dagnelie Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Closes #17597 --- module/os/linux/zfs/zvol_os.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index c7b7bab2f117..702a6481c195 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -558,8 +558,8 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, #ifdef HAVE_BLK_MQ_RQ_HCTX blk_mq_hw_queue = rq->mq_hctx->queue_num; #else - blk_mq_hw_queue = - rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num; + blk_mq_hw_queue = rq->q->queue_hw_ctx[ + rq->q->mq_map[raw_smp_processor_id()]]->queue_num; #endif taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, blk_mq_hw_queue); From 96516684577b9c215b95ebeff82c41aeea2c3844 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Wirnata?= Date: Mon, 11 Aug 2025 18:44:51 +0200 Subject: [PATCH 55/61] zed: prettify slack notification message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This converts the body of a ZED slack notification from plain text to code block style to help with readability. Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Signed-off-by: René Wirnata Closes #17610 --- cmd/zed/zed.d/zed-functions.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmd/zed/zed.d/zed-functions.sh b/cmd/zed/zed.d/zed-functions.sh index 6e00f153be1c..78d8f658ddd8 100644 --- a/cmd/zed/zed.d/zed-functions.sh +++ b/cmd/zed/zed.d/zed-functions.sh @@ -441,8 +441,9 @@ zed_notify_slack_webhook() "${pathname}")" # Construct the JSON message for posting. + # shellcheck disable=SC2016 # - msg_json="$(printf '{"text": "*%s*\\n%s"}' "${subject}" "${msg_body}" )" + msg_json="$(printf '{"text": "*%s*\\n```%s```"}' "${subject}" "${msg_body}" )" # Send the POST request and check for errors. # From 41ca2296cd5109527de2022ced07c0898f0592b7 Mon Sep 17 00:00:00 2001 From: achill Date: Tue, 12 Aug 2025 01:30:09 +0200 Subject: [PATCH 56/61] Linux 6.16 compat: META Update the META file to reflect compatibility with the 6.16 kernel. Tested with 6.16.0-0-stable of Alpine Linux edge, see . Reviewed-by: Rob Norris Reviewed-by: Brian Behlendorf Signed-off-by: Achill Gilgenast Closes #17578 --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index 984cc28effe1..c82cb7e7fa9b 100644 --- a/META +++ b/META @@ -6,5 +6,5 @@ Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.15 +Linux-Maximum: 6.16 Linux-Minimum: 4.18 From 46de04d2e911d39a4a8930e44b217fe41537e710 Mon Sep 17 00:00:00 2001 From: Colin Percival Date: Tue, 12 Aug 2025 13:38:55 -0700 Subject: [PATCH 57/61] FreeBSD 15.0 is now "PRERELEASE" Chase URL change from the FreeBSD project. Reviewed-by: Brian Behlendorf Signed-off-by: Colin Percival Closes #17617 --- .github/workflows/scripts/qemu-2-start.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh index 885a64037f89..70a2364f1fc6 100755 --- a/.github/workflows/scripts/qemu-2-start.sh +++ b/.github/workflows/scripts/qemu-2-start.sh @@ -109,7 +109,7 @@ case "$OS" in KSRC="$FREEBSD_SNAP/../amd64/$FreeBSD/src.txz" ;; freebsd15-0c) - FreeBSD="15.0-CURRENT" + FreeBSD="15.0-PRERELEASE" OSNAME="FreeBSD $FreeBSD" OSv="freebsd14.0" URLxz="$FREEBSD_SNAP/$FreeBSD/amd64/Latest/FreeBSD-$FreeBSD-amd64-BASIC-CI-ufs.raw.xz" From 3e78905ffb32a2f61a2bd06a87ea495899391c7e Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 12 Aug 2025 13:38:08 -0700 Subject: [PATCH 58/61] Silence zstd large allocation warning Allow zstd_mempool_init() to allocate using vmem_alloc() instead of kmem_alloc() to silence the large allocation warning on Linux during module load when the system has a large number of CPUs. It's not at all clear to me that scaling the allocation size with the number of CPUs is beneficial and that should be evaluated. But for the moment this should resolve the warning without introducing any unexpected side effects. Reviewed-by: Alexander Motin Reviewed-by: Rob Norris Signed-off-by: Brian Behlendorf Closes #17620 Closes #11557 --- module/zstd/zfs_zstd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/module/zstd/zfs_zstd.c b/module/zstd/zfs_zstd.c index b42066fdb7c3..2c0455d7c851 100644 --- a/module/zstd/zfs_zstd.c +++ b/module/zstd/zfs_zstd.c @@ -796,9 +796,9 @@ static void __init zstd_mempool_init(void) { zstd_mempool_cctx = - kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); + vmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); zstd_mempool_dctx = - kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); + vmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); for (int i = 0; i < ZSTD_POOL_MAX; i++) { mutex_init(&zstd_mempool_cctx[i].barrier, NULL, @@ -844,8 +844,8 @@ zstd_mempool_deinit(void) release_pool(&zstd_mempool_dctx[i]); } - kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); - kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); + vmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); + vmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); zstd_mempool_dctx = NULL; zstd_mempool_cctx = NULL; } From 0fe10361ba8354a8279e04691e7d8ef40f029459 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 12 Aug 2025 13:36:03 -0700 Subject: [PATCH 59/61] Allow vmem_alloc backed multilists Systems with a large number of CPU cores (192+) may trigger the large allocation warning in multilist_create() on Linux. Silence the warning by converting the allocation to vmem_alloc(). On Linux this results in a call to kvalloc() which will alloc vmem for large allocations and kmem for small allocations. On FreeBSD both vmem_alloc and kmem_alloc internally use the same allocator so there is no functional change. Reviewed-by: Tony Hutter Reviewed-by: Alexander Motin Reviewed-by: Rob Norris Signed-off-by: Brian Behlendorf Closes #17616 --- module/zfs/multilist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/zfs/multilist.c b/module/zfs/multilist.c index 7b85d19e19ee..46fb79269310 100644 --- a/module/zfs/multilist.c +++ b/module/zfs/multilist.c @@ -81,7 +81,7 @@ multilist_create_impl(multilist_t *ml, size_t size, size_t offset, ml->ml_num_sublists = num; ml->ml_index_func = index_func; - ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) * + ml->ml_sublists = vmem_zalloc(sizeof (multilist_sublist_t) * ml->ml_num_sublists, KM_SLEEP); ASSERT3P(ml->ml_sublists, !=, NULL); @@ -134,7 +134,7 @@ multilist_destroy(multilist_t *ml) } ASSERT3P(ml->ml_sublists, !=, NULL); - kmem_free(ml->ml_sublists, + vmem_free(ml->ml_sublists, sizeof (multilist_sublist_t) * ml->ml_num_sublists); ml->ml_num_sublists = 0; From a072611eefcbc1ee641db879252f82676dcff742 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Sat, 28 Jun 2025 02:32:16 +0000 Subject: [PATCH 60/61] Revert "FreeBSD: zfs_putpages: don't undirty pages until after write completes" This causes async putpages to leave the pages sbusied for a long time, which hurts concurrency. Revert for now until we have a better approach. This reverts commit 238eab7dc16932edbe9bcc990e8e5376bfe5b2ba. Reported by: Ihor Antonov Discussed with: Rob Norris References: freebsd/freebsd-src@738a9a7 Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Mark Johnston Ported-by: Rob Norris Signed-off-by: Rob Norris Closes #17533 --- include/os/freebsd/spl/sys/vm.h | 1 - module/os/freebsd/spl/spl_vm.c | 1 - module/os/freebsd/zfs/zfs_vnops_os.c | 60 +++++++--------------------- 3 files changed, 15 insertions(+), 47 deletions(-) diff --git a/include/os/freebsd/spl/sys/vm.h b/include/os/freebsd/spl/sys/vm.h index d36bee881d0b..454078f0fe79 100644 --- a/include/os/freebsd/spl/sys/vm.h +++ b/include/os/freebsd/spl/sys/vm.h @@ -35,7 +35,6 @@ extern const int zfs_vm_pagerret_bad; extern const int zfs_vm_pagerret_error; extern const int zfs_vm_pagerret_ok; -extern const int zfs_vm_pagerret_pend; extern const int zfs_vm_pagerput_sync; extern const int zfs_vm_pagerput_inval; diff --git a/module/os/freebsd/spl/spl_vm.c b/module/os/freebsd/spl/spl_vm.c index 9d5f025423a1..733c2bd07ebb 100644 --- a/module/os/freebsd/spl/spl_vm.c +++ b/module/os/freebsd/spl/spl_vm.c @@ -43,7 +43,6 @@ const int zfs_vm_pagerret_bad = VM_PAGER_BAD; const int zfs_vm_pagerret_error = VM_PAGER_ERROR; const int zfs_vm_pagerret_ok = VM_PAGER_OK; -const int zfs_vm_pagerret_pend = VM_PAGER_PEND; const int zfs_vm_pagerput_sync = VM_PAGER_PUT_SYNC; const int zfs_vm_pagerput_inval = VM_PAGER_PUT_INVAL; diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 68367f105691..0fa2003554cc 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -25,7 +25,6 @@ * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. - * Copyright (c) 2025, Klara, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ @@ -4085,33 +4084,6 @@ zfs_freebsd_getpages(struct vop_getpages_args *ap) ap->a_rahead)); } -typedef struct { - uint_t pca_npages; - vm_page_t pca_pages[]; -} putpage_commit_arg_t; - -static void -zfs_putpage_commit_cb(void *arg) -{ - putpage_commit_arg_t *pca = arg; - vm_object_t object = pca->pca_pages[0]->object; - - zfs_vmobject_wlock(object); - - for (uint_t i = 0; i < pca->pca_npages; i++) { - vm_page_t pp = pca->pca_pages[i]; - vm_page_undirty(pp); - vm_page_sunbusy(pp); - } - - vm_object_pip_wakeupn(object, pca->pca_npages); - - zfs_vmobject_wunlock(object); - - kmem_free(pca, - offsetof(putpage_commit_arg_t, pca_pages[pca->pca_npages])); -} - static int zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, int *rtvals) @@ -4213,12 +4185,10 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, } if (zp->z_blksz < PAGE_SIZE) { - vm_ooffset_t woff = off; - size_t wlen = len; - for (i = 0; wlen > 0; woff += tocopy, wlen -= tocopy, i++) { - tocopy = MIN(PAGE_SIZE, wlen); + for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { + tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; va = zfs_map_page(ma[i], &sf); - dmu_write(zfsvfs->z_os, zp->z_id, woff, tocopy, va, tx); + dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); zfs_unmap_page(sf); } } else { @@ -4239,19 +4209,19 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT0(err); + /* + * XXX we should be passing a callback to undirty + * but that would make the locking messier + */ + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, + len, commit, B_FALSE, NULL, NULL); - putpage_commit_arg_t *pca = kmem_alloc( - offsetof(putpage_commit_arg_t, pca_pages[ncount]), - KM_SLEEP); - pca->pca_npages = ncount; - memcpy(pca->pca_pages, ma, sizeof (vm_page_t) * ncount); - - zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, - off, len, commit, B_FALSE, zfs_putpage_commit_cb, pca); - - for (i = 0; i < ncount; i++) - rtvals[i] = zfs_vm_pagerret_pend; - + zfs_vmobject_wlock(object); + for (i = 0; i < ncount; i++) { + rtvals[i] = zfs_vm_pagerret_ok; + vm_page_undirty(ma[i]); + } + zfs_vmobject_wunlock(object); VM_CNT_INC(v_vnodeout); VM_CNT_ADD(v_vnodepgsout, ncount); } From 3b64a9619f8f724ecac3e280a235f6b56d20ee1c Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 1 Jul 2025 09:24:23 +1000 Subject: [PATCH 61/61] FreeBSD: zfs_putpages: don't undirty pages until after write completes In syncing mode, zfs_putpages() would put the entire range of pages onto the ZIL, then return VM_PAGER_OK for each page to the kernel. However, an associated zil_commit() or txg sync had not happened at this point, so the write may not actually be on disk. So, we rework that case to use a ZIL commit callback, and do the post-write work of undirtying the page and signaling completion there. We return VM_PAGER_PEND to the kernel instead so it knows that we will take care of it. The original version of this (238eab7dc1) copied the Linux model and did the cleanup in a ZIL callback for both async and sync. This was a mistake, as FreeBSD does not have a separate "busy for writeback" flag like Linux which keeps the page usable. The full sbusy flag locks the entire page out until the itx callback fires, which for async is after txg sync, which could be literal seconds in the future. For the async case, the data is already on the DMU and the in-memory ZIL, which is sufficient for async writeback, so the old method of logging it without a callback, undirtying the page and returning is more than sufficient and reclaims that lost performance. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Mark Johnston Signed-off-by: Rob Norris Closes #17533 --- include/os/freebsd/spl/sys/vm.h | 1 + module/os/freebsd/spl/spl_vm.c | 1 + module/os/freebsd/zfs/zfs_vnops_os.c | 87 +++++++++++++++++++++++----- 3 files changed, 75 insertions(+), 14 deletions(-) diff --git a/include/os/freebsd/spl/sys/vm.h b/include/os/freebsd/spl/sys/vm.h index 454078f0fe79..d36bee881d0b 100644 --- a/include/os/freebsd/spl/sys/vm.h +++ b/include/os/freebsd/spl/sys/vm.h @@ -35,6 +35,7 @@ extern const int zfs_vm_pagerret_bad; extern const int zfs_vm_pagerret_error; extern const int zfs_vm_pagerret_ok; +extern const int zfs_vm_pagerret_pend; extern const int zfs_vm_pagerput_sync; extern const int zfs_vm_pagerput_inval; diff --git a/module/os/freebsd/spl/spl_vm.c b/module/os/freebsd/spl/spl_vm.c index 733c2bd07ebb..9d5f025423a1 100644 --- a/module/os/freebsd/spl/spl_vm.c +++ b/module/os/freebsd/spl/spl_vm.c @@ -43,6 +43,7 @@ const int zfs_vm_pagerret_bad = VM_PAGER_BAD; const int zfs_vm_pagerret_error = VM_PAGER_ERROR; const int zfs_vm_pagerret_ok = VM_PAGER_OK; +const int zfs_vm_pagerret_pend = VM_PAGER_PEND; const int zfs_vm_pagerput_sync = VM_PAGER_PUT_SYNC; const int zfs_vm_pagerput_inval = VM_PAGER_PUT_INVAL; diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 0fa2003554cc..6bacda949061 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -25,6 +25,7 @@ * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2025, Klara, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ @@ -4084,6 +4085,33 @@ zfs_freebsd_getpages(struct vop_getpages_args *ap) ap->a_rahead)); } +typedef struct { + uint_t pca_npages; + vm_page_t pca_pages[]; +} putpage_commit_arg_t; + +static void +zfs_putpage_commit_cb(void *arg) +{ + putpage_commit_arg_t *pca = arg; + vm_object_t object = pca->pca_pages[0]->object; + + zfs_vmobject_wlock(object); + + for (uint_t i = 0; i < pca->pca_npages; i++) { + vm_page_t pp = pca->pca_pages[i]; + vm_page_undirty(pp); + vm_page_sunbusy(pp); + } + + vm_object_pip_wakeupn(object, pca->pca_npages); + + zfs_vmobject_wunlock(object); + + kmem_free(pca, + offsetof(putpage_commit_arg_t, pca_pages[pca->pca_npages])); +} + static int zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, int *rtvals) @@ -4185,10 +4213,12 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, } if (zp->z_blksz < PAGE_SIZE) { - for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { - tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; + vm_ooffset_t woff = off; + size_t wlen = len; + for (i = 0; wlen > 0; woff += tocopy, wlen -= tocopy, i++) { + tocopy = MIN(PAGE_SIZE, wlen); va = zfs_map_page(ma[i], &sf); - dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); + dmu_write(zfsvfs->z_os, zp->z_id, woff, tocopy, va, tx); zfs_unmap_page(sf); } } else { @@ -4209,19 +4239,48 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT0(err); - /* - * XXX we should be passing a callback to undirty - * but that would make the locking messier - */ - zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, - len, commit, B_FALSE, NULL, NULL); - zfs_vmobject_wlock(object); - for (i = 0; i < ncount; i++) { - rtvals[i] = zfs_vm_pagerret_ok; - vm_page_undirty(ma[i]); + if (commit) { + /* + * Caller requested that we commit immediately. We set + * a callback on the log entry, to be called once its + * on disk after the call to zil_commit() below. The + * pages will be undirtied and unbusied there. + */ + putpage_commit_arg_t *pca = kmem_alloc( + offsetof(putpage_commit_arg_t, pca_pages[ncount]), + KM_SLEEP); + pca->pca_npages = ncount; + memcpy(pca->pca_pages, ma, sizeof (vm_page_t) * ncount); + + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, + B_TRUE, B_FALSE, zfs_putpage_commit_cb, pca); + + for (i = 0; i < ncount; i++) + rtvals[i] = zfs_vm_pagerret_pend; + } else { + /* + * Caller just wants the page written back somewhere, + * but doesn't need it committed yet. We've already + * written it back to the DMU, so we just need to put + * it on the async log, then undirty the page and + * return. + * + * We cannot use a callback here, because it would keep + * the page busy (locked) until it is eventually + * written down at txg sync. + */ + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, + B_FALSE, B_FALSE, NULL, NULL); + + zfs_vmobject_wlock(object); + for (i = 0; i < ncount; i++) { + rtvals[i] = zfs_vm_pagerret_ok; + vm_page_undirty(ma[i]); + } + zfs_vmobject_wunlock(object); } - zfs_vmobject_wunlock(object); + VM_CNT_INC(v_vnodeout); VM_CNT_ADD(v_vnodepgsout, ncount); }