25
25
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
26
26
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
27
27
* Copyright 2017 Nexenta Systems, Inc.
28
+ * Copyright (c) 2025, Klara, Inc.
28
29
*/
29
30
30
31
/* Portions Copyright 2007 Jeremy Teo */
@@ -3875,17 +3876,49 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
3875
3876
3876
3877
err = sa_bulk_update (zp -> z_sa_hdl , bulk , cnt , tx );
3877
3878
3878
- boolean_t commit = B_FALSE ;
3879
- if (wbc -> sync_mode != WB_SYNC_NONE ) {
3880
- /*
3881
- * Note that this is rarely called under writepages(), because
3882
- * writepages() normally handles the entire commit for
3883
- * performance reasons.
3884
- */
3885
- commit = B_TRUE ;
3886
- }
3879
+ /*
3880
+ * A note about for_sync vs wbc->sync_mode.
3881
+ *
3882
+ * for_sync indicates that this is a syncing writeback, that is, kernel
3883
+ * caller expects the data to be durably stored before being notified.
3884
+ * Often, but not always, the call was triggered by a userspace syncing
3885
+ * op (eg fsync(), msync(MS_SYNC)). For our purposes, for_sync==TRUE
3886
+ * means that that page should remain "locked" (in the writeback state)
3887
+ * until it is definitely on disk (ie zil_commit() or spa_sync()).
3888
+ * Otherwise, we can unlock and return as soon as it is on the
3889
+ * in-memory ZIL.
3890
+ *
3891
+ * wbc->sync_mode has similar meaning. wbc is passed from the kernel to
3892
+ * zpl_writepages()/zpl_writepage(); wbc->sync_mode==WB_SYNC_NONE
3893
+ * indicates this a regular async writeback (eg a cache eviction) and
3894
+ * so does not need a durability guarantee, while WB_SYNC_ALL indicates
3895
+ * a syncing op that must be waited on (by convention, we test for
3896
+ * !WB_SYNC_NONE rather than WB_SYNC_ALL, to prefer durability over
3897
+ * performance should there ever be a new mode that we have not yet
3898
+ * added support for).
3899
+ *
3900
+ * So, why a separate for_sync field? This is because zpl_writepages()
3901
+ * calls zfs_putpage() multiple times for a single "logical" operation.
3902
+ * It wants all the individual pages to be for_sync==TRUE ie only
3903
+ * unlocked once durably stored, but it only wants one call to
3904
+ * zil_commit() at the very end, once all the pages are synced. So,
3905
+ * it repurposes sync_mode slightly to indicate who issue and wait for
3906
+ * the IO: for NONE, the caller to zfs_putpage() will do it, while for
3907
+ * ALL, zfs_putpage should do it.
3908
+ *
3909
+ * Summary:
3910
+ * for_sync: 0=unlock immediately; 1 unlock once on disk
3911
+ * sync_mode: NONE=caller will commit; ALL=we will commit
3912
+ */
3913
+ boolean_t need_commit = (wbc -> sync_mode != WB_SYNC_NONE );
3887
3914
3888
- zfs_log_write (zfsvfs -> z_log , tx , TX_WRITE , zp , pgoff , pglen , commit ,
3915
+ /*
3916
+ * We use for_sync as the "commit" arg to zfs_log_write() (arg 7)
3917
+ * because it is a policy flag that indicates "someone will call
3918
+ * zil_commit() soon". for_sync=TRUE means exactly that; the only
3919
+ * question is whether it will be us, or zpl_writepages().
3920
+ */
3921
+ zfs_log_write (zfsvfs -> z_log , tx , TX_WRITE , zp , pgoff , pglen , for_sync ,
3889
3922
B_FALSE , for_sync ? zfs_putpage_commit_cb : NULL , pp );
3890
3923
3891
3924
if (!for_sync ) {
@@ -3897,7 +3930,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
3897
3930
3898
3931
zfs_rangelock_exit (lr );
3899
3932
3900
- if (commit )
3933
+ if (need_commit )
3901
3934
zil_commit (zfsvfs -> z_log , zp -> z_id );
3902
3935
3903
3936
dataset_kstats_update_write_kstats (& zfsvfs -> z_kstat , pglen );
0 commit comments