diff --git a/config/kernel-dentry-operations.m4 b/config/kernel-dentry-operations.m4 index 6d87ad0e0710..ce0e6e5be959 100644 --- a/config/kernel-dentry-operations.m4 +++ b/config/kernel-dentry-operations.m4 @@ -46,12 +46,37 @@ AC_DEFUN([ZFS_AC_KERNEL_D_SET_D_OP], [ ]) ]) +dnl # +dnl # 6.17 API change +dnl # sb->s_d_op removed; set_default_d_op(sb, dop) added +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SET_DEFAULT_D_OP], [ + ZFS_LINUX_TEST_SRC([set_default_d_op], [ + #include + ], [ + set_default_d_op(NULL, NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SET_DEFAULT_D_OP], [ + AC_MSG_CHECKING([whether set_default_d_op() is available]) + ZFS_LINUX_TEST_RESULT([set_default_d_op], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SET_DEFAULT_D_OP, 1, + [Define if set_default_d_op() is available]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_SRC_DENTRY], [ ZFS_AC_KERNEL_SRC_D_OBTAIN_ALIAS ZFS_AC_KERNEL_SRC_D_SET_D_OP + ZFS_AC_KERNEL_SRC_SET_DEFAULT_D_OP ]) AC_DEFUN([ZFS_AC_KERNEL_DENTRY], [ ZFS_AC_KERNEL_D_OBTAIN_ALIAS ZFS_AC_KERNEL_D_SET_D_OP + ZFS_AC_KERNEL_SET_DEFAULT_D_OP ]) diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h index f5a9105cd885..8994aab889fe 100644 --- a/include/os/linux/zfs/sys/zpl.h +++ b/include/os/linux/zfs/sys/zpl.h @@ -55,6 +55,7 @@ extern const struct file_operations zpl_dir_file_operations; extern void zpl_prune_sb(uint64_t nr_to_scan, void *arg); extern const struct super_operations zpl_super_operations; +extern const struct dentry_operations zpl_dentry_operations; extern const struct export_operations zpl_export_operations; extern struct file_system_type zpl_fs_type; diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 7f1adaceb408..11bcbf430210 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -18,7 +18,7 @@ .\" own identifying information: .\" Portions Copyright [yyyy] [name of copyright owner] .\" -.Dd August 14, 2025 +.Dd September 15, 2025 .Dt ZFS 4 .Os . @@ -2583,6 +2583,49 @@ the xattr so as to not accumulate duplicates. .It Sy zio_requeue_io_start_cut_in_line Ns = Ns Sy 0 Ns | Ns 1 Pq int Prioritize requeued I/O. . +.It Sy zfs_delete_inode Ns = Ns Sy 0 Ns | Ns 1 Pq int +Sets whether the kernel should free an inode structure when the last reference +is released, or cache it in memory. +Intended for testing/debugging. +.Pp +A live inode structure "pins" versious internal OpenZFS structures in memory, +which can result in large amounts of "unusable" memory on systems with lots of +infrequently-accessed files, until the kernel's memory pressure mechanism +asks OpenZFS to release them. +.Pp +The default value of +.Sy 0 +always caches inodes that appear to still exist on disk. +Setting it to +.Sy 1 +will immediately release unused inodes and their associated memory back to the +dbuf cache or the ARC for reuse, but may reduce performance if inodes are +frequently evicted and reloaded. +.Pp +This parameter is only available on Linux. +. +.It Sy zfs_delete_dentry Ns = Ns Sy 0 Ns | Ns 1 Pq int +Sets whether the kernel should free a dentry structure when it is no longer +required, or hold it in the dentry cache. +Intended for testing/debugging. +. +Since a dentry structure holds an inode reference, a cached dentry can "pin" +an inode in memory indefinitely, along with associated OpenZFS structures (See +.Sy zfs_delete_inode ) . +.Pp +The default value of +.Sy 0 +instructs the kernel to cache entries and their associated inodes when they +are no longer directly referenced. +They will be reclaimed as part of the kernel's normal cache management +processes. +Setting it to +.Sy 1 +will instruct the kernel to release directory entries and their inodes as soon +as they are no longer referenced by the filesystem. +.Pp +This parameter is only available on Linux. +. .It Sy zio_taskq_batch_pct Ns = Ns Sy 80 Ns % Pq uint Percentage of online CPUs which will run a worker thread for I/O. These workers are responsible for I/O work such as compression, encryption, diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index cd606e667bff..8a7d14ab6119 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -1556,6 +1556,12 @@ zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) sb->s_xattr = zpl_xattr_handlers; sb->s_export_op = &zpl_export_operations; +#ifdef HAVE_SET_DEFAULT_D_OP + set_default_d_op(sb, &zpl_dentry_operations); +#else + sb->s_d_op = &zpl_dentry_operations; +#endif + /* Set features for file system. */ zfs_set_fuid_feature(zfsvfs); diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c index 53819628627d..444948d03cb3 100644 --- a/module/os/linux/zfs/zpl_super.c +++ b/module/os/linux/zfs/zpl_super.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2023, Datto Inc. All rights reserved. + * Copyright (c) 2025, Klara, Inc. */ @@ -33,6 +34,20 @@ #include #include +/* + * What to do when the last reference to an inode is released. If 0, the kernel + * will cache it on the superblock. If 1, the inode will be freed immediately. + * See zpl_drop_inode(). + */ +int zfs_delete_inode = 0; + +/* + * What to do when the last reference to a dentry is released. If 0, the kernel + * will cache it until the entry (file) is destroyed. If 1, the dentry will be + * marked for cleanup, at which time its inode reference will be released. See + * zpl_dentry_delete(). + */ +int zfs_delete_dentry = 0; static struct inode * zpl_inode_alloc(struct super_block *sb) @@ -77,11 +92,36 @@ zpl_dirty_inode(struct inode *ip, int flags) } /* - * When ->drop_inode() is called its return value indicates if the - * inode should be evicted from the inode cache. If the inode is - * unhashed and has no links the default policy is to evict it - * immediately. + * ->drop_inode() is called when the last reference to an inode is released. + * Its return value indicates if the inode should be destroyed immediately, or + * cached on the superblock structure. + * + * By default (zfs_delete_inode=0), we call generic_drop_inode(), which returns + * "destroy immediately" if the inode is unhashed and has no links (roughly: no + * longer exists on disk). On datasets with millions of rarely-accessed files, + * this can cause a large amount of memory to be "pinned" by cached inodes, + * which in turn pin their associated dnodes and dbufs, until the kernel starts + * reporting memory pressure and requests OpenZFS release some memory (see + * zfs_prune()). + * + * When set to 1, we call generic_delete_node(), which always returns "destroy + * immediately", resulting in inodes being destroyed immediately, releasing + * their associated dnodes and dbufs to the dbuf cached and the ARC to be + * evicted as normal. * + * Note that the "last reference" doesn't always mean the last _userspace_ + * reference; the dentry cache also holds a reference, so "busy" inodes will + * still be kept alive that way (subject to dcache tuning). + */ +static int +zpl_drop_inode(struct inode *ip) +{ + if (zfs_delete_inode) + return (generic_delete_inode(ip)); + return (generic_drop_inode(ip)); +} + +/* * The ->evict_inode() callback must minimally truncate the inode pages, * and call clear_inode(). For 2.6.35 and later kernels this will * simply update the inode state, with the sync occurring before the @@ -470,6 +510,7 @@ const struct super_operations zpl_super_operations = { .destroy_inode = zpl_inode_destroy, .dirty_inode = zpl_dirty_inode, .write_inode = NULL, + .drop_inode = zpl_drop_inode, .evict_inode = zpl_evict_inode, .put_super = zpl_put_super, .sync_fs = zpl_sync_fs, @@ -480,6 +521,35 @@ const struct super_operations zpl_super_operations = { .show_stats = NULL, }; +/* + * ->d_delete() is called when the last reference to a dentry is released. Its + * return value indicates if the dentry should be destroyed immediately, or + * retained in the dentry cache. + * + * By default (zfs_delete_dentry=0) the kernel will always cache unused + * entries. Each dentry holds an inode reference, so cached dentries can hold + * the final inode reference indefinitely, leading to the inode and its related + * data being pinned (see zpl_drop_inode()). + * + * When set to 1, we signal that the dentry should be destroyed immediately and + * never cached. This reduces memory usage, at the cost of higher overheads to + * lookup a file, as the inode and its underlying data (dnode/dbuf) need to be + * reloaded and reinflated. + * + * Note that userspace does not have direct control over dentry references and + * reclaim; rather, this is part of the kernel's caching and reclaim subsystems + * (eg vm.vfs_cache_pressure). + */ +static int +zpl_dentry_delete(const struct dentry *dentry) +{ + return (zfs_delete_dentry ? 1 : 0); +} + +const struct dentry_operations zpl_dentry_operations = { + .d_delete = zpl_dentry_delete, +}; + struct file_system_type zpl_fs_type = { .owner = THIS_MODULE, .name = ZFS_DRIVER, @@ -491,3 +561,10 @@ struct file_system_type zpl_fs_type = { .mount = zpl_mount, .kill_sb = zpl_kill_sb, }; + +ZFS_MODULE_PARAM(zfs, zfs_, delete_inode, INT, ZMOD_RW, + "Delete inodes as soon as the last reference is released."); + +ZFS_MODULE_PARAM(zfs, zfs_, delete_dentry, INT, ZMOD_RW, + "Delete dentries from dentry cache as soon as the last reference is " + "released.");