From 767b37019f1aac5d958e3b38ceecd292537df94f Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Wed, 31 Jul 2024 14:35:48 +1000
Subject: [PATCH 01/65] linux/zvol_os: tidy and document queue limit/config
 setup

It gets hairier again in Linux 6.11, so I want some actual theory of
operation laid out for next time.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #16400
---
 module/os/linux/zfs/zvol_os.c | 45 +++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 83f80f62aee7..48e49a50a9c3 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
  * Copyright (c) 2024, Klara, Inc.
  */
 
@@ -1089,6 +1090,34 @@ static const struct block_device_operations zvol_ops = {
 #endif
 };
 
+/*
+ * Since 6.9, Linux has been removing queue limit setters in favour of an
+ * initial queue_limits struct applied when the device is open. Since 6.11,
+ * queue_limits is being extended to allow more things to be applied when the
+ * device is open. Setters are also being removed for this.
+ *
+ * For OpenZFS, this means that depending on kernel version, some options may
+ * be set up before the device is open, and some applied to an open device
+ * (queue) after the fact.
+ *
+ * We manage this complexity by having our own limits struct,
+ * zvol_queue_limits_t, in which we carry any queue config that we're
+ * interested in setting. This structure is the same on all kernels.
+ *
+ * These limits are then applied to the queue at device open time by the most
+ * appropriate method for the kernel.
+ *
+ * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of
+ * blk_alloc_disk() exists). This converts our limits struct to a proper Linux
+ * struct queue_limits, and passes it in. Any fields added in later kernels are
+ * (obviously) not set up here.
+ *
+ * zvol_queue_limits_apply() is called on all kernel versions after the queue
+ * is created, and applies any remaining config. Before 6.9 that will be
+ * everything, via setter methods. After 6.9 that will be whatever couldn't be
+ * put into struct queue_limits. (This implies that zvol_queue_limits_apply()
+ * will always be a no-op on the latest kernel we support).
+ */
 typedef struct zvol_queue_limits {
 	unsigned int	zql_max_hw_sectors;
 	unsigned short	zql_max_segments;
@@ -1175,17 +1204,18 @@ zvol_queue_limits_convert(zvol_queue_limits_t *limits,
 	qlimits->max_segment_size = limits->zql_max_segment_size;
 	qlimits->io_opt = limits->zql_io_opt;
 }
-#else
+#endif
+
 static void
 zvol_queue_limits_apply(zvol_queue_limits_t *limits,
     struct request_queue *queue)
 {
+#ifndef HAVE_BLK_ALLOC_DISK_2ARG
 	blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors);
 	blk_queue_max_segments(queue, limits->zql_max_segments);
 	blk_queue_max_segment_size(queue, limits->zql_max_segment_size);
 	blk_queue_io_opt(queue, limits->zql_io_opt);
 }
-#endif
 
 static int
 zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
@@ -1223,7 +1253,6 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
 	}
 
 	zso->zvo_disk->queue = zso->zvo_queue;
-	zvol_queue_limits_apply(limits, zso->zvo_queue);
 #endif /* HAVE_BLK_ALLOC_DISK */
 #else
 	zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
@@ -1237,8 +1266,10 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
 	}
 
 	zso->zvo_disk->queue = zso->zvo_queue;
-	zvol_queue_limits_apply(limits, zso->zvo_queue);
 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
+
+	zvol_queue_limits_apply(limits, zso->zvo_queue);
+
 	return (0);
 
 }
@@ -1260,7 +1291,6 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
 		return (1);
 	}
 	zso->zvo_queue = zso->zvo_disk->queue;
-	zvol_queue_limits_apply(limits, zso->zvo_queue);
 	zso->zvo_disk->minors = ZVOL_MINORS;
 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
 	struct queue_limits qlimits;
@@ -1291,10 +1321,11 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
 
 	/* Our queue is now created, assign it to our disk */
 	zso->zvo_disk->queue = zso->zvo_queue;
-	zvol_queue_limits_apply(limits, zso->zvo_queue);
-
 #endif
+
+	zvol_queue_limits_apply(limits, zso->zvo_queue);
 #endif
+
 	return (0);
 }
 

From e95b732e4997cf7ca9c4ba1c4f2fc2e64151dae7 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Tue, 30 Jul 2024 21:40:35 +1000
Subject: [PATCH 02/65] Linux 6.11: enable queue flush through queue limits

In 6.11 struct queue_limits gains a 'features' field, where, among other
things, flush and write-cache are enabled. Detect it and use it.

Along the way, the blk_queue_set_write_cache() compat wrapper gets a
little cleanup. Since both flags are alway set together, its now a
single bool. Also the very very ancient version that sets q->flush_flags
directly couldn't actually turn it off, so I've fixed that. Not that we
use it, but still.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #16400
---
 config/kernel-make-request-fn.m4              | 21 +++++++++++++
 include/os/linux/kernel/linux/blkdev_compat.h | 31 ++++++++++++-------
 module/os/linux/zfs/zvol_os.c                 | 12 +++++--
 3 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/config/kernel-make-request-fn.m4 b/config/kernel-make-request-fn.m4
index 9813ad2fb3f3..4c54bdd6d4a2 100644
--- a/config/kernel-make-request-fn.m4
+++ b/config/kernel-make-request-fn.m4
@@ -58,6 +58,13 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [
 		disk = blk_alloc_disk(lim, NUMA_NO_NODE);
 	])
 
+	ZFS_LINUX_TEST_SRC([blkdev_queue_limits_features], [
+		#include <linux/blkdev.h>
+	],[
+		struct queue_limits *lim = NULL;
+		lim->features = 0;
+	])
+
 	ZFS_LINUX_TEST_SRC([blk_cleanup_disk], [
 		#include <linux/blkdev.h>
 	],[
@@ -114,6 +121,20 @@ AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [
 			AC_MSG_RESULT(yes)
 			AC_DEFINE([HAVE_BLK_ALLOC_DISK_2ARG], 1, [blk_alloc_disk() exists and takes 2 args])
 
+			dnl #
+			dnl # Linux 6.11 API change:
+			dnl # struct queue_limits gains a 'features' field,
+			dnl # used to set flushing options
+			dnl #
+			AC_MSG_CHECKING([whether struct queue_limits has a features field])
+			ZFS_LINUX_TEST_RESULT([blkdev_queue_limits_features], [
+				AC_MSG_RESULT(yes)
+				AC_DEFINE([HAVE_BLKDEV_QUEUE_LIMITS_FEATURES], 1,
+				    [struct queue_limits has a features field])
+			], [
+				AC_MSG_RESULT(no)
+			])
+
 			dnl #
 			dnl # 5.20 API change,
 			dnl # Removed blk_cleanup_disk(), put_disk() should be used.
diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h
index 658f546213de..b7c21f5b317a 100644
--- a/include/os/linux/kernel/linux/blkdev_compat.h
+++ b/include/os/linux/kernel/linux/blkdev_compat.h
@@ -57,6 +57,11 @@ blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
 #endif
 
 /*
+ * 6.11 API
+ * Setting the flush flags directly is no longer possible; flush flags are set
+ * on the queue_limits structure and passed to blk_disk_alloc(). In this case
+ * we remove this function entirely.
+ *
  * 4.7 API,
  * The blk_queue_write_cache() interface has replaced blk_queue_flush()
  * interface.  However, the new interface is GPL-only thus we implement
@@ -68,31 +73,33 @@ blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
  * new one is GPL-only.   Thus if the GPL-only version is detected we
  * implement our own trivial helper.
  */
+#if !defined(HAVE_BLK_ALLOC_DISK_2ARG) || \
+	!defined(HAVE_BLKDEV_QUEUE_LIMITS_FEATURES)
 static inline void
-blk_queue_set_write_cache(struct request_queue *q, bool wc, bool fua)
+blk_queue_set_write_cache(struct request_queue *q, bool on)
 {
 #if defined(HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY)
-	if (wc)
+	if (on) {
 		blk_queue_flag_set(QUEUE_FLAG_WC, q);
-	else
-		blk_queue_flag_clear(QUEUE_FLAG_WC, q);
-	if (fua)
 		blk_queue_flag_set(QUEUE_FLAG_FUA, q);
-	else
+	} else {
+		blk_queue_flag_clear(QUEUE_FLAG_WC, q);
 		blk_queue_flag_clear(QUEUE_FLAG_FUA, q);
+	}
 #elif defined(HAVE_BLK_QUEUE_WRITE_CACHE)
-	blk_queue_write_cache(q, wc, fua);
+	blk_queue_write_cache(q, on, on);
 #elif defined(HAVE_BLK_QUEUE_FLUSH_GPL_ONLY)
-	if (wc)
-		q->flush_flags |= REQ_FLUSH;
-	if (fua)
-		q->flush_flags |= REQ_FUA;
+	if (on)
+		q->flush_flags |= REQ_FLUSH | REQ_FUA;
+	else
+		q->flush_flags &= ~(REQ_FLUSH | REQ_FUA);
 #elif defined(HAVE_BLK_QUEUE_FLUSH)
-	blk_queue_flush(q, (wc ? REQ_FLUSH : 0) | (fua ? REQ_FUA : 0));
+	blk_queue_flush(q, on ? (REQ_FLUSH | REQ_FUA) : 0);
 #else
 #error "Unsupported kernel"
 #endif
 }
+#endif /* !HAVE_BLK_ALLOC_DISK_2ARG || !HAVE_BLKDEV_QUEUE_LIMITS_FEATURES */
 
 /*
  * Detect if a device has a write cache. Used to set the intial value for the
diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 48e49a50a9c3..044e9a35600e 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1203,6 +1203,9 @@ zvol_queue_limits_convert(zvol_queue_limits_t *limits,
 	qlimits->max_segments = limits->zql_max_segments;
 	qlimits->max_segment_size = limits->zql_max_segment_size;
 	qlimits->io_opt = limits->zql_io_opt;
+#ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
+	qlimits->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
+#endif
 }
 #endif
 
@@ -1215,6 +1218,9 @@ zvol_queue_limits_apply(zvol_queue_limits_t *limits,
 	blk_queue_max_segments(queue, limits->zql_max_segments);
 	blk_queue_max_segment_size(queue, limits->zql_max_segment_size);
 	blk_queue_io_opt(queue, limits->zql_io_opt);
+#ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
+	blk_queue_set_write_cache(queue, B_TRUE);
+#endif
 }
 
 static int
@@ -1238,6 +1244,10 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
 		return (1);
 	}
 
+#ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
+	blk_queue_set_write_cache(zso->zvo_queue, B_TRUE);
+#endif
+
 	zso->zvo_disk = disk;
 	zso->zvo_disk->minors = ZVOL_MINORS;
 	zso->zvo_queue = zso->zvo_disk->queue;
@@ -1391,8 +1401,6 @@ zvol_alloc(dev_t dev, const char *name)
 	if (ret != 0)
 		goto out_kmem;
 
-	blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);
-
 	/* Limit read-ahead to a single page to prevent over-prefetching. */
 	blk_queue_set_read_ahead(zso->zvo_queue, 1);
 

From 7e98d30f467233b5b07e03de6942e045cf33ee9b Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Tue, 30 Jul 2024 22:25:50 +1000
Subject: [PATCH 03/65] Linux 6.11: get backing_dev_info through queue gendisk

It's no longer available directly on the request queue, but its easy to
get from the attached disk.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #16400
---
 config/kernel-blk-queue.m4                    | 28 +++++++++++++++++++
 include/os/linux/kernel/linux/blkdev_compat.h |  4 ++-
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/config/kernel-blk-queue.m4 b/config/kernel-blk-queue.m4
index 2f0b386e6637..a064140f337a 100644
--- a/config/kernel-blk-queue.m4
+++ b/config/kernel-blk-queue.m4
@@ -25,6 +25,8 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_PLUG], [
 dnl #
 dnl # 2.6.32 - 4.11: statically allocated bdi in request_queue
 dnl # 4.12: dynamically allocated bdi in request_queue
+dnl # 6.11: bdi no longer available through request_queue, so get it from
+dnl #       the gendisk attached to the queue
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI], [
 	ZFS_LINUX_TEST_SRC([blk_queue_bdi], [
@@ -47,6 +49,30 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_BDI], [
 	])
 ])
 
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISK_BDI], [
+	ZFS_LINUX_TEST_SRC([blk_queue_disk_bdi], [
+		#include <linux/blkdev.h>
+		#include <linux/backing-dev.h>
+	], [
+		struct request_queue q;
+		struct gendisk disk;
+		struct backing_dev_info bdi __attribute__ ((unused));
+		q.disk = &disk;
+		q.disk->bdi = &bdi;
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_DISK_BDI], [
+	AC_MSG_CHECKING([whether backing_dev_info is available through queue gendisk])
+	ZFS_LINUX_TEST_RESULT([blk_queue_disk_bdi], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLK_QUEUE_DISK_BDI, 1,
+		    [backing_dev_info is available through queue gendisk])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
 dnl #
 dnl # 5.9: added blk_queue_update_readahead(),
 dnl # 5.15: renamed to disk_update_readahead()
@@ -407,6 +433,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI
+	ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISK_BDI
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_UPDATE_READAHEAD
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISCARD
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_SECURE_ERASE
@@ -421,6 +448,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [
 AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [
 	ZFS_AC_KERNEL_BLK_QUEUE_PLUG
 	ZFS_AC_KERNEL_BLK_QUEUE_BDI
+	ZFS_AC_KERNEL_BLK_QUEUE_DISK_BDI
 	ZFS_AC_KERNEL_BLK_QUEUE_UPDATE_READAHEAD
 	ZFS_AC_KERNEL_BLK_QUEUE_DISCARD
 	ZFS_AC_KERNEL_BLK_QUEUE_SECURE_ERASE
diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h
index b7c21f5b317a..c2e818b4d4ee 100644
--- a/include/os/linux/kernel/linux/blkdev_compat.h
+++ b/include/os/linux/kernel/linux/blkdev_compat.h
@@ -133,8 +133,10 @@ blk_queue_set_read_ahead(struct request_queue *q, unsigned long ra_pages)
 {
 #if !defined(HAVE_BLK_QUEUE_UPDATE_READAHEAD) && \
 	!defined(HAVE_DISK_UPDATE_READAHEAD)
-#ifdef HAVE_BLK_QUEUE_BDI_DYNAMIC
+#if defined(HAVE_BLK_QUEUE_BDI_DYNAMIC)
 	q->backing_dev_info->ra_pages = ra_pages;
+#elif defined(HAVE_BLK_QUEUE_DISK_BDI)
+	q->disk->bdi->ra_pages = ra_pages;
 #else
 	q->backing_dev_info.ra_pages = ra_pages;
 #endif

From 22619523f6e5d66aad92d6c443976f9d29bd039f Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Wed, 31 Jul 2024 12:15:07 +1000
Subject: [PATCH 04/65] Linux 6.11: first arg to proc_handler is now const

Detect it, and use a macro to make sure we always match the prototype.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #16400
---
 config/kernel-register_sysctl_table.m4 | 33 ++++++++++++++++++++++++++
 config/kernel.m4                       |  2 ++
 module/os/linux/spl/spl-proc.c         | 12 +++++++---
 3 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/config/kernel-register_sysctl_table.m4 b/config/kernel-register_sysctl_table.m4
index a5e934f56d29..b8a0e0b17332 100644
--- a/config/kernel-register_sysctl_table.m4
+++ b/config/kernel-register_sysctl_table.m4
@@ -25,3 +25,36 @@ AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE], [
 		AC_MSG_RESULT([no])
 	])
 ])
+
+dnl #
+dnl # Linux 6.11 makes const the ctl_table arg of proc_handler
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_PROC_HANDLER_CTL_TABLE_CONST], [
+	ZFS_LINUX_TEST_SRC([has_proc_handler_ctl_table_const], [
+		#include <linux/sysctl.h>
+
+		static int test_handler(
+		    const struct ctl_table *ctl __attribute((unused)),
+		    int write __attribute((unused)),
+		    void *buffer __attribute((unused)),
+		    size_t *lenp __attribute((unused)),
+		    loff_t *ppos __attribute((unused)))
+		{
+			return (0);
+		}
+	], [
+		proc_handler *ph __attribute((unused)) =
+		    &test_handler;
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_PROC_HANDLER_CTL_TABLE_CONST], [
+	AC_MSG_CHECKING([whether proc_handler ctl_table arg is const])
+	ZFS_LINUX_TEST_RESULT([has_proc_handler_ctl_table_const], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_PROC_HANDLER_CTL_TABLE_CONST, 1,
+		    [proc_handler ctl_table arg is const])
+	], [
+		AC_MSG_RESULT([no])
+	])
+])
diff --git a/config/kernel.m4 b/config/kernel.m4
index f282ccd8b9d7..6194c119cca6 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -167,6 +167,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_WRITEPAGE_T
 	ZFS_AC_KERNEL_SRC_RECLAIMED
 	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
+	ZFS_AC_KERNEL_SRC_PROC_HANDLER_CTL_TABLE_CONST
 	ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SRC_SYNC_BDEV
 	ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
@@ -319,6 +320,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_WRITEPAGE_T
 	ZFS_AC_KERNEL_RECLAIMED
 	ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
+	ZFS_AC_KERNEL_PROC_HANDLER_CTL_TABLE_CONST
 	ZFS_AC_KERNEL_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SYNC_BDEV
 	ZFS_AC_KERNEL_MM_PAGE_SIZE
diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c
index f0f929d3ce90..22f587934d25 100644
--- a/module/os/linux/spl/spl-proc.c
+++ b/module/os/linux/spl/spl-proc.c
@@ -43,6 +43,12 @@ typedef struct ctl_table __no_const spl_ctl_table;
 typedef struct ctl_table spl_ctl_table;
 #endif
 
+#ifdef HAVE_PROC_HANDLER_CTL_TABLE_CONST
+#define	CONST_CTL_TABLE		const struct ctl_table
+#else
+#define	CONST_CTL_TABLE		struct ctl_table
+#endif
+
 static unsigned long table_min = 0;
 static unsigned long table_max = ~0;
 
@@ -60,7 +66,7 @@ struct proc_dir_entry *proc_spl_kstat = NULL;
 
 #ifdef DEBUG_KMEM
 static int
-proc_domemused(struct ctl_table *table, int write,
+proc_domemused(CONST_CTL_TABLE *table, int write,
     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int rc = 0;
@@ -88,7 +94,7 @@ proc_domemused(struct ctl_table *table, int write,
 #endif /* DEBUG_KMEM */
 
 static int
-proc_doslab(struct ctl_table *table, int write,
+proc_doslab(CONST_CTL_TABLE *table, int write,
     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int rc = 0;
@@ -135,7 +141,7 @@ proc_doslab(struct ctl_table *table, int write,
 }
 
 static int
-proc_dohostid(struct ctl_table *table, int write,
+proc_dohostid(CONST_CTL_TABLE *table, int write,
     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	char *end, str[32];

From 0b741a03513553550e189d7337fb2ee6828c704c Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Wed, 31 Jul 2024 14:48:58 +1000
Subject: [PATCH 05/65] Linux 6.11: IO stats is now a queue feature flag

Apply them with with the rest of the settings.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #16400
---
 module/os/linux/zfs/zvol_os.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 044e9a35600e..5aad4e430c8d 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1204,7 +1204,8 @@ zvol_queue_limits_convert(zvol_queue_limits_t *limits,
 	qlimits->max_segment_size = limits->zql_max_segment_size;
 	qlimits->io_opt = limits->zql_io_opt;
 #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
-	qlimits->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
+	qlimits->features =
+	    BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT;
 #endif
 }
 #endif
@@ -1220,6 +1221,7 @@ zvol_queue_limits_apply(zvol_queue_limits_t *limits,
 	blk_queue_io_opt(queue, limits->zql_io_opt);
 #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
 	blk_queue_set_write_cache(queue, B_TRUE);
+	blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue);
 #endif
 }
 
@@ -1409,9 +1411,6 @@ zvol_alloc(dev_t dev, const char *name)
 		blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
 	}
 
-	/* Enable /proc/diskstats */
-	blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue);
-
 	zso->zvo_queue->queuedata = zv;
 	zso->zvo_dev = dev;
 	zv->zv_open_count = 0;

From f5236fe47ac0d4be84089651a19a2f5b713f9798 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Wed, 31 Jul 2024 17:22:20 +1000
Subject: [PATCH 06/65] Linux 6.11: add more queue_limit fields with removed
 setters

These fields are very old, so no detection necessary; we just move them
into the limit setup functions.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #16400
---
 module/os/linux/zfs/zvol_os.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 5aad4e430c8d..2beec6436bff 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1123,6 +1123,9 @@ typedef struct zvol_queue_limits {
 	unsigned short	zql_max_segments;
 	unsigned int	zql_max_segment_size;
 	unsigned int	zql_io_opt;
+	unsigned int	zql_physical_block_size;
+	unsigned int	zql_max_discard_sectors;
+	unsigned int	zql_discard_granularity;
 } zvol_queue_limits_t;
 
 static void
@@ -1191,6 +1194,11 @@ zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv,
 	}
 
 	limits->zql_io_opt = zv->zv_volblocksize;
+
+	limits->zql_physical_block_size = zv->zv_volblocksize;
+	limits->zql_max_discard_sectors =
+	    (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9;
+	limits->zql_discard_granularity = zv->zv_volblocksize;
 }
 
 #ifdef HAVE_BLK_ALLOC_DISK_2ARG
@@ -1203,6 +1211,9 @@ zvol_queue_limits_convert(zvol_queue_limits_t *limits,
 	qlimits->max_segments = limits->zql_max_segments;
 	qlimits->max_segment_size = limits->zql_max_segment_size;
 	qlimits->io_opt = limits->zql_io_opt;
+	qlimits->physical_block_size = limits->zql_physical_block_size;
+	qlimits->max_discard_sectors = limits->zql_max_discard_sectors;
+	qlimits->discard_granularity = limits->zql_discard_granularity;
 #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
 	qlimits->features =
 	    BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT;
@@ -1219,6 +1230,10 @@ zvol_queue_limits_apply(zvol_queue_limits_t *limits,
 	blk_queue_max_segments(queue, limits->zql_max_segments);
 	blk_queue_max_segment_size(queue, limits->zql_max_segment_size);
 	blk_queue_io_opt(queue, limits->zql_io_opt);
+	blk_queue_physical_block_size(queue, limits->zql_physical_block_size);
+	blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors);
+	blk_queue_discard_granularity(queue, limits->zql_discard_granularity);
+#endif
 #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
 	blk_queue_set_write_cache(queue, B_TRUE);
 	blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue);
@@ -1677,14 +1692,6 @@ zvol_os_create_minor(const char *name)
 
 	set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
 
-
-
-	blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
-	    zv->zv_volblocksize);
-	blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue,
-	    (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
-	blk_queue_discard_granularity(zv->zv_zso->zvo_queue,
-	    zv->zv_volblocksize);
 #ifdef QUEUE_FLAG_DISCARD
 	blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
 #endif

From 3abffc8781ddd8a49434ec6cadf10c68bb699533 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Wed, 31 Jul 2024 18:43:39 +1000
Subject: [PATCH 07/65] Linux 6.11: add compat macro for page_mapping()

Since the change to folios it has just been a wrapper anyway. Linux has
removed their wrapper, so we add one.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #16400
---
 config/kernel-mm-page-size.m4             | 17 -----------
 config/kernel-mm-pagemap.m4               | 36 +++++++++++++++++++++++
 config/kernel.m4                          |  2 ++
 include/os/linux/kernel/linux/mm_compat.h |  7 +++++
 module/os/linux/zfs/zfs_vnops_os.c        |  1 +
 5 files changed, 46 insertions(+), 17 deletions(-)
 delete mode 100644 config/kernel-mm-page-size.m4
 create mode 100644 config/kernel-mm-pagemap.m4

diff --git a/config/kernel-mm-page-size.m4 b/config/kernel-mm-page-size.m4
deleted file mode 100644
index d5ebd926986a..000000000000
--- a/config/kernel-mm-page-size.m4
+++ /dev/null
@@ -1,17 +0,0 @@
-AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
-	ZFS_LINUX_TEST_SRC([page_size], [
-		#include <linux/mm.h>
-	],[
-		unsigned long s;
-		s = page_size(NULL);
-	])
-])
-AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
-	AC_MSG_CHECKING([whether page_size() is available])
-	ZFS_LINUX_TEST_RESULT([page_size], [
-		AC_MSG_RESULT(yes)
-		AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
-	],[
-		AC_MSG_RESULT(no)
-	])
-])
diff --git a/config/kernel-mm-pagemap.m4 b/config/kernel-mm-pagemap.m4
new file mode 100644
index 000000000000..466b6fa07d9a
--- /dev/null
+++ b/config/kernel-mm-pagemap.m4
@@ -0,0 +1,36 @@
+AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
+	ZFS_LINUX_TEST_SRC([page_size], [
+		#include <linux/mm.h>
+	],[
+		unsigned long s;
+		s = page_size(NULL);
+	])
+])
+AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
+	AC_MSG_CHECKING([whether page_size() is available])
+	ZFS_LINUX_TEST_RESULT([page_size], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
+
+AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING], [
+	ZFS_LINUX_TEST_SRC([page_mapping], [
+		#include <linux/pagemap.h>
+	],[
+		struct page *p = NULL;
+		struct address_space *m = page_mapping(NULL);
+	])
+])
+AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_MAPPING], [
+	AC_MSG_CHECKING([whether page_mapping() is available])
+	ZFS_LINUX_TEST_RESULT([page_mapping], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_MM_PAGE_MAPPING, 1, [page_mapping() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
diff --git a/config/kernel.m4 b/config/kernel.m4
index 6194c119cca6..d6ea3453292a 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -171,6 +171,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SRC_SYNC_BDEV
 	ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
+	ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@@ -324,6 +325,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SYNC_BDEV
 	ZFS_AC_KERNEL_MM_PAGE_SIZE
+	ZFS_AC_KERNEL_MM_PAGE_MAPPING
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
diff --git a/include/os/linux/kernel/linux/mm_compat.h b/include/os/linux/kernel/linux/mm_compat.h
index 40056c68d6dd..817f6df422de 100644
--- a/include/os/linux/kernel/linux/mm_compat.h
+++ b/include/os/linux/kernel/linux/mm_compat.h
@@ -21,16 +21,23 @@
 
 /*
  * Copyright (c) 2023, 2024, Klara Inc.
+ * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
  */
 
 #ifndef _ZFS_MM_COMPAT_H
 #define	_ZFS_MM_COMPAT_H
 
 #include <linux/mm.h>
+#include <linux/pagemap.h>
 
 /* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
 #ifndef HAVE_MM_PAGE_SIZE
 #define	page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
 #endif
 
+/* 6.11 removed page_mapping(). A simple wrapper around folio_mapping() works */
+#ifndef HAVE_MM_PAGE_MAPPING
+#define	page_mapping(p) folio_mapping(page_folio(p))
+#endif
+
 #endif /* _ZFS_MM_COMPAT_H */
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index 1cecad9f7755..8061169c3293 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -69,6 +69,7 @@
 #include <sys/zpl.h>
 #include <sys/zil.h>
 #include <sys/sa_impl.h>
+#include <linux/mm_compat.h>
 
 /*
  * Programming rules.

From 2633075e0905bbe4989a469c7d5892f2cf1108be Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Wed, 31 Jul 2024 21:39:31 +1000
Subject: [PATCH 08/65] Linux 6.11: avoid passing "end" sentinel to
 register_sysctl()

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Sponsored-by: https://despairlabs.com/sponsor/
Closes #16400
---
 config/kernel-register_sysctl_table.m4 | 26 ++++++++++++++++
 config/kernel.m4                       |  2 ++
 module/os/linux/spl/spl-proc.c         | 41 ++++++++++++++++++++++++--
 3 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/config/kernel-register_sysctl_table.m4 b/config/kernel-register_sysctl_table.m4
index b8a0e0b17332..12ffe9d95142 100644
--- a/config/kernel-register_sysctl_table.m4
+++ b/config/kernel-register_sysctl_table.m4
@@ -26,6 +26,32 @@ AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE], [
 	])
 ])
 
+dnl #
+dnl # Linux 6.11 register_sysctl() enforces that sysctl tables no longer
+dnl # supply a sentinel end-of-table element. 6.6 introduces
+dnl # register_sysctl_sz() to enable callers to choose, so we use it if
+dnl # available for backward compatibility.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_SZ], [
+	ZFS_LINUX_TEST_SRC([has_register_sysctl_sz], [
+		#include <linux/sysctl.h>
+	],[
+		struct ctl_table test_table[] __attribute__((unused)) = {0};
+		register_sysctl_sz("", test_table, 0);
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_SZ], [
+	AC_MSG_CHECKING([whether register_sysctl_sz exists])
+	ZFS_LINUX_TEST_RESULT([has_register_sysctl_sz], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_REGISTER_SYSCTL_SZ, 1,
+			[register_sysctl_sz exists])
+	],[
+		AC_MSG_RESULT([no])
+	])
+])
+
 dnl #
 dnl # Linux 6.11 makes const the ctl_table arg of proc_handler
 dnl #
diff --git a/config/kernel.m4 b/config/kernel.m4
index d6ea3453292a..4d471358d242 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -167,6 +167,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_WRITEPAGE_T
 	ZFS_AC_KERNEL_SRC_RECLAIMED
 	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
+	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_SZ
 	ZFS_AC_KERNEL_SRC_PROC_HANDLER_CTL_TABLE_CONST
 	ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SRC_SYNC_BDEV
@@ -321,6 +322,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_WRITEPAGE_T
 	ZFS_AC_KERNEL_RECLAIMED
 	ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
+	ZFS_AC_KERNEL_REGISTER_SYSCTL_SZ
 	ZFS_AC_KERNEL_PROC_HANDLER_CTL_TABLE_CONST
 	ZFS_AC_KERNEL_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SYNC_BDEV
diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c
index 22f587934d25..2c0cdd9febf5 100644
--- a/module/os/linux/spl/spl-proc.c
+++ b/module/os/linux/spl/spl-proc.c
@@ -22,6 +22,9 @@
  *
  *  Solaris Porting Layer (SPL) Proc Implementation.
  */
+/*
+ * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
+ */
 
 #include <sys/systeminfo.h>
 #include <sys/kstat.h>
@@ -694,6 +697,37 @@ static void spl_proc_cleanup(void)
 	}
 }
 
+#ifndef HAVE_REGISTER_SYSCTL_TABLE
+
+/*
+ * Traditionally, struct ctl_table arrays have been terminated by an "empty"
+ * sentinel element (specifically, one with .procname == NULL).
+ *
+ * Linux 6.6 began migrating away from this, adding register_sysctl_sz() so
+ * that callers could provide the size directly, and redefining
+ * register_sysctl() to just call register_sysctl_sz() with the array size. It
+ * retained support for the terminating element so that existing callers would
+ * continue to work.
+ *
+ * Linux 6.11 removed support for the terminating element, instead interpreting
+ * it as a real malformed element, and rejecting it.
+ *
+ * In order to continue support older kernels, we retain the terminating
+ * sentinel element for our sysctl tables, but instead detect availability of
+ * register_sysctl_sz(). If it exists, we pass it the array size -1, stopping
+ * the kernel from trying to process the terminator. For pre-6.6 kernels that
+ * don't have register_sysctl_sz(), we just use register_sysctl(), which can
+ * handle the terminating element as it always has.
+ */
+#ifdef HAVE_REGISTER_SYSCTL_SZ
+#define	spl_proc_register_sysctl(p, t)	\
+	register_sysctl_sz(p, t, ARRAY_SIZE(t)-1)
+#else
+#define	spl_proc_register_sysctl(p, t)	\
+	register_sysctl(p, t)
+#endif
+#endif
+
 int
 spl_proc_init(void)
 {
@@ -704,16 +738,17 @@ spl_proc_init(void)
 	if (spl_header == NULL)
 		return (-EUNATCH);
 #else
-	spl_header = register_sysctl("kernel/spl", spl_table);
+	spl_header = spl_proc_register_sysctl("kernel/spl", spl_table);
 	if (spl_header == NULL)
 		return (-EUNATCH);
 
-	spl_kmem = register_sysctl("kernel/spl/kmem", spl_kmem_table);
+	spl_kmem = spl_proc_register_sysctl("kernel/spl/kmem", spl_kmem_table);
 	if (spl_kmem == NULL) {
 		rc = -EUNATCH;
 		goto out;
 	}
-	spl_kstat = register_sysctl("kernel/spl/kstat", spl_kstat_table);
+	spl_kstat = spl_proc_register_sysctl("kernel/spl/kstat",
+	    spl_kstat_table);
 	if (spl_kstat == NULL) {
 		rc = -EUNATCH;
 		goto out;

From d06de4f007947a9bc73f5796dd57a70ede3a525f Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Wed, 14 Aug 2024 12:27:07 -0700
Subject: [PATCH 09/65] ZTS: Use /dev/urandom instead of /dev/random

Use /dev/urandom so we never have to wait on entropy.

Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #16442
---
 tests/zfs-tests/tests/functional/alloc_class/alloc_class.kshlib | 2 +-
 .../functional/block_cloning/block_cloning_rlimit_fsize.ksh     | 2 +-
 .../zfs-tests/tests/functional/fault/suspend_resume_single.ksh  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class.kshlib b/tests/zfs-tests/tests/functional/alloc_class/alloc_class.kshlib
index e204f43b3bcd..795e71b26b5a 100644
--- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class.kshlib
+++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class.kshlib
@@ -55,7 +55,7 @@ function display_status
 	((ret |= $?))
 
 	typeset mntpnt=$(get_prop mountpoint $pool)
-	dd if=/dev/random of=$mntpnt/testfile.$$ &
+	dd if=/dev/urandom of=$mntpnt/testfile.$$ &
 	typeset pid=$!
 
 	zpool iostat -v 1 3 > /dev/null
diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_rlimit_fsize.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_rlimit_fsize.ksh
index a8a64e52491a..3632fc9a4df0 100755
--- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_rlimit_fsize.ksh
+++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_rlimit_fsize.ksh
@@ -54,7 +54,7 @@ log_must truncate -s 1G $VDEV
 
 log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $VDEV
 
-log_must dd if=/dev/random of=/$TESTPOOL/file1 bs=1 count=1000
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=1 count=1000
 
 ulimit -f 2
 log_must clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file2 0 0 all
diff --git a/tests/zfs-tests/tests/functional/fault/suspend_resume_single.ksh b/tests/zfs-tests/tests/functional/fault/suspend_resume_single.ksh
index 041dadb1eadb..05f3ac708477 100755
--- a/tests/zfs-tests/tests/functional/fault/suspend_resume_single.ksh
+++ b/tests/zfs-tests/tests/functional/fault/suspend_resume_single.ksh
@@ -42,7 +42,7 @@ log_onexit cleanup
 log_assert "ensure single-disk pool resumes properly after suspend and clear"
 
 # create a file, and take a checksum, so we can compare later
-log_must dd if=/dev/random of=$DATAFILE bs=128K count=1
+log_must dd if=/dev/urandom of=$DATAFILE bs=128K count=1
 typeset sum1=$(cat $DATAFILE | md5sum)
 
 # make a debug device that we can "unplug"

From 244ea5c4881f92a9d7c1fb341a49b127fda7539d Mon Sep 17 00:00:00 2001
From: Paul Dagnelie <pcd@delphix.com>
Date: Wed, 14 Aug 2024 14:18:46 -0700
Subject: [PATCH 10/65] Add missing kstats to dataset kstats

Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Closes #16431
---
 module/zfs/dataset_kstats.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/module/zfs/dataset_kstats.c b/module/zfs/dataset_kstats.c
index 2ac058fd2c93..8faa6c2a2528 100644
--- a/module/zfs/dataset_kstats.c
+++ b/module/zfs/dataset_kstats.c
@@ -40,6 +40,9 @@ static dataset_kstat_values_t empty_dataset_kstats = {
 	{
 	{ "zil_commit_count",			KSTAT_DATA_UINT64 },
 	{ "zil_commit_writer_count",		KSTAT_DATA_UINT64 },
+	{ "zil_commit_error_count",		KSTAT_DATA_UINT64 },
+	{ "zil_commit_stall_count",		KSTAT_DATA_UINT64 },
+	{ "zil_commit_suspend_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_count",			KSTAT_DATA_UINT64 },
 	{ "zil_itx_indirect_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_indirect_bytes",		KSTAT_DATA_UINT64 },

From d2ccc2155217bfdd7e19db4480d981a8f3fad46f Mon Sep 17 00:00:00 2001
From: Jitendra Patidar <jitendra.patidar@nutanix.com>
Date: Thu, 15 Aug 2024 06:29:19 +0530
Subject: [PATCH 11/65] Fix projid accounting for xattr objects

zpool upgraded with 'feature@project_quota' needs re-layout of SA's
to fix the SA_ZPL_PROJID at SA_PROJID_OFFSET (128). Its necessary for
the correct accounting of object usage against its projid.
Old object (created before upgrade) when gets a projid assigned, its
SA gets re-layout via sa_add_projid(). If object has xattr dir, SA
of xattr dir also gets re-layout. But SA re-layout of xattr objects
inside a xattr dir is not done.

Fix zfs_setattr_dir() to re-layout SA's on xattr objects, when setting
projid on old xattr object (created before upgrade).

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Jitendra Patidar <jitendra.patidar@nutanix.com>
Closes #16355
Closes #16356
---
 module/os/linux/zfs/zfs_vnops_os.c            | 28 +++++++++++++------
 .../upgrade/upgrade_projectquota_001_pos.ksh  | 19 +++++++++++--
 2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index 8061169c3293..9803c7fecb5c 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -1789,24 +1789,36 @@ zfs_setattr_dir(znode_t *dzp)
 			    &gid, sizeof (gid));
 		}
 
-		if (zp->z_projid != dzp->z_projid) {
+
+		uint64_t projid = dzp->z_projid;
+		if (zp->z_projid != projid) {
 			if (!(zp->z_pflags & ZFS_PROJID)) {
-				zp->z_pflags |= ZFS_PROJID;
-				SA_ADD_BULK_ATTR(bulk, count,
-				    SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags,
-				    sizeof (zp->z_pflags));
+				err = sa_add_projid(zp->z_sa_hdl, tx, projid);
+				if (unlikely(err == EEXIST)) {
+					err = 0;
+				} else if (err != 0) {
+					goto sa_add_projid_err;
+				} else {
+					projid = ZFS_INVALID_PROJID;
+				}
 			}
 
-			zp->z_projid = dzp->z_projid;
-			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs),
-			    NULL, &zp->z_projid, sizeof (zp->z_projid));
+			if (projid != ZFS_INVALID_PROJID) {
+				zp->z_projid = projid;
+				SA_ADD_BULK_ATTR(bulk, count,
+				    SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
+				    sizeof (zp->z_projid));
+			}
 		}
 
+sa_add_projid_err:
 		mutex_exit(&dzp->z_lock);
 
 		if (likely(count > 0)) {
 			err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 			dmu_tx_commit(tx);
+		} else if (projid == ZFS_INVALID_PROJID) {
+			dmu_tx_commit(tx);
 		} else {
 			dmu_tx_abort(tx);
 		}
diff --git a/tests/zfs-tests/tests/functional/upgrade/upgrade_projectquota_001_pos.ksh b/tests/zfs-tests/tests/functional/upgrade/upgrade_projectquota_001_pos.ksh
index 2ad37e06a5f1..2c365e37af23 100755
--- a/tests/zfs-tests/tests/functional/upgrade/upgrade_projectquota_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/upgrade/upgrade_projectquota_001_pos.ksh
@@ -63,6 +63,7 @@ log_must mkfiles $TESTDIR/fs2/tf $((RANDOM % 100 + 1))
 log_must zfs create $TESTPOOL/fs3
 log_must mkdir $TESTDIR/fs3/dir
 log_must mkfiles $TESTDIR/fs3/tf $((RANDOM % 100 + 1))
+log_must set_xattr_stdin passwd $TESTDIR/fs3/dir < /etc/passwd
 
 # Make sure project quota is disabled
 zfs projectspace -o used $TESTPOOL | grep -q "USED" &&
@@ -109,9 +110,23 @@ log_must chattr -p 100 $TESTDIR/fs3/dir
 log_must sleep 5 # upgrade done in the background so let's wait for a while
 zfs projectspace -o used $TESTPOOL/fs3 | grep -q "USED" ||
 	log_fail "project quota should be enabled for $TESTPOOL/fs3"
+dirino=$(stat -c '%i' $TESTDIR/fs3/dir)
+log_must zdb -ddddd $TESTPOOL/fs3 $dirino
+xattrdirino=$(zdb -ddddd $TESTPOOL/fs3 $dirino |grep -w "xattr" |awk '{print $2}')
+echo "xattrdirino: $xattrdirino"
+expectedcnt=1
+echo "expectedcnt: $expectedcnt"
+if [ "$xattrdirino" != "" ]; then
+	expectedcnt=$(($expectedcnt + 1))
+	echo "expectedcnt: $expectedcnt"
+	log_must zdb -ddddd $TESTPOOL/fs3 $xattrdirino
+	xattrinocnt=$(zdb -ddddd $TESTPOOL/fs3 $xattrdirino |grep -w "(type:" |wc -l)
+	echo "xattrinocnt: $xattrinocnt"
+	expectedcnt=$(($expectedcnt + $xattrinocnt))
+	echo "expectedcnt: $expectedcnt"
+fi
 cnt=$(get_prop projectobjused@100 $TESTPOOL/fs3)
-# if 'xattr=on', then 'cnt = 2'
-[[ $cnt -ne 1 ]] && [[ $cnt -ne 2 ]] &&
+[[ $cnt -ne $expectedcnt ]] &&
 	log_fail "projectquota accounting failed $cnt"
 
 # All in all, after having been through this, the dataset for testpool

From 83f359245adc9c03ee0ededa2ff00b7dd9f82d2a Mon Sep 17 00:00:00 2001
From: Gleb Smirnoff <glebius@FreeBSD.org>
Date: Thu, 15 Aug 2024 09:08:43 -0700
Subject: [PATCH 12/65] FreeBSD: fix build without kernel option MAC

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Mark Johnston <markj@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Gleb Smirnoff <glebius@FreeBSD.org>
Closes #16446
---
 module/os/freebsd/zfs/zfs_vnops_os.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c
index 283f56963170..01b964f98f3a 100644
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -6125,7 +6125,9 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
 	    error == EOPNOTSUPP)
 		goto bad_locked_fallback;
 	*ap->a_lenp = (size_t)len;
+#ifdef MAC
 out_locked:
+#endif
 	if (invp != outvp)
 		VOP_UNLOCK(invp);
 	VOP_UNLOCK(outvp);

From 963e6c9f3ffc0bc767ca8b89549be595f29f9470 Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Fri, 16 Aug 2024 00:39:44 +0500
Subject: [PATCH 13/65] Fix incorrect error report on vdev attach/replace

Report the correct error message in libzfs when attaching/replacing a
vdev with a higher ashift.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #16449
---
 lib/libzfs/libzfs_pool.c | 7 +++++++
 module/zfs/spa.c         | 6 ++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 8a043aa0f872..e493e8562a7d 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -3733,6 +3733,13 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk,
 			(void) zpool_standard_error(hdl, errno, errbuf);
 		}
 		break;
+
+	case ZFS_ERR_ASHIFT_MISMATCH:
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "The new device cannot have a higher alignment requirement "
+		    "than the top-level vdev."));
+		(void) zfs_error(hdl, EZFS_BADTARGET, errbuf);
+		break;
 	default:
 		(void) zpool_standard_error(hdl, errno, errbuf);
 	}
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index cafc7196c354..99a8d107ecab 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -7602,8 +7602,10 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
 	 * The new device cannot have a higher alignment requirement
 	 * than the top-level vdev.
 	 */
-	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
-		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) {
+		return (spa_vdev_exit(spa, newrootvd, txg,
+		    ZFS_ERR_ASHIFT_MISMATCH));
+	}
 
 	/*
 	 * RAIDZ-expansion-specific checks.

From f2f4ada240e3560faa721082c4a62dcb0cce0115 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Thu, 15 Aug 2024 14:00:18 -0700
Subject: [PATCH 14/65] Linux 6.10 compat: fix rpm-kmod and builtin

The 6.10 kernel broke our rpm-kmod builds.  The 6.10 kernel really
wants the source files in the same directory as the object files.
This workaround makes rpm-kmod work again.  It also updates
the builtin kernel codepath to work correctly with 6.10.

See kernel commits:

b1992c3772e6 kbuild: use $(src) instead of $(srctree)/$(src) for source
                     directory
9a0ebe5011f4 kbuild: use $(obj)/ instead of $(src)/ for common pattern
                     rules

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #16439
Closes #16450
---
 module/Kbuild.in             |  4 ++--
 rpm/generic/zfs-kmod.spec.in | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/module/Kbuild.in b/module/Kbuild.in
index 4f48cb9da0c1..57682214dfd6 100644
--- a/module/Kbuild.in
+++ b/module/Kbuild.in
@@ -16,8 +16,8 @@ src = @abs_srcdir@
 obj = @abs_builddir@
 else
 zfs_include = $(srctree)/include/zfs
-icp_include = $(srctree)/$(src)/icp/include
-zstd_include = $(srctree)/$(src)/zstd/include
+icp_include = $(src)/icp/include
+zstd_include = $(src)/zstd/include
 ZFS_MODULE_CFLAGS += -include $(zfs_include)/zfs_config.h
 endif
 
diff --git a/rpm/generic/zfs-kmod.spec.in b/rpm/generic/zfs-kmod.spec.in
index 4cc075585d4b..30524474d1ac 100644
--- a/rpm/generic/zfs-kmod.spec.in
+++ b/rpm/generic/zfs-kmod.spec.in
@@ -145,6 +145,24 @@ for kernel_version in %{?kernel_versions}; do
         %{?kernel_cc} \
         %{?kernel_ld} \
         %{?kernel_llvm}
+
+    # Pre-6.10 kernel builds didn't need to copy over the source files to the
+    # build directory.  However we do need to do it though post-6.10 due to
+    # these commits:
+    #
+    # b1992c3772e6 kbuild: use $(src) instead of $(srctree)/$(src) for source
+    #                      directory
+    #
+    # 9a0ebe5011f4 kbuild: use $(obj)/ instead of $(src)/ for common pattern
+    #                      rules
+    #
+    # Note that kmodtool actually copies over the source into the build
+    # directory, so what we're doing here is normal.  For efficiency reasons
+    # though we just use hardlinks instead of copying.
+    #
+    # See https://github.com/openzfs/zfs/issues/16439 for more info.
+    cp -lR ../%{module}-%{version}/module/* module/
+
     make %{?_smp_mflags}
     cd ..
 done

From fb432660c3691b2ac4a4cc462b9789e593c5ac29 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Thu, 15 Aug 2024 14:05:58 -0700
Subject: [PATCH 15/65] Linux 6.10 compat: Fix zvol NULL pointer deference

zvol_alloc_non_blk_mq()->blk_queue_set_write_cache() needs the disk
queue setup to prevent a NULL pointer deference.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #16453
---
 module/os/linux/zfs/zvol_os.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 2beec6436bff..5daf00c647cb 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1261,13 +1261,14 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
 		return (1);
 	}
 
+	zso->zvo_disk = disk;
+	zso->zvo_disk->minors = ZVOL_MINORS;
+	zso->zvo_queue = zso->zvo_disk->queue;
+
 #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
 	blk_queue_set_write_cache(zso->zvo_queue, B_TRUE);
 #endif
 
-	zso->zvo_disk = disk;
-	zso->zvo_disk->minors = ZVOL_MINORS;
-	zso->zvo_queue = zso->zvo_disk->queue;
 #else
 	zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
 	if (zso->zvo_queue == NULL)

From 5807de90a14127ee78de45c85c2e010841759536 Mon Sep 17 00:00:00 2001
From: Justin Gottula <justin@jgottula.com>
Date: Thu, 15 Aug 2024 14:13:18 -0700
Subject: [PATCH 16/65] Fix null ptr deref when renaming a zvol with snaps and
 snapdev=visible (#16316)

If a zvol is renamed, and it has one or more snapshots, and
snapdev=visible is true for the zvol, then the rename causes a kernel
null pointer dereference error. This has the effect (on Linux, anyway)
of killing the z_zvol taskq kthread, with locks still held; which in
turn causes a variety of zvol-related operations afterward to hang
indefinitely (such as udev workers, among other things).

The problem occurs because of an oversight in #15486
(e36ff84c338d2f7b15aef2538f6a9507115bbf4a). As documented in
dataset_kstats_create, some datasets may not actually have kstats
allocated for them; and at least at the present time, this is true for
snapshots. In practical terms, this means that for snapshots,
dk->dk_kstats will be NULL. The dataset_kstats_rename function
introduced in the patch above does not first check whether dk->dk_kstats
is NULL before proceeding, unlike e.g. the nearby
dataset_kstats_update_* functions.

In the very particular circumstance in which a zvol is renamed, AND that
zvol has one or more snapshots, AND that zvol also has snapdev=visible,
zvol_rename_minors_impl will loop over not just the zvol dataset itself,
but each of the zvol's snapshots as well, so that their device nodes
will be renamed as well. This results in dataset_kstats_create being
called for snapshots, where, as we've established, dk->dk_kstats is
NULL.

Fix this by simply adding a NULL check before doing anything in
dataset_kstats_rename.

This still allows the dataset_name kstat value for the zvol to be
updated (as was the intent of the original patch), and merely blocks
attempts by the code to act upon the zvol's non-kstat-having snapshots.
If at some future time, kstats are added for snapshots, then things
should work as intended in that case as well.

Signed-off-by: Justin Gottula <justin@jgottula.com>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Alan Somers <asomers@gmail.com>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
---
 module/zfs/dataset_kstats.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/module/zfs/dataset_kstats.c b/module/zfs/dataset_kstats.c
index 8faa6c2a2528..914260e742f9 100644
--- a/module/zfs/dataset_kstats.c
+++ b/module/zfs/dataset_kstats.c
@@ -204,6 +204,9 @@ dataset_kstats_destroy(dataset_kstats_t *dk)
 void
 dataset_kstats_rename(dataset_kstats_t *dk, const char *name)
 {
+	if (dk->dk_kstats == NULL)
+		return;
+
 	dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
 	char *ds_name;
 

From bdf4d6be1de870b16d4f7997b235d9f19dd7e30e Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Fri, 16 Aug 2024 02:29:50 +0500
Subject: [PATCH 17/65] linux/zvol_os: fix zvol queue limits initialization

zvol queue limits initialization depends on `zv_volblocksize`, but it is
initialized later, leading to several limits being initialized with
incorrect values, including `max_discard_*` limits. This also causes
`blkdiscard` command to consistently fail, as `blk_ioctl_discard` reads
`bdev_max_discard_sectors()` limits as 0, leading to failure. The fix is
straightforward: initialize `zv->zv_volblocksize` early, before setting
the queue limits. This PR should fix `zvol/zvol_misc/zvol_misc_trim`
failure on recent PRs, as the test case issues `blkdiscard` for a zvol.
Additionally, `zvol_misc_trim` was recently enabled in `6c7d41a`,
which is why the issue wasn't identified earlier.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #16454
---
 module/os/linux/zfs/zvol_os.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 5daf00c647cb..e04f64e232a6 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1362,7 +1362,7 @@ zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
  * request queue and generic disk structures for the block device.
  */
 static zvol_state_t *
-zvol_alloc(dev_t dev, const char *name)
+zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
 {
 	zvol_state_t *zv;
 	struct zvol_state_os *zso;
@@ -1382,6 +1382,7 @@ zvol_alloc(dev_t dev, const char *name)
 	zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
 	zv->zv_zso = zso;
 	zv->zv_volmode = volmode;
+	zv->zv_volblocksize = volblocksize;
 
 	list_link_init(&zv->zv_next);
 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -1671,7 +1672,8 @@ zvol_os_create_minor(const char *name)
 	if (error)
 		goto out_dmu_objset_disown;
 
-	zv = zvol_alloc(MKDEV(zvol_major, minor), name);
+	zv = zvol_alloc(MKDEV(zvol_major, minor), name,
+	    doi->doi_data_block_size);
 	if (zv == NULL) {
 		error = SET_ERROR(EAGAIN);
 		goto out_dmu_objset_disown;
@@ -1681,7 +1683,6 @@ zvol_os_create_minor(const char *name)
 	if (dmu_objset_is_snapshot(os))
 		zv->zv_flags |= ZVOL_RDONLY;
 
-	zv->zv_volblocksize = doi->doi_data_block_size;
 	zv->zv_volsize = volsize;
 	zv->zv_objset = os;
 

From db2b1fdb796619823b22b4882ebe0c09db5fa05f Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 20 Jun 2023 12:06:13 +1000
Subject: [PATCH 18/65] ddt: add FDT feature and support for legacy and new
 on-disk formats

This is the supporting infrastructure for the upcoming dedup features.

Traditionally, dedup objects live directly in the MOS root. While their
details vary (checksum, type and class), they are all the same "kind" of
thing - a store of dedup entries.

The new features are more varied than that, and are better thought of as
a set of related stores for the overall state of a dedup table.

This adds a new feature flag, SPA_FEATURE_FAST_DEDUP. Enabling this will
cause new DDTs to be created as a ZAP in the MOS root, named
DDT-<checksum>. The is used as the root object for the normal type/class
store objects, but will also be a place for any storage required by new
features.

This commit adds two new fields to ddt_t, for version and flags. These
are intended to describe the structure and features of the overall dedup
table, and are stored as-is in the DDT root. In this commit, flags are
always zero, but the intent is that they can be used to hang optional
logic or state onto for new dedup features. Version is always 1.

For a "legacy" dedup table, where no DDT root directory exists, the
version will be 0.

ddt_configure() is expected to determine the version and flags features
currently in operation based on whether or not the fast_dedup feature is
enabled, and from what's available on disk. In this way, its possible to
support both old and new tables.

This also provides a migration path. A legacy setup can be upgraded to
FDT by creating the DDT root ZAP, moving the existing objects into it,
and setting version and flags appropriately. There's no support for that
here, but it would be straightforward to add later and allows the
possibility that newer features could be applied to existing dedup
tables.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15892
---
 include/sys/ddt.h                             |  18 +-
 include/sys/ddt_impl.h                        |   8 +
 include/sys/dmu.h                             |   1 +
 include/zfeature_common.h                     |   1 +
 lib/libzfs/libzfs.abi                         |  11 +-
 man/man7/zpool-features.7                     |  17 +-
 module/zcommon/zfeature_common.c              |   6 +
 module/zfs/ddt.c                              | 260 +++++++++++++++++-
 module/zfs/zio_compress.c                     |   4 +
 .../cli_root/zpool_get/zpool_get.cfg          |   1 +
 10 files changed, 307 insertions(+), 20 deletions(-)

diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 66d59cebacde..02d0cf5daab0 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -39,6 +39,12 @@ extern "C" {
 
 struct abd;
 
+/*
+ * DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
+ */
+/* No flags yet. */
+#define	DDT_FLAG_MASK	(0)
+
 /*
  * DDT on-disk storage object types. Each one corresponds to specific
  * implementation, see ddt_ops_t. The value itself is not stored on disk.
@@ -185,11 +191,15 @@ typedef struct {
 
 	avl_tree_t	ddt_tree;	/* "live" (changed) entries this txg */
 
-	avl_tree_t	ddt_repair_tree;	/* entries being repaired */
+	avl_tree_t	ddt_repair_tree; /* entries being repaired */
+
+	enum zio_checksum ddt_checksum;	/* checksum algorithm in use */
+	spa_t		*ddt_spa;	/* pool this ddt is on */
+	objset_t	*ddt_os;	/* ddt objset (always MOS) */
 
-	enum zio_checksum ddt_checksum;		/* checksum algorithm in use */
-	spa_t		*ddt_spa;		/* pool this ddt is on */
-	objset_t	*ddt_os;		/* ddt objset (always MOS) */
+	uint64_t	ddt_dir_object;	/* MOS dir holding ddt objects */
+	uint64_t	ddt_version;	/* DDT version */
+	uint64_t	ddt_flags;	/* FDT option flags */
 
 	/* per-type/per-class entry store objects */
 	uint64_t	ddt_object[DDT_TYPES][DDT_CLASSES];
diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h
index 4aaab10c8737..9c0fea64f389 100644
--- a/include/sys/ddt_impl.h
+++ b/include/sys/ddt_impl.h
@@ -33,6 +33,14 @@
 extern "C" {
 #endif
 
+/* DDT version numbers */
+#define	DDT_VERSION_LEGACY	(0)
+#define	DDT_VERSION_FDT		(1)
+
+/* Names of interesting objects in the DDT root dir */
+#define	DDT_DIR_VERSION		"version"
+#define	DDT_DIR_FLAGS		"flags"
+
 /*
  * Ops vector to access a specific DDT object type.
  */
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 1376cbef763c..5b80dc315945 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -376,6 +376,7 @@ typedef struct dmu_buf {
 #define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
 #define	DMU_POOL_DDT			"DDT-%s-%s-%s"
 #define	DMU_POOL_DDT_STATS		"DDT-statistics"
+#define	DMU_POOL_DDT_DIR		"DDT-%s"
 #define	DMU_POOL_CREATION_VERSION	"creation_version"
 #define	DMU_POOL_SCAN			"scan"
 #define	DMU_POOL_ERRORSCRUB		"error_scrub"
diff --git a/include/zfeature_common.h b/include/zfeature_common.h
index 2515ba321759..5733a8187a95 100644
--- a/include/zfeature_common.h
+++ b/include/zfeature_common.h
@@ -82,6 +82,7 @@ typedef enum spa_feature {
 	SPA_FEATURE_AVZ_V2,
 	SPA_FEATURE_REDACTION_LIST_SPILL,
 	SPA_FEATURE_RAIDZ_EXPANSION,
+	SPA_FEATURE_FAST_DEDUP,
 	SPA_FEATURES
 } spa_feature_t;
 
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 51c8dc9647ee..88baa4168c31 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -616,7 +616,7 @@
     <elf-symbol name='fletcher_4_superscalar_ops' size='128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='spa_feature_table' size='2296' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='spa_feature_table' size='2352' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_deleg_perm_tab' size='512' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -6006,7 +6006,8 @@
       <enumerator name='SPA_FEATURE_AVZ_V2' value='38'/>
       <enumerator name='SPA_FEATURE_REDACTION_LIST_SPILL' value='39'/>
       <enumerator name='SPA_FEATURE_RAIDZ_EXPANSION' value='40'/>
-      <enumerator name='SPA_FEATURES' value='41'/>
+      <enumerator name='SPA_FEATURE_FAST_DEDUP' value='41'/>
+      <enumerator name='SPA_FEATURES' value='42'/>
     </enum-decl>
     <typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/>
     <qualified-type-def type-id='80f4b756' const='yes' id='b99c00c9'/>
@@ -9131,8 +9132,8 @@
     </function-decl>
   </abi-instr>
   <abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'>
-    <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='18368' id='b93e4d14'>
-      <subrange length='41' type-id='7359adad' id='cb834f44'/>
+    <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='18816' id='b937914f'>
+      <subrange length='42' type-id='7359adad' id='cb7c937f'/>
     </array-type-def>
     <enum-decl name='zfeature_flags' id='6db816a4'>
       <underlying-type type-id='9cac1fee'/>
@@ -9209,7 +9210,7 @@
     <pointer-type-def type-id='611586a1' size-in-bits='64' id='2e243169'/>
     <qualified-type-def type-id='eaa32e2f' const='yes' id='83be723c'/>
     <pointer-type-def type-id='83be723c' size-in-bits='64' id='7acd98a2'/>
-    <var-decl name='spa_feature_table' type-id='b93e4d14' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
+    <var-decl name='spa_feature_table' type-id='b937914f' mangled-name='spa_feature_table' visibility='default' elf-symbol-id='spa_feature_table'/>
     <var-decl name='zfeature_checks_disable' type-id='c19b74c3' mangled-name='zfeature_checks_disable' visibility='default' elf-symbol-id='zfeature_checks_disable'/>
     <function-decl name='opendir' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='80f4b756'/>
diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7
index ea3c68dc6083..ff6e485a4819 100644
--- a/man/man7/zpool-features.7
+++ b/man/man7/zpool-features.7
@@ -17,8 +17,9 @@
 .\" Copyright (c) 2019, Klara Inc.
 .\" Copyright (c) 2019, Allan Jude
 .\" Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
+.\" Copyright (c) 2023, Klara Inc.
 .\"
-.Dd June 23, 2022
+.Dd February 14, 2024
 .Dt ZPOOL-FEATURES 7
 .Os
 .
@@ -550,6 +551,20 @@ when an encrypted dataset is created and will be returned to the
 .Sy enabled
 state when all datasets that use this feature are destroyed.
 .
+.feature com.klarasystems fast_dedup yes
+This feature allows more advanced deduplication features to be enabled on new
+dedup tables.
+.Pp
+This feature will be
+.Sy active
+when the first deduplicated block is written after a new dedup table is created
+(ie after a new pool creation, or new checksum used on a dataset with
+.Sy dedup
+enabled).
+It will be returned to the
+.Sy enabled
+state when all deduplicated blocks using it are freed.
+.
 .feature com.delphix extensible_dataset no
 This feature allows more flexible use of internal ZFS data structures,
 and exists for other features to depend on.
diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c
index 309d9bf14cd4..8dec5f27b0af 100644
--- a/module/zcommon/zfeature_common.c
+++ b/module/zcommon/zfeature_common.c
@@ -754,6 +754,12 @@ zpool_feature_init(void)
 	    "Support for raidz expansion",
 	    ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures);
 
+	zfeature_register(SPA_FEATURE_FAST_DEDUP,
+	    "com.klarasystems:fast_dedup", "fast_dedup",
+	    "Support for advanced deduplication",
+	    ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL,
+	    sfeatures);
+
 	zfs_mod_list_supported_free(sfeatures);
 }
 
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index d70ae1a031d5..7e2010c423c0 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -39,6 +39,7 @@
 #include <sys/zio_checksum.h>
 #include <sys/dsl_scan.h>
 #include <sys/abd.h>
+#include <sys/zfeature.h>
 
 /*
  * # DDT: Deduplication tables
@@ -185,6 +186,18 @@ static const char *const ddt_class_name[DDT_CLASSES] = {
 	"unique",
 };
 
+/*
+ * DDT feature flags automatically enabled for each on-disk version. Note that
+ * versions >0 cannot exist on disk without SPA_FEATURE_FAST_DEDUP enabled.
+ */
+static const uint64_t ddt_version_flags[] = {
+	[DDT_VERSION_LEGACY] = 0,
+	[DDT_VERSION_FDT] = 0,
+};
+
+/* Dummy version to signal that configure is still necessary */
+#define	DDT_VERSION_UNCONFIGURED	(UINT64_MAX)
+
 static void
 ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     dmu_tx_t *tx)
@@ -196,14 +209,18 @@ ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
 	    ZCHECKSUM_FLAG_DEDUP;
 	char name[DDT_NAMELEN];
 
+	ASSERT3U(ddt->ddt_dir_object, >, 0);
+
 	ddt_object_name(ddt, type, class, name);
 
 	ASSERT3U(*objectp, ==, 0);
 	VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash));
 	ASSERT3U(*objectp, !=, 0);
 
-	VERIFY0(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
-	    sizeof (uint64_t), 1, objectp, tx));
+	ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
+
+	VERIFY0(zap_add(os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1,
+	    objectp, tx));
 
 	VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name,
 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
@@ -220,13 +237,15 @@ ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
 	uint64_t count;
 	char name[DDT_NAMELEN];
 
+	ASSERT3U(ddt->ddt_dir_object, >, 0);
+
 	ddt_object_name(ddt, type, class, name);
 
 	ASSERT3U(*objectp, !=, 0);
 	ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
 	VERIFY0(ddt_object_count(ddt, type, class, &count));
 	VERIFY0(count);
-	VERIFY0(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx));
+	VERIFY0(zap_remove(os, ddt->ddt_dir_object, name, tx));
 	VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx));
 	VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx));
 	memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t));
@@ -243,9 +262,18 @@ ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 	char name[DDT_NAMELEN];
 	int error;
 
+	if (ddt->ddt_dir_object == 0) {
+		/*
+		 * If we're configured but the containing dir doesn't exist
+		 * yet, then this object can't possibly exist either.
+		 */
+		ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
+		return (SET_ERROR(ENOENT));
+	}
+
 	ddt_object_name(ddt, type, class, name);
 
-	error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
+	error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
 	    sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
 	if (error != 0)
 		return (error);
@@ -684,6 +712,8 @@ ddt_prefetch_all(spa_t *spa)
 	}
 }
 
+static int ddt_configure(ddt_t *ddt, boolean_t new);
+
 ddt_entry_t *
 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
 {
@@ -697,6 +727,15 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
 
 	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 
+	if (ddt->ddt_version == DDT_VERSION_UNCONFIGURED) {
+		/*
+		 * This is the first use of this DDT since the pool was
+		 * created; finish getting it ready for use.
+		 */
+		VERIFY0(ddt_configure(ddt, B_TRUE));
+		ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
+	}
+
 	ddt_key_fill(&search, bp);
 
 	/* Find an existing live entry */
@@ -837,6 +876,181 @@ ddt_key_compare(const void *x1, const void *x2)
 	return (TREE_ISIGN(cmp));
 }
 
+/* Create the containing dir for this DDT and bump the feature count */
+static void
+ddt_create_dir(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ASSERT3U(ddt->ddt_dir_object, ==, 0);
+	ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT);
+
+	char name[DDT_NAMELEN];
+	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
+	    zio_checksum_table[ddt->ddt_checksum].ci_name);
+
+	ddt->ddt_dir_object = zap_create_link(ddt->ddt_os,
+	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, name, tx);
+
+	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_VERSION,
+	    sizeof (uint64_t), 1, &ddt->ddt_version, tx));
+	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS,
+	    sizeof (uint64_t), 1, &ddt->ddt_flags, tx));
+
+	spa_feature_incr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx);
+}
+
+/* Destroy the containing dir and deactivate the feature */
+static void
+ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ASSERT3U(ddt->ddt_dir_object, !=, 0);
+	ASSERT3U(ddt->ddt_dir_object, !=, DMU_POOL_DIRECTORY_OBJECT);
+	ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT);
+
+	char name[DDT_NAMELEN];
+	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
+	    zio_checksum_table[ddt->ddt_checksum].ci_name);
+
+	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
+			ASSERT(!ddt_object_exists(ddt, type, class));
+		}
+	}
+
+	uint64_t count;
+	ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count));
+	ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object,
+	    DDT_DIR_VERSION));
+	ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS));
+	ASSERT3U(count, ==, 2);
+
+	VERIFY0(zap_remove(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, tx));
+	VERIFY0(zap_destroy(ddt->ddt_os, ddt->ddt_dir_object, tx));
+
+	ddt->ddt_dir_object = 0;
+
+	spa_feature_decr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx);
+}
+
+/*
+ * Determine, flags and on-disk layout from what's already stored. If there's
+ * nothing stored, then if new is false, returns ENOENT, and if true, selects
+ * based on pool config.
+ */
+static int
+ddt_configure(ddt_t *ddt, boolean_t new)
+{
+	spa_t *spa = ddt->ddt_spa;
+	char name[DDT_NAMELEN];
+	int error;
+
+	ASSERT3U(spa_load_state(spa), !=, SPA_LOAD_CREATE);
+
+	boolean_t fdt_enabled =
+	    spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP);
+	boolean_t fdt_active =
+	    spa_feature_is_active(spa, SPA_FEATURE_FAST_DEDUP);
+
+	/*
+	 * First, look for the global DDT stats object. If its not there, then
+	 * there's never been a DDT written before ever, and we know we're
+	 * starting from scratch.
+	 */
+	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+	    &spa->spa_ddt_stat_object);
+	if (error != 0) {
+		if (error != ENOENT)
+			return (error);
+		goto not_found;
+	}
+
+	if (fdt_active) {
+		/*
+		 * Now look for a DDT directory. If it exists, then it has
+		 * everything we need.
+		 */
+		snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
+		    zio_checksum_table[ddt->ddt_checksum].ci_name);
+
+		error = zap_lookup(spa->spa_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1,
+		    &ddt->ddt_dir_object);
+		if (error == 0) {
+			ASSERT3U(spa->spa_meta_objset, ==, ddt->ddt_os);
+
+			error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object,
+			    DDT_DIR_VERSION, sizeof (uint64_t), 1,
+			    &ddt->ddt_version);
+			if (error != 0)
+				return (error);
+
+			error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object,
+			    DDT_DIR_FLAGS, sizeof (uint64_t), 1,
+			    &ddt->ddt_flags);
+			if (error != 0)
+				return (error);
+
+			if (ddt->ddt_version != DDT_VERSION_FDT) {
+				zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s "
+				    "unknown version %llu", spa_name(spa),
+				    name, (u_longlong_t)ddt->ddt_version);
+				return (SET_ERROR(EINVAL));
+			}
+
+			if ((ddt->ddt_flags & ~DDT_FLAG_MASK) != 0) {
+				zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s "
+				    "version=%llu unknown flags %llx",
+				    spa_name(spa), name,
+				    (u_longlong_t)ddt->ddt_flags,
+				    (u_longlong_t)ddt->ddt_version);
+				return (SET_ERROR(EINVAL));
+			}
+
+			return (0);
+		}
+		if (error != ENOENT)
+			return (error);
+	}
+
+	/* Any object in the root indicates a traditional setup. */
+	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
+			ddt_object_name(ddt, type, class, name);
+			uint64_t obj;
+			error = zap_lookup(spa->spa_meta_objset,
+			    DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t),
+			    1, &obj);
+			if (error == ENOENT)
+				continue;
+			if (error != 0)
+				return (error);
+
+			ddt->ddt_version = DDT_VERSION_LEGACY;
+			ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
+			ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT;
+
+			return (0);
+		}
+	}
+
+not_found:
+	if (!new)
+		return (SET_ERROR(ENOENT));
+
+	/* Nothing on disk, so set up for the best version we can */
+	if (fdt_enabled) {
+		ddt->ddt_version = DDT_VERSION_FDT;
+		ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
+		ddt->ddt_dir_object = 0; /* create on first use */
+	} else {
+		ddt->ddt_version = DDT_VERSION_LEGACY;
+		ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
+		ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT;
+	}
+
+	return (0);
+}
+
 static ddt_t *
 ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 {
@@ -853,6 +1067,7 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 	ddt->ddt_checksum = c;
 	ddt->ddt_spa = spa;
 	ddt->ddt_os = spa->spa_meta_objset;
+	ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
 
 	return (ddt);
 }
@@ -889,7 +1104,6 @@ ddt_load(spa_t *spa)
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
 	    &spa->spa_ddt_stat_object);
-
 	if (error)
 		return (error == ENOENT ? 0 : error);
 
@@ -898,6 +1112,12 @@ ddt_load(spa_t *spa)
 			continue;
 
 		ddt_t *ddt = spa->spa_ddt[c];
+		error = ddt_configure(ddt, B_FALSE);
+		if (error == ENOENT)
+			continue;
+		if (error != 0)
+			return (error);
+
 		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 			for (ddt_class_t class = 0; class < DDT_CLASSES;
 			    class++) {
@@ -912,10 +1132,11 @@ ddt_load(spa_t *spa)
 		 */
 		memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
 		    sizeof (ddt->ddt_histogram));
-		spa->spa_dedup_dspace = ~0ULL;
-		spa->spa_dedup_dsize = ~0ULL;
 	}
 
+	spa->spa_dedup_dspace = ~0ULL;
+	spa->spa_dedup_dsize = ~0ULL;
+
 	return (0);
 }
 
@@ -1147,25 +1368,44 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
 		    DMU_POOL_DDT_STATS, tx);
 	}
 
+	if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0)
+		ddt_create_dir(ddt, tx);
+
 	while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
 		ddt_sync_entry(ddt, dde, tx, txg);
 		ddt_free(dde);
 	}
 
+	uint64_t count = 0;
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
-		uint64_t add, count = 0;
+		uint64_t add, tcount = 0;
 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
 			if (ddt_object_exists(ddt, type, class)) {
 				ddt_object_sync(ddt, type, class, tx);
 				VERIFY0(ddt_object_count(ddt, type, class,
 				    &add));
-				count += add;
+				tcount += add;
 			}
 		}
 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
-			if (count == 0 && ddt_object_exists(ddt, type, class))
+			if (tcount == 0 && ddt_object_exists(ddt, type, class))
 				ddt_object_destroy(ddt, type, class, tx);
 		}
+		count += tcount;
+	}
+
+	if (count == 0) {
+		/*
+		 * No entries left on the DDT, so reset the version for next
+		 * time. This allows us to handle the feature being changed
+		 * since the DDT was originally created. New entries should get
+		 * whatever the feature currently demands.
+		 */
+		if (ddt->ddt_version == DDT_VERSION_FDT)
+			ddt_destroy_dir(ddt, tx);
+
+		ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
+		ddt->ddt_flags = 0;
 	}
 
 	memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c
index e12d5498ccda..c3bceababa38 100644
--- a/module/zfs/zio_compress.c
+++ b/module/zfs/zio_compress.c
@@ -48,6 +48,10 @@ static unsigned long zio_decompress_fail_fraction = 0;
 
 /*
  * Compression vectors.
+ *
+ * NOTE: DO NOT CHANGE THE NAMES OF THESE COMPRESSION FUNCTIONS.
+ * THEY ARE USED AS ZAP KEY NAMES BY FAST DEDUP AND THEREFORE
+ * PART OF THE ON-DISK FORMAT.
  */
 zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
 	{"inherit",	0,	NULL,		NULL, NULL},
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
index e8a94ce209bc..50c1b7a9d09e 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
@@ -109,5 +109,6 @@ if is_linux || is_freebsd; then
 	    "feature@block_cloning"
 	    "feature@vdev_zaps_v2"
 	    "feature@raidz_expansion"
+	    "feature@fast_dedup"
 	)
 fi

From 2b131d734577bf489c86fdb9dbb63460a5675613 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 13 Jun 2024 14:50:33 +1000
Subject: [PATCH 19/65] ZTS: tests for dedup legacy/FDT tables

Very basic coverage to make sure things appear to work, have the right
format on disk, and pool upgrades and mixed table types work as
expected.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15892
---
 tests/runfiles/common.run                     |   4 +-
 tests/zfs-tests/tests/Makefile.am             |   6 +
 .../functional/dedup/dedup_fdt_create.ksh     |  99 ++++++++++++++
 .../functional/dedup/dedup_fdt_import.ksh     | 112 ++++++++++++++++
 .../functional/dedup/dedup_legacy_create.ksh  |  95 ++++++++++++++
 .../dedup/dedup_legacy_fdt_mixed.ksh          |  97 ++++++++++++++
 .../dedup/dedup_legacy_fdt_upgrade.ksh        | 122 ++++++++++++++++++
 .../functional/dedup/dedup_legacy_import.ksh  | 104 +++++++++++++++
 .../tests/functional/dedup/setup.ksh          |   4 -
 9 files changed, 638 insertions(+), 5 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh
 create mode 100755 tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh
 create mode 100755 tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh
 create mode 100755 tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh
 create mode 100755 tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh
 create mode 100755 tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh

diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 326eb2a44d37..ad131664698b 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -672,7 +672,9 @@ post =
 tags = ['functional', 'deadman']
 
 [tests/functional/dedup]
-tests = ['dedup_quota']
+tests = ['dedup_legacy_create', 'dedup_fdt_create', 'dedup_fdt_import',
+    'dedup_legacy_create', 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade',
+    'dedup_legacy_fdt_mixed', 'dedup_quota']
 pre =
 post =
 tags = ['functional', 'dedup']
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 9dcb097e2b38..bbeabc6dfb42 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1424,6 +1424,12 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/deadman/deadman_zio.ksh \
 	functional/dedup/cleanup.ksh \
 	functional/dedup/setup.ksh \
+	functional/dedup/dedup_fdt_create.ksh \
+	functional/dedup/dedup_fdt_import.ksh \
+	functional/dedup/dedup_legacy_create.ksh \
+	functional/dedup/dedup_legacy_import.ksh \
+	functional/dedup/dedup_legacy_fdt_upgrade.ksh \
+	functional/dedup/dedup_legacy_fdt_mixed.ksh \
 	functional/dedup/dedup_quota.ksh \
 	functional/delegate/cleanup.ksh \
 	functional/delegate/setup.ksh \
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh
new file mode 100755
index 000000000000..83c4d7c8e2aa
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh
@@ -0,0 +1,99 @@
+#!/bin/ksh -p
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024 Klara, Inc.
+#
+
+# Simple test of dedup table operations (FDT)
+
+. $STF_SUITE/include/libtest.shlib
+
+log_assert "basic dedup (FDT) operations work"
+
+function cleanup
+{
+	destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+# create a pool with fast dedup enabled. we disable block cloning to ensure
+# it doesn't get in the way of dedup, and we disable compression so our writes
+# create predictable results on disk
+# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting
+log_must zpool create -f \
+    -o feature@fast_dedup=enabled \
+    -O dedup=on \
+    -o feature@block_cloning=disabled \
+    -O compression=off \
+    -O xattr=sa \
+    $TESTPOOL $DISKS
+
+# confirm the feature is enabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled"
+
+# confirm there's no DDT keys in the MOS root
+log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-sha256"
+
+# create a file. this is four full blocks, so will produce four entries in the
+# dedup table
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128k count=4
+log_must zpool sync
+
+# feature should now be active
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active"
+
+# four entries in the unique table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'"
+
+# single containing object in the MOS
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 1
+obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }')
+
+# with only one ZAP inside
+log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1
+
+# copy the file
+log_must cp /$TESTPOOL/file1 /$TESTPOOL/file2
+log_must zpool sync
+
+# now four entries in the duplicate table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate: 4 entries'"
+
+# now two DDT ZAPs in the container object; DDT ZAPs aren't cleaned up until
+# the entire logical table is destroyed
+log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 2
+
+# remove the files
+log_must rm -f /$TESTPOOL/file*
+log_must zpool sync
+
+# feature should move back to enabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled"
+
+# all DDTs empty
+log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'"
+
+# logical table now destroyed; containing object destroyed
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 0
+
+log_pass "basic dedup (FDT) operations work"
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh
new file mode 100755
index 000000000000..f0f20671b95d
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh
@@ -0,0 +1,112 @@
+#!/bin/ksh -p
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024 Klara, Inc.
+#
+
+# Ensure dedup retains version after import (FDT)
+
+. $STF_SUITE/include/libtest.shlib
+
+log_assert "dedup (FDT) retains version after import"
+
+function cleanup
+{
+	destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+# create a pool with fast dedup enabled. we disable block cloning to ensure
+# it doesn't get in the way of dedup, and we disable compression so our writes
+# create predictable results on disk
+# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting
+log_must zpool create -f \
+    -o feature@fast_dedup=enabled \
+    -O dedup=on \
+    -o feature@block_cloning=disabled \
+    -O compression=off \
+    -O xattr=sa \
+    $TESTPOOL $DISKS
+
+# confirm the feature is enabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled"
+
+# confirm there's no DDT keys in the MOS root
+log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-sha256"
+
+# create a file. this is four full blocks, so will produce four entries in the
+# dedup table
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128k count=4
+log_must zpool sync
+
+# feature should now be active
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active"
+
+# four entries in the unique table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'"
+
+# single containing object in the MOS
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 1
+obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }')
+
+# with only one ZAP inside
+log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1
+
+# export and import the pool
+zpool export $TESTPOOL
+zpool import $TESTPOOL
+
+# feature still active
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active"
+
+# remove the file
+log_must rm -f /$TESTPOOL/file1
+log_must zpool sync
+
+# feature should revert to enabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled"
+
+# all DDTs empty
+log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'"
+
+# logical table now destroyed; containing object destroyed
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 0
+
+# create a new file
+log_must dd if=/dev/urandom of=/$TESTPOOL/file2 bs=128k count=4
+log_must zpool sync
+
+# feature should be active again
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active"
+
+# four entries in the unique table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'"
+
+# single containing object in the MOS
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 1
+obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }')
+
+# with only one ZAP inside
+log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1
+
+log_pass "dedup (FDT) retains version after import"
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh
new file mode 100755
index 000000000000..e3efcf5c8b36
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_create.ksh
@@ -0,0 +1,95 @@
+#!/bin/ksh -p
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024 Klara, Inc.
+#
+
+# Simple test of dedup table operations (legacy)
+
+. $STF_SUITE/include/libtest.shlib
+
+log_assert "basic dedup (legacy) operations work"
+
+function cleanup
+{
+	destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+# create a pool with legacy dedup enabled. we disable block cloning to ensure
+# it doesn't get in the way of dedup, and we disable compression so our writes
+# create predictable results on disk
+# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting
+log_must zpool create -f \
+    -o feature@fast_dedup=disabled \
+    -O dedup=on \
+    -o feature@block_cloning=disabled \
+    -O compression=off \
+    -O xattr=sa \
+    $TESTPOOL $DISKS
+
+# confirm the feature is disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# confirm there's no DDT keys in the MOS root
+log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-sha256"
+
+# create a file. this is four full blocks, so will produce four entries in the
+# dedup table
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128k count=4
+log_must zpool sync
+
+# feature should still be disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# should be four entries in the unique table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'"
+
+# should be just one DDT ZAP in the MOS
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1
+
+# copy the file
+log_must cp /$TESTPOOL/file1 /$TESTPOOL/file2
+log_must zpool sync
+
+# now four entries in the duplicate table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate: 4 entries'"
+
+# now two DDT ZAPs in the MOS; DDT ZAPs aren't cleaned up until the entire
+# logical table is destroyed
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 2
+
+# remove the files
+log_must rm -f /$TESTPOOL/file*
+log_must zpool sync
+
+# feature should still be disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# all DDTs empty
+log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'"
+
+# logical table now destroyed; all DDT ZAPs removed
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 0
+
+log_pass "basic dedup (legacy) operations work"
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh
new file mode 100755
index 000000000000..049ccaae3dca
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh
@@ -0,0 +1,97 @@
+#!/bin/ksh -p
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024 Klara, Inc.
+#
+
+# Check legacy dedup table continues to work after pool upgrade to fast_dedup,
+# but if deleted and recreated, the new table is FDT
+
+. $STF_SUITE/include/libtest.shlib
+
+log_assert "legacy and FDT dedup tables on the same pool can happily coexist"
+
+function cleanup
+{
+	destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+# create a pool with legacy dedup enabled. we disable block cloning to ensure
+# it doesn't get in the way of dedup, and we disable compression so our writes
+# create predictable results on disk
+# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting
+log_must zpool create -f \
+    -o feature@fast_dedup=disabled \
+    -o feature@block_cloning=disabled \
+    -O compression=off \
+    -O xattr=sa \
+    $TESTPOOL $DISKS
+
+# create two datasets, enabling a different dedup algorithm on each
+log_must zfs create -o dedup=skein $TESTPOOL/ds1
+log_must zfs create -o dedup=blake3 $TESTPOOL/ds2
+
+# confirm the feature is disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# confirm there's no DDT keys in the MOS root
+log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-skein"
+log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-blake3"
+
+# create a file in the first dataset
+log_must dd if=/dev/urandom of=/$TESTPOOL/ds1/file1 bs=128k count=4
+log_must zpool sync
+
+# should be four entries in the skein unique table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-skein-zap-unique: 4 entries'"
+
+# should be just one DDT ZAP in the MOS
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-.*-zap- | wc -l) -eq 1
+
+# enable the fast_dedup feature
+log_must zpool set feature@fast_dedup=enabled $TESTPOOL
+
+# confirm the feature is now enabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled"
+
+# create a file in the first dataset
+log_must dd if=/dev/urandom of=/$TESTPOOL/ds2/file1 bs=128k count=4
+log_must zpool sync
+
+# feature should now be active
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active"
+
+# now also four entries in the blake3 unique table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-blake3-zap-unique: 4 entries'"
+
+# two entries in the MOS: the legacy skein DDT ZAP, and the containing dir for
+# the blake3 FDT table
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-.*-zap- | wc -l) -eq 1
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-blake3 | wc -l) -eq 1
+
+# containing object has one ZAP inside
+obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-blake3 | awk '{ print $NF }')
+log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-.*-zap- | wc -l) -eq 1
+
+log_pass "legacy and FDT dedup tables on the same pool can happily coexist"
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh
new file mode 100755
index 000000000000..d563fade88af
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh
@@ -0,0 +1,122 @@
+#!/bin/ksh -p
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024 Klara, Inc.
+#
+
+# Check legacy dedup table continues to work after pool upgrade to fast_dedup,
+# but if deleted and recreated, the new table is FDT
+
+. $STF_SUITE/include/libtest.shlib
+
+log_assert "legacy dedup tables work after upgrade; new dedup tables created as FDT"
+
+function cleanup
+{
+	destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+# create a pool with legacy dedup enabled. we disable block cloning to ensure
+# it doesn't get in the way of dedup, and we disable compression so our writes
+# create predictable results on disk
+# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting
+log_must zpool create -f \
+    -o feature@fast_dedup=disabled \
+    -O dedup=on \
+    -o feature@block_cloning=disabled \
+    -O compression=off \
+    -O xattr=sa \
+    $TESTPOOL $DISKS
+
+# confirm the feature is disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# confirm there's no DDT keys in the MOS root
+log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-sha256"
+
+# create a file. this is four full blocks, so will produce four entries in the
+# dedup table
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128k count=4
+log_must zpool sync
+
+# feature should still be disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# should be four entries in the unique table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'"
+
+# should be just one DDT ZAP in the MOS
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1
+
+# enable the fast_dedup feature
+log_must zpool set feature@fast_dedup=enabled $TESTPOOL
+
+# confirm the feature is now enabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled"
+
+# copy the file
+log_must cp /$TESTPOOL/file1 /$TESTPOOL/file2
+log_must zpool sync
+
+# feature should still be enabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled"
+
+# now four entries in the duplicate table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-duplicate: 4 entries'"
+
+# now two DDT ZAPs in the MOS; DDT ZAPs aren't cleaned up until the entire
+# logical table is destroyed
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 2
+
+# remove the files
+log_must rm -f /$TESTPOOL/file*
+log_must zpool sync
+
+# feature should still be enabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "enabled"
+
+# all DDTs empty
+log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'"
+
+# logical table now destroyed; all DDT ZAPs removed
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 0
+
+# create a new file
+log_must dd if=/dev/urandom of=/$TESTPOOL/file3 bs=128k count=4
+log_must zpool sync
+
+# feature should now be active
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "active"
+
+# four entries in the unique table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'"
+
+# single containing object in the MOS
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | wc -l) -eq 1
+obj=$(zdb -dddd $TESTPOOL 1 | grep DDT-sha256 | awk '{ print $NF }')
+
+# with one ZAP inside
+log_must test $(zdb -dddd $TESTPOOL $obj | grep DDT-sha256-zap- | wc -l) -eq 1
+
+log_pass "legacy dedup tables work after upgrade; new dedup tables created as FDT"
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh
new file mode 100755
index 000000000000..a7b667eaf882
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_import.ksh
@@ -0,0 +1,104 @@
+#!/bin/ksh -p
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024 Klara, Inc.
+#
+
+# Ensure dedup retains version after import (legacy)
+
+. $STF_SUITE/include/libtest.shlib
+
+log_assert "dedup (legacy) retains version after import"
+
+function cleanup
+{
+	destroy_pool $TESTPOOL
+}
+
+log_onexit cleanup
+
+# create a pool with legacy dedup enabled. we disable block cloning to ensure
+# it doesn't get in the way of dedup, and we disable compression so our writes
+# create predictable results on disk
+# Use 'xattr=sa' to prevent selinux xattrs influencing our accounting
+log_must zpool create -f \
+    -o feature@fast_dedup=disabled \
+    -O dedup=on \
+    -o feature@block_cloning=disabled \
+    -O compression=off \
+    -O xattr=sa \
+    $TESTPOOL $DISKS
+
+# confirm the feature is disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# confirm there's no DDT keys in the MOS root
+log_mustnot eval "zdb -dddd $TESTPOOL 1 | grep -q DDT-sha256"
+
+# create a file. this is four full blocks, so will produce four entries in the
+# dedup table
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128k count=4
+log_must zpool sync
+
+# feature should still be disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# should be four entries in the unique table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'"
+
+# should be just one DDT ZAP in the MOS
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1
+
+# export and import the pool
+zpool export $TESTPOOL
+zpool import $TESTPOOL
+
+# confirm the feature is disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# remove the file
+log_must rm -f /$TESTPOOL/file1
+log_must zpool sync
+
+# feature should still be disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# all DDTs empty
+log_must eval "zdb -D $TESTPOOL | grep -q 'All DDTs are empty'"
+
+# logical table now destroyed; all DDT ZAPs removed
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 0
+
+# create a new file
+log_must dd if=/dev/urandom of=/$TESTPOOL/file2 bs=128k count=4
+log_must zpool sync
+
+# feature should still be disabled
+log_must test $(get_pool_prop feature@fast_dedup $TESTPOOL) = "disabled"
+
+# should be four entries in the unique table
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique: 4 entries'"
+
+# should be just one DDT ZAP in the MOS
+log_must test $(zdb -dddd $TESTPOOL 1 | grep DDT-sha256-zap- | wc -l) -eq 1
+
+log_pass "dedup (legacy) retains version after import"
diff --git a/tests/zfs-tests/tests/functional/dedup/setup.ksh b/tests/zfs-tests/tests/functional/dedup/setup.ksh
index 3c0830401f81..a21238879faf 100755
--- a/tests/zfs-tests/tests/functional/dedup/setup.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/setup.ksh
@@ -25,7 +25,3 @@
 #
 
 . $STF_SUITE/include/libtest.shlib
-
-DISK=${DISKS%% *}
-
-default_setup $DISK

From d63f5d7e50b65c76d9a8b79db0b66ebb6a49742c Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 18 Jun 2024 14:11:11 +1000
Subject: [PATCH 20/65] zdb: rework DDT block count and leak check to just
 count the blocks

The upcoming dedup features break the long held assumption that all
blocks on disk with a 'D' dedup bit will always be present in the DDT,
or will have the same set of DVA allocations on disk as in the DDT.

If the DDT is no longer a complete picture of all the dedup blocks that
will be and should be on disk, then it does us no good to walk and prime
it up front, since it won't necessarily match up with every block we'll
see anyway.

Instead, we rework things here to be more like the BRT checks. When we
see a dedup'd block, we look it up in the DDT, consume a refcount, and
for the second-or-later instances, count them as duplicates.

The DDT and BRT are moved ahead of the space accounting. This will
become important for the "flat" feature, which may need to count a
modified version of the block.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Allan Jude <allan@klarasystems.com>
Co-authored-by: Don Brady <don.brady@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15892
---
 cmd/zdb/zdb.c     | 315 ++++++++++++++++++++++++++++------------------
 include/sys/ddt.h |   2 +-
 module/zfs/ddt.c  |   8 +-
 module/zfs/zio.c  |   4 +-
 4 files changed, 200 insertions(+), 129 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index dec70c60cec1..fcf0e4779788 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -33,7 +33,7 @@
  *     under sponsorship from the FreeBSD Foundation.
  * Copyright (c) 2021 Allan Jude
  * Copyright (c) 2021 Toomas Soome <tsoome@me.com>
- * Copyright (c) 2023, Klara Inc.
+ * Copyright (c) 2023, 2024, Klara Inc.
  * Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
  */
 
@@ -3287,9 +3287,46 @@ fuid_table_destroy(void)
 	}
 }
 
+/*
+ * Clean up DDT internal state. ddt_lookup() adds entries to ddt_tree, which on
+ * a live pool are normally cleaned up during ddt_sync(). We can't do that (and
+ * wouldn't want to anyway), but if we don't clean up the presence of stuff on
+ * ddt_tree will trip asserts in ddt_table_free(). So, we clean up ourselves.
+ *
+ * Note that this is not a particularly efficient way to do this, but
+ * ddt_remove() is the only public method that can do the work we need, and it
+ * requires the right locks and etc to do the job. This is only ever called
+ * during zdb shutdown so efficiency is not especially important.
+ */
+static void
+zdb_ddt_cleanup(spa_t *spa)
+{
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		if (!ddt)
+			continue;
+
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		ddt_enter(ddt);
+		ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next;
+		while (dde) {
+			next = AVL_NEXT(&ddt->ddt_tree, dde);
+			memset(&dde->dde_lead_zio, 0,
+			    sizeof (dde->dde_lead_zio));
+			ddt_remove(ddt, dde);
+			dde = next;
+		}
+		ddt_exit(ddt);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+	}
+}
+
 static void
 zdb_exit(int reason)
 {
+	if (spa != NULL)
+		zdb_ddt_cleanup(spa);
+
 	if (os != NULL) {
 		close_objset(os, FTAG);
 	} else if (spa != NULL) {
@@ -5633,7 +5670,6 @@ static void
 zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
     dmu_object_type_t type)
 {
-	uint64_t refcnt = 0;
 	int i;
 
 	ASSERT(type < ZDB_OT_TOTAL);
@@ -5641,8 +5677,144 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
 		return;
 
+	/*
+	 * This flag controls if we will issue a claim for the block while
+	 * counting it, to ensure that all blocks are referenced in space maps.
+	 * We don't issue claims if we're not doing leak tracking, because it's
+	 * expensive if the user isn't interested. We also don't claim the
+	 * second or later occurences of cloned or dedup'd blocks, because we
+	 * already claimed them the first time.
+	 */
+	boolean_t do_claim = !dump_opt['L'];
+
 	spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
 
+	if (BP_GET_DEDUP(bp)) {
+		/*
+		 * Dedup'd blocks are special. We need to count them, so we can
+		 * later uncount them when reporting leaked space, and we must
+		 * only claim them once.
+		 *
+		 * We use the existing dedup system to track what we've seen.
+		 * The first time we see a block, we do a ddt_lookup() to see
+		 * if it exists in the DDT. If we're doing leak tracking, we
+		 * claim the block at this time.
+		 *
+		 * Each time we see a block, we reduce the refcount in the
+		 * entry by one, and add to the size and count of dedup'd
+		 * blocks to report at the end.
+		 */
+
+		ddt_t *ddt = ddt_select(zcb->zcb_spa, bp);
+
+		ddt_enter(ddt);
+
+		/*
+		 * Find the block. This will create the entry in memory, but
+		 * we'll know if that happened by its refcount.
+		 */
+		ddt_entry_t *dde = ddt_lookup(ddt, bp);
+
+		/*
+		 * ddt_lookup() can only return NULL if this block didn't exist
+		 * in the DDT and creating it would take the DDT over its
+		 * quota. Since we got the block from disk, it must exist in
+		 * the DDT, so this can't happen.
+		 */
+		VERIFY3P(dde, !=, NULL);
+
+		/* Get the phys for this variant */
+		ddt_phys_t *ddp = ddt_phys_select(dde, bp);
+		VERIFY3P(ddp, !=, NULL);
+
+		/*
+		 * This entry may have multiple sets of DVAs. We must claim
+		 * each set the first time we see them in a real block on disk,
+		 * or count them on subsequent occurences. We don't have a
+		 * convenient way to track the first time we see each variant,
+		 * so we repurpose dde_lead_zio[] as a per-phys "seen" flag. We
+		 * can do this safely in zdb because it never writes, so it
+		 * will never have a writing zio for this block in that
+		 * pointer.
+		 */
+
+		/*
+		 * Work out which dde_phys index was used, get the seen flag,
+		 * and update it if necessary.
+		 */
+		uint_t idx =
+		    ((uint_t)((uintptr_t)ddp - (uintptr_t)dde->dde_phys)) /
+		    sizeof (ddt_phys_t);
+		VERIFY3P(ddp, ==, &dde->dde_phys[idx]);
+		boolean_t seen = (boolean_t)(uintptr_t)dde->dde_lead_zio[idx];
+		if (!seen)
+			dde->dde_lead_zio[idx] = (zio_t *)(uintptr_t)B_TRUE;
+
+		/* Consume a reference for this block. */
+		VERIFY3U(ddt_phys_total_refcnt(dde), >, 0);
+		ddt_phys_decref(ddp);
+
+		if (seen) {
+			/*
+			 * The second or later time we see this block,
+			 * it's a duplicate and we count it.
+			 */
+			zcb->zcb_dedup_asize += BP_GET_ASIZE(bp);
+			zcb->zcb_dedup_blocks++;
+
+			/* Already claimed, don't do it again. */
+			do_claim = B_FALSE;
+		}
+
+		ddt_exit(ddt);
+	} else if (zcb->zcb_brt_is_active &&
+	    brt_maybe_exists(zcb->zcb_spa, bp)) {
+		/*
+		 * Cloned blocks are special. We need to count them, so we can
+		 * later uncount them when reporting leaked space, and we must
+		 * only claim them once.
+		 *
+		 * To do this, we keep our own in-memory BRT. For each block
+		 * we haven't seen before, we look it up in the real BRT and
+		 * if its there, we note it and its refcount then proceed as
+		 * normal. If we see the block again, we count it as a clone
+		 * and then give it no further consideration.
+		 */
+		zdb_brt_entry_t zbre_search, *zbre;
+		avl_index_t where;
+
+		zbre_search.zbre_dva = bp->blk_dva[0];
+		zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
+		if (zbre == NULL) {
+			/* Not seen before; track it */
+			uint64_t refcnt =
+			    brt_entry_get_refcount(zcb->zcb_spa, bp);
+			if (refcnt > 0) {
+				zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
+				    UMEM_NOFAIL);
+				zbre->zbre_dva = bp->blk_dva[0];
+				zbre->zbre_refcount = refcnt;
+				avl_insert(&zcb->zcb_brt, zbre, where);
+			}
+		} else  {
+			/*
+			 * Second or later occurrence, count it and take a
+			 * refcount.
+			 */
+			zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
+			zcb->zcb_clone_blocks++;
+
+			zbre->zbre_refcount--;
+			if (zbre->zbre_refcount == 0) {
+				avl_remove(&zcb->zcb_brt, zbre);
+				umem_free(zbre, sizeof (zdb_brt_entry_t));
+			}
+
+			/* Already claimed, don't do it again. */
+			do_claim = B_FALSE;
+		}
+	}
+
 	for (i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
 		int t = (i & 1) ? type : ZDB_OT_TOTAL;
@@ -5745,71 +5917,12 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 	zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
 	zcb->zcb_asize_total += BP_GET_ASIZE(bp);
 
-	if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) {
-		/*
-		 * Cloned blocks are special. We need to count them, so we can
-		 * later uncount them when reporting leaked space, and we must
-		 * only claim them them once.
-		 *
-		 * To do this, we keep our own in-memory BRT. For each block
-		 * we haven't seen before, we look it up in the real BRT and
-		 * if its there, we note it and its refcount then proceed as
-		 * normal. If we see the block again, we count it as a clone
-		 * and then give it no further consideration.
-		 */
-		zdb_brt_entry_t zbre_search, *zbre;
-		avl_index_t where;
-
-		zbre_search.zbre_dva = bp->blk_dva[0];
-		zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
-		if (zbre != NULL) {
-			zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
-			zcb->zcb_clone_blocks++;
-
-			zbre->zbre_refcount--;
-			if (zbre->zbre_refcount == 0) {
-				avl_remove(&zcb->zcb_brt, zbre);
-				umem_free(zbre, sizeof (zdb_brt_entry_t));
-			}
-			return;
-		}
-
-		uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp);
-		if (crefcnt > 0) {
-			zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
-			    UMEM_NOFAIL);
-			zbre->zbre_dva = bp->blk_dva[0];
-			zbre->zbre_refcount = crefcnt;
-			avl_insert(&zcb->zcb_brt, zbre, where);
-		}
-	}
-
-	if (dump_opt['L'])
+	if (!do_claim)
 		return;
 
-	if (BP_GET_DEDUP(bp)) {
-		ddt_t *ddt;
-		ddt_entry_t *dde;
-
-		ddt = ddt_select(zcb->zcb_spa, bp);
-		ddt_enter(ddt);
-		dde = ddt_lookup(ddt, bp, B_FALSE);
-
-		if (dde == NULL) {
-			refcnt = 0;
-		} else {
-			ddt_phys_t *ddp = ddt_phys_select(dde, bp);
-			ddt_phys_decref(ddp);
-			refcnt = ddp->ddp_refcnt;
-			if (ddt_phys_total_refcnt(dde) == 0)
-				ddt_remove(ddt, dde);
-		}
-		ddt_exit(ddt);
-	}
-
-	VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
-	    refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
-	    bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
+	VERIFY0(zio_wait(zio_claim(NULL, zcb->zcb_spa,
+	    spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL,
+	    ZIO_FLAG_CANFAIL)));
 }
 
 static void
@@ -6120,49 +6233,6 @@ zdb_load_obsolete_counts(vdev_t *vd)
 	return (counts);
 }
 
-static void
-zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
-{
-	ddt_bookmark_t ddb = {0};
-	ddt_entry_t dde;
-	int error;
-	int p;
-
-	ASSERT(!dump_opt['L']);
-
-	while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
-		blkptr_t blk;
-		ddt_phys_t *ddp = dde.dde_phys;
-
-		if (ddb.ddb_class == DDT_CLASS_UNIQUE)
-			return;
-
-		ASSERT(ddt_phys_total_refcnt(&dde) > 1);
-		ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
-		VERIFY(ddt);
-
-		for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-			if (ddp->ddp_phys_birth == 0)
-				continue;
-			ddt_bp_create(ddb.ddb_checksum,
-			    &dde.dde_key, ddp, &blk);
-			if (p == DDT_PHYS_DITTO) {
-				zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
-			} else {
-				zcb->zcb_dedup_asize +=
-				    BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
-				zcb->zcb_dedup_blocks++;
-			}
-		}
-
-		ddt_enter(ddt);
-		VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
-		ddt_exit(ddt);
-	}
-
-	ASSERT(error == ENOENT);
-}
-
 typedef struct checkpoint_sm_exclude_entry_arg {
 	vdev_t *cseea_vd;
 	uint64_t cseea_checkpoint_size;
@@ -6546,10 +6616,6 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 		(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
 		    increment_indirect_mapping_cb, zcb, NULL);
 	}
-
-	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-	zdb_ddt_leak_init(spa, zcb);
-	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
 static boolean_t
@@ -6814,6 +6880,8 @@ dump_block_stats(spa_t *spa)
 	int e, c, err;
 	bp_embedded_type_t i;
 
+	ddt_prefetch_all(spa);
+
 	zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL);
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
@@ -6938,7 +7006,6 @@ dump_block_stats(spa_t *spa)
 		    (u_longlong_t)total_alloc,
 		    (dump_opt['L']) ? "unreachable" : "leaked",
 		    (longlong_t)(total_alloc - total_found));
-		leaks = B_TRUE;
 	}
 
 	if (tzb->zb_count == 0) {
@@ -8022,16 +8089,21 @@ dump_mos_leaks(spa_t *spa)
 
 	mos_leak_vdev(spa->spa_root_vdev);
 
-	for (uint64_t class = 0; class < DDT_CLASSES; class++) {
-		for (uint64_t type = 0; type < DDT_TYPES; type++) {
-			for (uint64_t cksum = 0;
-			    cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) {
-				ddt_t *ddt = spa->spa_ddt[cksum];
-				if (!ddt)
-					continue;
+	for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		if (!ddt)
+			continue;
+
+		/* DDT store objects */
+		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+			for (ddt_class_t class = 0; class < DDT_CLASSES;
+			    class++) {
 				mos_obj_refd(ddt->ddt_object[type][class]);
 			}
 		}
+
+		/* FDT container */
+		mos_obj_refd(ddt->ddt_dir_object);
 	}
 
 	if (spa->spa_brt != NULL) {
@@ -9624,6 +9696,9 @@ main(int argc, char **argv)
 	}
 
 fini:
+	if (spa != NULL)
+		zdb_ddt_cleanup(spa);
+
 	if (os != NULL) {
 		close_objset(os, FTAG);
 	} else if (spa != NULL) {
diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 02d0cf5daab0..20bae8ce0fcb 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -253,7 +253,7 @@ extern void ddt_enter(ddt_t *ddt);
 extern void ddt_exit(ddt_t *ddt);
 extern void ddt_init(void);
 extern void ddt_fini(void);
-extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
+extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp);
 extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
 extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp);
 extern void ddt_prefetch_all(spa_t *spa);
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 7e2010c423c0..84d7800cbc73 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -715,7 +715,7 @@ ddt_prefetch_all(spa_t *spa)
 static int ddt_configure(ddt_t *ddt, boolean_t new);
 
 ddt_entry_t *
-ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
+ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 {
 	spa_t *spa = ddt->ddt_spa;
 	ddt_key_t search;
@@ -767,10 +767,6 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
 		return (dde);
 	}
 
-	/* Not found. */
-	if (!add)
-		return (NULL);
-
 	/* Time to make a new entry. */
 	dde = ddt_alloc(&search);
 	avl_insert(&ddt->ddt_tree, dde, where);
@@ -1502,7 +1498,7 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 	ddt = ddt_select(spa, bp);
 	ddt_enter(ddt);
 
-	dde = ddt_lookup(ddt, bp, B_TRUE);
+	dde = ddt_lookup(ddt, bp);
 
 	/* Can be NULL if the entry for this block was pruned. */
 	if (dde == NULL) {
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 6d08d4bd1633..5810e811a39d 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -3518,7 +3518,7 @@ zio_ddt_write(zio_t *zio)
 	ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
 
 	ddt_enter(ddt);
-	dde = ddt_lookup(ddt, bp, B_TRUE);
+	dde = ddt_lookup(ddt, bp);
 	if (dde == NULL) {
 		/* DDT size is over its quota so no new entries */
 		zp->zp_dedup = B_FALSE;
@@ -3598,7 +3598,7 @@ zio_ddt_free(zio_t *zio)
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
 	ddt_enter(ddt);
-	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
+	freedde = dde = ddt_lookup(ddt, bp);
 	if (dde) {
 		ddp = ddt_phys_select(dde, bp);
 		if (ddp)

From d17ab631a9142b81b100d87f0619f5e59bc211ac Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 3 Jul 2023 15:16:02 +1000
Subject: [PATCH 21/65] ddt: rework access to phys array slots

The "flat phys" feature will use only a single phys slot for all
entries, which means the old "single", "double" etc naming now makes no
sense, and more importantly, means that choosing the right slot for a
given block pointer will depend on how many slots are in use for a given
DDT.

This removes the old names, and adds accessor macros to decouple
specific phys array indexes from any particular meaning.

(These macros look strange in isolation, mainly in the way they take the
ddt_t* as an arg but don't use it. This is mostly a separate commit to
introduce the concept to the reader before the "flat phys" commit
extends it).

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15893
---
 cmd/zdb/zdb.c          | 13 +++++-----
 include/sys/ddt.h      | 27 ++++++++-------------
 include/sys/ddt_impl.h |  2 +-
 module/zfs/ddt.c       | 54 ++++++++++++++++++++++++------------------
 module/zfs/ddt_stats.c |  5 ++--
 module/zfs/ddt_zap.c   |  1 +
 module/zfs/dsl_scan.c  |  6 +++--
 module/zfs/zio.c       | 36 ++++++++++++++++++----------
 8 files changed, 79 insertions(+), 65 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index fcf0e4779788..7a6459b756b2 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -1916,21 +1916,20 @@ dump_log_spacemaps(spa_t *spa)
 static void
 dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
 {
-	const ddt_phys_t *ddp = dde->dde_phys;
 	const ddt_key_t *ddk = &dde->dde_key;
-	const char *types[4] = { "ditto", "single", "double", "triple" };
 	char blkbuf[BP_SPRINTF_LEN];
 	blkptr_t blk;
 	int p;
 
-	for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+	for (p = 0; p < DDT_NPHYS(ddt); p++) {
+		const ddt_phys_t *ddp = &dde->dde_phys[p];
 		if (ddp->ddp_phys_birth == 0)
 			continue;
 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
-		(void) printf("index %llx refcnt %llu %s %s\n",
+		(void) printf("index %llx refcnt %llu phys %d %s\n",
 		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
-		    types[p], blkbuf);
+		    p, blkbuf);
 	}
 }
 
@@ -5724,7 +5723,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 		VERIFY3P(dde, !=, NULL);
 
 		/* Get the phys for this variant */
-		ddt_phys_t *ddp = ddt_phys_select(dde, bp);
+		ddt_phys_t *ddp = ddt_phys_select(ddt, dde, bp);
 		VERIFY3P(ddp, !=, NULL);
 
 		/*
@@ -5751,7 +5750,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 			dde->dde_lead_zio[idx] = (zio_t *)(uintptr_t)B_TRUE;
 
 		/* Consume a reference for this block. */
-		VERIFY3U(ddt_phys_total_refcnt(dde), >, 0);
+		VERIFY3U(ddt_phys_total_refcnt(ddt, dde), >, 0);
 		ddt_phys_decref(ddp);
 
 		if (seen) {
diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 20bae8ce0fcb..a2e069f13922 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -137,19 +137,10 @@ typedef struct {
 	uint64_t	ddp_phys_birth;
 } ddt_phys_t;
 
-/*
- * Named indexes into the ddt_phys_t array in each entry.
- *
- * Note, we no longer generate new DDT_PHYS_DITTO-type blocks.  However,
- * we maintain the ability to free existing dedup-ditto blocks.
- */
-enum ddt_phys_type {
-	DDT_PHYS_DITTO = 0,
-	DDT_PHYS_SINGLE = 1,
-	DDT_PHYS_DOUBLE = 2,
-	DDT_PHYS_TRIPLE = 3,
-	DDT_PHYS_TYPES
-};
+#define	DDT_PHYS_MAX			(4)
+#define	DDT_NPHYS(ddt)			((ddt) ? DDT_PHYS_MAX : DDT_PHYS_MAX)
+#define	DDT_PHYS_IS_DITTO(ddt, p)	((ddt) && p == 0)
+#define	DDT_PHYS_FOR_COPIES(ddt, p)	((ddt) ? (p) : (p))
 
 /*
  * A "live" entry, holding changes to an entry made this txg, and other data to
@@ -162,11 +153,11 @@ enum ddt_phys_type {
 
 typedef struct {
 	/* key must be first for ddt_key_compare */
-	ddt_key_t	dde_key;			/* ddt_tree key */
-	ddt_phys_t	dde_phys[DDT_PHYS_TYPES];	/* on-disk data */
+	ddt_key_t	dde_key;		/* ddt_tree key */
+	ddt_phys_t	dde_phys[DDT_PHYS_MAX];	/* on-disk data */
 
 	/* in-flight update IOs */
-	zio_t		*dde_lead_zio[DDT_PHYS_TYPES];
+	zio_t		*dde_lead_zio[DDT_PHYS_MAX];
 
 	/* copy of data after a repair read, to be rewritten */
 	struct abd	*dde_repair_abd;
@@ -234,7 +225,8 @@ extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
 extern void ddt_phys_clear(ddt_phys_t *ddp);
 extern void ddt_phys_addref(ddt_phys_t *ddp);
 extern void ddt_phys_decref(ddt_phys_t *ddp);
-extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp);
+extern ddt_phys_t *ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde,
+    const blkptr_t *bp);
 
 extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
 extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
@@ -249,6 +241,7 @@ extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa);
 extern int ddt_get_pool_dedup_cached(spa_t *spa, uint64_t *psize);
 
 extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp);
+extern ddt_t *ddt_select_checksum(spa_t *spa, enum zio_checksum checksum);
 extern void ddt_enter(ddt_t *ddt);
 extern void ddt_exit(ddt_t *ddt);
 extern void ddt_init(void);
diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h
index 9c0fea64f389..e97b71621c37 100644
--- a/include/sys/ddt_impl.h
+++ b/include/sys/ddt_impl.h
@@ -82,7 +82,7 @@ extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg);
  */
 #define	DDT_NAMELEN	32
 
-extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde);
+extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde);
 
 extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
 
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 84d7800cbc73..9bb0b8f15fca 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -540,11 +540,10 @@ ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
 }
 
 ddt_phys_t *
-ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
+ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp)
 {
-	ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
-
-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		ddt_phys_t *ddp = (ddt_phys_t *)&dde->dde_phys[p];
 		if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
 		    BP_GET_BIRTH(bp) == ddp->ddp_phys_birth)
 			return (ddp);
@@ -553,12 +552,15 @@ ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
 }
 
 uint64_t
-ddt_phys_total_refcnt(const ddt_entry_t *dde)
+ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde)
 {
 	uint64_t refcnt = 0;
 
-	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		if (DDT_PHYS_IS_DITTO(ddt, p))
+			continue;
 		refcnt += dde->dde_phys[p].ddp_refcnt;
+	}
 
 	return (refcnt);
 }
@@ -570,6 +572,12 @@ ddt_select(spa_t *spa, const blkptr_t *bp)
 	return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
 }
 
+ddt_t *
+ddt_select_checksum(spa_t *spa, enum zio_checksum checksum)
+{
+	return (spa->spa_ddt[checksum]);
+}
+
 void
 ddt_enter(ddt_t *ddt)
 {
@@ -613,9 +621,9 @@ ddt_alloc(const ddt_key_t *ddk)
 }
 
 static void
-ddt_free(ddt_entry_t *dde)
+ddt_free(const ddt_t *ddt, ddt_entry_t *dde)
 {
-	for (int p = 0; p < DDT_PHYS_TYPES; p++)
+	for (int p = 0; p < DDT_NPHYS(ddt); p++)
 		ASSERT3P(dde->dde_lead_zio[p], ==, NULL);
 
 	if (dde->dde_repair_abd != NULL)
@@ -631,7 +639,7 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
 	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 
 	avl_remove(&ddt->ddt_tree, dde);
-	ddt_free(dde);
+	ddt_free(ddt, dde);
 }
 
 static boolean_t
@@ -759,7 +767,7 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 		if (dde->dde_flags & DDE_FLAG_OVERQUOTA) {
 			if (dde->dde_waiters == 0) {
 				avl_remove(&ddt->ddt_tree, dde);
-				ddt_free(dde);
+				ddt_free(ddt, dde);
 			}
 			return (NULL);
 		}
@@ -805,7 +813,7 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 		/* Over quota. If no one is waiting, clean up right now. */
 		if (dde->dde_waiters == 0) {
 			avl_remove(&ddt->ddt_tree, dde);
-			ddt_free(dde);
+			ddt_free(ddt, dde);
 			return (NULL);
 		}
 
@@ -1212,7 +1220,7 @@ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
 	    avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
 		avl_insert(&ddt->ddt_repair_tree, dde, where);
 	else
-		ddt_free(dde);
+		ddt_free(ddt, dde);
 
 	ddt_exit(ddt);
 }
@@ -1220,16 +1228,15 @@ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
 static void
 ddt_repair_entry_done(zio_t *zio)
 {
+	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *rdde = zio->io_private;
 
-	ddt_free(rdde);
+	ddt_free(ddt, rdde);
 }
 
 static void
 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
 {
-	ddt_phys_t *ddp = dde->dde_phys;
-	ddt_phys_t *rddp = rdde->dde_phys;
 	ddt_key_t *ddk = &dde->dde_key;
 	ddt_key_t *rddk = &rdde->dde_key;
 	zio_t *zio;
@@ -1238,7 +1245,9 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
 	zio = zio_null(rio, rio->io_spa, NULL,
 	    ddt_repair_entry_done, rdde, rio->io_flags);
 
-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		ddt_phys_t *ddp = &dde->dde_phys[p];
+		ddt_phys_t *rddp = &rdde->dde_phys[p];
 		if (ddp->ddp_phys_birth == 0 ||
 		    ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
 		    memcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
@@ -1281,7 +1290,6 @@ static void
 ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 {
 	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
-	ddt_phys_t *ddp = dde->dde_phys;
 	ddt_key_t *ddk = &dde->dde_key;
 	ddt_type_t otype = dde->dde_type;
 	ddt_type_t ntype = DDT_TYPE_DEFAULT;
@@ -1291,13 +1299,14 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 
 	ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
 
-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
 		ASSERT3P(dde->dde_lead_zio[p], ==, NULL);
+		ddt_phys_t *ddp = &dde->dde_phys[p];
 		if (ddp->ddp_phys_birth == 0) {
 			ASSERT0(ddp->ddp_refcnt);
 			continue;
 		}
-		if (p == DDT_PHYS_DITTO) {
+		if (DDT_PHYS_IS_DITTO(ddt, p)) {
 			/*
 			 * Note, we no longer create DDT-DITTO blocks, but we
 			 * don't want to leak any written by older software.
@@ -1310,8 +1319,6 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 		total_refcnt += ddp->ddp_refcnt;
 	}
 
-	/* We do not create new DDT-DITTO blocks. */
-	ASSERT0(dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth);
 	if (total_refcnt > 1)
 		nclass = DDT_CLASS_DUPLICATE;
 	else
@@ -1369,7 +1376,7 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
 
 	while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
 		ddt_sync_entry(ddt, dde, tx, txg);
-		ddt_free(dde);
+		ddt_free(ddt, dde);
 	}
 
 	uint64_t count = 0;
@@ -1512,7 +1519,8 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 
 		ASSERT3S(dde->dde_class, <, DDT_CLASSES);
 
-		ddp = &dde->dde_phys[BP_GET_NDVAS(bp)];
+		int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp));
+		ddp = &dde->dde_phys[p];
 
 		/*
 		 * This entry already existed (dde_type is real), so it must
diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c
index 82b682019ae9..5449eca3afb1 100644
--- a/module/zfs/ddt_stats.c
+++ b/module/zfs/ddt_stats.c
@@ -36,14 +36,15 @@ static void
 ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
 {
 	spa_t *spa = ddt->ddt_spa;
-	ddt_phys_t *ddp = dde->dde_phys;
 	ddt_key_t *ddk = &dde->dde_key;
 	uint64_t lsize = DDK_GET_LSIZE(ddk);
 	uint64_t psize = DDK_GET_PSIZE(ddk);
 
 	memset(dds, 0, sizeof (*dds));
 
-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		ddt_phys_t *ddp = &dde->dde_phys[p];
+
 		uint64_t dsize = 0;
 		uint64_t refcnt = ddp->ddp_refcnt;
 
diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c
index 7ce7461a2b25..8f1bbeeecd8d 100644
--- a/module/zfs/ddt_zap.c
+++ b/module/zfs/ddt_zap.c
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2023, Klara Inc.
  */
 
 #include <sys/zfs_context.h>
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 085cfd3c5691..737ee4f6600c 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -2933,7 +2933,6 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
 {
 	(void) tx;
 	const ddt_key_t *ddk = &dde->dde_key;
-	ddt_phys_t *ddp = dde->dde_phys;
 	blkptr_t bp;
 	zbookmark_phys_t zb = { 0 };
 
@@ -2954,7 +2953,10 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
 	if (scn->scn_done_txg != 0)
 		return;
 
-	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+	ddt_t *ddt = ddt_select_checksum(tx->tx_pool->dp_spa, checksum);
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		ddt_phys_t *ddp = &dde->dde_phys[p];
+
 		if (ddp->ddp_phys_birth == 0 ||
 		    ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
 			continue;
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 5810e811a39d..914f83fb9f9b 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -3254,12 +3254,14 @@ static void
 zio_ddt_child_read_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
+	ddt_t *ddt;
 	ddt_entry_t *dde = zio->io_private;
 	ddt_phys_t *ddp;
 	zio_t *pio = zio_unique_parent(zio);
 
 	mutex_enter(&pio->io_lock);
-	ddp = ddt_phys_select(dde, bp);
+	ddt = ddt_select(zio->io_spa, bp);
+	ddp = ddt_phys_select(ddt, dde, bp);
 	if (zio->io_error == 0)
 		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
 
@@ -3282,8 +3284,7 @@ zio_ddt_read_start(zio_t *zio)
 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
-		ddt_phys_t *ddp = dde->dde_phys;
-		ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
+		ddt_phys_t *ddp_self = ddt_phys_select(ddt, dde, bp);
 		blkptr_t blk;
 
 		ASSERT(zio->io_vsd == NULL);
@@ -3292,7 +3293,8 @@ zio_ddt_read_start(zio_t *zio)
 		if (ddp_self == NULL)
 			return (zio);
 
-		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+			ddt_phys_t *ddp = &dde->dde_phys[p];
 			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
 				continue;
 			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
@@ -3372,7 +3374,10 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 	 * loaded).
 	 */
 
-	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		if (DDT_PHYS_IS_DITTO(ddt, p))
+			continue;
+
 		zio_t *lio = dde->dde_lead_zio[p];
 
 		if (lio != NULL && do_raw) {
@@ -3384,7 +3389,10 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 		}
 	}
 
-	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		if (DDT_PHYS_IS_DITTO(ddt, p))
+			continue;
+
 		ddt_phys_t *ddp = &dde->dde_phys[p];
 
 		if (ddp->ddp_phys_birth != 0 && do_raw) {
@@ -3452,15 +3460,16 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 static void
 zio_ddt_child_write_ready(zio_t *zio)
 {
-	int p = zio->io_prop.zp_copies;
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
-	ddt_phys_t *ddp = &dde->dde_phys[p];
 	zio_t *pio;
 
 	if (zio->io_error)
 		return;
 
+	int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
+	ddt_phys_t *ddp = &dde->dde_phys[p];
+
 	ddt_enter(ddt);
 
 	ASSERT(dde->dde_lead_zio[p] == zio);
@@ -3477,9 +3486,10 @@ zio_ddt_child_write_ready(zio_t *zio)
 static void
 zio_ddt_child_write_done(zio_t *zio)
 {
-	int p = zio->io_prop.zp_copies;
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
+
+	int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
 	ddt_phys_t *ddp = &dde->dde_phys[p];
 
 	ddt_enter(ddt);
@@ -3506,11 +3516,9 @@ zio_ddt_write(zio_t *zio)
 	blkptr_t *bp = zio->io_bp;
 	uint64_t txg = zio->io_txg;
 	zio_prop_t *zp = &zio->io_prop;
-	int p = zp->zp_copies;
 	zio_t *cio = NULL;
 	ddt_t *ddt = ddt_select(spa, bp);
 	ddt_entry_t *dde;
-	ddt_phys_t *ddp;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
@@ -3528,7 +3536,9 @@ zio_ddt_write(zio_t *zio)
 		ddt_exit(ddt);
 		return (zio);
 	}
-	ddp = &dde->dde_phys[p];
+
+	int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies);
+	ddt_phys_t *ddp = &dde->dde_phys[p];
 
 	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
 		/*
@@ -3600,7 +3610,7 @@ zio_ddt_free(zio_t *zio)
 	ddt_enter(ddt);
 	freedde = dde = ddt_lookup(ddt, bp);
 	if (dde) {
-		ddp = ddt_phys_select(dde, bp);
+		ddp = ddt_phys_select(ddt, dde, bp);
 		if (ddp)
 			ddt_phys_decref(ddp);
 	}

From 4d686c3da53db5e5f3f3cc52060d9fbca2baf092 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 3 Jul 2023 22:16:04 +1000
Subject: [PATCH 22/65] ddt: introduce lightweight entry

The idea here is that sometimes you need the contents of an entry with
no intent to modify it, and/or from a place where its difficult to get
hold of its originating ddt_t to know how to interpret it.

A lightweight entry contains everything you might need to "read" an
entry - its key, type and phys contents - but none of the extras for
modifying it or using it in a larger context. It also has the full
complement of phys slots, so it can represent any kind of dedup entry
without having to know the specific configuration of the table it came
from.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15893
---
 cmd/zdb/zdb.c          | 15 ++++++++-------
 include/sys/ddt.h      | 16 ++++++++++++++--
 include/sys/ddt_impl.h | 13 ++++++++++++-
 include/sys/dsl_scan.h |  2 +-
 module/zfs/ddt.c       | 31 ++++++++++++++++---------------
 module/zfs/dsl_scan.c  | 15 +++++++--------
 6 files changed, 58 insertions(+), 34 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 7a6459b756b2..3bde5736c0fa 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -1914,15 +1914,16 @@ dump_log_spacemaps(spa_t *spa)
 }
 
 static void
-dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
+dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
+    uint64_t index)
 {
-	const ddt_key_t *ddk = &dde->dde_key;
+	const ddt_key_t *ddk = &ddlwe->ddlwe_key;
 	char blkbuf[BP_SPRINTF_LEN];
 	blkptr_t blk;
 	int p;
 
-	for (p = 0; p < DDT_NPHYS(ddt); p++) {
-		const ddt_phys_t *ddp = &dde->dde_phys[p];
+	for (p = 0; p < ddlwe->ddlwe_nphys; p++) {
+		const ddt_phys_t *ddp = &ddlwe->ddlwe_phys[p];
 		if (ddp->ddp_phys_birth == 0)
 			continue;
 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
@@ -1959,7 +1960,7 @@ static void
 dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 {
 	char name[DDT_NAMELEN];
-	ddt_entry_t dde;
+	ddt_lightweight_entry_t ddlwe;
 	uint64_t walk = 0;
 	dmu_object_info_t doi;
 	uint64_t count, dspace, mspace;
@@ -2000,8 +2001,8 @@ dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 
 	(void) printf("%s contents:\n\n", name);
 
-	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
-		dump_dde(ddt, &dde, walk);
+	while ((error = ddt_object_walk(ddt, type, class, &walk, &ddlwe)) == 0)
+		dump_ddt_entry(ddt, &ddlwe, walk);
 
 	ASSERT3U(error, ==, ENOENT);
 
diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index a2e069f13922..7a0916690909 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -173,6 +173,18 @@ typedef struct {
 	avl_node_t	dde_node;	/* ddt_tree node */
 } ddt_entry_t;
 
+/*
+ * A lightweight entry is for short-lived or transient uses, like iterating or
+ * inspecting, when you don't care where it came from.
+ */
+typedef struct {
+	ddt_key_t	ddlwe_key;
+	ddt_type_t	ddlwe_type;
+	ddt_class_t	ddlwe_class;
+	uint8_t		ddlwe_nphys;
+	ddt_phys_t	ddlwe_phys[DDT_PHYS_MAX];
+} ddt_lightweight_entry_t;
+
 /*
  * In-core DDT object. This covers all entries and stats for a the whole pool
  * for a given checksum type.
@@ -241,7 +253,6 @@ extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa);
 extern int ddt_get_pool_dedup_cached(spa_t *spa, uint64_t *psize);
 
 extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp);
-extern ddt_t *ddt_select_checksum(spa_t *spa, enum zio_checksum checksum);
 extern void ddt_enter(ddt_t *ddt);
 extern void ddt_exit(ddt_t *ddt);
 extern void ddt_init(void);
@@ -263,7 +274,8 @@ extern void ddt_create(spa_t *spa);
 extern int ddt_load(spa_t *spa);
 extern void ddt_unload(spa_t *spa);
 extern void ddt_sync(spa_t *spa, uint64_t txg);
-extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
+extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb,
+    ddt_lightweight_entry_t *ddlwe);
 
 extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp);
 
diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h
index e97b71621c37..e88a046ab8ae 100644
--- a/include/sys/ddt_impl.h
+++ b/include/sys/ddt_impl.h
@@ -41,6 +41,17 @@ extern "C" {
 #define	DDT_DIR_VERSION		"version"
 #define	DDT_DIR_FLAGS		"flags"
 
+/* Fill a lightweight entry from a live entry. */
+#define	DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do {		\
+	memset((ddlwe), 0, sizeof (*ddlwe));			\
+	(ddlwe)->ddlwe_key = (dde)->dde_key;			\
+	(ddlwe)->ddlwe_type = (dde)->dde_type;			\
+	(ddlwe)->ddlwe_class = (dde)->dde_class;		\
+	(ddlwe)->ddlwe_nphys = DDT_NPHYS(ddt);			\
+	for (int p = 0; p < (ddlwe)->ddlwe_nphys; p++)		\
+		(ddlwe)->ddlwe_phys[p] = (dde)->dde_phys[p];	\
+} while (0)
+
 /*
  * Ops vector to access a specific DDT object type.
  */
@@ -91,7 +102,7 @@ extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg);
 extern void ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
     char *name);
 extern int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
-    uint64_t *walk, ddt_entry_t *dde);
+    uint64_t *walk, ddt_lightweight_entry_t *ddlwe);
 extern int ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
     uint64_t *count);
 extern int ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h
index f32f59a2bedf..b91d7f4be88f 100644
--- a/include/sys/dsl_scan.h
+++ b/include/sys/dsl_scan.h
@@ -202,7 +202,7 @@ boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
 boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp);
 boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
 void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
-    ddt_entry_t *dde, dmu_tx_t *tx);
+    ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx);
 void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
 void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
 void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 9bb0b8f15fca..aac2250bf30c 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -401,13 +401,20 @@ ddt_object_remove(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
 
 int
 ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
-    uint64_t *walk, ddt_entry_t *dde)
+    uint64_t *walk, ddt_lightweight_entry_t *ddlwe)
 {
 	ASSERT(ddt_object_exists(ddt, type, class));
 
-	return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
-	    ddt->ddt_object[type][class], walk, &dde->dde_key,
-	    dde->dde_phys, sizeof (dde->dde_phys)));
+	int error = ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
+	    ddt->ddt_object[type][class], walk, &ddlwe->ddlwe_key,
+	    ddlwe->ddlwe_phys, sizeof (ddlwe->ddlwe_phys));
+	if (error == 0) {
+		ddlwe->ddlwe_type = type;
+		ddlwe->ddlwe_class = class;
+		ddlwe->ddlwe_nphys = DDT_NPHYS(ddt);
+		return (0);
+	}
+	return (error);
 }
 
 int
@@ -572,12 +579,6 @@ ddt_select(spa_t *spa, const blkptr_t *bp)
 	return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
 }
 
-ddt_t *
-ddt_select_checksum(spa_t *spa, enum zio_checksum checksum)
-{
-	return (spa->spa_ddt[checksum]);
-}
-
 void
 ddt_enter(ddt_t *ddt)
 {
@@ -1347,8 +1348,10 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 		 * traversing.)
 		 */
 		if (nclass < oclass) {
+			ddt_lightweight_entry_t ddlwe;
+			DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
 			dsl_scan_ddt_entry(dp->dp_scan,
-			    ddt->ddt_checksum, dde, tx);
+			    ddt->ddt_checksum, &ddlwe, tx);
 		}
 	}
 }
@@ -1455,7 +1458,7 @@ ddt_sync(spa_t *spa, uint64_t txg)
 }
 
 int
-ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
+ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
 {
 	do {
 		do {
@@ -1468,10 +1471,8 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
 				    ddb->ddb_class)) {
 					error = ddt_object_walk(ddt,
 					    ddb->ddb_type, ddb->ddb_class,
-					    &ddb->ddb_cursor, dde);
+					    &ddb->ddb_cursor, ddlwe);
 				}
-				dde->dde_type = ddb->ddb_type;
-				dde->dde_class = ddb->ddb_class;
 				if (error == 0)
 					return (0);
 				if (error != ENOENT)
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 737ee4f6600c..dec0eb28dc5f 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -2929,10 +2929,10 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 
 void
 dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
-    ddt_entry_t *dde, dmu_tx_t *tx)
+    ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
 {
 	(void) tx;
-	const ddt_key_t *ddk = &dde->dde_key;
+	const ddt_key_t *ddk = &ddlwe->ddlwe_key;
 	blkptr_t bp;
 	zbookmark_phys_t zb = { 0 };
 
@@ -2953,9 +2953,8 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
 	if (scn->scn_done_txg != 0)
 		return;
 
-	ddt_t *ddt = ddt_select_checksum(tx->tx_pool->dp_spa, checksum);
-	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-		ddt_phys_t *ddp = &dde->dde_phys[p];
+	for (int p = 0; p < ddlwe->ddlwe_nphys; p++) {
+		ddt_phys_t *ddp = &ddlwe->ddlwe_phys[p];
 
 		if (ddp->ddp_phys_birth == 0 ||
 		    ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
@@ -3004,11 +3003,11 @@ static void
 dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
 {
 	ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
-	ddt_entry_t dde = {{{{0}}}};
+	ddt_lightweight_entry_t ddlwe = {0};
 	int error;
 	uint64_t n = 0;
 
-	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
+	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &ddlwe)) == 0) {
 		ddt_t *ddt;
 
 		if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
@@ -3023,7 +3022,7 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
 		ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
 		ASSERT(avl_first(&ddt->ddt_tree) == NULL);
 
-		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
+		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &ddlwe, tx);
 		n++;
 
 		if (dsl_scan_check_suspend(scn, NULL))

From 0ba5f503c5d644d28429c366fd1cdbd1c6c9b2b9 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 3 Jul 2023 19:54:40 +1000
Subject: [PATCH 23/65] ddt: slim down ddt_entry_t

This slims down the in-memory entry to as small as it can be. The
IO-related parts are made into a separate entry, since they're
relatively rarely needed.

The variable allocation for dde_phys is to support the upcoming flat
format.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15893
---
 include/sys/ddt.h | 22 ++++++++++++++++------
 module/zfs/ddt.c  | 46 +++++++++++++++++++++++++++++++++-------------
 module/zfs/zio.c  | 26 ++++++++++++++------------
 3 files changed, 63 insertions(+), 31 deletions(-)

diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 7a0916690909..222373c98a08 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -151,16 +151,22 @@ typedef struct {
 #define	DDE_FLAG_LOADED		(1 << 0)	/* entry ready for use */
 #define	DDE_FLAG_OVERQUOTA	(1 << 1)	/* entry unusable, no space */
 
+/*
+ * Additional data to support entry update or repair. This is fixed size
+ * because its relatively rarely used.
+ */
 typedef struct {
-	/* key must be first for ddt_key_compare */
-	ddt_key_t	dde_key;		/* ddt_tree key */
-	ddt_phys_t	dde_phys[DDT_PHYS_MAX];	/* on-disk data */
+	/* copy of data after a repair read, to be rewritten */
+	abd_t		*dde_repair_abd;
 
 	/* in-flight update IOs */
 	zio_t		*dde_lead_zio[DDT_PHYS_MAX];
+} ddt_entry_io_t;
 
-	/* copy of data after a repair read, to be rewritten */
-	struct abd	*dde_repair_abd;
+typedef struct {
+	/* key must be first for ddt_key_compare */
+	ddt_key_t	dde_key;	/* ddt_tree key */
+	avl_node_t	dde_node;	/* ddt_tree_node */
 
 	/* storage type and class the entry was loaded from */
 	ddt_type_t	dde_type;
@@ -170,7 +176,9 @@ typedef struct {
 	kcondvar_t	dde_cv;		/* signaled when load completes */
 	uint64_t	dde_waiters;	/* count of waiters on dde_cv */
 
-	avl_node_t	dde_node;	/* ddt_tree node */
+	ddt_entry_io_t	*dde_io;	/* IO support, when required */
+
+	ddt_phys_t	dde_phys[];	/* physical data */
 } ddt_entry_t;
 
 /*
@@ -265,6 +273,8 @@ extern void ddt_prefetch_all(spa_t *spa);
 extern boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class,
     const blkptr_t *bp);
 
+extern void ddt_alloc_entry_io(ddt_entry_t *dde);
+
 extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
 extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);
 
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index aac2250bf30c..213e042394f8 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -164,6 +164,9 @@
 static kmem_cache_t *ddt_cache;
 static kmem_cache_t *ddt_entry_cache;
 
+#define	DDT_ENTRY_SIZE	\
+	(sizeof (ddt_entry_t) + sizeof (ddt_phys_t) * DDT_PHYS_MAX)
+
 /*
  * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
  */
@@ -343,7 +346,7 @@ ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
 
 	return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
 	    ddt->ddt_object[type][class], &dde->dde_key,
-	    dde->dde_phys, sizeof (dde->dde_phys)));
+	    dde->dde_phys, sizeof (ddt_phys_t) * DDT_NPHYS(ddt)));
 }
 
 static int
@@ -386,7 +389,7 @@ ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
 
 	return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
 	    ddt->ddt_object[type][class], &dde->dde_key, dde->dde_phys,
-	    sizeof (dde->dde_phys), tx));
+	    sizeof (ddt_phys_t) * DDT_NPHYS(ddt), tx));
 }
 
 static int
@@ -597,7 +600,7 @@ ddt_init(void)
 	ddt_cache = kmem_cache_create("ddt_cache",
 	    sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 	ddt_entry_cache = kmem_cache_create("ddt_entry_cache",
-	    sizeof (ddt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	    DDT_ENTRY_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
 }
 
 void
@@ -613,7 +616,7 @@ ddt_alloc(const ddt_key_t *ddk)
 	ddt_entry_t *dde;
 
 	dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP);
-	memset(dde, 0, sizeof (ddt_entry_t));
+	memset(dde, 0, DDT_ENTRY_SIZE);
 	cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
 
 	dde->dde_key = *ddk;
@@ -621,14 +624,27 @@ ddt_alloc(const ddt_key_t *ddk)
 	return (dde);
 }
 
+void
+ddt_alloc_entry_io(ddt_entry_t *dde)
+{
+	if (dde->dde_io != NULL)
+		return;
+
+	dde->dde_io = kmem_zalloc(sizeof (ddt_entry_io_t), KM_SLEEP);
+}
+
 static void
 ddt_free(const ddt_t *ddt, ddt_entry_t *dde)
 {
-	for (int p = 0; p < DDT_NPHYS(ddt); p++)
-		ASSERT3P(dde->dde_lead_zio[p], ==, NULL);
+	if (dde->dde_io != NULL) {
+		for (int p = 0; p < DDT_NPHYS(ddt); p++)
+			ASSERT3P(dde->dde_io->dde_lead_zio[p], ==, NULL);
 
-	if (dde->dde_repair_abd != NULL)
-		abd_free(dde->dde_repair_abd);
+		if (dde->dde_io->dde_repair_abd != NULL)
+			abd_free(dde->dde_io->dde_repair_abd);
+
+		kmem_free(dde->dde_io, sizeof (ddt_entry_io_t));
+	}
 
 	cv_destroy(&dde->dde_cv);
 	kmem_cache_free(ddt_entry_cache, dde);
@@ -1191,6 +1207,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
 	ddt_key_fill(&ddk, bp);
 
 	dde = ddt_alloc(&ddk);
+	ddt_alloc_entry_io(dde);
 
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
@@ -1205,7 +1222,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
 		}
 	}
 
-	memset(dde->dde_phys, 0, sizeof (dde->dde_phys));
+	memset(dde->dde_phys, 0, sizeof (ddt_phys_t) * DDT_NPHYS(ddt));
 
 	return (dde);
 }
@@ -1217,7 +1234,8 @@ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
 
 	ddt_enter(ddt);
 
-	if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
+	if (dde->dde_io->dde_repair_abd != NULL &&
+	    spa_writeable(ddt->ddt_spa) &&
 	    avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
 		avl_insert(&ddt->ddt_repair_tree, dde, where);
 	else
@@ -1255,8 +1273,9 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
 			continue;
 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 		zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
-		    rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
-		    ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
+		    rdde->dde_io->dde_repair_abd, DDK_GET_PSIZE(rddk),
+		    NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
+		    ZIO_DDT_CHILD_FLAGS(zio), NULL));
 	}
 
 	zio_nowait(zio);
@@ -1301,7 +1320,8 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 	ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
 
 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-		ASSERT3P(dde->dde_lead_zio[p], ==, NULL);
+		ASSERT(dde->dde_io == NULL ||
+		    dde->dde_io->dde_lead_zio[p] == NULL);
 		ddt_phys_t *ddp = &dde->dde_phys[p];
 		if (ddp->ddp_phys_birth == 0) {
 			ASSERT0(ddp->ddp_refcnt);
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 914f83fb9f9b..1ca71c738c8f 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -3265,8 +3265,8 @@ zio_ddt_child_read_done(zio_t *zio)
 	if (zio->io_error == 0)
 		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
 
-	if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
-		dde->dde_repair_abd = zio->io_abd;
+	if (zio->io_error == 0 && dde->dde_io->dde_repair_abd == NULL)
+		dde->dde_io->dde_repair_abd = zio->io_abd;
 	else
 		abd_free(zio->io_abd);
 	mutex_exit(&pio->io_lock);
@@ -3340,8 +3340,8 @@ zio_ddt_read_done(zio_t *zio)
 			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 			return (NULL);
 		}
-		if (dde->dde_repair_abd != NULL) {
-			abd_copy(zio->io_abd, dde->dde_repair_abd,
+		if (dde->dde_io->dde_repair_abd != NULL) {
+			abd_copy(zio->io_abd, dde->dde_io->dde_repair_abd,
 			    zio->io_size);
 			zio->io_child_error[ZIO_CHILD_DDT] = 0;
 		}
@@ -3378,7 +3378,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 		if (DDT_PHYS_IS_DITTO(ddt, p))
 			continue;
 
-		zio_t *lio = dde->dde_lead_zio[p];
+		zio_t *lio = dde->dde_io->dde_lead_zio[p];
 
 		if (lio != NULL && do_raw) {
 			return (lio->io_size != zio->io_size ||
@@ -3472,7 +3472,7 @@ zio_ddt_child_write_ready(zio_t *zio)
 
 	ddt_enter(ddt);
 
-	ASSERT(dde->dde_lead_zio[p] == zio);
+	ASSERT(dde->dde_io->dde_lead_zio[p] == zio);
 
 	ddt_phys_fill(ddp, zio->io_bp);
 
@@ -3495,8 +3495,8 @@ zio_ddt_child_write_done(zio_t *zio)
 	ddt_enter(ddt);
 
 	ASSERT(ddp->ddp_refcnt == 0);
-	ASSERT(dde->dde_lead_zio[p] == zio);
-	dde->dde_lead_zio[p] = NULL;
+	ASSERT(dde->dde_io->dde_lead_zio[p] == zio);
+	dde->dde_io->dde_lead_zio[p] = NULL;
 
 	if (zio->io_error == 0) {
 		zio_link_t *zl = NULL;
@@ -3563,11 +3563,13 @@ zio_ddt_write(zio_t *zio)
 		return (zio);
 	}
 
-	if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
+	ddt_alloc_entry_io(dde);
+
+	if (ddp->ddp_phys_birth != 0 || dde->dde_io->dde_lead_zio[p] != NULL) {
 		if (ddp->ddp_phys_birth != 0)
 			ddt_bp_fill(ddp, bp, txg);
-		if (dde->dde_lead_zio[p] != NULL)
-			zio_add_child(zio, dde->dde_lead_zio[p]);
+		if (dde->dde_io->dde_lead_zio[p] != NULL)
+			zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
 		else
 			ddt_phys_addref(ddp);
 	} else if (zio->io_bp_override) {
@@ -3583,7 +3585,7 @@ zio_ddt_write(zio_t *zio)
 		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
 
 		zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
-		dde->dde_lead_zio[p] = cio;
+		dde->dde_io->dde_lead_zio[p] = cio;
 	}
 
 	ddt_exit(ddt);

From f4aeb23f521cb4c5d94b103c926a3cc7b7be8abc Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 20 Jun 2023 11:09:48 +1000
Subject: [PATCH 24/65] ddt: add "flat phys" feature

Traditional dedup keeps a separate ddt_phys_t "type" for each possible
count of DVAs (that is, copies=) parameter. Each of these are tracked
independently of each other, and have their own set of DVAs. This leads
to an (admittedly rare) situation where you can create as many as six
copies of the data, by changing the copies= parameter between copying.
This is both a waste of storage on disk, but also a waste of space in
the stored DDT entries, since there never needs to be more than three
DVAs to handle all possible values of copies=.

This commit adds a new FDT feature, DDT_FLAG_FLAT. When active, only the
first ddt_phys_t is used. Each time a block is written with the dedup
bit set, this single phys is checked to see if it has enough DVAs to
fulfill the request. If it does, the block is filled with the saved DVAs
as normal. If not, an adjusted write is issued to create as many extra
copies as are needed to fulfill the request, which are then saved into
the entry too.

Because a single phys is no longer an all-or-nothing, but can be
transitioning from fewer to more DVAs, the write path now has to keep a
copy of the previous "known good" DVA set so we can revert to it in case
an error occurs. zio_ddt_write() has been restructured and heavily
commented to make it much easier to see what's happening.

Backwards compatibility is maintained simply by allocating four
ddt_phys_t when the DDT_FLAG_FLAT flag is not set, and updating the phys
selection macros to check the flag. In the old arrangement, each number
of copies gets a whole phys, so it will always have either zero or all
necessary DVAs filled, with no in-between, so the old behaviour
naturally falls out of the new code.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Co-authored-by: Don Brady <don.brady@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15893
---
 cmd/zdb/zdb.c          |  68 +++++---
 include/sys/ddt.h      | 122 ++++++++++---
 include/sys/ddt_impl.h |  20 +--
 include/sys/dsl_scan.h |   2 +-
 include/sys/spa.h      |   7 +-
 module/zfs/ddt.c       | 328 ++++++++++++++++++++++++++---------
 module/zfs/ddt_stats.c |  20 ++-
 module/zfs/ddt_zap.c   |   6 +-
 module/zfs/dsl_scan.c  |  14 +-
 module/zfs/zio.c       | 380 +++++++++++++++++++++++++++++++++--------
 10 files changed, 733 insertions(+), 234 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 3bde5736c0fa..142f55b299e5 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -1922,14 +1922,16 @@ dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
 	blkptr_t blk;
 	int p;
 
-	for (p = 0; p < ddlwe->ddlwe_nphys; p++) {
-		const ddt_phys_t *ddp = &ddlwe->ddlwe_phys[p];
-		if (ddp->ddp_phys_birth == 0)
+	for (p = 0; p < DDT_NPHYS(ddt); p++) {
+		const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+
+		if (ddt_phys_birth(ddp, v) == 0)
 			continue;
-		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
 		(void) printf("index %llx refcnt %llu phys %d %s\n",
-		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
+		    (u_longlong_t)index, (u_longlong_t)ddt_phys_refcnt(ddp, v),
 		    p, blkbuf);
 	}
 }
@@ -3311,8 +3313,7 @@ zdb_ddt_cleanup(spa_t *spa)
 		ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next;
 		while (dde) {
 			next = AVL_NEXT(&ddt->ddt_tree, dde);
-			memset(&dde->dde_lead_zio, 0,
-			    sizeof (dde->dde_lead_zio));
+			dde->dde_io = NULL;
 			ddt_remove(ddt, dde);
 			dde = next;
 		}
@@ -5689,6 +5690,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 
 	spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
 
+	blkptr_t tempbp;
 	if (BP_GET_DEDUP(bp)) {
 		/*
 		 * Dedup'd blocks are special. We need to count them, so we can
@@ -5724,35 +5726,51 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 		VERIFY3P(dde, !=, NULL);
 
 		/* Get the phys for this variant */
-		ddt_phys_t *ddp = ddt_phys_select(ddt, dde, bp);
-		VERIFY3P(ddp, !=, NULL);
+		ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
 
 		/*
 		 * This entry may have multiple sets of DVAs. We must claim
 		 * each set the first time we see them in a real block on disk,
 		 * or count them on subsequent occurences. We don't have a
 		 * convenient way to track the first time we see each variant,
-		 * so we repurpose dde_lead_zio[] as a per-phys "seen" flag. We
-		 * can do this safely in zdb because it never writes, so it
-		 * will never have a writing zio for this block in that
-		 * pointer.
-		 */
-
-		/*
-		 * Work out which dde_phys index was used, get the seen flag,
-		 * and update it if necessary.
+		 * so we repurpose dde_io as a set of "seen" flag bits. We can
+		 * do this safely in zdb because it never writes, so it will
+		 * never have a writing zio for this block in that pointer.
 		 */
-		uint_t idx =
-		    ((uint_t)((uintptr_t)ddp - (uintptr_t)dde->dde_phys)) /
-		    sizeof (ddt_phys_t);
-		VERIFY3P(ddp, ==, &dde->dde_phys[idx]);
-		boolean_t seen = (boolean_t)(uintptr_t)dde->dde_lead_zio[idx];
+		boolean_t seen = !!(((uintptr_t)dde->dde_io) & (1 << v));
 		if (!seen)
-			dde->dde_lead_zio[idx] = (zio_t *)(uintptr_t)B_TRUE;
+			dde->dde_io =
+			    (void *)(((uintptr_t)dde->dde_io) | (1 << v));
 
 		/* Consume a reference for this block. */
 		VERIFY3U(ddt_phys_total_refcnt(ddt, dde), >, 0);
-		ddt_phys_decref(ddp);
+		ddt_phys_decref(dde->dde_phys, v);
+
+		/*
+		 * If this entry has a single flat phys, it may have been
+		 * extended with additional DVAs at some time in its life.
+		 * This block might be from before it was fully extended, and
+		 * so have fewer DVAs.
+		 *
+		 * If this is the first time we've seen this block, and we
+		 * claimed it as-is, then we would miss the claim on some
+		 * number of DVAs, which would then be seen as leaked.
+		 *
+		 * In all cases, if we've had fewer DVAs, then the asize would
+		 * be too small, and would lead to the pool apparently using
+		 * more space than allocated.
+		 *
+		 * To handle this, we copy the canonical set of DVAs from the
+		 * entry back to the block pointer before we claim it.
+		 */
+		if (v == DDT_PHYS_FLAT) {
+			ASSERT3U(BP_GET_BIRTH(bp), ==,
+			    ddt_phys_birth(dde->dde_phys, v));
+			tempbp = *bp;
+			ddt_bp_fill(dde->dde_phys, v, &tempbp,
+			    BP_GET_BIRTH(bp));
+			bp = &tempbp;
+		}
 
 		if (seen) {
 			/*
diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 222373c98a08..11e09eef3bcc 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -42,8 +42,8 @@ struct abd;
 /*
  * DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
  */
-/* No flags yet. */
-#define	DDT_FLAG_MASK	(0)
+#define	DDT_FLAG_FLAT	(1 << 0)	/* single extensible phys */
+#define	DDT_FLAG_MASK	(DDT_FLAG_FLAT)
 
 /*
  * DDT on-disk storage object types. Each one corresponds to specific
@@ -126,21 +126,80 @@ typedef struct {
  * characteristics of the stored block, such as its location on disk (DVAs),
  * birth txg and ref count.
  *
- * Note that an entry has an array of four ddt_phys_t, one for each number of
- * DVAs (copies= property) and another for additional "ditto" copies. Most
- * users of ddt_phys_t will handle indexing into or counting the phys they
- * want.
+ * The "traditional" entry has an array of four, one for each number of DVAs
+ * (copies= property) and another for additional "ditto" copies. Users of the
+ * traditional struct will specify the variant (index) of the one they want.
+ *
+ * The newer "flat" entry has only a single form that is specified using the
+ * DDT_PHYS_FLAT variant.
+ *
+ * Since the value size varies, use one of the size macros when interfacing
+ * with the ddt zap.
  */
-typedef struct {
-	dva_t		ddp_dva[SPA_DVAS_PER_BP];
-	uint64_t	ddp_refcnt;
-	uint64_t	ddp_phys_birth;
-} ddt_phys_t;
 
-#define	DDT_PHYS_MAX			(4)
-#define	DDT_NPHYS(ddt)			((ddt) ? DDT_PHYS_MAX : DDT_PHYS_MAX)
-#define	DDT_PHYS_IS_DITTO(ddt, p)	((ddt) && p == 0)
-#define	DDT_PHYS_FOR_COPIES(ddt, p)	((ddt) ? (p) : (p))
+#define	DDT_PHYS_MAX	(4)
+
+/*
+ * Note - this can be used in a flexible array and allocated for
+ * a specific size (ddp_trad or ddp_flat). So be careful not to
+ * copy using "=" assignment but instead use ddt_phys_copy().
+ */
+typedef union {
+	/*
+	 * Traditional physical payload value for DDT zap (256 bytes)
+	 */
+	struct {
+		dva_t		ddp_dva[SPA_DVAS_PER_BP];
+		uint64_t	ddp_refcnt;
+		uint64_t	ddp_phys_birth;
+	} ddp_trad[DDT_PHYS_MAX];
+
+	/*
+	 * Flat physical payload value for DDT zap (72 bytes)
+	 */
+	struct {
+		dva_t		ddp_dva[SPA_DVAS_PER_BP];
+		uint64_t	ddp_refcnt;
+		uint64_t	ddp_phys_birth; /* txg based from BP */
+		uint64_t	ddp_class_start; /* in realtime seconds */
+	} ddp_flat;
+} ddt_univ_phys_t;
+
+/*
+ * This enum denotes which variant of a ddt_univ_phys_t to target. For
+ * a traditional DDT entry, it represents the indexes into the ddp_trad
+ * array. Any consumer of a ddt_univ_phys_t needs to know which variant
+ * is being targeted.
+ *
+ * Note, we no longer generate new DDT_PHYS_DITTO-type blocks.  However,
+ * we maintain the ability to free existing dedup-ditto blocks.
+ */
+
+typedef enum {
+	DDT_PHYS_DITTO = 0,
+	DDT_PHYS_SINGLE = 1,
+	DDT_PHYS_DOUBLE = 2,
+	DDT_PHYS_TRIPLE = 3,
+	DDT_PHYS_FLAT = 4,
+	DDT_PHYS_NONE = 5
+} ddt_phys_variant_t;
+
+#define	DDT_PHYS_VARIANT(ddt, p)	\
+	(ASSERT((p) < DDT_PHYS_NONE),	\
+	((ddt)->ddt_flags & DDT_FLAG_FLAT ? DDT_PHYS_FLAT : (p)))
+
+#define	DDT_TRAD_PHYS_SIZE	sizeof (((ddt_univ_phys_t *)0)->ddp_trad)
+#define	DDT_FLAT_PHYS_SIZE	sizeof (((ddt_univ_phys_t *)0)->ddp_flat)
+
+#define	_DDT_PHYS_SWITCH(ddt, flat, trad)	\
+	(((ddt)->ddt_flags & DDT_FLAG_FLAT) ? (flat) : (trad))
+
+#define	DDT_PHYS_SIZE(ddt)		_DDT_PHYS_SWITCH(ddt,	\
+	DDT_FLAT_PHYS_SIZE, DDT_TRAD_PHYS_SIZE)
+
+#define	DDT_NPHYS(ddt)			_DDT_PHYS_SWITCH(ddt, 1, DDT_PHYS_MAX)
+#define	DDT_PHYS_FOR_COPIES(ddt, p)	_DDT_PHYS_SWITCH(ddt, 0, p)
+#define	DDT_PHYS_IS_DITTO(ddt, p)	_DDT_PHYS_SWITCH(ddt, 0, (p == 0))
 
 /*
  * A "live" entry, holding changes to an entry made this txg, and other data to
@@ -159,6 +218,9 @@ typedef struct {
 	/* copy of data after a repair read, to be rewritten */
 	abd_t		*dde_repair_abd;
 
+	/* original phys contents before update, for error handling */
+	ddt_univ_phys_t	dde_orig_phys;
+
 	/* in-flight update IOs */
 	zio_t		*dde_lead_zio[DDT_PHYS_MAX];
 } ddt_entry_io_t;
@@ -178,7 +240,7 @@ typedef struct {
 
 	ddt_entry_io_t	*dde_io;	/* IO support, when required */
 
-	ddt_phys_t	dde_phys[];	/* physical data */
+	ddt_univ_phys_t	dde_phys[];	/* flexible -- allocated size varies */
 } ddt_entry_t;
 
 /*
@@ -189,8 +251,7 @@ typedef struct {
 	ddt_key_t	ddlwe_key;
 	ddt_type_t	ddlwe_type;
 	ddt_class_t	ddlwe_class;
-	uint8_t		ddlwe_nphys;
-	ddt_phys_t	ddlwe_phys[DDT_PHYS_MAX];
+	ddt_univ_phys_t	ddlwe_phys;
 } ddt_lightweight_entry_t;
 
 /*
@@ -236,17 +297,26 @@ typedef struct {
 	uint64_t	ddb_cursor;
 } ddt_bookmark_t;
 
-extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
-    uint64_t txg);
+extern void ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
+    blkptr_t *bp, uint64_t txg);
 extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
-    const ddt_phys_t *ddp, blkptr_t *bp);
+    const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp);
 
-extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
-extern void ddt_phys_clear(ddt_phys_t *ddp);
-extern void ddt_phys_addref(ddt_phys_t *ddp);
-extern void ddt_phys_decref(ddt_phys_t *ddp);
-extern ddt_phys_t *ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde,
+extern void ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
     const blkptr_t *bp);
+extern void ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
+    ddt_phys_variant_t v);
+extern void ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
+extern void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
+extern uint64_t ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
+extern uint64_t ddt_phys_refcnt(const ddt_univ_phys_t *ddp,
+    ddt_phys_variant_t v);
+extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt,
+    const ddt_entry_t *dde, const blkptr_t *bp);
+extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp,
+    ddt_phys_variant_t v);
+extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
+    boolean_t encrypted);
 
 extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
 extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h
index e88a046ab8ae..c4e681fb117b 100644
--- a/include/sys/ddt_impl.h
+++ b/include/sys/ddt_impl.h
@@ -42,14 +42,12 @@ extern "C" {
 #define	DDT_DIR_FLAGS		"flags"
 
 /* Fill a lightweight entry from a live entry. */
-#define	DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do {		\
-	memset((ddlwe), 0, sizeof (*ddlwe));			\
-	(ddlwe)->ddlwe_key = (dde)->dde_key;			\
-	(ddlwe)->ddlwe_type = (dde)->dde_type;			\
-	(ddlwe)->ddlwe_class = (dde)->dde_class;		\
-	(ddlwe)->ddlwe_nphys = DDT_NPHYS(ddt);			\
-	for (int p = 0; p < (ddlwe)->ddlwe_nphys; p++)		\
-		(ddlwe)->ddlwe_phys[p] = (dde)->dde_phys[p];	\
+#define	DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do {			\
+	memset((ddlwe), 0, sizeof (*ddlwe));				\
+	(ddlwe)->ddlwe_key = (dde)->dde_key;				\
+	(ddlwe)->ddlwe_type = (dde)->dde_type;				\
+	(ddlwe)->ddlwe_class = (dde)->dde_class;			\
+	memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \
 } while (0)
 
 /*
@@ -61,19 +59,19 @@ typedef struct {
 	    boolean_t prehash);
 	int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
 	int (*ddt_op_lookup)(objset_t *os, uint64_t object,
-	    const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize);
+	    const ddt_key_t *ddk, void *phys, size_t psize);
 	int (*ddt_op_contains)(objset_t *os, uint64_t object,
 	    const ddt_key_t *ddk);
 	void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
 	    const ddt_key_t *ddk);
 	void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object);
 	int (*ddt_op_update)(objset_t *os, uint64_t object,
-	    const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize,
+	    const ddt_key_t *ddk, const void *phys, size_t psize,
 	    dmu_tx_t *tx);
 	int (*ddt_op_remove)(objset_t *os, uint64_t object,
 	    const ddt_key_t *ddk, dmu_tx_t *tx);
 	int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk,
-	    ddt_key_t *ddk, ddt_phys_t *phys, size_t psize);
+	    ddt_key_t *ddk, void *phys, size_t psize);
 	int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count);
 } ddt_ops_t;
 
diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h
index b91d7f4be88f..63734dbc176f 100644
--- a/include/sys/dsl_scan.h
+++ b/include/sys/dsl_scan.h
@@ -202,7 +202,7 @@ boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
 boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp);
 boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
 void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
-    ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx);
+    ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx);
 void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
 void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
 void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 3998f5a6de73..a70912335b16 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -572,7 +572,7 @@ typedef struct blkptr {
 #define	BP_IS_RAIDZ(bp)		(DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
 				BP_GET_PSIZE(bp))
 
-#define	BP_ZERO(bp)				\
+#define	BP_ZERO_DVAS(bp)			\
 {						\
 	(bp)->blk_dva[0].dva_word[0] = 0;	\
 	(bp)->blk_dva[0].dva_word[1] = 0;	\
@@ -580,6 +580,11 @@ typedef struct blkptr {
 	(bp)->blk_dva[1].dva_word[1] = 0;	\
 	(bp)->blk_dva[2].dva_word[0] = 0;	\
 	(bp)->blk_dva[2].dva_word[1] = 0;	\
+}
+
+#define	BP_ZERO(bp)				\
+{						\
+	BP_ZERO_DVAS(bp);			\
 	(bp)->blk_prop = 0;			\
 	(bp)->blk_pad[0] = 0;			\
 	(bp)->blk_pad[1] = 0;			\
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 213e042394f8..59526394bd07 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -75,12 +75,19 @@
  * fill the BP with the DVAs from the entry, increment the refcount and cause
  * the write IO to return immediately.
  *
- * Each ddt_phys_t slot in the entry represents a separate dedup block for the
- * same content/checksum. The slot is selected based on the zp_copies parameter
- * the block is written with, that is, the number of DVAs in the block. The
- * "ditto" slot (DDT_PHYS_DITTO) used to be used for now-removed "dedupditto"
- * feature. These are no longer written, and will be freed if encountered on
- * old pools.
+ * Traditionally, each ddt_phys_t slot in the entry represents a separate dedup
+ * block for the same content/checksum. The slot is selected based on the
+ * zp_copies parameter the block is written with, that is, the number of DVAs
+ * in the block. The "ditto" slot (DDT_PHYS_DITTO) used to be used for
+ * now-removed "dedupditto" feature. These are no longer written, and will be
+ * freed if encountered on old pools.
+ *
+ * If the "fast_dedup" feature is enabled, new dedup tables will be created
+ * with the "flat phys" option. In this mode, there is only one ddt_phys_t
+ * slot. If a write is issued for an entry that exists, but has fewer DVAs,
+ * then only as many new DVAs are allocated and written to make up the
+ * shortfall. The existing entry is then extended (ddt_phys_extend()) with the
+ * new DVAs.
  *
  * ## Lifetime of an entry
  *
@@ -130,6 +137,16 @@
  * from the alternate block. If the block is actually damaged, this will invoke
  * the pool's "self-healing" mechanism, and repair the block.
  *
+ * If the "fast_dedup" feature is enabled, the "flat phys" option will be in
+ * use, so there is only ever one ddt_phys_t slot. The repair process will
+ * still happen in this case, though it is unlikely to succeed as there will
+ * usually be no other equivalent blocks to fall back on (though there might
+ * be, if this was an early version of a dedup'd block that has since been
+ * extended).
+ *
+ * Note that this repair mechanism is in addition to and separate from the
+ * regular OpenZFS scrub and self-healing mechanisms.
+ *
  * ## Scanning (scrub/resilver)
  *
  * If dedup is active, the scrub machinery will walk the dedup table first, and
@@ -162,10 +179,15 @@
 	c == ZIO_CHECKSUM_BLAKE3)
 
 static kmem_cache_t *ddt_cache;
-static kmem_cache_t *ddt_entry_cache;
 
-#define	DDT_ENTRY_SIZE	\
-	(sizeof (ddt_entry_t) + sizeof (ddt_phys_t) * DDT_PHYS_MAX)
+static kmem_cache_t *ddt_entry_flat_cache;
+static kmem_cache_t *ddt_entry_trad_cache;
+
+#define	DDT_ENTRY_FLAT_SIZE	(sizeof (ddt_entry_t) + DDT_FLAT_PHYS_SIZE)
+#define	DDT_ENTRY_TRAD_SIZE	(sizeof (ddt_entry_t) + DDT_TRAD_PHYS_SIZE)
+
+#define	DDT_ENTRY_SIZE(ddt)	\
+	_DDT_PHYS_SWITCH(ddt, DDT_ENTRY_FLAT_SIZE, DDT_ENTRY_TRAD_SIZE)
 
 /*
  * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
@@ -195,7 +217,7 @@ static const char *const ddt_class_name[DDT_CLASSES] = {
  */
 static const uint64_t ddt_version_flags[] = {
 	[DDT_VERSION_LEGACY] = 0,
-	[DDT_VERSION_FDT] = 0,
+	[DDT_VERSION_FDT] = DDT_FLAG_FLAT,
 };
 
 /* Dummy version to signal that configure is still necessary */
@@ -346,7 +368,7 @@ ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
 
 	return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
 	    ddt->ddt_object[type][class], &dde->dde_key,
-	    dde->dde_phys, sizeof (ddt_phys_t) * DDT_NPHYS(ddt)));
+	    dde->dde_phys, DDT_PHYS_SIZE(ddt)));
 }
 
 static int
@@ -388,8 +410,8 @@ ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
 	ASSERT(ddt_object_exists(ddt, type, class));
 
 	return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
-	    ddt->ddt_object[type][class], &dde->dde_key, dde->dde_phys,
-	    sizeof (ddt_phys_t) * DDT_NPHYS(ddt), tx));
+	    ddt->ddt_object[type][class], &dde->dde_key,
+	    dde->dde_phys, DDT_PHYS_SIZE(ddt), tx));
 }
 
 static int
@@ -410,11 +432,10 @@ ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
 
 	int error = ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
 	    ddt->ddt_object[type][class], walk, &ddlwe->ddlwe_key,
-	    ddlwe->ddlwe_phys, sizeof (ddlwe->ddlwe_phys));
+	    &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
 	if (error == 0) {
 		ddlwe->ddlwe_type = type;
 		ddlwe->ddlwe_class = class;
-		ddlwe->ddlwe_nphys = DDT_NPHYS(ddt);
 		return (0);
 	}
 	return (error);
@@ -451,13 +472,25 @@ ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
 }
 
 void
-ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
+ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
+    blkptr_t *bp, uint64_t txg)
 {
 	ASSERT3U(txg, !=, 0);
+	ASSERT3U(v, <, DDT_PHYS_NONE);
+	uint64_t phys_birth;
+	const dva_t *dvap;
+
+	if (v == DDT_PHYS_FLAT) {
+		phys_birth = ddp->ddp_flat.ddp_phys_birth;
+		dvap = ddp->ddp_flat.ddp_dva;
+	} else {
+		phys_birth = ddp->ddp_trad[v].ddp_phys_birth;
+		dvap = ddp->ddp_trad[v].ddp_dva;
+	}
 
 	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
-		bp->blk_dva[d] = ddp->ddp_dva[d];
-	BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
+		bp->blk_dva[d] = dvap[d];
+	BP_SET_BIRTH(bp, txg, phys_birth);
 }
 
 /*
@@ -465,13 +498,13 @@ ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
  * will be missing the salt / IV required to do a full decrypting read.
  */
 void
-ddt_bp_create(enum zio_checksum checksum,
-    const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
+ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
+    const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp)
 {
 	BP_ZERO(bp);
 
 	if (ddp != NULL)
-		ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
+		ddt_bp_fill(ddp, v, bp, ddt_phys_birth(ddp, v));
 
 	bp->blk_cksum = ddk->ddk_cksum;
 
@@ -502,42 +535,101 @@ ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
 }
 
 void
-ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
+ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp)
 {
-	ASSERT0(ddp->ddp_phys_birth);
+	ASSERT3U(v, <, DDT_PHYS_NONE);
+	int bp_ndvas = BP_GET_NDVAS(bp);
+	int ddp_max_dvas = BP_IS_ENCRYPTED(bp) ?
+	    SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP;
+	dva_t *dvas = (v == DDT_PHYS_FLAT) ?
+	    ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
+
+	int s = 0, d = 0;
+	while (s < bp_ndvas && d < ddp_max_dvas) {
+		if (DVA_IS_VALID(&dvas[d])) {
+			d++;
+			continue;
+		}
+		dvas[d] = bp->blk_dva[s];
+		s++; d++;
+	}
 
-	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
-		ddp->ddp_dva[d] = bp->blk_dva[d];
-	ddp->ddp_phys_birth = BP_GET_BIRTH(bp);
+	/*
+	 * If the caller offered us more DVAs than we can fit, something has
+	 * gone wrong in their accounting. zio_ddt_write() should never ask for
+	 * more than we need.
+	 */
+	ASSERT3U(s, ==, bp_ndvas);
+
+	if (BP_IS_ENCRYPTED(bp))
+		dvas[2] = bp->blk_dva[2];
+
+	if (ddt_phys_birth(ddp, v) == 0) {
+		if (v == DDT_PHYS_FLAT)
+			ddp->ddp_flat.ddp_phys_birth = BP_GET_BIRTH(bp);
+		else
+			ddp->ddp_trad[v].ddp_phys_birth = BP_GET_BIRTH(bp);
+	}
 }
 
 void
-ddt_phys_clear(ddt_phys_t *ddp)
+ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
+    ddt_phys_variant_t v)
 {
-	memset(ddp, 0, sizeof (*ddp));
+	ASSERT3U(v, <, DDT_PHYS_NONE);
+
+	if (v == DDT_PHYS_FLAT)
+		dst->ddp_flat = src->ddp_flat;
+	else
+		dst->ddp_trad[v] = src->ddp_trad[v];
 }
 
 void
-ddt_phys_addref(ddt_phys_t *ddp)
+ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
 {
-	ddp->ddp_refcnt++;
+	ASSERT3U(v, <, DDT_PHYS_NONE);
+
+	if (v == DDT_PHYS_FLAT)
+		memset(&ddp->ddp_flat, 0, DDT_FLAT_PHYS_SIZE);
+	else
+		memset(&ddp->ddp_trad[v], 0, DDT_TRAD_PHYS_SIZE / DDT_PHYS_MAX);
 }
 
 void
-ddt_phys_decref(ddt_phys_t *ddp)
+ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
 {
-	if (ddp) {
-		ASSERT3U(ddp->ddp_refcnt, >, 0);
-		ddp->ddp_refcnt--;
-	}
+	ASSERT3U(v, <, DDT_PHYS_NONE);
+
+	if (v == DDT_PHYS_FLAT)
+		ddp->ddp_flat.ddp_refcnt++;
+	else
+		ddp->ddp_trad[v].ddp_refcnt++;
+}
+
+uint64_t
+ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
+{
+	ASSERT3U(v, <, DDT_PHYS_NONE);
+
+	uint64_t *refcntp;
+
+	if (v == DDT_PHYS_FLAT)
+		refcntp = &ddp->ddp_flat.ddp_refcnt;
+	else
+		refcntp = &ddp->ddp_trad[v].ddp_refcnt;
+
+	ASSERT3U(*refcntp, >, 0);
+	(*refcntp)--;
+	return (*refcntp);
 }
 
 static void
-ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
+ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_univ_phys_t *ddp,
+    ddt_phys_variant_t v, uint64_t txg)
 {
 	blkptr_t blk;
 
-	ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+	ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
 
 	/*
 	 * We clear the dedup bit so that zio_free() will actually free the
@@ -545,20 +637,67 @@ ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
 	 */
 	BP_SET_DEDUP(&blk, 0);
 
-	ddt_phys_clear(ddp);
+	ddt_phys_clear(ddp, v);
 	zio_free(ddt->ddt_spa, txg, &blk);
 }
 
-ddt_phys_t *
+uint64_t
+ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
+{
+	ASSERT3U(v, <, DDT_PHYS_NONE);
+
+	if (v == DDT_PHYS_FLAT)
+		return (ddp->ddp_flat.ddp_phys_birth);
+	else
+		return (ddp->ddp_trad[v].ddp_phys_birth);
+}
+
+int
+ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
+    boolean_t encrypted)
+{
+	ASSERT3U(v, <, DDT_PHYS_NONE);
+
+	const dva_t *dvas = (v == DDT_PHYS_FLAT) ?
+	    ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
+
+	return (DVA_IS_VALID(&dvas[0]) +
+	    DVA_IS_VALID(&dvas[1]) +
+	    DVA_IS_VALID(&dvas[2]) * !encrypted);
+}
+
+ddt_phys_variant_t
 ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp)
 {
-	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-		ddt_phys_t *ddp = (ddt_phys_t *)&dde->dde_phys[p];
-		if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
-		    BP_GET_BIRTH(bp) == ddp->ddp_phys_birth)
-			return (ddp);
+	const ddt_univ_phys_t *ddp = dde->dde_phys;
+
+	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
+		if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_flat.ddp_dva[0]) &&
+		    BP_GET_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) {
+			return (DDT_PHYS_FLAT);
+		}
+	} else /* traditional phys */ {
+		for (int p = 0; p < DDT_PHYS_MAX; p++) {
+			if (DVA_EQUAL(BP_IDENTITY(bp),
+			    &ddp->ddp_trad[p].ddp_dva[0]) &&
+			    BP_GET_BIRTH(bp) ==
+			    ddp->ddp_trad[p].ddp_phys_birth) {
+				return (p);
+			}
+		}
 	}
-	return (NULL);
+	return (DDT_PHYS_NONE);
+}
+
+uint64_t
+ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
+{
+	ASSERT3U(v, <, DDT_PHYS_NONE);
+
+	if (v == DDT_PHYS_FLAT)
+		return (ddp->ddp_flat.ddp_refcnt);
+	else
+		return (ddp->ddp_trad[v].ddp_refcnt);
 }
 
 uint64_t
@@ -566,10 +705,11 @@ ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde)
 {
 	uint64_t refcnt = 0;
 
-	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-		if (DDT_PHYS_IS_DITTO(ddt, p))
-			continue;
-		refcnt += dde->dde_phys[p].ddp_refcnt;
+	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
+		refcnt = dde->dde_phys->ddp_flat.ddp_refcnt;
+	} else {
+		for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
+			refcnt += dde->dde_phys->ddp_trad[p].ddp_refcnt;
 	}
 
 	return (refcnt);
@@ -599,24 +739,33 @@ ddt_init(void)
 {
 	ddt_cache = kmem_cache_create("ddt_cache",
 	    sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-	ddt_entry_cache = kmem_cache_create("ddt_entry_cache",
-	    DDT_ENTRY_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+	ddt_entry_flat_cache = kmem_cache_create("ddt_entry_flat_cache",
+	    DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+	ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache",
+	    DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
 }
 
 void
 ddt_fini(void)
 {
-	kmem_cache_destroy(ddt_entry_cache);
+	kmem_cache_destroy(ddt_entry_trad_cache);
+	kmem_cache_destroy(ddt_entry_flat_cache);
 	kmem_cache_destroy(ddt_cache);
 }
 
 static ddt_entry_t *
-ddt_alloc(const ddt_key_t *ddk)
+ddt_alloc(const ddt_t *ddt, const ddt_key_t *ddk)
 {
 	ddt_entry_t *dde;
 
-	dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP);
-	memset(dde, 0, DDT_ENTRY_SIZE);
+	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
+		dde = kmem_cache_alloc(ddt_entry_flat_cache, KM_SLEEP);
+		memset(dde, 0, DDT_ENTRY_FLAT_SIZE);
+	} else {
+		dde = kmem_cache_alloc(ddt_entry_trad_cache, KM_SLEEP);
+		memset(dde, 0, DDT_ENTRY_TRAD_SIZE);
+	}
+
 	cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
 
 	dde->dde_key = *ddk;
@@ -647,7 +796,8 @@ ddt_free(const ddt_t *ddt, ddt_entry_t *dde)
 	}
 
 	cv_destroy(&dde->dde_cv);
-	kmem_cache_free(ddt_entry_cache, dde);
+	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+	    ddt_entry_flat_cache : ddt_entry_trad_cache, dde);
 }
 
 void
@@ -793,7 +943,12 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 	}
 
 	/* Time to make a new entry. */
-	dde = ddt_alloc(&search);
+	dde = ddt_alloc(ddt, &search);
+
+	/* Record the time this class was created (used by ddt prune) */
+	if (ddt->ddt_flags & DDT_FLAG_FLAT)
+		dde->dde_phys->ddp_flat.ddp_class_start = gethrestime_sec();
+
 	avl_insert(&ddt->ddt_tree, dde, where);
 
 	/*
@@ -1206,7 +1361,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
 
 	ddt_key_fill(&ddk, bp);
 
-	dde = ddt_alloc(&ddk);
+	dde = ddt_alloc(ddt, &ddk);
 	ddt_alloc_entry_io(dde);
 
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
@@ -1222,7 +1377,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
 		}
 	}
 
-	memset(dde->dde_phys, 0, sizeof (ddt_phys_t) * DDT_NPHYS(ddt));
+	memset(dde->dde_phys, 0, DDT_PHYS_SIZE(ddt));
 
 	return (dde);
 }
@@ -1265,13 +1420,26 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
 	    ddt_repair_entry_done, rdde, rio->io_flags);
 
 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-		ddt_phys_t *ddp = &dde->dde_phys[p];
-		ddt_phys_t *rddp = &rdde->dde_phys[p];
-		if (ddp->ddp_phys_birth == 0 ||
-		    ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
-		    memcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
+		ddt_univ_phys_t *ddp = dde->dde_phys;
+		ddt_univ_phys_t *rddp = rdde->dde_phys;
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+		uint64_t phys_birth = ddt_phys_birth(ddp, v);
+		const dva_t *dvas, *rdvas;
+
+		if (ddt->ddt_flags & DDT_FLAG_FLAT) {
+			dvas = ddp->ddp_flat.ddp_dva;
+			rdvas = rddp->ddp_flat.ddp_dva;
+		} else {
+			dvas = ddp->ddp_trad[p].ddp_dva;
+			rdvas = rddp->ddp_trad[p].ddp_dva;
+		}
+
+		if (phys_birth == 0 ||
+		    phys_birth != ddt_phys_birth(rddp, v) ||
+		    memcmp(dvas, rdvas, sizeof (dva_t) * SPA_DVAS_PER_BP))
 			continue;
-		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+
+		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
 		zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
 		    rdde->dde_io->dde_repair_abd, DDK_GET_PSIZE(rddk),
 		    NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
@@ -1297,7 +1465,8 @@ ddt_repair_table(ddt_t *ddt, zio_t *rio)
 		rdde_next = AVL_NEXT(t, rdde);
 		avl_remove(&ddt->ddt_repair_tree, rdde);
 		ddt_exit(ddt);
-		ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
+		ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL,
+		    DDT_PHYS_NONE, &blk);
 		dde = ddt_repair_start(ddt, &blk);
 		ddt_repair_entry(ddt, dde, rdde, rio);
 		ddt_repair_done(ddt, dde);
@@ -1322,9 +1491,12 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
 		ASSERT(dde->dde_io == NULL ||
 		    dde->dde_io->dde_lead_zio[p] == NULL);
-		ddt_phys_t *ddp = &dde->dde_phys[p];
-		if (ddp->ddp_phys_birth == 0) {
-			ASSERT0(ddp->ddp_refcnt);
+		ddt_univ_phys_t *ddp = dde->dde_phys;
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+		uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v);
+
+		if (ddt_phys_birth(ddp, v) == 0) {
+			ASSERT0(phys_refcnt);
 			continue;
 		}
 		if (DDT_PHYS_IS_DITTO(ddt, p)) {
@@ -1332,12 +1504,12 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 			 * Note, we no longer create DDT-DITTO blocks, but we
 			 * don't want to leak any written by older software.
 			 */
-			ddt_phys_free(ddt, ddk, ddp, txg);
+			ddt_phys_free(ddt, ddk, ddp, v, txg);
 			continue;
 		}
-		if (ddp->ddp_refcnt == 0)
-			ddt_phys_free(ddt, ddk, ddp, txg);
-		total_refcnt += ddp->ddp_refcnt;
+		if (phys_refcnt == 0)
+			ddt_phys_free(ddt, ddk, ddp, v, txg);
+		total_refcnt += phys_refcnt;
 	}
 
 	if (total_refcnt > 1)
@@ -1371,7 +1543,7 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 			ddt_lightweight_entry_t ddlwe;
 			DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
 			dsl_scan_ddt_entry(dp->dp_scan,
-			    ddt->ddt_checksum, &ddlwe, tx);
+			    ddt->ddt_checksum, ddt, &ddlwe, tx);
 		}
 	}
 }
@@ -1536,12 +1708,10 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 	}
 
 	if (dde->dde_type < DDT_TYPES) {
-		ddt_phys_t *ddp;
-
 		ASSERT3S(dde->dde_class, <, DDT_CLASSES);
 
 		int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp));
-		ddp = &dde->dde_phys[p];
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 
 		/*
 		 * This entry already existed (dde_type is real), so it must
@@ -1553,9 +1723,9 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 		 * likely further action is required to fill out the DDT entry,
 		 * and this is a place that is likely to be missed in testing.
 		 */
-		ASSERT3U(ddp->ddp_refcnt, >, 0);
+		ASSERT3U(ddt_phys_refcnt(dde->dde_phys, v), >, 0);
 
-		ddt_phys_addref(ddp);
+		ddt_phys_addref(dde->dde_phys, v);
 		result = B_TRUE;
 	} else {
 		/*
diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c
index 5449eca3afb1..6da77bbca5cb 100644
--- a/module/zfs/ddt_stats.c
+++ b/module/zfs/ddt_stats.c
@@ -43,18 +43,22 @@ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
 	memset(dds, 0, sizeof (*dds));
 
 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-		ddt_phys_t *ddp = &dde->dde_phys[p];
+		const ddt_univ_phys_t *ddp = dde->dde_phys;
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 
-		uint64_t dsize = 0;
-		uint64_t refcnt = ddp->ddp_refcnt;
-
-		if (ddp->ddp_phys_birth == 0)
+		if (ddt_phys_birth(ddp, v) == 0)
 			continue;
 
-		int ndvas = DDK_GET_CRYPT(&dde->dde_key) ?
-		    SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP;
+		int ndvas = ddt_phys_dva_count(ddp, v,
+		    DDK_GET_CRYPT(&dde->dde_key));
+		const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ?
+		    ddp->ddp_flat.ddp_dva : ddp->ddp_trad[p].ddp_dva;
+
+		uint64_t dsize = 0;
 		for (int d = 0; d < ndvas; d++)
-			dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
+			dsize += dva_get_dsize_sync(spa, &dvas[d]);
+
+		uint64_t refcnt = ddt_phys_refcnt(ddp, v);
 
 		dds->dds_blocks += 1;
 		dds->dds_lsize += lsize;
diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c
index 8f1bbeeecd8d..4e01624f3684 100644
--- a/module/zfs/ddt_zap.c
+++ b/module/zfs/ddt_zap.c
@@ -109,7 +109,7 @@ ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
 
 static int
 ddt_zap_lookup(objset_t *os, uint64_t object,
-    const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize)
+    const ddt_key_t *ddk, void *phys, size_t psize)
 {
 	uchar_t *cbuf;
 	uint64_t one, csize;
@@ -156,7 +156,7 @@ ddt_zap_prefetch_all(objset_t *os, uint64_t object)
 
 static int
 ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk,
-    const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx)
+    const void *phys, size_t psize, dmu_tx_t *tx)
 {
 	const size_t cbuf_size = psize + 1;
 
@@ -182,7 +182,7 @@ ddt_zap_remove(objset_t *os, uint64_t object, const ddt_key_t *ddk,
 
 static int
 ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk,
-    ddt_phys_t *phys, size_t psize)
+    void *phys, size_t psize)
 {
 	zap_cursor_t zc;
 	zap_attribute_t za;
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index dec0eb28dc5f..daf1bd5d637b 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -2929,7 +2929,7 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 
 void
 dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
-    ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
+    ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
 {
 	(void) tx;
 	const ddt_key_t *ddk = &ddlwe->ddlwe_key;
@@ -2953,13 +2953,13 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
 	if (scn->scn_done_txg != 0)
 		return;
 
-	for (int p = 0; p < ddlwe->ddlwe_nphys; p++) {
-		ddt_phys_t *ddp = &ddlwe->ddlwe_phys[p];
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+		uint64_t phys_birth = ddt_phys_birth(&ddlwe->ddlwe_phys, v);
 
-		if (ddp->ddp_phys_birth == 0 ||
-		    ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
+		if (phys_birth == 0 || phys_birth > scn->scn_phys.scn_max_txg)
 			continue;
-		ddt_bp_create(checksum, ddk, ddp, &bp);
+		ddt_bp_create(checksum, ddk, &ddlwe->ddlwe_phys, v, &bp);
 
 		scn->scn_visited_this_txg++;
 		scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
@@ -3022,7 +3022,7 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
 		ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
 		ASSERT(avl_first(&ddt->ddt_tree) == NULL);
 
-		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &ddlwe, tx);
+		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, ddt, &ddlwe, tx);
 		n++;
 
 		if (dsl_scan_check_suspend(scn, NULL))
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 1ca71c738c8f..1f3acb9b921e 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -3256,14 +3256,16 @@ zio_ddt_child_read_done(zio_t *zio)
 	blkptr_t *bp = zio->io_bp;
 	ddt_t *ddt;
 	ddt_entry_t *dde = zio->io_private;
-	ddt_phys_t *ddp;
 	zio_t *pio = zio_unique_parent(zio);
 
 	mutex_enter(&pio->io_lock);
 	ddt = ddt_select(zio->io_spa, bp);
-	ddp = ddt_phys_select(ddt, dde, bp);
-	if (zio->io_error == 0)
-		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
+
+	if (zio->io_error == 0) {
+		ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
+		/* this phys variant doesn't need repair */
+		ddt_phys_clear(dde->dde_phys, v);
+	}
 
 	if (zio->io_error == 0 && dde->dde_io->dde_repair_abd == NULL)
 		dde->dde_io->dde_repair_abd = zio->io_abd;
@@ -3284,21 +3286,25 @@ zio_ddt_read_start(zio_t *zio)
 	if (zio->io_child_error[ZIO_CHILD_DDT]) {
 		ddt_t *ddt = ddt_select(zio->io_spa, bp);
 		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
-		ddt_phys_t *ddp_self = ddt_phys_select(ddt, dde, bp);
+		ddt_phys_variant_t v_self = ddt_phys_select(ddt, dde, bp);
+		ddt_univ_phys_t *ddp = dde->dde_phys;
 		blkptr_t blk;
 
 		ASSERT(zio->io_vsd == NULL);
 		zio->io_vsd = dde;
 
-		if (ddp_self == NULL)
+		if (v_self == DDT_PHYS_NONE)
 			return (zio);
 
+		/* issue I/O for the other copies */
 		for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-			ddt_phys_t *ddp = &dde->dde_phys[p];
-			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
+			ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+
+			if (ddt_phys_birth(ddp, v) == 0 || v == v_self)
 				continue;
-			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
-			    &blk);
+
+			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key,
+			    ddp, v, &blk);
 			zio_nowait(zio_read(zio, zio->io_spa, &blk,
 			    abd_alloc_for_io(zio->io_size, B_TRUE),
 			    zio->io_size, zio_ddt_child_read_done, dde,
@@ -3378,30 +3384,32 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 		if (DDT_PHYS_IS_DITTO(ddt, p))
 			continue;
 
+		if (dde->dde_io == NULL)
+			continue;
+
 		zio_t *lio = dde->dde_io->dde_lead_zio[p];
+		if (lio == NULL)
+			continue;
 
-		if (lio != NULL && do_raw) {
+		if (do_raw)
 			return (lio->io_size != zio->io_size ||
 			    abd_cmp(zio->io_abd, lio->io_abd) != 0);
-		} else if (lio != NULL) {
-			return (lio->io_orig_size != zio->io_orig_size ||
-			    abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
-		}
+
+		return (lio->io_orig_size != zio->io_orig_size ||
+		    abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
 	}
 
 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-		if (DDT_PHYS_IS_DITTO(ddt, p))
-			continue;
+		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+		uint64_t phys_birth = ddt_phys_birth(dde->dde_phys, v);
 
-		ddt_phys_t *ddp = &dde->dde_phys[p];
-
-		if (ddp->ddp_phys_birth != 0 && do_raw) {
+		if (phys_birth != 0 && do_raw) {
 			blkptr_t blk = *zio->io_bp;
 			uint64_t psize;
 			abd_t *tmpabd;
 			int error;
 
-			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+			ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth);
 			psize = BP_GET_PSIZE(&blk);
 
 			if (psize != zio->io_size)
@@ -3424,13 +3432,13 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 			abd_free(tmpabd);
 			ddt_enter(ddt);
 			return (error != 0);
-		} else if (ddp->ddp_phys_birth != 0) {
+		} else if (phys_birth != 0) {
 			arc_buf_t *abuf = NULL;
 			arc_flags_t aflags = ARC_FLAG_WAIT;
 			blkptr_t blk = *zio->io_bp;
 			int error;
 
-			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+			ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth);
 
 			if (BP_GET_LSIZE(&blk) != zio->io_orig_size)
 				return (B_TRUE);
@@ -3458,52 +3466,87 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
 }
 
 static void
-zio_ddt_child_write_ready(zio_t *zio)
+zio_ddt_child_write_done(zio_t *zio)
 {
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
-	zio_t *pio;
 
-	if (zio->io_error)
-		return;
+	zio_link_t *zl = NULL;
+	ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL);
 
 	int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
-	ddt_phys_t *ddp = &dde->dde_phys[p];
+	ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+	ddt_univ_phys_t *ddp = dde->dde_phys;
 
 	ddt_enter(ddt);
 
-	ASSERT(dde->dde_io->dde_lead_zio[p] == zio);
+	/* we're the lead, so once we're done there's no one else outstanding */
+	if (dde->dde_io->dde_lead_zio[p] == zio)
+		dde->dde_io->dde_lead_zio[p] = NULL;
 
-	ddt_phys_fill(ddp, zio->io_bp);
+	ddt_univ_phys_t *orig = &dde->dde_io->dde_orig_phys;
 
-	zio_link_t *zl = NULL;
-	while ((pio = zio_walk_parents(zio, &zl)) != NULL)
-		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
+	if (zio->io_error != 0) {
+		/*
+		 * The write failed, so we're about to abort the entire IO
+		 * chain. We need to revert the entry back to what it was at
+		 * the last time it was successfully extended.
+		 */
+		ddt_phys_copy(ddp, orig, v);
+		ddt_phys_clear(orig, v);
+
+		ddt_exit(ddt);
+		return;
+	}
+
+	/*
+	 * We've successfully added new DVAs to the entry. Clear the saved
+	 * state or, if there's still outstanding IO, remember it so we can
+	 * revert to a known good state if that IO fails.
+	 */
+	if (dde->dde_io->dde_lead_zio[p] == NULL)
+		ddt_phys_clear(orig, v);
+	else
+		ddt_phys_copy(orig, ddp, v);
+
+	/*
+	 * Add references for all dedup writes that were waiting on the
+	 * physical one, skipping any other physical writes that are waiting.
+	 */
+	zio_t *pio;
+	zl = NULL;
+	while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
+		if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
+			ddt_phys_addref(ddp, v);
+	}
 
 	ddt_exit(ddt);
 }
 
 static void
-zio_ddt_child_write_done(zio_t *zio)
+zio_ddt_child_write_ready(zio_t *zio)
 {
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *dde = zio->io_private;
 
+	zio_link_t *zl = NULL;
+	ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL);
+
 	int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies);
-	ddt_phys_t *ddp = &dde->dde_phys[p];
+	ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+
+	if (zio->io_error != 0)
+		return;
 
 	ddt_enter(ddt);
 
-	ASSERT(ddp->ddp_refcnt == 0);
-	ASSERT(dde->dde_io->dde_lead_zio[p] == zio);
-	dde->dde_io->dde_lead_zio[p] = NULL;
+	ddt_phys_extend(dde->dde_phys, v, zio->io_bp);
 
-	if (zio->io_error == 0) {
-		zio_link_t *zl = NULL;
-		while (zio_walk_parents(zio, &zl) != NULL)
-			ddt_phys_addref(ddp);
-	} else {
-		ddt_phys_clear(ddp);
+	zio_t *pio;
+	zl = NULL;
+	while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
+		if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD))
+			ddt_bp_fill(dde->dde_phys, v, pio->io_bp, zio->io_txg);
 	}
 
 	ddt_exit(ddt);
@@ -3516,7 +3559,6 @@ zio_ddt_write(zio_t *zio)
 	blkptr_t *bp = zio->io_bp;
 	uint64_t txg = zio->io_txg;
 	zio_prop_t *zp = &zio->io_prop;
-	zio_t *cio = NULL;
 	ddt_t *ddt = ddt_select(spa, bp);
 	ddt_entry_t *dde;
 
@@ -3537,9 +3579,6 @@ zio_ddt_write(zio_t *zio)
 		return (zio);
 	}
 
-	int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies);
-	ddt_phys_t *ddp = &dde->dde_phys[p];
-
 	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
 		/*
 		 * If we're using a weak checksum, upgrade to a strong checksum
@@ -3563,31 +3602,227 @@ zio_ddt_write(zio_t *zio)
 		return (zio);
 	}
 
-	ddt_alloc_entry_io(dde);
+	int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies);
+	ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
+	ddt_univ_phys_t *ddp = dde->dde_phys;
 
-	if (ddp->ddp_phys_birth != 0 || dde->dde_io->dde_lead_zio[p] != NULL) {
-		if (ddp->ddp_phys_birth != 0)
-			ddt_bp_fill(ddp, bp, txg);
-		if (dde->dde_io->dde_lead_zio[p] != NULL)
-			zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
-		else
-			ddt_phys_addref(ddp);
-	} else if (zio->io_bp_override) {
-		ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
-		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
-		ddt_phys_fill(ddp, bp);
-		ddt_phys_addref(ddp);
+	/*
+	 * In the common cases, at this point we have a regular BP with no
+	 * allocated DVAs, and the corresponding DDT entry for its checksum.
+	 * Our goal is to fill the BP with enough DVAs to satisfy its copies=
+	 * requirement.
+	 *
+	 * One of three things needs to happen to fulfill this:
+	 *
+	 * - if the DDT entry has enough DVAs to satisfy the BP, we just copy
+	 *   them out of the entry and return;
+	 *
+	 * - if the DDT entry has no DVAs (ie its brand new), then we have to
+	 *   issue the write as normal so that DVAs can be allocated and the
+	 *   data land on disk. We then copy the DVAs into the DDT entry on
+	 *   return.
+	 *
+	 * - if the DDT entry has some DVAs, but too few, we have to issue the
+	 *   write, adjusted to have allocate fewer copies. When it returns, we
+	 *   add the new DVAs to the DDT entry, and update the BP to have the
+	 *   full amount it originally requested.
+	 *
+	 * In all cases, if there's already a writing IO in flight, we need to
+	 * defer the action until after the write is done. If our action is to
+	 * write, we need to adjust our request for additional DVAs to match
+	 * what will be in the DDT entry after it completes. In this way every
+	 * IO can be guaranteed to recieve enough DVAs simply by joining the
+	 * end of the chain and letting the sequence play out.
+	 */
+
+	/*
+	 * Number of DVAs in the DDT entry. If the BP is encrypted we ignore
+	 * the third one as normal.
+	 */
+	int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp));
+	IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0);
+
+	/* Number of DVAs requested bya the IO. */
+	uint8_t need_dvas = zp->zp_copies;
+
+	/*
+	 * What we do next depends on whether or not there's IO outstanding that
+	 * will update this entry.
+	 */
+	if (dde->dde_io == NULL || dde->dde_io->dde_lead_zio[p] == NULL) {
+		/*
+		 * No IO outstanding, so we only need to worry about ourselves.
+		 */
+
+		/*
+		 * Override BPs bring their own DVAs and their own problems.
+		 */
+		if (zio->io_bp_override) {
+			/*
+			 * For a brand-new entry, all the work has been done
+			 * for us, and we can just fill it out from the provided
+			 * block and leave.
+			 */
+			if (have_dvas == 0) {
+				ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
+				ASSERT(BP_EQUAL(bp, zio->io_bp_override));
+				ddt_phys_extend(ddp, v, bp);
+				ddt_phys_addref(ddp, v);
+				ddt_exit(ddt);
+				return (zio);
+			}
+
+			/*
+			 * If we already have this entry, then we want to treat
+			 * it like a regular write. To do this we just wipe
+			 * them out and proceed like a regular write.
+			 *
+			 * Even if there are some DVAs in the entry, we still
+			 * have to clear them out. We can't use them to fill
+			 * out the dedup entry, as they are all referenced
+			 * together by a bp already on disk, and will be freed
+			 * as a group.
+			 */
+			BP_ZERO_DVAS(bp);
+			BP_SET_BIRTH(bp, 0, 0);
+		}
+
+		/*
+		 * If there are enough DVAs in the entry to service our request,
+		 * then we can just use them as-is.
+		 */
+		if (have_dvas >= need_dvas) {
+			ddt_bp_fill(ddp, v, bp, txg);
+			ddt_phys_addref(ddp, v);
+			ddt_exit(ddt);
+			return (zio);
+		}
+
+		/*
+		 * Otherwise, we have to issue IO to fill the entry up to the
+		 * amount we need.
+		 */
+		need_dvas -= have_dvas;
 	} else {
-		cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
-		    zio->io_orig_size, zio->io_orig_size, zp,
-		    zio_ddt_child_write_ready, NULL,
-		    zio_ddt_child_write_done, dde, zio->io_priority,
-		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+		/*
+		 * There's a write in-flight. If there's already enough DVAs on
+		 * the entry, then either there were already enough to start
+		 * with, or the in-flight IO is between READY and DONE, and so
+		 * has extended the entry with new DVAs. Either way, we don't
+		 * need to do anything, we can just slot in behind it.
+		 */
+
+		if (zio->io_bp_override) {
+			/*
+			 * If there's a write out, then we're soon going to
+			 * have our own copies of this block, so clear out the
+			 * override block and treat it as a regular dedup
+			 * write. See comment above.
+			 */
+			BP_ZERO_DVAS(bp);
+			BP_SET_BIRTH(bp, 0, 0);
+		}
+
+		if (have_dvas >= need_dvas) {
+			/*
+			 * A minor point: there might already be enough
+			 * committed DVAs in the entry to service our request,
+			 * but we don't know which are completed and which are
+			 * allocated but not yet written. In this case, should
+			 * the IO for the new DVAs fail, we will be on the end
+			 * of the IO chain and will also recieve an error, even
+			 * though our request could have been serviced.
+			 *
+			 * This is an extremely rare case, as it requires the
+			 * original block to be copied with a request for a
+			 * larger number of DVAs, then copied again requesting
+			 * the same (or already fulfilled) number of DVAs while
+			 * the first request is active, and then that first
+			 * request errors. In return, the logic required to
+			 * catch and handle it is complex. For now, I'm just
+			 * not going to bother with it.
+			 */
+
+			/*
+			 * We always fill the bp here as we may have arrived
+			 * after the in-flight write has passed READY, and so
+			 * missed out.
+			 */
+			ddt_bp_fill(ddp, v, bp, txg);
+			zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
+			ddt_exit(ddt);
+			return (zio);
+		}
+
+		/*
+		 * There's not enough in the entry yet, so we need to look at
+		 * the write in-flight and see how many DVAs it will have once
+		 * it completes.
+		 *
+		 * The in-flight write has potentially had its copies request
+		 * reduced (if we're filling out an existing entry), so we need
+		 * to reach in and get the original write to find out what it is
+		 * expecting.
+		 *
+		 * Note that the parent of the lead zio will always have the
+		 * highest zp_copies of any zio in the chain, because ones that
+		 * can be serviced without additional IO are always added to
+		 * the back of the chain.
+		 */
+		zio_link_t *zl = NULL;
+		zio_t *pio =
+		    zio_walk_parents(dde->dde_io->dde_lead_zio[p], &zl);
+		ASSERT(pio);
+		uint8_t parent_dvas = pio->io_prop.zp_copies;
+
+		if (parent_dvas >= need_dvas) {
+			zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
+			ddt_exit(ddt);
+			return (zio);
+		}
 
-		zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
-		dde->dde_io->dde_lead_zio[p] = cio;
+		/*
+		 * Still not enough, so we will need to issue to get the
+		 * shortfall.
+		 */
+		need_dvas -= parent_dvas;
 	}
 
+	/*
+	 * We need to write. We will create a new write with the copies
+	 * property adjusted to match the number of DVAs we need to need to
+	 * grow the DDT entry by to satisfy the request.
+	 */
+	zio_prop_t czp = *zp;
+	czp.zp_copies = need_dvas;
+	zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
+	    zio->io_orig_size, zio->io_orig_size, &czp,
+	    zio_ddt_child_write_ready, NULL,
+	    zio_ddt_child_write_done, dde, zio->io_priority,
+	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+	zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
+
+	/*
+	 * We are the new lead zio, because our parent has the highest
+	 * zp_copies that has been requested for this entry so far.
+	 */
+	ddt_alloc_entry_io(dde);
+	if (dde->dde_io->dde_lead_zio[p] == NULL) {
+		/*
+		 * First time out, take a copy of the stable entry to revert
+		 * to if there's an error (see zio_ddt_child_write_done())
+		 */
+		ddt_phys_copy(&dde->dde_io->dde_orig_phys, dde->dde_phys, v);
+	} else {
+		/*
+		 * Make the existing chain our child, because it cannot
+		 * complete until we have.
+		 */
+		zio_add_child(cio, dde->dde_io->dde_lead_zio[p]);
+	}
+	dde->dde_io->dde_lead_zio[p] = cio;
+
 	ddt_exit(ddt);
 
 	zio_nowait(cio);
@@ -3603,8 +3838,7 @@ zio_ddt_free(zio_t *zio)
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	ddt_t *ddt = ddt_select(spa, bp);
-	ddt_entry_t *dde;
-	ddt_phys_t *ddp;
+	ddt_entry_t *dde = NULL;
 
 	ASSERT(BP_GET_DEDUP(bp));
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
@@ -3612,9 +3846,9 @@ zio_ddt_free(zio_t *zio)
 	ddt_enter(ddt);
 	freedde = dde = ddt_lookup(ddt, bp);
 	if (dde) {
-		ddp = ddt_phys_select(ddt, dde, bp);
-		if (ddp)
-			ddt_phys_decref(ddp);
+		ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
+		if (v != DDT_PHYS_NONE)
+			ddt_phys_decref(dde->dde_phys, v);
 	}
 	ddt_exit(ddt);
 

From 27e9cb5f8022bef72553cbe12f7ec292535e4c0b Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 15 Jun 2023 17:19:41 +1000
Subject: [PATCH 25/65] ddt: cleanup the stats & histogram code

Both the API and the code were kinda mangled and I was really struggling
to follow it. The worst offender was the old ddt_stat_add(); after
fixing it up the rest of the changes are mostly knock-on effects and
targets of opportunity.

Note that the old ddt_stat_add() was safe against overflows - it could
produce crazy numbers, but the compiler wouldn't do anything stupid. The
assertions in ddt_stat_sub() go a lot of the way to protecting against
this; getting in a position where overflows are a problem is definitely
a programming error.

Also expanding ddt_stat_add() and ddt_histogram_empty() produces less
efficient assembly. I'm not bothered about this right now though; these
should not be hot functions, and if they are we'll optimise them later.
If we have to go back to the old form, we'll comment it like crazy.

Finally, I've removed the assertion that the bucket will never be
negative, as it will soon be possible to have entries with zero
refcounts: an entry for a block that is no longer on the pool, but is on
the log waiting to be synced out. It might be better to have a separate
bucket for these, since they're still using real space on disk, but
ultimately these stats are driving UI, and for now I've chosen to keep
them matching how they've looked in the past, as well as match the
operators mental model - pool usage is managed elsewhere.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
---
 cmd/zdb/zdb.c          |  22 ++++-----
 include/sys/ddt.h      |   8 ++-
 include/sys/ddt_impl.h |   4 --
 module/zfs/ddt.c       |  24 +++++++--
 module/zfs/ddt_stats.c | 107 +++++++++++++++++++++++++++++------------
 5 files changed, 114 insertions(+), 51 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 142f55b299e5..250052adfb15 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -7357,29 +7357,27 @@ dump_simulated_ddt(spa_t *spa)
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
-		ddt_stat_t dds;
 		uint64_t refcnt = zdde->zdde_ref_blocks;
 		ASSERT(refcnt != 0);
 
-		dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
-		dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
-		dds.dds_psize = zdde->zdde_ref_psize / refcnt;
-		dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
+		ddt_stat_t *dds = &ddh_total.ddh_stat[highbit64(refcnt) - 1];
 
-		dds.dds_ref_blocks = zdde->zdde_ref_blocks;
-		dds.dds_ref_lsize = zdde->zdde_ref_lsize;
-		dds.dds_ref_psize = zdde->zdde_ref_psize;
-		dds.dds_ref_dsize = zdde->zdde_ref_dsize;
+		dds->dds_blocks += zdde->zdde_ref_blocks / refcnt;
+		dds->dds_lsize += zdde->zdde_ref_lsize / refcnt;
+		dds->dds_psize += zdde->zdde_ref_psize / refcnt;
+		dds->dds_dsize += zdde->zdde_ref_dsize / refcnt;
 
-		ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1],
-		    &dds, 0);
+		dds->dds_ref_blocks += zdde->zdde_ref_blocks;
+		dds->dds_ref_lsize += zdde->zdde_ref_lsize;
+		dds->dds_ref_psize += zdde->zdde_ref_psize;
+		dds->dds_ref_dsize += zdde->zdde_ref_dsize;
 
 		umem_free(zdde, sizeof (*zdde));
 	}
 
 	avl_destroy(&t);
 
-	ddt_histogram_stat(&dds_total, &ddh_total);
+	ddt_histogram_total(&dds_total, &ddh_total);
 
 	(void) printf("Simulated DDT histogram:\n");
 
diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 11e09eef3bcc..2dd18526dbb7 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -318,9 +318,15 @@ extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp,
 extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
     boolean_t encrypted);
 
+extern void ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh,
+    const ddt_lightweight_entry_t *ddlwe);
+extern void ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh,
+    const ddt_lightweight_entry_t *ddlwe);
+
 extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
-extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
+extern void ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh);
 extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
+
 extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo);
 extern uint64_t ddt_get_ddt_dsize(spa_t *spa);
 extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh);
diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h
index c4e681fb117b..ce4bc559ddb5 100644
--- a/include/sys/ddt_impl.h
+++ b/include/sys/ddt_impl.h
@@ -77,8 +77,6 @@ typedef struct {
 
 extern const ddt_ops_t ddt_zap_ops;
 
-extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg);
-
 /*
  * These are only exposed so that zdb can access them. Try not to use them
  * outside of the DDT implementation proper, and if you do, consider moving
@@ -95,8 +93,6 @@ extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde);
 
 extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
 
-extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg);
-
 extern void ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
     char *name);
 extern int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz,
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 59526394bd07..f3b347326112 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -992,7 +992,18 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 		/* Flag cleanup required */
 		dde->dde_flags |= DDE_FLAG_OVERQUOTA;
 	} else if (error == 0) {
-		ddt_stat_update(ddt, dde, -1ULL);
+		/*
+		 * The histograms only track inactive (stored) blocks.
+		 * We've just put an entry onto the live list, so we need to
+		 * remove its counts. When its synced back, it'll be re-added
+		 * to the right one.
+		 */
+		ddt_histogram_t *ddh =
+		    &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
+
+		ddt_lightweight_entry_t ddlwe;
+		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
+		ddt_histogram_sub_entry(ddt, ddh, &ddlwe);
 	}
 
 	/* Entry loaded, everyone can proceed now */
@@ -1527,11 +1538,18 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 	if (total_refcnt != 0) {
 		dde->dde_type = ntype;
 		dde->dde_class = nclass;
-		ddt_stat_update(ddt, dde, 0);
+
 		if (!ddt_object_exists(ddt, ntype, nclass))
 			ddt_object_create(ddt, ntype, nclass, tx);
 		VERIFY0(ddt_object_update(ddt, ntype, nclass, dde, tx));
 
+		ddt_lightweight_entry_t ddlwe;
+		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
+
+		ddt_histogram_t *ddh =
+		    &ddt->ddt_histogram[ntype][nclass];
+		ddt_histogram_add_entry(ddt, ddh, &ddlwe);
+
 		/*
 		 * If the class changes, the order that we scan this bp
 		 * changes.  If it decreases, we could miss it, so
@@ -1540,8 +1558,6 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 		 * traversing.)
 		 */
 		if (nclass < oclass) {
-			ddt_lightweight_entry_t ddlwe;
-			DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
 			dsl_scan_ddt_entry(dp->dp_scan,
 			    ddt->ddt_checksum, ddt, &ddlwe, tx);
 		}
diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c
index 6da77bbca5cb..9316200f21fc 100644
--- a/module/zfs/ddt_stats.c
+++ b/module/zfs/ddt_stats.c
@@ -33,24 +33,24 @@
 #include <sys/ddt_impl.h>
 
 static void
-ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
+ddt_stat_generate(ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
+    ddt_stat_t *dds)
 {
 	spa_t *spa = ddt->ddt_spa;
-	ddt_key_t *ddk = &dde->dde_key;
-	uint64_t lsize = DDK_GET_LSIZE(ddk);
-	uint64_t psize = DDK_GET_PSIZE(ddk);
+	uint64_t lsize = DDK_GET_LSIZE(&ddlwe->ddlwe_key);
+	uint64_t psize = DDK_GET_PSIZE(&ddlwe->ddlwe_key);
 
 	memset(dds, 0, sizeof (*dds));
 
-	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-		const ddt_univ_phys_t *ddp = dde->dde_phys;
+	for (int p = 0; p < ddlwe->ddlwe_nphys; p++) {
+		const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 
 		if (ddt_phys_birth(ddp, v) == 0)
 			continue;
 
 		int ndvas = ddt_phys_dva_count(ddp, v,
-		    DDK_GET_CRYPT(&dde->dde_key));
+		    DDK_GET_CRYPT(&ddlwe->ddlwe_key));
 		const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ?
 		    ddp->ddp_flat.ddp_dva : ddp->ddp_trad[p].ddp_dva;
 
@@ -72,61 +72,108 @@ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
 	}
 }
 
+static void
+ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src)
+{
+	dst->dds_blocks		+= src->dds_blocks;
+	dst->dds_lsize		+= src->dds_lsize;
+	dst->dds_psize		+= src->dds_psize;
+	dst->dds_dsize		+= src->dds_dsize;
+	dst->dds_ref_blocks	+= src->dds_ref_blocks;
+	dst->dds_ref_lsize	+= src->dds_ref_lsize;
+	dst->dds_ref_psize	+= src->dds_ref_psize;
+	dst->dds_ref_dsize	+= src->dds_ref_dsize;
+}
+
+static void
+ddt_stat_sub(ddt_stat_t *dst, const ddt_stat_t *src)
+{
+	/* This caught more during development than you might expect... */
+	ASSERT3U(dst->dds_blocks, >=, src->dds_blocks);
+	ASSERT3U(dst->dds_lsize, >=, src->dds_lsize);
+	ASSERT3U(dst->dds_psize, >=, src->dds_psize);
+	ASSERT3U(dst->dds_dsize, >=, src->dds_dsize);
+	ASSERT3U(dst->dds_ref_blocks, >=, src->dds_ref_blocks);
+	ASSERT3U(dst->dds_ref_lsize, >=, src->dds_ref_lsize);
+	ASSERT3U(dst->dds_ref_psize, >=, src->dds_ref_psize);
+	ASSERT3U(dst->dds_ref_dsize, >=, src->dds_ref_dsize);
+
+	dst->dds_blocks		-= src->dds_blocks;
+	dst->dds_lsize		-= src->dds_lsize;
+	dst->dds_psize		-= src->dds_psize;
+	dst->dds_dsize		-= src->dds_dsize;
+	dst->dds_ref_blocks	-= src->dds_ref_blocks;
+	dst->dds_ref_lsize	-= src->dds_ref_lsize;
+	dst->dds_ref_psize	-= src->dds_ref_psize;
+	dst->dds_ref_dsize	-= src->dds_ref_dsize;
+}
+
 void
-ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
+ddt_histogram_add_entry(ddt_t *ddt, ddt_histogram_t *ddh,
+    const ddt_lightweight_entry_t *ddlwe)
 {
-	const uint64_t *s = (const uint64_t *)src;
-	uint64_t *d = (uint64_t *)dst;
-	uint64_t *d_end = (uint64_t *)(dst + 1);
+	ddt_stat_t dds;
+	int bucket;
 
-	ASSERT(neg == 0 || neg == -1ULL);	/* add or subtract */
+	ddt_stat_generate(ddt, ddlwe, &dds);
+
+	bucket = highbit64(dds.dds_ref_blocks) - 1;
+	if (bucket < 0)
+		return;
 
-	for (int i = 0; i < d_end - d; i++)
-		d[i] += (s[i] ^ neg) - neg;
+	ddt_stat_add(&ddh->ddh_stat[bucket], &dds);
 }
 
 void
-ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
+ddt_histogram_sub_entry(ddt_t *ddt, ddt_histogram_t *ddh,
+    const ddt_lightweight_entry_t *ddlwe)
 {
 	ddt_stat_t dds;
-	ddt_histogram_t *ddh;
 	int bucket;
 
-	ddt_stat_generate(ddt, dde, &dds);
+	ddt_stat_generate(ddt, ddlwe, &dds);
 
 	bucket = highbit64(dds.dds_ref_blocks) - 1;
-	ASSERT3U(bucket, >=, 0);
+	if (bucket < 0)
+		return;
 
-	ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
-
-	ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
+	ddt_stat_sub(&ddh->ddh_stat[bucket], &dds);
 }
 
 void
 ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
 {
 	for (int h = 0; h < 64; h++)
-		ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
+		ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h]);
 }
 
 void
-ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
+ddt_histogram_total(ddt_stat_t *dds, const ddt_histogram_t *ddh)
 {
 	memset(dds, 0, sizeof (*dds));
 
 	for (int h = 0; h < 64; h++)
-		ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
+		ddt_stat_add(dds, &ddh->ddh_stat[h]);
 }
 
 boolean_t
 ddt_histogram_empty(const ddt_histogram_t *ddh)
 {
-	const uint64_t *s = (const uint64_t *)ddh;
-	const uint64_t *s_end = (const uint64_t *)(ddh + 1);
+	for (int h = 0; h < 64; h++) {
+		const ddt_stat_t *dds = &ddh->ddh_stat[h];
+
+		if (dds->dds_blocks == 0 &&
+		    dds->dds_lsize == 0 &&
+		    dds->dds_psize == 0 &&
+		    dds->dds_dsize == 0 &&
+		    dds->dds_ref_blocks == 0 &&
+		    dds->dds_ref_lsize == 0 &&
+		    dds->dds_ref_psize == 0 &&
+		    dds->dds_ref_dsize == 0)
+			continue;
 
-	while (s < s_end)
-		if (*s++ != 0)
-			return (B_FALSE);
+		return (B_FALSE);
+	}
 
 	return (B_TRUE);
 }
@@ -222,7 +269,7 @@ ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
 
 	ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
 	ddt_get_dedup_histogram(spa, ddh_total);
-	ddt_histogram_stat(dds_total, ddh_total);
+	ddt_histogram_total(dds_total, ddh_total);
 	kmem_free(ddh_total, sizeof (ddt_histogram_t));
 }
 

From 592f38900dc21ff86ca9c821c72b55e4ace347af Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Wed, 11 Oct 2023 12:46:55 +1100
Subject: [PATCH 26/65] ddt: compare keys 64-bits at a time, trying to match
 ZAP order

This yields substantial performance improvements when we only write out
some small % of entries at a time, as it will cause entries that will go
into "nearby" ZAP leaf nodes to be grouped closer together in the AVL, and
so touch fewer blocks. Without this, the distribution is an even spread,
so we touch a lot more ZAP leaf nodes for any given number of entries.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
---
 module/zfs/ddt.c | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index f3b347326112..26e127d61ac2 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -1038,29 +1038,25 @@ ddt_prefetch(spa_t *spa, const blkptr_t *bp)
 }
 
 /*
- * Key comparison. Any struct wanting to make use of this function must have
- * the key as the first element.
+ * ddt_key_t comparison. Any struct wanting to make use of this function must
+ * have the key as the first element. Casts it to N uint64_ts, and checks until
+ * we find there's a difference. This is intended to match how ddt_zap.c drives
+ * the ZAPs (first uint64_t as the key prehash), which will minimise the number
+ * of ZAP blocks touched when flushing logged entries from an AVL walk. This is
+ * not an invariant for this function though, should you wish to change it.
  */
-#define	DDT_KEY_CMP_LEN	(sizeof (ddt_key_t) / sizeof (uint16_t))
-
-typedef struct ddt_key_cmp {
-	uint16_t	u16[DDT_KEY_CMP_LEN];
-} ddt_key_cmp_t;
-
 int
 ddt_key_compare(const void *x1, const void *x2)
 {
-	const ddt_key_cmp_t *k1 = (const ddt_key_cmp_t *)x1;
-	const ddt_key_cmp_t *k2 = (const ddt_key_cmp_t *)x2;
-	int32_t cmp = 0;
+	const uint64_t *k1 = (const uint64_t *)x1;
+	const uint64_t *k2 = (const uint64_t *)x2;
 
-	for (int i = 0; i < DDT_KEY_CMP_LEN; i++) {
-		cmp = (int32_t)k1->u16[i] - (int32_t)k2->u16[i];
-		if (likely(cmp))
-			break;
-	}
+	int cmp;
+	for (int i = 0; i < (sizeof (ddt_key_t) / sizeof (uint64_t)); i++)
+		if (likely((cmp = TREE_CMP(k1[i], k2[i])) != 0))
+			return (cmp);
 
-	return (TREE_ISIGN(cmp));
+	return (0);
 }
 
 /* Create the containing dir for this DDT and bump the feature count */

From cbb9ef0a4c8e04358f7d5ddae0eb99d0f703ee21 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Fri, 6 Oct 2023 17:06:34 +1100
Subject: [PATCH 27/65] ddt: tuneable to override copies= on dedup metadata
 objects

All objects stored in the MOS get copies=3. For a large dedup table,
this requires significant extra IO and disk space, when its not really
necessary - the dedup table itself isn't needed to read or write data,
only to keep data usage down. Losing the dedup table does not render the
pool unusable, it just messes up the accounting somewhat.

This adds a dmu_ddt_copies tuneable. When set to 0, the existing
behaviour is used. When set higher, dedup table blocks (ZAP and log)
will have this many copies rather than the usual 3, while indirect
blocks will have one more again.

This is a tuneable for now mostly for testing. Losing a dedup table can
cause blocks to be leaked, and we currently have no facilities to repair
that.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
---
 module/zfs/dmu.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 3dcf49ceb64e..b3eda8ea5097 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -95,6 +95,12 @@ uint_t dmu_prefetch_max = 8 * 1024 * 1024;
 uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
 #endif
 
+/*
+ * Override copies= for dedup state objects. 0 means the traditional behaviour
+ * (ie the default for the containing objset ie 3 for the MOS).
+ */
+uint_t dmu_ddt_copies = 0;
+
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "unallocated"		},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "object directory"	},
@@ -2272,6 +2278,28 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 		case ZFS_REDUNDANT_METADATA_NONE:
 			break;
 		}
+
+		if (dmu_ddt_copies > 0) {
+			/*
+			 * If this tuneable is set, and this is a write for a
+			 * dedup entry store (zap or log), then we treat it
+			 * something like ZFS_REDUNDANT_METADATA_MOST on a
+			 * regular dataset: this many copies, and one more for
+			 * "higher" indirect blocks. This specific exception is
+			 * necessary because dedup objects are stored in the
+			 * MOS, which always has the highest possible copies.
+			 */
+			dmu_object_type_t stype =
+			    dn ? dn->dn_storage_type : DMU_OT_NONE;
+			if (stype == DMU_OT_NONE)
+				stype = type;
+			if (stype == DMU_OT_DDT_ZAP) {
+				copies = dmu_ddt_copies;
+				if (level >=
+				    zfs_redundant_metadata_most_ditto_level)
+					copies++;
+			}
+		}
 	} else if (wp & WP_NOFILL) {
 		ASSERT(level == 0);
 
@@ -2824,3 +2852,7 @@ ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW,
 /* CSTYLED */
 ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW,
 	"Limit one prefetch call to this size");
+
+/* CSTYLED */
+ZFS_MODULE_PARAM(zfs, , dmu_ddt_copies, UINT, ZMOD_RW,
+	"Override copies= for dedup objects");

From cd69ba3d49cdb939cba87e7fd6814608532df92f Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 22 Jun 2023 17:46:22 +1000
Subject: [PATCH 28/65] ddt: dedup log

Adds a log/journal to dedup. At the end of txg, instead of writing the
entry directly to the ZAP, instead its adding to an in-memory tree and
appended to an on-disk object. The on-disk object is only read at
import, to reload the in-memory tree.

Lookups first go the the log tree before going to the ZAP, so
recently-used entries will remain close by in memory. This vastly
reduces overhead from dedup IO, as it will not have to do so many
read/update/write cycles on ZAP leaf nodes.

A flushing facility is added at end of txg, to push logged entries out
to the ZAP. There's actually two separate "logs" (in-memory tree and
on-disk object), one active (recieving updated entries) and one flushing
(writing out to disk). These are swapped (ie flushing begins) based on
memory used by the in-memory log trees and time since we last flushed
something.

The flushing facility monitors the amount of entries coming in and being
flushed out, and calibrates itself to try to flush enough each txg to
keep up with the ingest rate without competing too much with other IO.
Multiple tuneables are provided to control the flushing facility.

All the histograms and stats are update to accomodate the log as a
separate entry store. zdb gains knowledge of how to count them and dump
them. Documentation included!

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
---
 cmd/zdb/zdb.c                                 |  33 +-
 include/sys/ddt.h                             |  39 +-
 include/sys/ddt_impl.h                        | 131 ++-
 include/sys/dmu.h                             |   1 +
 lib/libzpool/Makefile.am                      |   1 +
 man/man4/zfs.4                                |  82 ++
 module/Kbuild.in                              |   1 +
 module/Makefile.bsd                           |   2 +
 module/zfs/ddt.c                              | 604 +++++++++++---
 module/zfs/ddt_log.c                          | 760 ++++++++++++++++++
 module/zfs/ddt_stats.c                        |   9 +-
 tests/zfs-tests/include/tunables.cfg          |   1 +
 .../functional/dedup/dedup_fdt_create.ksh     |   7 +
 .../functional/dedup/dedup_fdt_import.ksh     |   7 +
 .../dedup/dedup_legacy_fdt_mixed.ksh          |   7 +
 .../dedup/dedup_legacy_fdt_upgrade.ksh        |   7 +
 .../tests/functional/dedup/dedup_quota.ksh    |  18 +-
 17 files changed, 1600 insertions(+), 110 deletions(-)
 create mode 100644 module/zfs/ddt_log.c

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 250052adfb15..c72df3909356 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -1958,6 +1958,32 @@ dump_dedup_ratio(const ddt_stat_t *dds)
 	    dedup, compress, copies, dedup * compress / copies);
 }
 
+static void
+dump_ddt_log(ddt_t *ddt)
+{
+	for (int n = 0; n < 2; n++) {
+		ddt_log_t *ddl = &ddt->ddt_log[n];
+
+		uint64_t count = avl_numnodes(&ddl->ddl_tree);
+		if (count == 0)
+			continue;
+
+		printf(DMU_POOL_DDT_LOG ": %lu log entries\n",
+		    zio_checksum_table[ddt->ddt_checksum].ci_name, n, count);
+
+		if (dump_opt['D'] < 4)
+			continue;
+
+		ddt_lightweight_entry_t ddlwe;
+		uint64_t index = 0;
+		for (ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
+		    ddle; ddle = AVL_NEXT(&ddl->ddl_tree, ddle)) {
+			DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
+			dump_ddt_entry(ddt, &ddlwe, index++);
+		}
+	}
+}
+
 static void
 dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 {
@@ -2027,6 +2053,7 @@ dump_all_ddts(spa_t *spa)
 				dump_ddt(ddt, type, class);
 			}
 		}
+		dump_ddt_log(ddt);
 	}
 
 	ddt_get_dedup_stats(spa, &dds_total);
@@ -5743,7 +5770,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
 			    (void *)(((uintptr_t)dde->dde_io) | (1 << v));
 
 		/* Consume a reference for this block. */
-		VERIFY3U(ddt_phys_total_refcnt(ddt, dde), >, 0);
+		VERIFY3U(ddt_phys_total_refcnt(ddt, dde->dde_phys), >, 0);
 		ddt_phys_decref(dde->dde_phys, v);
 
 		/*
@@ -8120,6 +8147,10 @@ dump_mos_leaks(spa_t *spa)
 
 		/* FDT container */
 		mos_obj_refd(ddt->ddt_dir_object);
+
+		/* FDT log objects */
+		mos_obj_refd(ddt->ddt_log[0].ddl_object);
+		mos_obj_refd(ddt->ddt_log[1].ddl_object);
 	}
 
 	if (spa->spa_brt != NULL) {
diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 2dd18526dbb7..2fc798725eda 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -43,7 +43,8 @@ struct abd;
  * DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
  */
 #define	DDT_FLAG_FLAT	(1 << 0)	/* single extensible phys */
-#define	DDT_FLAG_MASK	(DDT_FLAG_FLAT)
+#define	DDT_FLAG_LOG	(1 << 1)	/* dedup log (journal) */
+#define	DDT_FLAG_MASK	(DDT_FLAG_FLAT|DDT_FLAG_LOG)
 
 /*
  * DDT on-disk storage object types. Each one corresponds to specific
@@ -209,6 +210,7 @@ typedef enum {
 /* State flags for dde_flags */
 #define	DDE_FLAG_LOADED		(1 << 0)	/* entry ready for use */
 #define	DDE_FLAG_OVERQUOTA	(1 << 1)	/* entry unusable, no space */
+#define	DDE_FLAG_LOGGED		(1 << 2)	/* loaded from log */
 
 /*
  * Additional data to support entry update or repair. This is fixed size
@@ -254,6 +256,19 @@ typedef struct {
 	ddt_univ_phys_t	ddlwe_phys;
 } ddt_lightweight_entry_t;
 
+/*
+ * In-core DDT log. A separate struct to make it easier to switch between the
+ * appending and flushing logs.
+ */
+typedef struct {
+	avl_tree_t	ddl_tree;	/* logged entries */
+	uint32_t	ddl_flags;	/* flags for this log */
+	uint64_t	ddl_object;	/* log object id */
+	uint64_t	ddl_length;	/* on-disk log size */
+	uint64_t	ddl_first_txg;	/* txg log became active */
+	ddt_key_t	ddl_checkpoint;	/* last checkpoint */
+} ddt_log_t;
+
 /*
  * In-core DDT object. This covers all entries and stats for a the whole pool
  * for a given checksum type.
@@ -262,8 +277,22 @@ typedef struct {
 	kmutex_t	ddt_lock;	/* protects changes to all fields */
 
 	avl_tree_t	ddt_tree;	/* "live" (changed) entries this txg */
+	avl_tree_t	ddt_log_tree;	/* logged entries */
 
-	avl_tree_t	ddt_repair_tree; /* entries being repaired */
+	avl_tree_t	ddt_repair_tree;	/* entries being repaired */
+
+	ddt_log_t	ddt_log[2];		/* active/flushing logs */
+	ddt_log_t	*ddt_log_active;	/* pointers into ddt_log */
+	ddt_log_t	*ddt_log_flushing;	/* swapped when flush starts */
+
+	hrtime_t	ddt_flush_start;	/* log flush start this txg */
+	uint32_t	ddt_flush_pass;		/* log flush pass this txg */
+
+	int32_t		ddt_flush_count;	/* entries flushed this txg */
+	int32_t		ddt_flush_min;		/* min rem entries to flush */
+	int32_t		ddt_log_ingest_rate;	/* rolling log ingest rate */
+	int32_t		ddt_log_flush_rate;	/* rolling log flush rate */
+	int32_t		ddt_log_flush_time_rate; /* avg time spent flushing */
 
 	enum zio_checksum ddt_checksum;	/* checksum algorithm in use */
 	spa_t		*ddt_spa;	/* pool this ddt is on */
@@ -276,13 +305,17 @@ typedef struct {
 	/* per-type/per-class entry store objects */
 	uint64_t	ddt_object[DDT_TYPES][DDT_CLASSES];
 
-	/* object ids for whole-ddt and per-type/per-class stats */
+	/* object ids for stored, logged and per-type/per-class stats */
 	uint64_t	ddt_stat_object;
+	ddt_object_t	ddt_log_stats;
 	ddt_object_t	ddt_object_stats[DDT_TYPES][DDT_CLASSES];
 
 	/* type/class stats by power-2-sized referenced blocks */
 	ddt_histogram_t	ddt_histogram[DDT_TYPES][DDT_CLASSES];
 	ddt_histogram_t	ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
+
+	/* log stats power-2-sized referenced blocks */
+	ddt_histogram_t	ddt_log_histogram;
 } ddt_t;
 
 /*
diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h
index ce4bc559ddb5..6f11cd90c1d8 100644
--- a/include/sys/ddt_impl.h
+++ b/include/sys/ddt_impl.h
@@ -28,6 +28,7 @@
 #define	_SYS_DDT_IMPL_H
 
 #include <sys/ddt.h>
+#include <sys/bitops.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -50,6 +51,106 @@ extern "C" {
 	memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \
 } while (0)
 
+#define	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe) do {             \
+	memset((ddlwe), 0, sizeof (*ddlwe));                            \
+	(ddlwe)->ddlwe_key = (ddle)->ddle_key;                          \
+	(ddlwe)->ddlwe_type = (ddle)->ddle_type;                        \
+	(ddlwe)->ddlwe_class = (ddle)->ddle_class;                      \
+	memcpy(&(ddlwe)->ddlwe_phys, (ddle)->ddle_phys, DDT_PHYS_SIZE(ddt)); \
+} while (0)
+
+/*
+ * An entry on the log tree. These are "frozen", and a record of what's in
+ * the on-disk log. They can't be used in place, but can be "loaded" back into
+ * the live tree.
+ */
+typedef struct {
+	ddt_key_t	ddle_key;	/* ddt_log_tree key */
+	avl_node_t	ddle_node;	/* ddt_log_tree node */
+
+	ddt_type_t	ddle_type;	/* storage type */
+	ddt_class_t	ddle_class;	/* storage class */
+
+	/* extra allocation for flat/trad phys */
+	ddt_univ_phys_t	ddle_phys[];
+} ddt_log_entry_t;
+
+/* On-disk log record types. */
+typedef enum {
+	DLR_INVALID	= 0,	/* end of block marker */
+	DLR_ENTRY	= 1,	/* an entry to add or replace in the log tree */
+} ddt_log_record_type_t;
+
+/* On-disk log record header. */
+typedef struct {
+	/*
+	 * dlr_info is a packed u64, use the DLR_GET/DLR_SET macros below to
+	 * access it.
+	 *
+	 * bits 0-7:    record type (ddt_log_record_type_t)
+	 * bits 8-15:  length of record header+payload
+	 * bits 16-47:  reserved, all zero
+	 * bits 48-55:   if type==DLR_ENTRY, storage type (ddt_type)
+	 *                otherwise all zero
+	 * bits 56-63:  if type==DLR_ENTRY, storage class (ddt_class)
+	 *                otherwise all zero
+	 */
+	uint64_t	dlr_info;
+	uint8_t		dlr_payload[];
+} ddt_log_record_t;
+
+#define	DLR_GET_TYPE(dlr)		BF64_GET((dlr)->dlr_info, 0, 8)
+#define	DLR_SET_TYPE(dlr, v)		BF64_SET((dlr)->dlr_info, 0, 8, v)
+#define	DLR_GET_RECLEN(dlr)		BF64_GET((dlr)->dlr_info, 8, 16)
+#define	DLR_SET_RECLEN(dlr, v)		BF64_SET((dlr)->dlr_info, 8, 16, v)
+#define	DLR_GET_ENTRY_TYPE(dlr)		BF64_GET((dlr)->dlr_info, 48, 8)
+#define	DLR_SET_ENTRY_TYPE(dlr, v)	BF64_SET((dlr)->dlr_info, 48, 8, v)
+#define	DLR_GET_ENTRY_CLASS(dlr)	BF64_GET((dlr)->dlr_info, 56, 8)
+#define	DLR_SET_ENTRY_CLASS(dlr, v)	BF64_SET((dlr)->dlr_info, 56, 8, v)
+
+/* Payload for DLR_ENTRY. */
+typedef struct {
+	ddt_key_t	dlre_key;
+	ddt_univ_phys_t	dlre_phys[];
+} ddt_log_record_entry_t;
+
+/* Log flags (ddl_flags, dlh_flags) */
+#define	DDL_FLAG_FLUSHING	(1 << 0)	/* this log is being flushed */
+#define	DDL_FLAG_CHECKPOINT	(1 << 1)	/* header has a checkpoint */
+
+/* On-disk log header, stored in the bonus buffer. */
+typedef struct {
+	/*
+	 * dlh_info is a packed u64, use the DLH_GET/DLH_SET macros below to
+	 * access it.
+	 *
+	 * bits 0-7:   log version
+	 * bits 8-15:  log flags
+	 * bits 16-63: reserved, all zero
+	 */
+	uint64_t	dlh_info;
+
+	uint64_t	dlh_length;	/* log size in bytes */
+	uint64_t	dlh_first_txg;	/* txg this log went active */
+	ddt_key_t	dlh_checkpoint;	/* last checkpoint */
+} ddt_log_header_t;
+
+#define	DLH_GET_VERSION(dlh)	BF64_GET((dlh)->dlh_info, 0, 8)
+#define	DLH_SET_VERSION(dlh, v)	BF64_SET((dlh)->dlh_info, 0, 8, v)
+#define	DLH_GET_FLAGS(dlh)	BF64_GET((dlh)->dlh_info, 8, 8)
+#define	DLH_SET_FLAGS(dlh, v)	BF64_SET((dlh)->dlh_info, 8, 8, v)
+
+/* DDT log update state */
+typedef struct {
+	dmu_tx_t	*dlu_tx;	/* tx the update is being applied to */
+	dnode_t		*dlu_dn;	/* log object dnode */
+	dmu_buf_t	**dlu_dbp;	/* array of block buffer pointers */
+	int		dlu_ndbp;	/* number of block buffer pointers */
+	uint16_t	dlu_reclen;	/* cached length of record */
+	uint64_t	dlu_block;	/* block for next entry */
+	uint64_t	dlu_offset;	/* offset for next entry */
+} ddt_log_update_t;
+
 /*
  * Ops vector to access a specific DDT object type.
  */
@@ -77,6 +178,33 @@ typedef struct {
 
 extern const ddt_ops_t ddt_zap_ops;
 
+/* Dedup log API */
+extern void ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx,
+    ddt_log_update_t *dlu);
+extern void ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *dde,
+    ddt_log_update_t *dlu);
+extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu);
+
+extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl,
+    ddt_lightweight_entry_t *ddlwe);
+extern boolean_t ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl,
+    const ddt_key_t *ddk, ddt_lightweight_entry_t *ddlwe);
+
+extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
+    dmu_tx_t *tx);
+extern void ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx);
+
+extern boolean_t ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx);
+
+extern void ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx);
+
+extern int ddt_log_load(ddt_t *ddt);
+extern void ddt_log_alloc(ddt_t *ddt);
+extern void ddt_log_free(ddt_t *ddt);
+
+extern void ddt_log_init(void);
+extern void ddt_log_fini(void);
+
 /*
  * These are only exposed so that zdb can access them. Try not to use them
  * outside of the DDT implementation proper, and if you do, consider moving
@@ -89,7 +217,8 @@ extern const ddt_ops_t ddt_zap_ops;
  */
 #define	DDT_NAMELEN	32
 
-extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde);
+extern uint64_t ddt_phys_total_refcnt(const ddt_t *ddt,
+    const ddt_univ_phys_t *ddp);
 
 extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
 
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 5b80dc315945..928f5f2b4fd4 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -375,6 +375,7 @@ typedef struct dmu_buf {
 #define	DMU_POOL_L2CACHE		"l2cache"
 #define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
 #define	DMU_POOL_DDT			"DDT-%s-%s-%s"
+#define	DMU_POOL_DDT_LOG		"DDT-log-%s-%u"
 #define	DMU_POOL_DDT_STATS		"DDT-statistics"
 #define	DMU_POOL_DDT_DIR		"DDT-%s"
 #define	DMU_POOL_CREATION_VERSION	"creation_version"
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index 42f3404db5a9..070dc0132f2f 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -79,6 +79,7 @@ nodist_libzpool_la_SOURCES = \
 	module/zfs/dbuf.c \
 	module/zfs/dbuf_stats.c \
 	module/zfs/ddt.c \
+	module/zfs/ddt_log.c \
 	module/zfs/ddt_stats.c \
 	module/zfs/ddt_zap.c \
 	module/zfs/dmu.c \
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 45b6c338aa9e..aae3d7dfb5f6 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -974,6 +974,88 @@ milliseconds until the operation completes.
 .It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Enable prefetching dedup-ed blocks which are going to be freed.
 .
+.It Sy zfs_dedup_log_flush_passes_max Ns = Ns Sy 8 Ns Pq uint
+Maximum number of dedup log flush passes (iterations) each transaction.
+.Pp
+At the start of each transaction, OpenZFS will estimate how many entries it
+needs to flush out to keep up with the change rate, taking the amount and time
+taken to flush on previous txgs into account (see
+.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
+It will spread this amount into a number of passes.
+At each pass, it will use the amount already flushed and the total time taken
+by flushing and by other IO to recompute how much it should do for the remainder
+of the txg.
+.Pp
+Reducing the max number of passes will make flushing more aggressive, flushing
+out more entries on each pass.
+This can be faster, but also more likely to compete with other IO.
+Increasing the max number of passes will put fewer entries onto each pass,
+keeping the overhead of dedup changes to a minimum but possibly causing a large
+number of changes to be dumped on the last pass, which can blow out the txg
+sync time beyond
+.Sy zfs_txg_timeout .
+.
+.It Sy zfs_dedup_log_flush_min_time_ms Ns = Ns Sy 1000 Ns Pq uint
+Minimum time to spend on dedup log flush each transaction.
+.Pp
+At least this long will be spent flushing dedup log entries each transaction,
+up to
+.Sy zfs_txg_timeout .
+This occurs even if doing so would delay the transaction, that is, other IO
+completes under this time.
+.
+.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 1000 Ns Pq uint
+Flush at least this many entries each transaction.
+.Pp
+OpenZFS will estimate how many entries it needs to flush each transaction to
+keep up with the ingest rate (see
+.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
+This sets the minimum for that estimate.
+Raising it can force OpenZFS to flush more aggressively, keeping the log small
+and so reducing pool import times, but can make it less able to back off if
+log flushing would compete with other IO too much.
+.
+.It Sy zfs_dedup_log_flush_flow_rate_txgs Ns = Ns Sy 10 Ns Pq uint
+Number of transactions to use to compute the flow rate.
+.Pp
+OpenZFS will estimate how many entries it needs to flush each transaction by
+monitoring the number of entries changed (ingest rate), number of entries
+flushed (flush rate) and time spent flushing (flush time rate) and combining
+these into an overall "flow rate".
+It will use an exponential weighted moving average over some number of recent
+transactions to compute these rates.
+This sets the number of transactions to compute these averages over.
+Setting it higher can help to smooth out the flow rate in the face of spiky
+workloads, but will take longer for the flow rate to adjust to a sustained
+change in the ingress rate.
+.
+.It Sy zfs_dedup_log_txg_max Ns = Ns Sy 8 Ns Pq uint
+Max transactions to before starting to flush dedup logs.
+.Pp
+OpenZFS maintains two dedup logs, one receiving new changes, one flushing.
+If there is nothing to flush, it will accumulate changes for no more than this
+many transactions before switching the logs and starting to flush entries out.
+.
+.It Sy zfs_dedup_log_mem_max Ns = Ns Sy 0 Ns Pq u64
+Max memory to use for dedup logs.
+.Pp
+OpenZFS will spend no more than this much memory on maintaining the in-memory
+dedup log.
+Flushing will begin when around half this amount is being spent on logs.
+The default value of
+.Sy 0
+will cause it to be set by
+.Sy zfs_dedup_log_mem_max_percent
+instead.
+.
+.It Sy zfs_dedup_log_mem_max_percent Ns = Ns Sy 1 Ns % Pq uint
+Max memory to use for dedup logs, as a percentage of total memory.
+.Pp
+If
+.Sy zfs_dedup_log_mem_max
+is not set, it will be initialised as a percentage of the total memory in the
+system.
+.
 .It Sy zfs_delay_min_dirty_percent Ns = Ns Sy 60 Ns % Pq uint
 Start to delay each transaction once there is this amount of dirty data,
 expressed as a percentage of
diff --git a/module/Kbuild.in b/module/Kbuild.in
index 57682214dfd6..a119198dbfc0 100644
--- a/module/Kbuild.in
+++ b/module/Kbuild.in
@@ -322,6 +322,7 @@ ZFS_OBJS := \
 	dbuf.o \
 	dbuf_stats.o \
 	ddt.o \
+	ddt_log.o \
 	ddt_stats.o \
 	ddt_zap.o \
 	dmu.o \
diff --git a/module/Makefile.bsd b/module/Makefile.bsd
index d9d31564d090..534f3257132a 100644
--- a/module/Makefile.bsd
+++ b/module/Makefile.bsd
@@ -252,6 +252,7 @@ SRCS+=	abd.c \
 	dbuf.c \
 	dbuf_stats.c \
 	ddt.c \
+	ddt_log.c \
 	ddt_stats.c \
 	ddt_zap.c \
 	dmu.c \
@@ -426,6 +427,7 @@ CFLAGS.gcc+= -Wno-pointer-to-int-cast
 
 CFLAGS.abd.c= -Wno-cast-qual
 CFLAGS.ddt.c= -Wno-cast-qual
+CFLAGS.ddt_log.c= -Wno-cast-qual -Wno-pointer-arith
 CFLAGS.ddt_zap.c= -Wno-cast-qual
 CFLAGS.dmu.c= -Wno-cast-qual
 CFLAGS.dmu_traverse.c= -Wno-cast-qual
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 26e127d61ac2..ce5c4efb51ed 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -125,6 +125,28 @@
  * without which, no space would be recovered and the DDT would continue to be
  * considered "over quota". See zap_shrink_enabled.
  *
+ * ## Dedup log
+ *
+ * Historically, all entries modified on a txg were written back to dedup
+ * storage objects at the end of every txg. This could cause significant
+ * overheads, as each entry only takes up a tiny portion of a ZAP leaf node,
+ * and so required reading the whole node, updating the entry, and writing it
+ * back. On busy pools, this could add serious IO and memory overheads.
+ *
+ * To address this, the dedup log was added. If the "fast_dedup" feature is
+ * enabled, at the end of each txg, modified entries will be copied to an
+ * in-memory "log" object (ddt_log_t), and appended to an on-disk log. If the
+ * same block is requested again, the in-memory object will be checked first,
+ * and if its there, the entry inflated back onto the live tree without going
+ * to storage. The on-disk log is only read at pool import time, to reload the
+ * in-memory log.
+ *
+ * Each txg, some amount of the in-memory log will be flushed out to a DDT
+ * storage object (ie ZAP) as normal. OpenZFS will try hard to flush enough to
+ * keep up with the rate of change on dedup entries, but not so much that it
+ * would impact overall throughput, and not using too much memory. See the
+ * zfs_dedup_log_* tuneables in zfs(4) for more details.
+ *
  * ## Repair IO
  *
  * If a read on a dedup block fails, but there are other copies of the block in
@@ -201,6 +223,26 @@ int zfs_dedup_prefetch = 0;
 uint_t dedup_class_wait_txgs = 5;
 
 
+/*
+ * Don't do more than this many incremental flush passes per txg.
+ */
+uint_t zfs_dedup_log_flush_passes_max = 8;
+
+/*
+ * Minimum time to flush per txg.
+ */
+uint_t zfs_dedup_log_flush_min_time_ms = 1000;
+
+/*
+ * Minimum entries to flush per txg.
+ */
+uint_t zfs_dedup_log_flush_entries_min = 1000;
+
+/*
+ * Number of txgs to average flow rates across.
+ */
+uint_t zfs_dedup_log_flush_flow_rate_txgs = 10;
+
 static const ddt_ops_t *const ddt_ops[DDT_TYPES] = {
 	&ddt_zap_ops,
 };
@@ -217,7 +259,7 @@ static const char *const ddt_class_name[DDT_CLASSES] = {
  */
 static const uint64_t ddt_version_flags[] = {
 	[DDT_VERSION_LEGACY] = 0,
-	[DDT_VERSION_FDT] = DDT_FLAG_FLAT,
+	[DDT_VERSION_FDT] = DDT_FLAG_FLAT | DDT_FLAG_LOG,
 };
 
 /* Dummy version to signal that configure is still necessary */
@@ -405,13 +447,13 @@ ddt_object_prefetch_all(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 
 static int
 ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
-    ddt_entry_t *dde, dmu_tx_t *tx)
+    const ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
 {
 	ASSERT(ddt_object_exists(ddt, type, class));
 
 	return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
-	    ddt->ddt_object[type][class], &dde->dde_key,
-	    dde->dde_phys, DDT_PHYS_SIZE(ddt), tx));
+	    ddt->ddt_object[type][class], &ddlwe->ddlwe_key,
+	    &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt), tx));
 }
 
 static int
@@ -701,16 +743,15 @@ ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
 }
 
 uint64_t
-ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde)
+ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_univ_phys_t *ddp)
 {
 	uint64_t refcnt = 0;
 
-	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
-		refcnt = dde->dde_phys->ddp_flat.ddp_refcnt;
-	} else {
-		for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
-			refcnt += dde->dde_phys->ddp_trad[p].ddp_refcnt;
-	}
+	if (ddt->ddt_flags & DDT_FLAG_FLAT)
+		refcnt = ddp->ddp_flat.ddp_refcnt;
+	else
+		for (int v = DDT_PHYS_SINGLE; v <= DDT_PHYS_TRIPLE; v++)
+			refcnt += ddp->ddp_trad[v].ddp_refcnt;
 
 	return (refcnt);
 }
@@ -743,11 +784,15 @@ ddt_init(void)
 	    DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
 	ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache",
 	    DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	ddt_log_init();
 }
 
 void
 ddt_fini(void)
 {
+	ddt_log_fini();
+
 	kmem_cache_destroy(ddt_entry_trad_cache);
 	kmem_cache_destroy(ddt_entry_flat_cache);
 	kmem_cache_destroy(ddt_cache);
@@ -805,6 +850,13 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
 {
 	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 
+	/* Entry is still in the log, so charge the entry back to it */
+	if (dde->dde_flags & DDE_FLAG_LOGGED) {
+		ddt_lightweight_entry_t ddlwe;
+		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
+		ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
+	}
+
 	avl_remove(&ddt->ddt_tree, dde);
 	ddt_free(ddt, dde);
 }
@@ -951,6 +1003,25 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 
 	avl_insert(&ddt->ddt_tree, dde, where);
 
+	/* If its in the log tree, we can "load" it from there */
+	if (ddt->ddt_flags & DDT_FLAG_LOG) {
+		ddt_lightweight_entry_t ddlwe;
+
+		if (ddt_log_take_key(ddt, ddt->ddt_log_active,
+		    &search, &ddlwe) ||
+		    ddt_log_take_key(ddt, ddt->ddt_log_flushing,
+		    &search, &ddlwe)) {
+			dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
+
+			dde->dde_type = ddlwe.ddlwe_type;
+			dde->dde_class = ddlwe.ddlwe_class;
+			memcpy(dde->dde_phys, &ddlwe.ddlwe_phys,
+			    DDT_PHYS_SIZE(ddt));
+
+			return (dde);
+		}
+	}
+
 	/*
 	 * ddt_tree is now stable, so unlock and let everyone else keep moving.
 	 * Anyone landing on this entry will find it without DDE_FLAG_LOADED,
@@ -993,10 +1064,14 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 		dde->dde_flags |= DDE_FLAG_OVERQUOTA;
 	} else if (error == 0) {
 		/*
-		 * The histograms only track inactive (stored) blocks.
+		 * The histograms only track inactive (stored or logged) blocks.
 		 * We've just put an entry onto the live list, so we need to
 		 * remove its counts. When its synced back, it'll be re-added
 		 * to the right one.
+		 *
+		 * We only do this when we successfully found it in the store.
+		 * error == ENOENT means this is a new entry, and so its already
+		 * not counted.
 		 */
 		ddt_histogram_t *ddh =
 		    &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
@@ -1099,6 +1174,8 @@ ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx)
 		}
 	}
 
+	ddt_log_destroy(ddt, tx);
+
 	uint64_t count;
 	ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count));
 	ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object,
@@ -1241,23 +1318,26 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 
 	ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP);
 	memset(ddt, 0, sizeof (ddt_t));
-
 	mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&ddt->ddt_tree, ddt_key_compare,
 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 	avl_create(&ddt->ddt_repair_tree, ddt_key_compare,
 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+
 	ddt->ddt_checksum = c;
 	ddt->ddt_spa = spa;
 	ddt->ddt_os = spa->spa_meta_objset;
 	ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
 
+	ddt_log_alloc(ddt);
+
 	return (ddt);
 }
 
 static void
 ddt_table_free(ddt_t *ddt)
 {
+	ddt_log_free(ddt);
 	ASSERT0(avl_numnodes(&ddt->ddt_tree));
 	ASSERT0(avl_numnodes(&ddt->ddt_repair_tree));
 	avl_destroy(&ddt->ddt_tree);
@@ -1310,6 +1390,10 @@ ddt_load(spa_t *spa)
 			}
 		}
 
+		error = ddt_log_load(ddt);
+		if (error != 0 && error != ENOENT)
+			return (error);
+
 		/*
 		 * Seed the cached histograms.
 		 */
@@ -1483,145 +1567,447 @@ ddt_repair_table(ddt_t *ddt, zio_t *rio)
 }
 
 static void
-ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
+ddt_sync_update_stats(ddt_t *ddt, dmu_tx_t *tx)
+{
+	/*
+	 * Count all the entries stored for each type/class, and updates the
+	 * stats within (ddt_object_sync()). If there's no entries for the
+	 * type/class, the whole object is removed. If all objects for the DDT
+	 * are removed, its containing dir is removed, effectively resetting
+	 * the entire DDT to an empty slate.
+	 */
+	uint64_t count = 0;
+	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+		uint64_t add, tcount = 0;
+		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
+			if (ddt_object_exists(ddt, type, class)) {
+				ddt_object_sync(ddt, type, class, tx);
+				VERIFY0(ddt_object_count(ddt, type, class,
+				    &add));
+				tcount += add;
+			}
+		}
+		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
+			if (tcount == 0 && ddt_object_exists(ddt, type, class))
+				ddt_object_destroy(ddt, type, class, tx);
+		}
+		count += tcount;
+	}
+
+	if (ddt->ddt_flags & DDT_FLAG_LOG) {
+		/* Include logged entries in the total count */
+		count += avl_numnodes(&ddt->ddt_log_active->ddl_tree);
+		count += avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
+	}
+
+	if (count == 0) {
+		/*
+		 * No entries left on the DDT, so reset the version for next
+		 * time. This allows us to handle the feature being changed
+		 * since the DDT was originally created. New entries should get
+		 * whatever the feature currently demands.
+		 */
+		if (ddt->ddt_version == DDT_VERSION_FDT)
+			ddt_destroy_dir(ddt, tx);
+
+		ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
+		ddt->ddt_flags = 0;
+	}
+
+	memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
+	    sizeof (ddt->ddt_histogram));
+	ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
+	ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
+}
+
+static void
+ddt_sync_scan_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
-	ddt_key_t *ddk = &dde->dde_key;
-	ddt_type_t otype = dde->dde_type;
-	ddt_type_t ntype = DDT_TYPE_DEFAULT;
-	ddt_class_t oclass = dde->dde_class;
-	ddt_class_t nclass;
-	uint64_t total_refcnt = 0;
 
-	ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
+	/*
+	 * Compute the target class, so we can decide whether or not to inform
+	 * the scrub traversal (below). Note that we don't store this in the
+	 * entry, as it might change multiple times before finally being
+	 * committed (if we're logging). Instead, we recompute it in
+	 * ddt_sync_entry().
+	 */
+	uint64_t refcnt = ddt_phys_total_refcnt(ddt, &ddlwe->ddlwe_phys);
+	ddt_class_t nclass =
+	    (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
+
+	/*
+	 * If the class changes, the order that we scan this bp changes. If it
+	 * decreases, we could miss it, so scan it right now. (This covers both
+	 * class changing while we are doing ddt_walk(), and when we are
+	 * traversing.)
+	 *
+	 * We also do this when the refcnt goes to zero, because that change is
+	 * only in the log so far; the blocks on disk won't be freed until
+	 * the log is flushed, and the refcnt might increase before that. If it
+	 * does, then we could miss it in the same way.
+	 */
+	if (refcnt == 0 || nclass < ddlwe->ddlwe_class)
+		dsl_scan_ddt_entry(dp->dp_scan, ddt->ddt_checksum, ddt,
+		    ddlwe, tx);
+}
+
+static void
+ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
+    ddt_type_t otype, ddt_class_t oclass, dmu_tx_t *tx)
+{
+	ddt_key_t *ddk = &ddlwe->ddlwe_key;
+	ddt_type_t ntype = DDT_TYPE_DEFAULT;
+	uint64_t refcnt = 0;
 
+	/*
+	 * Compute the total refcnt. Along the way, issue frees for any DVAs
+	 * we no longer want.
+	 */
 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
-		ASSERT(dde->dde_io == NULL ||
-		    dde->dde_io->dde_lead_zio[p] == NULL);
-		ddt_univ_phys_t *ddp = dde->dde_phys;
+		ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 		uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v);
 
 		if (ddt_phys_birth(ddp, v) == 0) {
-			ASSERT0(phys_refcnt);
+			ASSERT3U(phys_refcnt, ==, 0);
 			continue;
 		}
 		if (DDT_PHYS_IS_DITTO(ddt, p)) {
 			/*
-			 * Note, we no longer create DDT-DITTO blocks, but we
-			 * don't want to leak any written by older software.
+			 * We don't want to keep any obsolete slots (eg ditto),
+			 * regardless of their refcount, but we don't want to
+			 * leak them either. So, free them.
 			 */
-			ddt_phys_free(ddt, ddk, ddp, v, txg);
+			ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
 			continue;
 		}
 		if (phys_refcnt == 0)
-			ddt_phys_free(ddt, ddk, ddp, v, txg);
-		total_refcnt += phys_refcnt;
+			/* No remaining references, free it! */
+			ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
+		refcnt += phys_refcnt;
 	}
 
-	if (total_refcnt > 1)
-		nclass = DDT_CLASS_DUPLICATE;
-	else
-		nclass = DDT_CLASS_UNIQUE;
+	/* Select the best class for the entry. */
+	ddt_class_t nclass =
+	    (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
 
+	/*
+	 * If an existing entry changed type or class, or its refcount reached
+	 * zero, delete it from the DDT object
+	 */
 	if (otype != DDT_TYPES &&
-	    (otype != ntype || oclass != nclass || total_refcnt == 0)) {
+	    (otype != ntype || oclass != nclass || refcnt == 0)) {
 		VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx));
-		ASSERT3U(
-		    ddt_object_contains(ddt, otype, oclass, ddk), ==, ENOENT);
+		ASSERT(ddt_object_contains(ddt, otype, oclass, ddk) == ENOENT);
 	}
 
-	if (total_refcnt != 0) {
-		dde->dde_type = ntype;
-		dde->dde_class = nclass;
+	/*
+	 * Add or update the entry
+	 */
+	if (refcnt != 0) {
+		ddt_histogram_t *ddh =
+		    &ddt->ddt_histogram[ntype][nclass];
+
+		ddt_histogram_add_entry(ddt, ddh, ddlwe);
 
 		if (!ddt_object_exists(ddt, ntype, nclass))
 			ddt_object_create(ddt, ntype, nclass, tx);
-		VERIFY0(ddt_object_update(ddt, ntype, nclass, dde, tx));
+		VERIFY0(ddt_object_update(ddt, ntype, nclass, ddlwe, tx));
+	}
+}
 
-		ddt_lightweight_entry_t ddlwe;
-		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
+/* Calculate an exponential weighted moving average, lower limited to zero */
+static inline int32_t
+_ewma(int32_t val, int32_t prev, uint32_t weight)
+{
+	ASSERT3U(val, >=, 0);
+	ASSERT3U(prev, >=, 0);
+	const int32_t new =
+	    MAX(0, prev + (val-prev) / (int32_t)MAX(weight, 1));
+	ASSERT3U(new, >=, 0);
+	return (new);
+}
 
-		ddt_histogram_t *ddh =
-		    &ddt->ddt_histogram[ntype][nclass];
-		ddt_histogram_add_entry(ddt, ddh, &ddlwe);
+/* Returns true if done for this txg */
+static boolean_t
+ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx)
+{
+	if (ddt->ddt_flush_pass == 0) {
+		if (spa_sync_pass(ddt->ddt_spa) == 1) {
+			/* First run this txg, get set up */
+			ddt->ddt_flush_start = gethrtime();
+			ddt->ddt_flush_count = 0;
 
+			/*
+			 * How many entries we need to flush. We want to at
+			 * least match the ingest rate.
+			 */
+			ddt->ddt_flush_min = MAX(
+			    ddt->ddt_log_ingest_rate,
+			    zfs_dedup_log_flush_entries_min);
+		} else {
+			/* We already decided we're done for this txg */
+			return (B_FALSE);
+		}
+	} else if (ddt->ddt_flush_pass == spa_sync_pass(ddt->ddt_spa)) {
 		/*
-		 * If the class changes, the order that we scan this bp
-		 * changes.  If it decreases, we could miss it, so
-		 * scan it right now.  (This covers both class changing
-		 * while we are doing ddt_walk(), and when we are
-		 * traversing.)
+		 * We already did some flushing on this pass, skip it. This
+		 * happens when dsl_process_async_destroys() runs during a scan
+		 * (on pass 1) and does an additional ddt_sync() to update
+		 * freed blocks.
 		 */
-		if (nclass < oclass) {
-			dsl_scan_ddt_entry(dp->dp_scan,
-			    ddt->ddt_checksum, ddt, &ddlwe, tx);
-		}
+		return (B_FALSE);
 	}
+
+	if (spa_sync_pass(ddt->ddt_spa) >
+	    MAX(zfs_dedup_log_flush_passes_max, 1)) {
+		/* Too many passes this txg, defer until next. */
+		ddt->ddt_flush_pass = 0;
+		return (B_TRUE);
+	}
+
+	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
+		/* Nothing to flush, done for this txg. */
+		ddt->ddt_flush_pass = 0;
+		return (B_TRUE);
+	}
+
+	uint64_t target_time = txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ?
+	    MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms),
+	    SEC2NSEC(zfs_txg_timeout)) : SEC2NSEC(zfs_txg_timeout);
+
+	uint64_t elapsed_time = gethrtime() - ddt->ddt_flush_start;
+
+	if (elapsed_time >= target_time) {
+		/* Too long since we started, done for this txg. */
+		ddt->ddt_flush_pass = 0;
+		return (B_TRUE);
+	}
+
+	ddt->ddt_flush_pass++;
+	ASSERT3U(spa_sync_pass(ddt->ddt_spa), ==, ddt->ddt_flush_pass);
+
+	/*
+	 * Estimate how much time we'll need to flush the remaining entries
+	 * based on how long it normally takes.
+	 */
+	uint32_t want_time;
+	if (ddt->ddt_flush_pass == 1) {
+		/* First pass, use the average time/entries */
+		if (ddt->ddt_log_flush_rate == 0)
+			/* Zero rate, just assume the whole time */
+			want_time = target_time;
+		else
+			want_time = ddt->ddt_flush_min *
+			    ddt->ddt_log_flush_time_rate /
+			    ddt->ddt_log_flush_rate;
+	} else {
+		/* Later pass, calculate from this txg so far */
+		want_time = ddt->ddt_flush_min *
+		    elapsed_time / ddt->ddt_flush_count;
+	}
+
+	/* Figure out how much time we have left */
+	uint32_t remain_time = target_time - elapsed_time;
+
+	/* Smear the remaining entries over the remaining passes. */
+	uint32_t nentries = ddt->ddt_flush_min /
+	    (MAX(1, zfs_dedup_log_flush_passes_max) + 1 - ddt->ddt_flush_pass);
+	if (want_time > remain_time) {
+		/*
+		 * We're behind; try to catch up a bit by doubling the amount
+		 * this pass. If we're behind that means we're in a later
+		 * pass and likely have most of the remaining time to
+		 * ourselves. If we're in the last couple of passes, then
+		 * doubling might just take us over the timeout, but probably
+		 * not be much, and it stops us falling behind. If we're
+		 * in the middle passes, there'll be more to do, but it
+		 * might just help us catch up a bit and we'll recalculate on
+		 * the next pass anyway.
+		 */
+		nentries = MIN(ddt->ddt_flush_min, nentries*2);
+	}
+
+	ddt_lightweight_entry_t ddlwe;
+	uint32_t count = 0;
+	while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) {
+		ddt_sync_flush_entry(ddt, &ddlwe,
+		    ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx);
+
+		/* End this pass if we've synced as much as we need to. */
+		if (++count >= nentries)
+			break;
+	}
+	ddt->ddt_flush_count += count;
+	ddt->ddt_flush_min -= count;
+
+	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
+		/* We emptied it, so truncate on-disk */
+		ddt_log_truncate(ddt, tx);
+		/* No more passes needed this txg */
+		ddt->ddt_flush_pass = 0;
+	} else
+		/* More to do next time, save checkpoint */
+		ddt_log_checkpoint(ddt, &ddlwe, tx);
+
+	ddt_sync_update_stats(ddt, tx);
+
+	return (ddt->ddt_flush_pass == 0);
 }
 
 static void
-ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
+ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
 {
-	spa_t *spa = ddt->ddt_spa;
-	ddt_entry_t *dde;
-	void *cookie = NULL;
+	ASSERT(avl_is_empty(&ddt->ddt_tree));
 
-	if (avl_numnodes(&ddt->ddt_tree) == 0)
+	/* Don't do any flushing when the pool is ready to shut down */
+	if (tx->tx_txg > spa_final_dirty_txg(ddt->ddt_spa))
 		return;
 
-	ASSERT3U(spa->spa_uberblock.ub_version, >=, SPA_VERSION_DEDUP);
+	/* Try to flush some. */
+	if (!ddt_sync_flush_log_incremental(ddt, tx))
+		/* More to do next time */
+		return;
 
-	if (spa->spa_ddt_stat_object == 0) {
-		spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
-		    DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_DDT_STATS, tx);
+	/* No more flushing this txg, so we can do end-of-txg housekeeping */
+
+	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
+	    !avl_is_empty(&ddt->ddt_log_active->ddl_tree)) {
+		/*
+		 * No more to flush, and the active list has stuff, so
+		 * try to swap the logs for next time.
+		 */
+		(void) ddt_log_swap(ddt, tx);
 	}
 
-	if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0)
-		ddt_create_dir(ddt, tx);
+	/*
+	 * Update flush rate. This is an exponential weighted moving average of
+	 * the number of entries flushed over recent txgs.
+	 */
+	ddt->ddt_log_flush_rate = _ewma(
+	    ddt->ddt_flush_count, ddt->ddt_log_flush_rate,
+	    zfs_dedup_log_flush_flow_rate_txgs);
 
-	while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
-		ddt_sync_entry(ddt, dde, tx, txg);
-		ddt_free(ddt, dde);
-	}
+	/*
+	 * Update flush time rate. This is an exponential weighted moving
+	 * average of the total time taken to flush over recent txgs.
+	 */
+	ddt->ddt_log_flush_time_rate = _ewma(
+	    ddt->ddt_log_flush_time_rate,
+	    ((int32_t)(NSEC2MSEC(gethrtime() - ddt->ddt_flush_start))),
+	    zfs_dedup_log_flush_flow_rate_txgs);
+}
 
-	uint64_t count = 0;
-	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
-		uint64_t add, tcount = 0;
-		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
-			if (ddt_object_exists(ddt, type, class)) {
-				ddt_object_sync(ddt, type, class, tx);
-				VERIFY0(ddt_object_count(ddt, type, class,
-				    &add));
-				tcount += add;
-			}
+static void
+ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
+{
+	uint64_t count = avl_numnodes(&ddt->ddt_tree);
+
+	if (count > 0) {
+		ddt_log_update_t dlu = {0};
+		ddt_log_begin(ddt, count, tx, &dlu);
+
+		ddt_entry_t *dde;
+		void *cookie = NULL;
+		ddt_lightweight_entry_t ddlwe;
+		while ((dde =
+		    avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
+			ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
+			DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
+			ddt_log_entry(ddt, &ddlwe, &dlu);
+			ddt_sync_scan_entry(ddt, &ddlwe, tx);
+			ddt_free(ddt, dde);
 		}
-		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
-			if (tcount == 0 && ddt_object_exists(ddt, type, class))
-				ddt_object_destroy(ddt, type, class, tx);
+
+		ddt_log_commit(ddt, &dlu);
+
+		/*
+		 * Sync the stats for the store objects. Even though we haven't
+		 * modified anything on those objects, they're no longer the
+		 * source of truth for entries that are now in the log, and we
+		 * need the on-disk counts to reflect that, otherwise we'll
+		 * miscount later when importing.
+		 */
+		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
+			for (ddt_class_t class = 0;
+			    class < DDT_CLASSES; class++) {
+				if (ddt_object_exists(ddt, type, class))
+					ddt_object_sync(ddt, type, class, tx);
+			}
 		}
-		count += tcount;
+
+		memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
+		    sizeof (ddt->ddt_histogram));
+		ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
+		ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
 	}
 
-	if (count == 0) {
+	if (spa_sync_pass(ddt->ddt_spa) == 1)
 		/*
-		 * No entries left on the DDT, so reset the version for next
-		 * time. This allows us to handle the feature being changed
-		 * since the DDT was originally created. New entries should get
-		 * whatever the feature currently demands.
+		 * Update ingest rate. This is an exponential weighted moving
+		 * average of the number of entries changed over recent txgs.
+		 * The ramp-up cost shouldn't matter too much because the
+		 * flusher will be trying to take at least the minimum anyway.
 		 */
-		if (ddt->ddt_version == DDT_VERSION_FDT)
-			ddt_destroy_dir(ddt, tx);
+		ddt->ddt_log_ingest_rate = _ewma(
+		    count, ddt->ddt_log_ingest_rate,
+		    zfs_dedup_log_flush_flow_rate_txgs);
+}
 
-		ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
-		ddt->ddt_flags = 0;
+static void
+ddt_sync_table_flush(ddt_t *ddt, dmu_tx_t *tx)
+{
+	if (avl_numnodes(&ddt->ddt_tree) == 0)
+		return;
+
+	ddt_entry_t *dde;
+	void *cookie = NULL;
+	while ((dde = avl_destroy_nodes(
+	    &ddt->ddt_tree, &cookie)) != NULL) {
+		ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
+
+		ddt_lightweight_entry_t ddlwe;
+		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
+		ddt_sync_flush_entry(ddt, &ddlwe,
+		    dde->dde_type, dde->dde_class, tx);
+		ddt_sync_scan_entry(ddt, &ddlwe, tx);
+		ddt_free(ddt, dde);
 	}
 
 	memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
 	    sizeof (ddt->ddt_histogram));
-	spa->spa_dedup_dspace = ~0ULL;
-	spa->spa_dedup_dsize = ~0ULL;
+	ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
+	ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
+	ddt_sync_update_stats(ddt, tx);
+}
+
+static void
+ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx)
+{
+	spa_t *spa = ddt->ddt_spa;
+
+	if (ddt->ddt_version == UINT64_MAX)
+		return;
+
+	if (spa->spa_uberblock.ub_version < SPA_VERSION_DEDUP) {
+		ASSERT0(avl_numnodes(&ddt->ddt_tree));
+		return;
+	}
+
+	if (spa->spa_ddt_stat_object == 0) {
+		spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
+		    DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_DDT_STATS, tx);
+	}
+
+	if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0)
+		ddt_create_dir(ddt, tx);
+
+	if (ddt->ddt_flags & DDT_FLAG_LOG)
+		ddt_sync_table_log(ddt, tx);
+	else
+		ddt_sync_table_flush(ddt, tx);
 }
 
 void
@@ -1651,7 +2037,9 @@ ddt_sync(spa_t *spa, uint64_t txg)
 		ddt_t *ddt = spa->spa_ddt[c];
 		if (ddt == NULL)
 			continue;
-		ddt_sync_table(ddt, tx, txg);
+		ddt_sync_table(ddt, tx);
+		if (ddt->ddt_flags & DDT_FLAG_LOG)
+			ddt_sync_flush_log(ddt, tx);
 		ddt_repair_table(ddt, rio);
 	}
 
@@ -1719,9 +2107,12 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 		return (B_FALSE);
 	}
 
-	if (dde->dde_type < DDT_TYPES) {
-		ASSERT3S(dde->dde_class, <, DDT_CLASSES);
-
+	if ((dde->dde_type < DDT_TYPES) || (dde->dde_flags & DDE_FLAG_LOGGED)) {
+		/*
+		 * This entry was either synced to a store object (dde_type is
+		 * real) or was logged. It must be properly on disk at this
+		 * point, so we can just bump its refcount.
+		 */
 		int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp));
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 
@@ -1748,7 +2139,6 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 		 * we may have a block with the DEDUP set, but which doesn't
 		 * have a corresponding entry in the DDT. Be ready.
 		 */
-		ASSERT3S(dde->dde_class, ==, DDT_CLASSES);
 		ddt_remove(ddt, dde);
 		result = B_FALSE;
 	}
@@ -1761,3 +2151,15 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
 
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
 	"Enable prefetching dedup-ed blks");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_passes_max, UINT, ZMOD_RW,
+	"Max number of incremental dedup log flush passes per transaction");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_min_time_ms, UINT, ZMOD_RW,
+	"Min time to spend on incremental dedup log flush each transaction");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_min, UINT, ZMOD_RW,
+	"Min number of log entries to flush each transaction");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW,
+	"Number of txgs to average flow rates across");
diff --git a/module/zfs/ddt_log.c b/module/zfs/ddt_log.c
new file mode 100644
index 000000000000..7e7ff9e5b89f
--- /dev/null
+++ b/module/zfs/ddt_log.c
@@ -0,0 +1,760 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2023, Klara Inc.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/ddt.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu.h>
+#include <sys/ddt_impl.h>
+#include <sys/dnode.h>
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * No more than this many txgs before swapping logs.
+ */
+uint_t zfs_dedup_log_txg_max = 8;
+
+/*
+ * Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module
+ * load, it will be set to zfs_dedup_log_mem_max_percent% of total memory.
+ */
+uint64_t zfs_dedup_log_mem_max = 0;
+uint_t zfs_dedup_log_mem_max_percent = 1;
+
+
+static kmem_cache_t *ddt_log_entry_flat_cache;
+static kmem_cache_t *ddt_log_entry_trad_cache;
+
+#define	DDT_LOG_ENTRY_FLAT_SIZE	\
+	(sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE)
+#define	DDT_LOG_ENTRY_TRAD_SIZE	\
+	(sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE)
+
+#define	DDT_LOG_ENTRY_SIZE(ddt)	\
+	_DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE)
+
+void
+ddt_log_init(void)
+{
+	ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache",
+	    DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+	ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache",
+	    DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	/*
+	 * Max memory for log AVL entries. At least 1M, because we need
+	 * something (that's ~3800 entries per tree). They can say 100% if they
+	 * want; it just means they're at the mercy of the the txg flush limit.
+	 */
+	if (zfs_dedup_log_mem_max == 0) {
+		zfs_dedup_log_mem_max_percent =
+		    MIN(zfs_dedup_log_mem_max_percent, 100);
+		zfs_dedup_log_mem_max = (physmem * PAGESIZE) *
+		    zfs_dedup_log_mem_max_percent / 100;
+	}
+	zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024);
+}
+
+void
+ddt_log_fini(void)
+{
+	kmem_cache_destroy(ddt_log_entry_trad_cache);
+	kmem_cache_destroy(ddt_log_entry_flat_cache);
+}
+
+static void
+ddt_log_name(ddt_t *ddt, char *name, uint_t n)
+{
+	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG,
+	    zio_checksum_table[ddt->ddt_checksum].ci_name, n);
+}
+
+static void
+ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+	VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db));
+	dmu_buf_will_dirty(db, tx);
+
+	ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data;
+	DLH_SET_VERSION(hdr, 1);
+	DLH_SET_FLAGS(hdr, ddl->ddl_flags);
+	hdr->dlh_length = ddl->ddl_length;
+	hdr->dlh_first_txg = ddl->ddl_first_txg;
+	hdr->dlh_checkpoint = ddl->ddl_checkpoint;
+
+	dmu_buf_rele(db, FTAG);
+}
+
+static void
+ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
+{
+	ASSERT3U(ddt->ddt_dir_object, >, 0);
+	ASSERT3U(ddl->ddl_object, ==, 0);
+
+	char name[DDT_NAMELEN];
+	ddt_log_name(ddt, name, n);
+
+	ddl->ddl_object = dmu_object_alloc(ddt->ddt_os,
+	    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
+	    DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx);
+	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name,
+	    sizeof (uint64_t), 1, &ddl->ddl_object, tx));
+	ddl->ddl_length = 0;
+	ddl->ddl_first_txg = tx->tx_txg;
+	ddt_log_update_header(ddt, ddl, tx);
+}
+
+static void
+ddt_log_create(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx);
+	ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx);
+}
+
+static void
+ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
+{
+	ASSERT3U(ddt->ddt_dir_object, >, 0);
+
+	if (ddl->ddl_object == 0)
+		return;
+
+	ASSERT0(ddl->ddl_length);
+
+	char name[DDT_NAMELEN];
+	ddt_log_name(ddt, name, n);
+
+	VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx));
+	VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx));
+
+	ddl->ddl_object = 0;
+}
+
+void
+ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx);
+	ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx);
+}
+
+static void
+ddt_log_update_stats(ddt_t *ddt)
+{
+	/*
+	 * Log object stats. We count the number of live entries in the log
+	 * tree, even if there are more than on disk, and even if the same
+	 * entry is on both append and flush trees, because that's more what
+	 * the user expects to see. This does mean the on-disk size is not
+	 * really correlated with the number of entries, but I don't think
+	 * that's reasonable to expect anyway.
+	 */
+	dmu_object_info_t doi;
+	uint64_t nblocks;
+	dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi);
+	nblocks = doi.doi_physical_blocks_512;
+	dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi);
+	nblocks += doi.doi_physical_blocks_512;
+
+	ddt_object_t *ddo = &ddt->ddt_log_stats;
+	ddo->ddo_count =
+	    avl_numnodes(&ddt->ddt_log_active->ddl_tree) +
+	    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
+	ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt);
+	ddo->ddo_dspace = nblocks << 9;
+}
+
+void
+ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)
+{
+	ASSERT3U(nentries, >, 0);
+	ASSERT3P(dlu->dlu_dbp, ==, NULL);
+
+	if (ddt->ddt_log_active->ddl_object == 0)
+		ddt_log_create(ddt, tx);
+
+	/*
+	 * We want to store as many entries as we can in a block, but never
+	 * split an entry across block boundaries.
+	 */
+	size_t reclen = P2ALIGN_TYPED(
+	    sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) +
+	    DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t);
+	ASSERT3U(reclen, <=, UINT16_MAX);
+	dlu->dlu_reclen = reclen;
+
+	VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG,
+	    &dlu->dlu_dn));
+	dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP);
+
+	uint64_t nblocks = howmany(nentries,
+	    dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen);
+	uint64_t offset = ddt->ddt_log_active->ddl_length;
+	uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz;
+
+	VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,
+	    B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,
+	    DMU_READ_NO_PREFETCH));
+
+	dlu->dlu_tx = tx;
+	dlu->dlu_block = dlu->dlu_offset = 0;
+}
+
+static ddt_log_entry_t *
+ddt_log_alloc_entry(ddt_t *ddt)
+{
+	ddt_log_entry_t *ddle;
+
+	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
+		ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP);
+		memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE);
+	} else {
+		ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP);
+		memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE);
+	}
+
+	return (ddle);
+}
+
+static void
+ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
+{
+	/* Create the log tree entry from a live or stored entry */
+	avl_index_t where;
+	ddt_log_entry_t *ddle =
+	    avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where);
+	if (ddle == NULL) {
+		ddle = ddt_log_alloc_entry(ddt);
+		ddle->ddle_key = ddlwe->ddlwe_key;
+		avl_insert(&ddl->ddl_tree, ddle, where);
+	}
+	ddle->ddle_type = ddlwe->ddlwe_type;
+	ddle->ddle_class = ddlwe->ddlwe_class;
+	memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
+}
+
+void
+ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
+{
+	ASSERT3U(dlu->dlu_dbp, !=, NULL);
+
+	ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe);
+	ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+
+	/* Get our block */
+	ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
+	dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block];
+
+	/*
+	 * If this would take us past the end of the block, finish it and
+	 * move to the next one.
+	 */
+	if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) {
+		ASSERT3U(dlu->dlu_offset, >, 0);
+		dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE);
+		dlu->dlu_block++;
+		dlu->dlu_offset = 0;
+		ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
+		db = dlu->dlu_dbp[dlu->dlu_block];
+	}
+
+	/*
+	 * If this is the first time touching the block, inform the DMU that
+	 * we will fill it, and zero it out.
+	 */
+	if (dlu->dlu_offset == 0) {
+		dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE);
+		memset(db->db_data, 0, db->db_size);
+	}
+
+	/* Create the log record directly in the buffer */
+	ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset);
+	DLR_SET_TYPE(dlr, DLR_ENTRY);
+	DLR_SET_RECLEN(dlr, dlu->dlu_reclen);
+	DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type);
+	DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class);
+
+	ddt_log_record_entry_t *dlre =
+	    (ddt_log_record_entry_t *)&dlr->dlr_payload;
+	dlre->dlre_key = ddlwe->ddlwe_key;
+	memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
+
+	/* Advance offset for next record. */
+	dlu->dlu_offset += dlu->dlu_reclen;
+}
+
+void
+ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)
+{
+	ASSERT3U(dlu->dlu_dbp, !=, NULL);
+	ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp);
+	ASSERT3U(dlu->dlu_offset, >, 0);
+
+	/*
+	 * Close out the last block. Whatever we haven't used will be zeroed,
+	 * which matches DLR_INVALID, so we can detect this during load.
+	 */
+	dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE);
+
+	dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG);
+
+	ddt->ddt_log_active->ddl_length +=
+	    dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz;
+	dnode_rele(dlu->dlu_dn, FTAG);
+
+	ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx);
+
+	memset(dlu, 0, sizeof (ddt_log_update_t));
+
+	ddt_log_update_stats(ddt);
+}
+
+boolean_t
+ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
+{
+	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
+	if (ddle == NULL)
+		return (B_FALSE);
+
+	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
+
+	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+
+	avl_remove(&ddl->ddl_tree, ddle);
+	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+
+	return (B_TRUE);
+}
+
+boolean_t
+ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk,
+    ddt_lightweight_entry_t *ddlwe)
+{
+	ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);
+	if (ddle == NULL)
+		return (B_FALSE);
+
+	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
+
+	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
+
+	avl_remove(&ddl->ddl_tree, ddle);
+	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+
+	return (B_TRUE);
+}
+
+void
+ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
+{
+	ddt_log_t *ddl = ddt->ddt_log_flushing;
+
+	ASSERT3U(ddl->ddl_object, !=, 0);
+
+#ifdef ZFS_DEBUG
+	/*
+	 * There should not be any entries on the log tree before the given
+	 * checkpoint. Assert that this is the case.
+	 */
+	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
+	if (ddle != NULL)
+		VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key),
+		    >, 0);
+#endif
+
+	ddl->ddl_flags |= DDL_FLAG_CHECKPOINT;
+	ddl->ddl_checkpoint = ddlwe->ddlwe_key;
+	ddt_log_update_header(ddt, ddl, tx);
+
+	ddt_log_update_stats(ddt);
+}
+
+void
+ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx)
+{
+	ddt_log_t *ddl = ddt->ddt_log_flushing;
+
+	if (ddl->ddl_object == 0)
+		return;
+
+	ASSERT(avl_is_empty(&ddl->ddl_tree));
+
+	/* Eject the entire object */
+	dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx);
+
+	ddl->ddl_length = 0;
+	ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT;
+	memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t));
+	ddt_log_update_header(ddt, ddl, tx);
+
+	ddt_log_update_stats(ddt);
+}
+
+boolean_t
+ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
+{
+	/* Swap the logs. The old flushing one must be empty */
+	VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree));
+
+	/*
+	 * If there are still blocks on the flushing log, truncate it first.
+	 * This can happen if there were entries on the flushing log that were
+	 * removed in memory via ddt_lookup(); their vestigal remains are
+	 * on disk.
+	 */
+	if (ddt->ddt_log_flushing->ddl_length > 0)
+		ddt_log_truncate(ddt, tx);
+
+	/*
+	 * Swap policy. We swap the logs (and so begin flushing) when the
+	 * active tree grows too large, or when we haven't swapped it in
+	 * some amount of time.
+	 */
+
+	/*
+	 * The log tree is too large if the memory usage of its entries is over
+	 * half of the memory limit. This effectively gives each log tree half
+	 * the available memory.
+	 */
+	const boolean_t too_large =
+	    (avl_numnodes(&ddt->ddt_log_active->ddl_tree) *
+	    DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1);
+
+	const boolean_t too_old =
+	    tx->tx_txg >=
+	    (ddt->ddt_log_active->ddl_first_txg +
+	    MAX(1, zfs_dedup_log_txg_max));
+
+	if (!(too_large || too_old))
+		return (B_FALSE);
+
+	ddt_log_t *swap = ddt->ddt_log_active;
+	ddt->ddt_log_active = ddt->ddt_log_flushing;
+	ddt->ddt_log_flushing = swap;
+
+	ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING);
+	ddt->ddt_log_active->ddl_flags &=
+	    ~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT);
+
+	ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING));
+	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
+
+	ddt->ddt_log_active->ddl_first_txg = tx->tx_txg;
+
+	ddt_log_update_header(ddt, ddt->ddt_log_active, tx);
+	ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx);
+
+	ddt_log_update_stats(ddt);
+
+	return (B_TRUE);
+}
+
+static inline void
+ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,
+    const ddt_key_t *checkpoint)
+{
+	ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY);
+
+	ddt_log_record_entry_t *dlre =
+	    (ddt_log_record_entry_t *)dlr->dlr_payload;
+	if (checkpoint != NULL &&
+	    ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) {
+		/* Skip pre-checkpoint entries; they're already flushed. */
+		return;
+	}
+
+	ddt_lightweight_entry_t ddlwe;
+	ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr);
+	ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr);
+
+	ddlwe.ddlwe_key = dlre->dlre_key;
+	memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));
+
+	ddt_log_update_entry(ddt, ddl, &ddlwe);
+}
+
+static void
+ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)
+{
+	void *cookie = NULL;
+	ddt_log_entry_t *ddle;
+	IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));
+	while ((ddle =
+	    avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {
+		kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
+		    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
+	}
+	ASSERT(avl_is_empty(&ddl->ddl_tree));
+}
+
+static int
+ddt_log_load_one(ddt_t *ddt, uint_t n)
+{
+	ASSERT3U(n, <, 2);
+
+	ddt_log_t *ddl = &ddt->ddt_log[n];
+
+	char name[DDT_NAMELEN];
+	ddt_log_name(ddt, name, n);
+
+	uint64_t obj;
+	int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
+	    sizeof (uint64_t), 1, &obj);
+	if (err == ENOENT)
+		return (0);
+	if (err != 0)
+		return (err);
+
+	dnode_t *dn;
+	err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn);
+	if (err != 0)
+		return (err);
+
+	ddt_log_header_t hdr;
+	dmu_buf_t *db;
+	err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0) {
+		dnode_rele(dn, FTAG);
+		return (err);
+	}
+	memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t));
+	dmu_buf_rele(db, FTAG);
+
+	if (DLH_GET_VERSION(&hdr) != 1) {
+		dnode_rele(dn, FTAG);
+		zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s "
+		    "unknown version=%llu", spa_name(ddt->ddt_spa), name,
+		    (u_longlong_t)DLH_GET_VERSION(&hdr));
+		return (SET_ERROR(EINVAL));
+	}
+
+	ddt_key_t *checkpoint = NULL;
+	if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) {
+		/*
+		 * If the log has a checkpoint, then we can ignore any entries
+		 * that have already been flushed.
+		 */
+		ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING);
+		checkpoint = &hdr.dlh_checkpoint;
+	}
+
+	if (hdr.dlh_length > 0) {
+		dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length,
+		    ZIO_PRIORITY_SYNC_READ);
+
+		for (uint64_t offset = 0; offset < hdr.dlh_length;
+		    offset += dn->dn_datablksz) {
+			err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db,
+			    DMU_READ_PREFETCH);
+			if (err != 0) {
+				dnode_rele(dn, FTAG);
+				ddt_log_empty(ddt, ddl);
+				return (err);
+			}
+
+			uint64_t boffset = 0;
+			while (boffset < db->db_size) {
+				ddt_log_record_t *dlr =
+				    (ddt_log_record_t *)(db->db_data + boffset);
+
+				/* Partially-filled block, skip the rest */
+				if (DLR_GET_TYPE(dlr) == DLR_INVALID)
+					break;
+
+				switch (DLR_GET_TYPE(dlr)) {
+				case DLR_ENTRY:
+					ddt_log_load_entry(ddt, ddl, dlr,
+					    checkpoint);
+					break;
+
+				default:
+					dmu_buf_rele(db, FTAG);
+					dnode_rele(dn, FTAG);
+					ddt_log_empty(ddt, ddl);
+					return (SET_ERROR(EINVAL));
+				}
+
+				boffset += DLR_GET_RECLEN(dlr);
+			}
+
+			dmu_buf_rele(db, FTAG);
+		}
+	}
+
+	dnode_rele(dn, FTAG);
+
+	ddl->ddl_object = obj;
+	ddl->ddl_flags = DLH_GET_FLAGS(&hdr);
+	ddl->ddl_length = hdr.dlh_length;
+	ddl->ddl_first_txg = hdr.dlh_first_txg;
+
+	if (ddl->ddl_flags & DDL_FLAG_FLUSHING)
+		ddt->ddt_log_flushing = ddl;
+	else
+		ddt->ddt_log_active = ddl;
+
+	return (0);
+}
+
+int
+ddt_log_load(ddt_t *ddt)
+{
+	int err;
+
+	if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) {
+		/*
+		 * The DDT is going to be freed again in a moment, so there's
+		 * no point loading the log; it'll just slow down import.
+		 */
+		return (0);
+	}
+
+	ASSERT0(ddt->ddt_log[0].ddl_object);
+	ASSERT0(ddt->ddt_log[1].ddl_object);
+	if (ddt->ddt_dir_object == 0) {
+		/*
+		 * If we're configured but the containing dir doesn't exist
+		 * yet, then the log object can't possibly exist either.
+		 */
+		ASSERT3U(ddt->ddt_version, !=, UINT64_MAX);
+		return (SET_ERROR(ENOENT));
+	}
+
+	if ((err = ddt_log_load_one(ddt, 0)) != 0)
+		return (err);
+	if ((err = ddt_log_load_one(ddt, 1)) != 0)
+		return (err);
+
+	VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing);
+	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING));
+	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT));
+	VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING);
+
+	/*
+	 * We have two finalisation tasks:
+	 *
+	 * - rebuild the histogram. We do this at the end rather than while
+	 *   we're loading so we don't need to uncount and recount entries that
+	 *   appear multiple times in the log.
+	 *
+	 * - remove entries from the flushing tree that are on both trees. This
+	 *   happens when ddt_lookup() rehydrates an entry from the flushing
+	 *   tree, as ddt_log_take_key() removes the entry from the in-memory
+	 *   tree but doesn't remove it from disk.
+	 */
+
+	/*
+	 * We don't technically need a config lock here, since there shouldn't
+	 * be pool config changes during DDT load. dva_get_dsize_sync() via
+	 * ddt_stat_generate() is expecting it though, and it won't hurt
+	 * anything, so we take it.
+	 */
+	spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER);
+
+	avl_tree_t *al = &ddt->ddt_log_active->ddl_tree;
+	avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree;
+	ddt_log_entry_t *ae = avl_first(al);
+	ddt_log_entry_t *fe = avl_first(fl);
+	while (ae != NULL || fe != NULL) {
+		ddt_log_entry_t *ddle;
+		if (ae == NULL) {
+			/* active exhausted, take flushing */
+			ddle = fe;
+			fe = AVL_NEXT(fl, fe);
+		} else if (fe == NULL) {
+			/* flushing exuhausted, take active */
+			ddle = ae;
+			ae = AVL_NEXT(al, ae);
+		} else {
+			/* compare active and flushing */
+			int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key);
+			if (c < 0) {
+				/* active behind, take and advance */
+				ddle = ae;
+				ae = AVL_NEXT(al, ae);
+			} else if (c > 0) {
+				/* flushing behind, take and advance */
+				ddle = fe;
+				fe = AVL_NEXT(fl, fe);
+			} else {
+				/* match. remove from flushing, take active */
+				ddle = fe;
+				fe = AVL_NEXT(fl, fe);
+				avl_remove(fl, ddle);
+
+				ddle = ae;
+				ae = AVL_NEXT(al, ae);
+			}
+		}
+
+		ddt_lightweight_entry_t ddlwe;
+		DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
+		ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
+	}
+
+	spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG);
+
+	ddt_log_update_stats(ddt);
+
+	return (0);
+}
+
+void
+ddt_log_alloc(ddt_t *ddt)
+{
+	ASSERT3P(ddt->ddt_log_active, ==, NULL);
+	ASSERT3P(ddt->ddt_log_flushing, ==, NULL);
+
+	avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare,
+	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
+	avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare,
+	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
+	ddt->ddt_log_active = &ddt->ddt_log[0];
+	ddt->ddt_log_flushing = &ddt->ddt_log[1];
+	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
+}
+
+void
+ddt_log_free(ddt_t *ddt)
+{
+	ddt_log_empty(ddt, &ddt->ddt_log[0]);
+	ddt_log_empty(ddt, &ddt->ddt_log[1]);
+	avl_destroy(&ddt->ddt_log[0].ddl_tree);
+	avl_destroy(&ddt->ddt_log[1].ddl_tree);
+}
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW,
+	"Max transactions before starting to flush dedup logs");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD,
+	"Max memory for dedup logs");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD,
+	"Max memory for dedup logs, as % of total memory");
diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c
index 9316200f21fc..8f55bc24f0f5 100644
--- a/module/zfs/ddt_stats.c
+++ b/module/zfs/ddt_stats.c
@@ -42,7 +42,7 @@ ddt_stat_generate(ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
 
 	memset(dds, 0, sizeof (*dds));
 
-	for (int p = 0; p < ddlwe->ddlwe_nphys; p++) {
+	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
 		const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 
@@ -222,6 +222,11 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
 				ddo_total->ddo_mspace += ddo->ddo_mspace;
 			}
 		}
+
+		ddt_object_t *ddo = &ddt->ddt_log_stats;
+		ddo_total->ddo_count += ddo->ddo_count;
+		ddo_total->ddo_dspace += ddo->ddo_dspace;
+		ddo_total->ddo_mspace += ddo->ddo_mspace;
 	}
 
 	/*
@@ -259,6 +264,8 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
 				    &ddt->ddt_histogram_cache[type][class]);
 			}
 		}
+
+		ddt_histogram_add(ddh, &ddt->ddt_log_histogram);
 	}
 }
 
diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg
index 3de316a12504..96943421f84c 100644
--- a/tests/zfs-tests/include/tunables.cfg
+++ b/tests/zfs-tests/include/tunables.cfg
@@ -31,6 +31,7 @@ DBUF_CACHE_SHIFT		dbuf.cache_shift		dbuf_cache_shift
 DDT_ZAP_DEFAULT_BS		dedup.ddt_zap_default_bs	ddt_zap_default_bs
 DDT_ZAP_DEFAULT_IBS		dedup.ddt_zap_default_ibs	ddt_zap_default_ibs
 DDT_DATA_IS_SPECIAL		ddt_data_is_special		zfs_ddt_data_is_special
+DEDUP_LOG_TXG_MAX		dedup.log_txg_max		zfs_dedup_log_txg_max
 DEADMAN_CHECKTIME_MS		deadman.checktime_ms		zfs_deadman_checktime_ms
 DEADMAN_EVENTS_PER_SECOND	deadman_events_per_second	zfs_deadman_events_per_second
 DEADMAN_FAILMODE		deadman.failmode		zfs_deadman_failmode
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh
index 83c4d7c8e2aa..4f6e5805bb3a 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_create.ksh
@@ -29,9 +29,16 @@
 
 log_assert "basic dedup (FDT) operations work"
 
+# we set the dedup log txg interval to 1, to get a log flush every txg,
+# effectively disabling the log. without this it's hard to predict when and
+# where things appear on-disk
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+
 function cleanup
 {
 	destroy_pool $TESTPOOL
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
 }
 
 log_onexit cleanup
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh
index f0f20671b95d..259eaddc0843 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_import.ksh
@@ -29,9 +29,16 @@
 
 log_assert "dedup (FDT) retains version after import"
 
+# we set the dedup log txg interval to 1, to get a log flush every txg,
+# effectively disabling the log. without this it's hard to predict when and
+# where things appear on-disk
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+
 function cleanup
 {
 	destroy_pool $TESTPOOL
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
 }
 
 log_onexit cleanup
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh
index 049ccaae3dca..114cf0266e12 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_mixed.ksh
@@ -30,9 +30,16 @@
 
 log_assert "legacy and FDT dedup tables on the same pool can happily coexist"
 
+# we set the dedup log txg interval to 1, to get a log flush every txg,
+# effectively disabling the log. without this it's hard to predict when and
+# where things appear on-disk
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+
 function cleanup
 {
 	destroy_pool $TESTPOOL
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
 }
 
 log_onexit cleanup
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh
index d563fade88af..c36463134fde 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_legacy_fdt_upgrade.ksh
@@ -30,9 +30,16 @@
 
 log_assert "legacy dedup tables work after upgrade; new dedup tables created as FDT"
 
+# we set the dedup log txg interval to 1, to get a log flush every txg,
+# effectively disabling the log. without this it's hard to predict when and
+# where things appear on-disk
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+
 function cleanup
 {
 	destroy_pool $TESTPOOL
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
 }
 
 log_onexit cleanup
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh
index 5b83a1ca396f..326152b510a9 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh
@@ -51,6 +51,12 @@ POOL="dedup_pool"
 
 save_tunable TXG_TIMEOUT
 
+# we set the dedup log txg interval to 1, to get a log flush every txg,
+# effectively disabling the log. without this it's hard to predict when and
+# where things appear on-disk
+log_must save_tunable DEDUP_LOG_TXG_MAX
+log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+
 function cleanup
 {
 	if poolexists $POOL ; then
@@ -58,6 +64,7 @@ function cleanup
 	fi
 	log_must rm -fd $VDEV_GENERAL $VDEV_DEDUP $MOUNTDIR
 	log_must restore_tunable TXG_TIMEOUT
+	log_must restore_tunable DEDUP_LOG_TXG_MAX
 }
 
 
@@ -206,10 +213,15 @@ function ddt_dedup_vdev_limit
 
 	#
 	# With no DDT quota in place, the above workload will produce over
-	# 800,000 entries by using space in the normal class. With a quota,
-	# it will be well below 500,000 entries.
+	# 800,000 entries by using space in the normal class. With a quota, it
+	# should be well under 500,000. However, logged entries are hard to
+	# account for because they can appear on both logs, and can also
+	# represent an eventual removal. This isn't easily visible from
+	# outside, and even internally can result in going slightly over quota.
+	# For here, we just set the entry count a little higher than what we
+	# expect to allow for some instability.
 	#
-	log_must test $(ddt_entries) -le 500000
+	log_must test $(ddt_entries) -le 600000
 
 	do_clean
 }

From a1902f49509b66a475c7b4b0d081792f33f1dc52 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 16 Oct 2023 11:52:17 +1100
Subject: [PATCH 29/65] ddt: block scan until log is flushed, and flush
 aggressively

The dedup log does not have a stable cursor, so its not possible to
persist our current scan location within it across pool reloads.
Beccause of this, when walking (scanning), we can't treat it like just
another source of dedup entries.

Instead, when a scan is wanted, we switch to an aggressive flushing
mode, pushing out entries older than the scan start txg as fast as we
can, before starting the scan proper.

Entries after the scan start txg will be handled via other methods; the
DDT ZAPs and logs will be written as normal, and blocks not seen yet
will be offered to the scan machinery as normal.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Co-authored-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
---
 include/sys/ddt.h                             |  5 ++
 module/zfs/ddt.c                              | 89 +++++++++++++++++++
 module/zfs/ddt_log.c                          |  8 +-
 module/zfs/dsl_scan.c                         | 25 +++++-
 .../zpool_prefetch/zpool_prefetch_001_pos.ksh |  4 +
 5 files changed, 125 insertions(+), 6 deletions(-)

diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 2fc798725eda..a7920e658062 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -294,6 +294,8 @@ typedef struct {
 	int32_t		ddt_log_flush_rate;	/* rolling log flush rate */
 	int32_t		ddt_log_flush_time_rate; /* avg time spent flushing */
 
+	uint64_t	ddt_flush_force_txg;	/* flush hard before this txg */
+
 	enum zio_checksum ddt_checksum;	/* checksum algorithm in use */
 	spa_t		*ddt_spa;	/* pool this ddt is on */
 	objset_t	*ddt_os;	/* ddt objset (always MOS) */
@@ -393,6 +395,9 @@ extern void ddt_create(spa_t *spa);
 extern int ddt_load(spa_t *spa);
 extern void ddt_unload(spa_t *spa);
 extern void ddt_sync(spa_t *spa, uint64_t txg);
+
+extern void ddt_walk_init(spa_t *spa, uint64_t txg);
+extern boolean_t ddt_walk_ready(spa_t *spa);
 extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb,
     ddt_lightweight_entry_t *ddlwe);
 
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index ce5c4efb51ed..051005f137bd 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -183,6 +183,12 @@
  * position on the object even if the object changes, the pool is exported, or
  * OpenZFS is upgraded.
  *
+ * If the "fast_dedup" feature is enabled and the table has a log, the scan
+ * cannot begin until entries on the log are flushed, as the on-disk log has no
+ * concept of a "stable position". Instead, the log flushing process will enter
+ * a more aggressive mode, to flush out as much as is necesary as soon as
+ * possible, in order to begin the scan as soon as possible.
+ *
  * ## Interaction with block cloning
  *
  * If block cloning and dedup are both enabled on a pool, BRT will look for the
@@ -1746,6 +1752,16 @@ ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx)
 			ddt->ddt_flush_min = MAX(
 			    ddt->ddt_log_ingest_rate,
 			    zfs_dedup_log_flush_entries_min);
+
+			/*
+			 * If we've been asked to flush everything in a hurry,
+			 * try to dump as much as possible on this txg. In
+			 * this case we're only limited by time, not amount.
+			 */
+			if (ddt->ddt_flush_force_txg > 0)
+				ddt->ddt_flush_min =
+				    MAX(ddt->ddt_flush_min, avl_numnodes(
+				    &ddt->ddt_log_flushing->ddl_tree));
 		} else {
 			/* We already decided we're done for this txg */
 			return (B_FALSE);
@@ -1856,6 +1872,40 @@ ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx)
 	return (ddt->ddt_flush_pass == 0);
 }
 
+static inline void
+ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg)
+{
+	/*
+	 * If we're not forcing flush, and not being asked to start, then
+	 * there's nothing more to do.
+	 */
+	if (txg == 0) {
+		/* Update requested, are we currently forcing flush? */
+		if (ddt->ddt_flush_force_txg == 0)
+			return;
+		txg = ddt->ddt_flush_force_txg;
+	}
+
+	/*
+	 * If either of the logs have entries unflushed entries before
+	 * the wanted txg, set the force txg, otherwise clear it.
+	 */
+
+	if ((!avl_is_empty(&ddt->ddt_log_active->ddl_tree) &&
+	    ddt->ddt_log_active->ddl_first_txg <= txg) ||
+	    (!avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
+	    ddt->ddt_log_flushing->ddl_first_txg <= txg)) {
+		ddt->ddt_flush_force_txg = txg;
+		return;
+	}
+
+	/*
+	 * Nothing to flush behind the given txg, so we can clear force flush
+	 * state.
+	 */
+	ddt->ddt_flush_force_txg = 0;
+}
+
 static void
 ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
 {
@@ -1881,6 +1931,9 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
 		(void) ddt_log_swap(ddt, tx);
 	}
 
+	/* If force flush is no longer necessary, turn it off. */
+	ddt_flush_force_update_txg(ddt, 0);
+
 	/*
 	 * Update flush rate. This is an exponential weighted moving average of
 	 * the number of entries flushed over recent txgs.
@@ -2049,6 +2102,38 @@ ddt_sync(spa_t *spa, uint64_t txg)
 	dmu_tx_commit(tx);
 }
 
+void
+ddt_walk_init(spa_t *spa, uint64_t txg)
+{
+	if (txg == 0)
+		txg = spa_syncing_txg(spa);
+
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
+			continue;
+
+		ddt_enter(ddt);
+		ddt_flush_force_update_txg(ddt, txg);
+		ddt_exit(ddt);
+	}
+}
+
+boolean_t
+ddt_walk_ready(spa_t *spa)
+{
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
+			continue;
+
+		if (ddt->ddt_flush_force_txg > 0)
+			return (B_FALSE);
+	}
+
+	return (B_TRUE);
+}
+
 int
 ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
 {
@@ -2058,6 +2143,10 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
 				ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
 				if (ddt == NULL)
 					continue;
+
+				if (ddt->ddt_flush_force_txg > 0)
+					return (EAGAIN);
+
 				int error = ENOENT;
 				if (ddt_object_exists(ddt, ddb->ddb_type,
 				    ddb->ddb_class)) {
diff --git a/module/zfs/ddt_log.c b/module/zfs/ddt_log.c
index 7e7ff9e5b89f..a367d0cd02f8 100644
--- a/module/zfs/ddt_log.c
+++ b/module/zfs/ddt_log.c
@@ -435,7 +435,8 @@ ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
 	/*
 	 * Swap policy. We swap the logs (and so begin flushing) when the
 	 * active tree grows too large, or when we haven't swapped it in
-	 * some amount of time.
+	 * some amount of time, or if something has requested the logs be
+	 * flushed ASAP (see ddt_walk_init()).
 	 */
 
 	/*
@@ -452,7 +453,10 @@ ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
 	    (ddt->ddt_log_active->ddl_first_txg +
 	    MAX(1, zfs_dedup_log_txg_max));
 
-	if (!(too_large || too_old))
+	const boolean_t force =
+	    ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg;
+
+	if (!(too_large || too_old || force))
 		return (B_FALSE);
 
 	ddt_log_t *swap = ddt->ddt_log_active;
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index daf1bd5d637b..9d040e146308 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -630,6 +630,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 		zap_cursor_fini(&zc);
 	}
 
+	ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
+
 	spa_scan_stat_init(spa);
 	vdev_scan_stat_init(spa->spa_root_vdev);
 
@@ -951,6 +953,8 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 
 	memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys));
 
+	ddt_walk_init(spa, scn->scn_phys.scn_max_txg);
+
 	dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
 
 	spa_history_log_internal(spa, "scan setup", tx,
@@ -1636,7 +1640,8 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
 	    txg_sync_waiting(scn->scn_dp) ||
 	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
 	    spa_shutting_down(scn->scn_dp->dp_spa) ||
-	    (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) {
+	    (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn)) ||
+	    !ddt_walk_ready(scn->scn_dp->dp_spa)) {
 		if (zb && zb->zb_level == ZB_ROOT_LEVEL) {
 			dprintf("suspending at first available bookmark "
 			    "%llx/%llx/%llx/%llx\n",
@@ -3029,9 +3034,21 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
 			break;
 	}
 
-	zfs_dbgmsg("scanned %llu ddt entries on %s with class_max = %u; "
-	    "suspending=%u", (longlong_t)n, scn->scn_dp->dp_spa->spa_name,
-	    (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending);
+	if (error == EAGAIN) {
+		dsl_scan_check_suspend(scn, NULL);
+		error = 0;
+
+		zfs_dbgmsg("waiting for ddt to become ready for scan "
+		    "on %s with class_max = %u; suspending=%u",
+		    scn->scn_dp->dp_spa->spa_name,
+		    (int)scn->scn_phys.scn_ddt_class_max,
+		    (int)scn->scn_suspending);
+	} else
+		zfs_dbgmsg("scanned %llu ddt entries on %s with "
+		    "class_max = %u; suspending=%u", (longlong_t)n,
+		    scn->scn_dp->dp_spa->spa_name,
+		    (int)scn->scn_phys.scn_ddt_class_max,
+		    (int)scn->scn_suspending);
 
 	ASSERT(error == 0 || error == ENOENT);
 	ASSERT(error != ENOENT ||
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh
index a96a38ff178a..474f41eae8f3 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh
@@ -95,6 +95,10 @@ while (( i < 16384 )); do
 	done
 	((i += 1))
 done
+
+# Force the DDT logs to disk with a scrub so they can be prefetched
+log_must zpool scrub -w $TESTPOOL
+
 log_note "Dataset generation completed."
 
 typeset -A generated

From 0d2707815d34177ffa79e3c78512bb1d4237b1ad Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 25 Sep 2023 11:02:46 +1000
Subject: [PATCH 30/65] ddt: lookup and log stats

Adds per-DDT stats counting lookups and where they were serviced from
(either log or backing zap), number of log entries in memory, and flow
rates.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15895
---
 include/sys/ddt.h |   2 +
 module/zfs/ddt.c  | 163 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 159 insertions(+), 6 deletions(-)

diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index a7920e658062..93abad85af44 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -296,6 +296,8 @@ typedef struct {
 
 	uint64_t	ddt_flush_force_txg;	/* flush hard before this txg */
 
+	kstat_t		*ddt_ksp;	/* kstats context */
+
 	enum zio_checksum ddt_checksum;	/* checksum algorithm in use */
 	spa_t		*ddt_spa;	/* pool this ddt is on */
 	objset_t	*ddt_os;	/* ddt objset (always MOS) */
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 051005f137bd..bd1941f43adf 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -271,6 +271,78 @@ static const uint64_t ddt_version_flags[] = {
 /* Dummy version to signal that configure is still necessary */
 #define	DDT_VERSION_UNCONFIGURED	(UINT64_MAX)
 
+#ifdef _KERNEL
+/* per-DDT kstats */
+typedef struct {
+	/* total lookups and whether they returned new or existing entries */
+	kstat_named_t dds_lookup;
+	kstat_named_t dds_lookup_new;
+	kstat_named_t dds_lookup_existing;
+
+	/* entries found on live tree, and if we had to wait for load */
+	kstat_named_t dds_lookup_live_hit;
+	kstat_named_t dds_lookup_live_wait;
+	kstat_named_t dds_lookup_live_miss;
+
+	/* entries found on log trees */
+	kstat_named_t dds_lookup_log_hit;
+	kstat_named_t dds_lookup_log_active_hit;
+	kstat_named_t dds_lookup_log_flushing_hit;
+	kstat_named_t dds_lookup_log_miss;
+
+	/* entries found on store objects */
+	kstat_named_t dds_lookup_stored_hit;
+	kstat_named_t dds_lookup_stored_miss;
+
+	/* number of entries on log trees */
+	kstat_named_t dds_log_active_entries;
+	kstat_named_t dds_log_flushing_entries;
+
+	/* avg updated/flushed entries per txg */
+	kstat_named_t dds_log_ingest_rate;
+	kstat_named_t dds_log_flush_rate;
+	kstat_named_t dds_log_flush_time_rate;
+} ddt_kstats_t;
+
+static const ddt_kstats_t ddt_kstats_template = {
+	{ "lookup",			KSTAT_DATA_UINT64 },
+	{ "lookup_new",			KSTAT_DATA_UINT64 },
+	{ "lookup_existing",		KSTAT_DATA_UINT64 },
+	{ "lookup_live_hit",		KSTAT_DATA_UINT64 },
+	{ "lookup_live_wait",		KSTAT_DATA_UINT64 },
+	{ "lookup_live_miss",		KSTAT_DATA_UINT64 },
+	{ "lookup_log_hit",		KSTAT_DATA_UINT64 },
+	{ "lookup_log_active_hit",	KSTAT_DATA_UINT64 },
+	{ "lookup_log_flushing_hit",	KSTAT_DATA_UINT64 },
+	{ "lookup_log_miss",		KSTAT_DATA_UINT64 },
+	{ "lookup_stored_hit",		KSTAT_DATA_UINT64 },
+	{ "lookup_stored_miss",		KSTAT_DATA_UINT64 },
+	{ "log_active_entries",		KSTAT_DATA_UINT64 },
+	{ "log_flushing_entries",	KSTAT_DATA_UINT64 },
+	{ "log_ingest_rate",		KSTAT_DATA_UINT32 },
+	{ "log_flush_rate",		KSTAT_DATA_UINT32 },
+	{ "log_flush_time_rate",	KSTAT_DATA_UINT32 },
+};
+
+#define	_DDT_KSTAT_STAT(ddt, stat) \
+	&((ddt_kstats_t *)(ddt)->ddt_ksp->ks_data)->stat.value.ui64
+#define	DDT_KSTAT_BUMP(ddt, stat) \
+	do { atomic_inc_64(_DDT_KSTAT_STAT(ddt, stat)); } while (0)
+#define	DDT_KSTAT_ADD(ddt, stat, val) \
+	do { atomic_add_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0)
+#define	DDT_KSTAT_SUB(ddt, stat, val) \
+	do { atomic_sub_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0)
+#define	DDT_KSTAT_SET(ddt, stat, val) \
+	do { atomic_store_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0)
+#define	DDT_KSTAT_ZERO(ddt, stat) DDT_KSTAT_SET(ddt, stat, 0)
+#else
+#define	DDT_KSTAT_BUMP(ddt, stat) do {} while (0)
+#define	DDT_KSTAT_ADD(ddt, stat, val) do {} while (0)
+#define	DDT_KSTAT_SUB(ddt, stat, val) do {} while (0)
+#define	DDT_KSTAT_SET(ddt, stat, val) do {} while (0)
+#define	DDT_KSTAT_ZERO(ddt, stat) do {} while (0)
+#endif /* _KERNEL */
+
 static void
 ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     dmu_tx_t *tx)
@@ -969,6 +1041,8 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 		ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
 	}
 
+	DDT_KSTAT_BUMP(ddt, dds_lookup);
+
 	ddt_key_fill(&search, bp);
 
 	/* Find an existing live entry */
@@ -979,11 +1053,13 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 			return (NULL);
 
 		/* If it's already loaded, we can just return it. */
+		DDT_KSTAT_BUMP(ddt, dds_lookup_live_hit);
 		if (dde->dde_flags & DDE_FLAG_LOADED)
 			return (dde);
 
 		/* Someone else is loading it, wait for it. */
 		dde->dde_waiters++;
+		DDT_KSTAT_BUMP(ddt, dds_lookup_live_wait);
 		while (!(dde->dde_flags & DDE_FLAG_LOADED))
 			cv_wait(&dde->dde_cv, &ddt->ddt_lock);
 		dde->dde_waiters--;
@@ -997,8 +1073,10 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 			return (NULL);
 		}
 
+		DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
 		return (dde);
-	}
+	} else
+		DDT_KSTAT_BUMP(ddt, dds_lookup_live_miss);
 
 	/* Time to make a new entry. */
 	dde = ddt_alloc(ddt, &search);
@@ -1012,11 +1090,19 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 	/* If its in the log tree, we can "load" it from there */
 	if (ddt->ddt_flags & DDT_FLAG_LOG) {
 		ddt_lightweight_entry_t ddlwe;
+		boolean_t found = B_FALSE;
 
 		if (ddt_log_take_key(ddt, ddt->ddt_log_active,
-		    &search, &ddlwe) ||
-		    ddt_log_take_key(ddt, ddt->ddt_log_flushing,
 		    &search, &ddlwe)) {
+			DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
+			found = B_TRUE;
+		} else if (ddt_log_take_key(ddt, ddt->ddt_log_flushing,
+		    &search, &ddlwe)) {
+			DDT_KSTAT_BUMP(ddt, dds_lookup_log_flushing_hit);
+			found = B_TRUE;
+		}
+
+		if (found) {
 			dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
 
 			dde->dde_type = ddlwe.ddlwe_type;
@@ -1024,8 +1110,13 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 			memcpy(dde->dde_phys, &ddlwe.ddlwe_phys,
 			    DDT_PHYS_SIZE(ddt));
 
+			DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit);
+			DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
+
 			return (dde);
 		}
+
+		DDT_KSTAT_BUMP(ddt, dds_lookup_log_miss);
 	}
 
 	/*
@@ -1069,6 +1160,9 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 		/* Flag cleanup required */
 		dde->dde_flags |= DDE_FLAG_OVERQUOTA;
 	} else if (error == 0) {
+		DDT_KSTAT_BUMP(ddt, dds_lookup_stored_hit);
+		DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
+
 		/*
 		 * The histograms only track inactive (stored or logged) blocks.
 		 * We've just put an entry onto the live list, so we need to
@@ -1085,6 +1179,9 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp)
 		ddt_lightweight_entry_t ddlwe;
 		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
 		ddt_histogram_sub_entry(ddt, ddh, &ddlwe);
+	} else {
+		DDT_KSTAT_BUMP(ddt, dds_lookup_stored_miss);
+		DDT_KSTAT_BUMP(ddt, dds_lookup_new);
 	}
 
 	/* Entry loaded, everyone can proceed now */
@@ -1317,6 +1414,30 @@ ddt_configure(ddt_t *ddt, boolean_t new)
 	return (0);
 }
 
+static void
+ddt_table_alloc_kstats(ddt_t *ddt)
+{
+#ifdef _KERNEL
+	char *mod = kmem_asprintf("zfs/%s", spa_name(ddt->ddt_spa));
+	char *name = kmem_asprintf("ddt_stats_%s",
+	    zio_checksum_table[ddt->ddt_checksum].ci_name);
+
+	ddt->ddt_ksp = kstat_create(mod, 0, name, "misc", KSTAT_TYPE_NAMED,
+	    sizeof (ddt_kstats_t) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+	if (ddt->ddt_ksp != NULL) {
+		ddt_kstats_t *dds = kmem_alloc(sizeof (ddt_kstats_t), KM_SLEEP);
+		memcpy(dds, &ddt_kstats_template, sizeof (ddt_kstats_t));
+		ddt->ddt_ksp->ks_data = dds;
+		kstat_install(ddt->ddt_ksp);
+	}
+
+	kmem_strfree(name);
+	kmem_strfree(mod);
+#else
+	(void) ddt;
+#endif /* _KERNEL */
+}
+
 static ddt_t *
 ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 {
@@ -1336,6 +1457,7 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 	ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
 
 	ddt_log_alloc(ddt);
+	ddt_table_alloc_kstats(ddt);
 
 	return (ddt);
 }
@@ -1343,6 +1465,14 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 static void
 ddt_table_free(ddt_t *ddt)
 {
+#ifdef _KERNEL
+	if (ddt->ddt_ksp != NULL) {
+		kmem_free(ddt->ddt_ksp->ks_data, sizeof (ddt_kstats_t));
+		ddt->ddt_ksp->ks_data = NULL;
+		kstat_delete(ddt->ddt_ksp);
+	}
+#endif /* _KERNEL */
+
 	ddt_log_free(ddt);
 	ASSERT0(avl_numnodes(&ddt->ddt_tree));
 	ASSERT0(avl_numnodes(&ddt->ddt_repair_tree));
@@ -1400,6 +1530,11 @@ ddt_load(spa_t *spa)
 		if (error != 0 && error != ENOENT)
 			return (error);
 
+		DDT_KSTAT_SET(ddt, dds_log_active_entries,
+		    avl_numnodes(&ddt->ddt_log_active->ddl_tree));
+		DDT_KSTAT_SET(ddt, dds_log_flushing_entries,
+		    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree));
+
 		/*
 		 * Seed the cached histograms.
 		 */
@@ -1860,12 +1995,15 @@ ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx)
 
 	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
 		/* We emptied it, so truncate on-disk */
+		DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries);
 		ddt_log_truncate(ddt, tx);
 		/* No more passes needed this txg */
 		ddt->ddt_flush_pass = 0;
-	} else
+	} else {
 		/* More to do next time, save checkpoint */
+		DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count);
 		ddt_log_checkpoint(ddt, &ddlwe, tx);
+	}
 
 	ddt_sync_update_stats(ddt, tx);
 
@@ -1928,7 +2066,11 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
 		 * No more to flush, and the active list has stuff, so
 		 * try to swap the logs for next time.
 		 */
-		(void) ddt_log_swap(ddt, tx);
+		if (ddt_log_swap(ddt, tx)) {
+			DDT_KSTAT_ZERO(ddt, dds_log_active_entries);
+			DDT_KSTAT_SET(ddt, dds_log_flushing_entries,
+			    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree));
+		}
 	}
 
 	/* If force flush is no longer necessary, turn it off. */
@@ -1941,6 +2083,7 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
 	ddt->ddt_log_flush_rate = _ewma(
 	    ddt->ddt_flush_count, ddt->ddt_log_flush_rate,
 	    zfs_dedup_log_flush_flow_rate_txgs);
+	DDT_KSTAT_SET(ddt, dds_log_flush_rate, ddt->ddt_log_flush_rate);
 
 	/*
 	 * Update flush time rate. This is an exponential weighted moving
@@ -1950,6 +2093,8 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
 	    ddt->ddt_log_flush_time_rate,
 	    ((int32_t)(NSEC2MSEC(gethrtime() - ddt->ddt_flush_start))),
 	    zfs_dedup_log_flush_flow_rate_txgs);
+	DDT_KSTAT_SET(ddt, dds_log_flush_time_rate,
+	    ddt->ddt_log_flush_time_rate);
 }
 
 static void
@@ -1975,6 +2120,9 @@ ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
 
 		ddt_log_commit(ddt, &dlu);
 
+		DDT_KSTAT_SET(ddt, dds_log_active_entries,
+		    avl_numnodes(&ddt->ddt_log_active->ddl_tree));
+
 		/*
 		 * Sync the stats for the store objects. Even though we haven't
 		 * modified anything on those objects, they're no longer the
@@ -1996,7 +2144,7 @@ ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
 		ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
 	}
 
-	if (spa_sync_pass(ddt->ddt_spa) == 1)
+	if (spa_sync_pass(ddt->ddt_spa) == 1) {
 		/*
 		 * Update ingest rate. This is an exponential weighted moving
 		 * average of the number of entries changed over recent txgs.
@@ -2006,6 +2154,9 @@ ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
 		ddt->ddt_log_ingest_rate = _ewma(
 		    count, ddt->ddt_log_ingest_rate,
 		    zfs_dedup_log_flush_flow_rate_txgs);
+		DDT_KSTAT_SET(ddt, dds_log_ingest_rate,
+		    ddt->ddt_log_ingest_rate);
+	}
 }
 
 static void

From a60e15d6b980c7c029c4c3da1f922a39ea24eac5 Mon Sep 17 00:00:00 2001
From: Allan Jude <allan@klarasystems.com>
Date: Tue, 23 Jul 2024 20:51:01 +0000
Subject: [PATCH 31/65] Man page updates for dmu_ddt_copies

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Allan Jude <allan@klarasystems.com>
Closes #15895
---
 man/man4/zfs.4 | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index aae3d7dfb5f6..075641872675 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -77,6 +77,17 @@ the array is dynamically sized based on total system memory.
 dnode slots allocated in a single operation as a power of 2.
 The default value minimizes lock contention for the bulk operation performed.
 .
+.It Sy dmu_ddt_copies Ns = Ns Sy 3 Pq uint
+Controls the number of copies stored for DeDup Table
+.Pq DDT
+objects.
+Reducing the number of copies to 1 from the previous default of 3
+can reduce the write inflation caused by deduplication.
+This assumes redundancy for this data is provided by the vdev layer.
+If the DDT is damaged, space may be leaked
+.Pq not freed
+when the DDT can not report the correct reference count.
+.
 .It Sy dmu_prefetch_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint
 Limit the amount we can prefetch with one call to this amount in bytes.
 This helps to limit the amount of memory that can be used by prefetching.

From 77a797a3823c12268e7f1d73f5e024a77b2f582a Mon Sep 17 00:00:00 2001
From: shodanshok <g.danti@assyoma.it>
Date: Fri, 16 Aug 2024 22:34:07 +0200
Subject: [PATCH 32/65] Enable L2 cache of all (MRU+MFU) metadata but MFU data
 only

`l2arc_mfuonly` was added to avoid wasting L2 ARC on read-once MRU
data and metadata. However it can be useful to cache as much
metadata as possible while, at the same time, restricting data
cache to MFU buffers only.

This patch allow for such behavior by setting `l2arc_mfuonly` to 2
(or higher). The list of possible values is the following:
0: cache both MRU and MFU for both data and metadata;
1: cache only MFU for both data and metadata;
2: cache both MRU and MFU for metadata, but only MFU for data.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Gionatan Danti <g.danti@assyoma.it>
Closes #16343
Closes #16402
---
 man/man4/zfs.4   | 14 ++++++++++----
 module/zfs/arc.c | 11 ++++++++---
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 075641872675..2be3a8414aac 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -132,20 +132,26 @@ Controls whether buffers present on special vdevs are eligible for caching
 into L2ARC.
 If set to 1, exclude dbufs on special vdevs from being cached to L2ARC.
 .
-.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Pq  int
+.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Ns | Ns 2 Pq int
 Controls whether only MFU metadata and data are cached from ARC into L2ARC.
 This may be desired to avoid wasting space on L2ARC when reading/writing large
 amounts of data that are not expected to be accessed more than once.
 .Pp
-The default is off,
+The default is 0,
 meaning both MRU and MFU data and metadata are cached.
-When turning off this feature, some MRU buffers will still be present
-in ARC and eventually cached on L2ARC.
+When turning off this feature (setting it to 0), some MRU buffers will
+still be present in ARC and eventually cached on L2ARC.
 .No If Sy l2arc_noprefetch Ns = Ns Sy 0 ,
 some prefetched buffers will be cached to L2ARC, and those might later
 transition to MRU, in which case the
 .Sy l2arc_mru_asize No arcstat will not be Sy 0 .
 .Pp
+Setting it to 1 means to L2 cache only MFU data and metadata.
+.Pp
+Setting it to 2 means to L2 cache all metadata (MRU+MFU) but
+only MFU data (ie: MRU data are not cached). This can be the right setting
+to cache as much metadata as possible even when having high data turnover.
+.Pp
 Regardless of
 .Sy l2arc_noprefetch ,
 some MFU buffers might be evicted from ARC,
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 78c2cf8ec5c3..3c657c979cdc 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -9158,12 +9158,17 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	 */
 	for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
 		/*
-		 * If pass == 1 or 3, we cache MRU metadata and data
-		 * respectively.
+		 * pass == 0: MFU meta
+		 * pass == 1: MRU meta
+		 * pass == 2: MFU data
+		 * pass == 3: MRU data
 		 */
-		if (l2arc_mfuonly) {
+		if (l2arc_mfuonly == 1) {
 			if (pass == 1 || pass == 3)
 				continue;
+		} else if (l2arc_mfuonly > 1) {
+			if (pass == 3)
+				continue;
 		}
 
 		uint64_t passed_sz = 0;

From 06a7b123acaaedc36926ab45b3cf61396702dc1d Mon Sep 17 00:00:00 2001
From: Chunwei Chen <tuxoko@gmail.com>
Date: Mon, 19 Aug 2024 09:42:17 -0700
Subject: [PATCH 33/65] Skip ro check for snaps when multi-mount

Skip ro check for snapshots since they are always ro regardless if ro
flag is passed by mount or not. This allows multi-mounting snapshots
without requiring to specify ro flag.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Closes #16299
---
 module/os/linux/zfs/zpl_super.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c
index d98d32c1f9fb..0a82b8858eb8 100644
--- a/module/os/linux/zfs/zpl_super.c
+++ b/module/os/linux/zfs/zpl_super.c
@@ -292,6 +292,7 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
 {
 	struct super_block *s;
 	objset_t *os;
+	boolean_t issnap = B_FALSE;
 	int err;
 
 	err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
@@ -323,6 +324,7 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
 		if (zpl_enter(zfsvfs, FTAG) == 0) {
 			if (os != zfsvfs->z_os)
 				err = -SET_ERROR(EBUSY);
+			issnap = zfsvfs->z_issnap;
 			zpl_exit(zfsvfs, FTAG);
 		} else {
 			err = -SET_ERROR(EBUSY);
@@ -346,7 +348,11 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
 			return (ERR_PTR(err));
 		}
 		s->s_flags |= SB_ACTIVE;
-	} else if ((flags ^ s->s_flags) & SB_RDONLY) {
+	} else if (!issnap && ((flags ^ s->s_flags) & SB_RDONLY)) {
+		/*
+		 * Skip ro check for snap since snap is always ro regardless
+		 * ro flag is passed by mount or not.
+		 */
 		deactivate_locked_super(s);
 		return (ERR_PTR(-EBUSY));
 	}

From f0ad031cd9236e0b8d9a42ea6b61c14a512a9b70 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 7 May 2024 10:18:22 +1000
Subject: [PATCH 34/65] spl-generic: bring up kstats subsystem before taskq

For spl-taskq to use the kstats infrastructure, it has to be available
first.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Syneto
Closes #16171
---
 module/os/linux/spl/spl-generic.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/module/os/linux/spl/spl-generic.c b/module/os/linux/spl/spl-generic.c
index 986db1518456..6ee0236d289a 100644
--- a/module/os/linux/spl/spl-generic.c
+++ b/module/os/linux/spl/spl-generic.c
@@ -868,16 +868,16 @@ spl_init(void)
 	if ((rc = spl_tsd_init()))
 		goto out2;
 
-	if ((rc = spl_taskq_init()))
+	if ((rc = spl_proc_init()))
 		goto out3;
 
-	if ((rc = spl_kmem_cache_init()))
+	if ((rc = spl_kstat_init()))
 		goto out4;
 
-	if ((rc = spl_proc_init()))
+	if ((rc = spl_taskq_init()))
 		goto out5;
 
-	if ((rc = spl_kstat_init()))
+	if ((rc = spl_kmem_cache_init()))
 		goto out6;
 
 	if ((rc = spl_zlib_init()))
@@ -891,13 +891,13 @@ spl_init(void)
 out8:
 	spl_zlib_fini();
 out7:
-	spl_kstat_fini();
+	spl_kmem_cache_fini();
 out6:
-	spl_proc_fini();
+	spl_taskq_fini();
 out5:
-	spl_kmem_cache_fini();
+	spl_kstat_fini();
 out4:
-	spl_taskq_fini();
+	spl_proc_fini();
 out3:
 	spl_tsd_fini();
 out2:
@@ -913,10 +913,10 @@ spl_fini(void)
 {
 	spl_zone_fini();
 	spl_zlib_fini();
-	spl_kstat_fini();
-	spl_proc_fini();
 	spl_kmem_cache_fini();
 	spl_taskq_fini();
+	spl_kstat_fini();
+	spl_proc_fini();
 	spl_tsd_fini();
 	spl_kvmem_fini();
 	spl_random_fini();

From db40fe4cf6254e59459c7c9969a204c540523192 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Fri, 3 May 2024 14:42:51 +1000
Subject: [PATCH 35/65] spl-taskq: per-taskq kstats

This exposes a variety of per-taskq stats under /proc/spl/kstat/taskq,
one file per taskq, named for the taskq name.instance.

These include a small amount of info about the taskq config, the current
state of the threads and queues, and various counters for thread and
queue activity since the taskq was created.

To assist with decrementing queue size counters, the list an entry is on
is encoded in spare bits in the entry flags.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Syneto
Closes #16171
---
 include/os/linux/spl/sys/taskq.h |  42 +++++
 module/os/linux/spl/spl-taskq.c  | 314 +++++++++++++++++++++++++++++--
 2 files changed, 342 insertions(+), 14 deletions(-)

diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h
index b73dab631e04..8051de36ba82 100644
--- a/include/os/linux/spl/sys/taskq.h
+++ b/include/os/linux/spl/sys/taskq.h
@@ -20,6 +20,10 @@
  *  You should have received a copy of the GNU General Public License along
  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  */
+/*
+ * Copyright (c) 2024, Klara Inc.
+ * Copyright (c) 2024, Syneto
+ */
 
 #ifndef _SPL_TASKQ_H
 #define	_SPL_TASKQ_H
@@ -33,6 +37,9 @@
 #include <sys/thread.h>
 #include <sys/rwlock.h>
 #include <sys/wait.h>
+#include <sys/wmsum.h>
+
+typedef struct kstat_s kstat_t;
 
 #define	TASKQ_NAMELEN		31
 
@@ -74,6 +81,32 @@ typedef enum tq_lock_role {
 typedef unsigned long taskqid_t;
 typedef void (task_func_t)(void *);
 
+typedef struct taskq_sums {
+	/* gauges (inc/dec counters, current value) */
+	wmsum_t tqs_threads_active;		/* threads running a task */
+	wmsum_t tqs_threads_idle;		/* threads waiting for work */
+	wmsum_t tqs_threads_total;		/* total threads */
+	wmsum_t tqs_tasks_pending;		/* tasks waiting to execute */
+	wmsum_t tqs_tasks_priority;		/* hi-pri tasks waiting */
+	wmsum_t tqs_tasks_total;		/* total waiting tasks */
+	wmsum_t tqs_tasks_delayed;		/* tasks deferred to future */
+	wmsum_t tqs_entries_free;		/* task entries on free list */
+
+	/* counters (inc only, since taskq creation) */
+	wmsum_t tqs_threads_created;		/* threads created */
+	wmsum_t tqs_threads_destroyed;		/* threads destroyed */
+	wmsum_t tqs_tasks_dispatched;		/* tasks dispatched */
+	wmsum_t tqs_tasks_dispatched_delayed;	/* tasks delayed to future */
+	wmsum_t tqs_tasks_executed_normal;	/* normal pri tasks executed */
+	wmsum_t tqs_tasks_executed_priority;	/* high pri tasks executed */
+	wmsum_t tqs_tasks_executed;		/* total tasks executed */
+	wmsum_t tqs_tasks_delayed_requeued;	/* delayed tasks requeued */
+	wmsum_t tqs_tasks_cancelled;		/* tasks cancelled before run */
+	wmsum_t tqs_thread_wakeups;		/* total thread wakeups */
+	wmsum_t tqs_thread_wakeups_nowork;	/* thread woken but no tasks */
+	wmsum_t tqs_thread_sleeps;		/* total thread sleeps */
+} taskq_sums_t;
+
 typedef struct taskq {
 	spinlock_t		tq_lock;	/* protects taskq_t */
 	char			*tq_name;	/* taskq name */
@@ -105,6 +138,8 @@ typedef struct taskq {
 	struct hlist_node	tq_hp_cb_node;
 	boolean_t		tq_hp_support;
 	unsigned long		lastspawnstop;	/* when to purge dynamic */
+	taskq_sums_t		tq_sums;
+	kstat_t			*tq_ksp;
 } taskq_t;
 
 typedef struct taskq_ent {
@@ -123,6 +158,13 @@ typedef struct taskq_ent {
 #define	TQENT_FLAG_PREALLOC	0x1
 #define	TQENT_FLAG_CANCEL	0x2
 
+/* bits 2-3 are which list tqent is on */
+#define	TQENT_LIST_NONE		0x0
+#define	TQENT_LIST_PENDING	0x4
+#define	TQENT_LIST_PRIORITY	0x8
+#define	TQENT_LIST_DELAY	0xc
+#define	TQENT_LIST_MASK		0xc
+
 typedef struct taskq_thread {
 	struct list_head	tqt_thread_list;
 	struct list_head	tqt_active_list;
diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c
index e7b812c3b5b5..61012bfb36d3 100644
--- a/module/os/linux/spl/spl-taskq.c
+++ b/module/os/linux/spl/spl-taskq.c
@@ -22,16 +22,98 @@
  *
  *  Solaris Porting Layer (SPL) Task Queue Implementation.
  */
+/*
+ * Copyright (c) 2024, Klara Inc.
+ * Copyright (c) 2024, Syneto
+ */
 
 #include <sys/timer.h>
 #include <sys/taskq.h>
 #include <sys/kmem.h>
 #include <sys/tsd.h>
 #include <sys/trace_spl.h>
+#include <sys/time.h>
+#include <sys/atomic.h>
+#include <sys/kstat.h>
 #ifdef HAVE_CPU_HOTPLUG
 #include <linux/cpuhotplug.h>
 #endif
 
+typedef struct taskq_kstats {
+	/* static values, for completeness */
+	kstat_named_t tqks_threads_max;
+	kstat_named_t tqks_entry_pool_min;
+	kstat_named_t tqks_entry_pool_max;
+
+	/* gauges (inc/dec counters, current value) */
+	kstat_named_t tqks_threads_active;
+	kstat_named_t tqks_threads_idle;
+	kstat_named_t tqks_threads_total;
+	kstat_named_t tqks_tasks_pending;
+	kstat_named_t tqks_tasks_priority;
+	kstat_named_t tqks_tasks_total;
+	kstat_named_t tqks_tasks_delayed;
+	kstat_named_t tqks_entries_free;
+
+	/* counters (inc only, since taskq creation) */
+	kstat_named_t tqks_threads_created;
+	kstat_named_t tqks_threads_destroyed;
+	kstat_named_t tqks_tasks_dispatched;
+	kstat_named_t tqks_tasks_dispatched_delayed;
+	kstat_named_t tqks_tasks_executed_normal;
+	kstat_named_t tqks_tasks_executed_priority;
+	kstat_named_t tqks_tasks_executed;
+	kstat_named_t tqks_tasks_delayed_requeued;
+	kstat_named_t tqks_tasks_cancelled;
+	kstat_named_t tqks_thread_wakeups;
+	kstat_named_t tqks_thread_wakeups_nowork;
+	kstat_named_t tqks_thread_sleeps;
+} taskq_kstats_t;
+
+static taskq_kstats_t taskq_kstats_template = {
+	{ "threads_max",		KSTAT_DATA_UINT64 },
+	{ "entry_pool_min",		KSTAT_DATA_UINT64 },
+	{ "entry_pool_max",		KSTAT_DATA_UINT64 },
+	{ "threads_active",		KSTAT_DATA_UINT64 },
+	{ "threads_idle",		KSTAT_DATA_UINT64 },
+	{ "threads_total",		KSTAT_DATA_UINT64 },
+	{ "tasks_pending",		KSTAT_DATA_UINT64 },
+	{ "tasks_priority",		KSTAT_DATA_UINT64 },
+	{ "tasks_total",		KSTAT_DATA_UINT64 },
+	{ "tasks_delayed",		KSTAT_DATA_UINT64 },
+	{ "entries_free",		KSTAT_DATA_UINT64 },
+
+	{ "threads_created",		KSTAT_DATA_UINT64 },
+	{ "threads_destroyed",		KSTAT_DATA_UINT64 },
+	{ "tasks_dispatched",		KSTAT_DATA_UINT64 },
+	{ "tasks_dispatched_delayed",	KSTAT_DATA_UINT64 },
+	{ "tasks_executed_normal",	KSTAT_DATA_UINT64 },
+	{ "tasks_executed_priority",	KSTAT_DATA_UINT64 },
+	{ "tasks_executed",		KSTAT_DATA_UINT64 },
+	{ "tasks_delayed_requeued",	KSTAT_DATA_UINT64 },
+	{ "tasks_cancelled",		KSTAT_DATA_UINT64 },
+	{ "thread_wakeups",		KSTAT_DATA_UINT64 },
+	{ "thread_wakeups_nowork",	KSTAT_DATA_UINT64 },
+	{ "thread_sleeps",		KSTAT_DATA_UINT64 },
+};
+
+#define	TQSTAT_INC(tq, stat)	wmsum_add(&tq->tq_sums.tqs_##stat, 1)
+#define	TQSTAT_DEC(tq, stat)	wmsum_add(&tq->tq_sums.tqs_##stat, -1)
+
+#define	_TQSTAT_MOD_LIST(mod, tq, t) do { \
+	switch (t->tqent_flags & TQENT_LIST_MASK) {			\
+	case TQENT_LIST_NONE: ASSERT(list_empty(&t->tqent_list)); break;\
+	case TQENT_LIST_PENDING: mod(tq, tasks_pending); break;		\
+	case TQENT_LIST_PRIORITY: mod(tq, tasks_priority); break;	\
+	case TQENT_LIST_DELAY: mod(tq, tasks_delayed); break;		\
+	}								\
+} while (0)
+#define	TQSTAT_INC_LIST(tq, t)	_TQSTAT_MOD_LIST(TQSTAT_INC, tq, t)
+#define	TQSTAT_DEC_LIST(tq, t)	_TQSTAT_MOD_LIST(TQSTAT_DEC, tq, t)
+
+#define	TQENT_SET_LIST(t, l)	\
+	t->tqent_flags = (t->tqent_flags & ~TQENT_LIST_MASK) | l;
+
 static int spl_taskq_thread_bind = 0;
 module_param(spl_taskq_thread_bind, int, 0644);
 MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
@@ -134,6 +216,7 @@ task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags)
 		ASSERT(!timer_pending(&t->tqent_timer));
 
 		list_del_init(&t->tqent_list);
+		TQSTAT_DEC(tq, entries_free);
 		return (t);
 	}
 
@@ -204,12 +287,11 @@ task_done(taskq_t *tq, taskq_ent_t *t)
 {
 	ASSERT(tq);
 	ASSERT(t);
+	ASSERT(list_empty(&t->tqent_list));
 
 	/* Wake tasks blocked in taskq_wait_id() */
 	wake_up_all(&t->tqent_waitq);
 
-	list_del_init(&t->tqent_list);
-
 	if (tq->tq_nalloc <= tq->tq_minalloc) {
 		t->tqent_id = TASKQID_INVALID;
 		t->tqent_func = NULL;
@@ -217,6 +299,7 @@ task_done(taskq_t *tq, taskq_ent_t *t)
 		t->tqent_flags = 0;
 
 		list_add_tail(&t->tqent_list, &tq->tq_free_list);
+		TQSTAT_INC(tq, entries_free);
 	} else {
 		task_free(tq, t);
 	}
@@ -263,6 +346,8 @@ task_expire_impl(taskq_ent_t *t)
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 	wake_up(&tq->tq_work_waitq);
+
+	TQSTAT_INC(tq, tasks_delayed_requeued);
 }
 
 static void
@@ -534,7 +619,10 @@ taskq_cancel_id(taskq_t *tq, taskqid_t id)
 	t = taskq_find(tq, id);
 	if (t && t != ERR_PTR(-EBUSY)) {
 		list_del_init(&t->tqent_list);
+		TQSTAT_DEC_LIST(tq, t);
+
 		t->tqent_flags |= TQENT_FLAG_CANCEL;
+		TQSTAT_INC(tq, tasks_cancelled);
 
 		/*
 		 * When canceling the lowest outstanding task id we
@@ -604,13 +692,19 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 	spin_lock(&t->tqent_lock);
 
 	/* Queue to the front of the list to enforce TQ_NOQUEUE semantics */
-	if (flags & TQ_NOQUEUE)
+	if (flags & TQ_NOQUEUE) {
+		TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
 		list_add(&t->tqent_list, &tq->tq_prio_list);
 	/* Queue to the priority list instead of the pending list */
-	else if (flags & TQ_FRONT)
+	} else if (flags & TQ_FRONT) {
+		TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
 		list_add_tail(&t->tqent_list, &tq->tq_prio_list);
-	else
+	} else {
+		TQENT_SET_LIST(t, TQENT_LIST_PENDING);
 		list_add_tail(&t->tqent_list, &tq->tq_pend_list);
+	}
+	TQSTAT_INC_LIST(tq, t);
+	TQSTAT_INC(tq, tasks_total);
 
 	t->tqent_id = rc = tq->tq_next_id;
 	tq->tq_next_id++;
@@ -629,6 +723,8 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 
 	wake_up(&tq->tq_work_waitq);
 
+	TQSTAT_INC(tq, tasks_dispatched);
+
 	/* Spawn additional taskq threads if required. */
 	if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
@@ -662,6 +758,8 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
 
 	/* Queue to the delay list for subsequent execution */
 	list_add_tail(&t->tqent_list, &tq->tq_delay_list);
+	TQENT_SET_LIST(t, TQENT_LIST_DELAY);
+	TQSTAT_INC_LIST(tq, t);
 
 	t->tqent_id = rc = tq->tq_next_id;
 	tq->tq_next_id++;
@@ -676,6 +774,8 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
 
 	spin_unlock(&t->tqent_lock);
 
+	TQSTAT_INC(tq, tasks_dispatched_delayed);
+
 	/* Spawn additional taskq threads if required. */
 	if (tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
@@ -724,10 +824,15 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
 	t->tqent_flags |= TQENT_FLAG_PREALLOC;
 
 	/* Queue to the priority list instead of the pending list */
-	if (flags & TQ_FRONT)
+	if (flags & TQ_FRONT) {
+		TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
 		list_add_tail(&t->tqent_list, &tq->tq_prio_list);
-	else
+	} else {
+		TQENT_SET_LIST(t, TQENT_LIST_PENDING);
 		list_add_tail(&t->tqent_list, &tq->tq_pend_list);
+	}
+	TQSTAT_INC_LIST(tq, t);
+	TQSTAT_INC(tq, tasks_total);
 
 	t->tqent_id = tq->tq_next_id;
 	tq->tq_next_id++;
@@ -742,6 +847,8 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
 
 	wake_up(&tq->tq_work_waitq);
 
+	TQSTAT_INC(tq, tasks_dispatched);
+
 	/* Spawn additional taskq threads if required. */
 	if (tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
@@ -908,6 +1015,8 @@ taskq_thread(void *args)
 	wake_up(&tq->tq_wait_waitq);
 	set_current_state(TASK_INTERRUPTIBLE);
 
+	TQSTAT_INC(tq, threads_total);
+
 	while (!kthread_should_stop()) {
 
 		if (list_empty(&tq->tq_pend_list) &&
@@ -919,9 +1028,15 @@ taskq_thread(void *args)
 			add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
 			spin_unlock_irqrestore(&tq->tq_lock, flags);
 
+			TQSTAT_INC(tq, thread_sleeps);
+			TQSTAT_INC(tq, threads_idle);
+
 			schedule();
 			seq_tasks = 0;
 
+			TQSTAT_DEC(tq, threads_idle);
+			TQSTAT_INC(tq, thread_wakeups);
+
 			spin_lock_irqsave_nested(&tq->tq_lock, flags,
 			    tq->tq_lock_class);
 			remove_wait_queue(&tq->tq_work_waitq, &wait);
@@ -931,6 +1046,8 @@ taskq_thread(void *args)
 
 		if ((t = taskq_next_ent(tq)) != NULL) {
 			list_del_init(&t->tqent_list);
+			TQSTAT_DEC_LIST(tq, t);
+			TQSTAT_DEC(tq, tasks_total);
 
 			/*
 			 * A TQENT_FLAG_PREALLOC task may be reused or freed
@@ -955,6 +1072,7 @@ taskq_thread(void *args)
 			tq->tq_nactive++;
 			spin_unlock_irqrestore(&tq->tq_lock, flags);
 
+			TQSTAT_INC(tq, threads_active);
 			DTRACE_PROBE1(taskq_ent__start, taskq_ent_t *, t);
 
 			/* Perform the requested task */
@@ -962,8 +1080,17 @@ taskq_thread(void *args)
 
 			DTRACE_PROBE1(taskq_ent__finish, taskq_ent_t *, t);
 
+			TQSTAT_DEC(tq, threads_active);
+			if ((t->tqent_flags & TQENT_LIST_MASK) ==
+			    TQENT_LIST_PENDING)
+				TQSTAT_INC(tq, tasks_executed_normal);
+			else
+				TQSTAT_INC(tq, tasks_executed_priority);
+			TQSTAT_INC(tq, tasks_executed);
+
 			spin_lock_irqsave_nested(&tq->tq_lock, flags,
 			    tq->tq_lock_class);
+
 			tq->tq_nactive--;
 			list_del_init(&tqt->tqt_active_list);
 			tqt->tqt_task = NULL;
@@ -989,7 +1116,8 @@ taskq_thread(void *args)
 			tqt->tqt_id = TASKQID_INVALID;
 			tqt->tqt_flags = 0;
 			wake_up_all(&tq->tq_wait_waitq);
-		}
+		} else
+			TQSTAT_INC(tq, thread_wakeups_nowork);
 
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -998,6 +1126,10 @@ taskq_thread(void *args)
 	__set_current_state(TASK_RUNNING);
 	tq->tq_nthreads--;
 	list_del_init(&tqt->tqt_thread_list);
+
+	TQSTAT_DEC(tq, threads_total);
+	TQSTAT_INC(tq, threads_destroyed);
+
 error:
 	kmem_free(tqt, sizeof (taskq_thread_t));
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
@@ -1037,9 +1169,156 @@ taskq_thread_create(taskq_t *tq)
 
 	wake_up_process(tqt->tqt_thread);
 
+	TQSTAT_INC(tq, threads_created);
+
 	return (tqt);
 }
 
+static void
+taskq_stats_init(taskq_t *tq)
+{
+	taskq_sums_t *tqs = &tq->tq_sums;
+	wmsum_init(&tqs->tqs_threads_active, 0);
+	wmsum_init(&tqs->tqs_threads_idle, 0);
+	wmsum_init(&tqs->tqs_threads_total, 0);
+	wmsum_init(&tqs->tqs_tasks_pending, 0);
+	wmsum_init(&tqs->tqs_tasks_priority, 0);
+	wmsum_init(&tqs->tqs_tasks_total, 0);
+	wmsum_init(&tqs->tqs_tasks_delayed, 0);
+	wmsum_init(&tqs->tqs_entries_free, 0);
+	wmsum_init(&tqs->tqs_threads_created, 0);
+	wmsum_init(&tqs->tqs_threads_destroyed, 0);
+	wmsum_init(&tqs->tqs_tasks_dispatched, 0);
+	wmsum_init(&tqs->tqs_tasks_dispatched_delayed, 0);
+	wmsum_init(&tqs->tqs_tasks_executed_normal, 0);
+	wmsum_init(&tqs->tqs_tasks_executed_priority, 0);
+	wmsum_init(&tqs->tqs_tasks_executed, 0);
+	wmsum_init(&tqs->tqs_tasks_delayed_requeued, 0);
+	wmsum_init(&tqs->tqs_tasks_cancelled, 0);
+	wmsum_init(&tqs->tqs_thread_wakeups, 0);
+	wmsum_init(&tqs->tqs_thread_wakeups_nowork, 0);
+	wmsum_init(&tqs->tqs_thread_sleeps, 0);
+}
+
+static void
+taskq_stats_fini(taskq_t *tq)
+{
+	taskq_sums_t *tqs = &tq->tq_sums;
+	wmsum_fini(&tqs->tqs_threads_active);
+	wmsum_fini(&tqs->tqs_threads_idle);
+	wmsum_fini(&tqs->tqs_threads_total);
+	wmsum_fini(&tqs->tqs_tasks_pending);
+	wmsum_fini(&tqs->tqs_tasks_priority);
+	wmsum_fini(&tqs->tqs_tasks_total);
+	wmsum_fini(&tqs->tqs_tasks_delayed);
+	wmsum_fini(&tqs->tqs_entries_free);
+	wmsum_fini(&tqs->tqs_threads_created);
+	wmsum_fini(&tqs->tqs_threads_destroyed);
+	wmsum_fini(&tqs->tqs_tasks_dispatched);
+	wmsum_fini(&tqs->tqs_tasks_dispatched_delayed);
+	wmsum_fini(&tqs->tqs_tasks_executed_normal);
+	wmsum_fini(&tqs->tqs_tasks_executed_priority);
+	wmsum_fini(&tqs->tqs_tasks_executed);
+	wmsum_fini(&tqs->tqs_tasks_delayed_requeued);
+	wmsum_fini(&tqs->tqs_tasks_cancelled);
+	wmsum_fini(&tqs->tqs_thread_wakeups);
+	wmsum_fini(&tqs->tqs_thread_wakeups_nowork);
+	wmsum_fini(&tqs->tqs_thread_sleeps);
+}
+
+static int
+taskq_kstats_update(kstat_t *ksp, int rw)
+{
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	taskq_t *tq = ksp->ks_private;
+	taskq_kstats_t *tqks = ksp->ks_data;
+
+	tqks->tqks_threads_max.value.ui64 = tq->tq_maxthreads;
+	tqks->tqks_entry_pool_min.value.ui64 = tq->tq_minalloc;
+	tqks->tqks_entry_pool_max.value.ui64 = tq->tq_maxalloc;
+
+	taskq_sums_t *tqs = &tq->tq_sums;
+
+	tqks->tqks_threads_active.value.ui64 =
+	    wmsum_value(&tqs->tqs_threads_active);
+	tqks->tqks_threads_idle.value.ui64 =
+	    wmsum_value(&tqs->tqs_threads_idle);
+	tqks->tqks_threads_total.value.ui64 =
+	    wmsum_value(&tqs->tqs_threads_total);
+	tqks->tqks_tasks_pending.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_pending);
+	tqks->tqks_tasks_priority.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_priority);
+	tqks->tqks_tasks_total.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_total);
+	tqks->tqks_tasks_delayed.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_delayed);
+	tqks->tqks_entries_free.value.ui64 =
+	    wmsum_value(&tqs->tqs_entries_free);
+	tqks->tqks_threads_created.value.ui64 =
+	    wmsum_value(&tqs->tqs_threads_created);
+	tqks->tqks_threads_destroyed.value.ui64 =
+	    wmsum_value(&tqs->tqs_threads_destroyed);
+	tqks->tqks_tasks_dispatched.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_dispatched);
+	tqks->tqks_tasks_dispatched_delayed.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_dispatched_delayed);
+	tqks->tqks_tasks_executed_normal.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_executed_normal);
+	tqks->tqks_tasks_executed_priority.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_executed_priority);
+	tqks->tqks_tasks_executed.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_executed);
+	tqks->tqks_tasks_delayed_requeued.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_delayed_requeued);
+	tqks->tqks_tasks_cancelled.value.ui64 =
+	    wmsum_value(&tqs->tqs_tasks_cancelled);
+	tqks->tqks_thread_wakeups.value.ui64 =
+	    wmsum_value(&tqs->tqs_thread_wakeups);
+	tqks->tqks_thread_wakeups_nowork.value.ui64 =
+	    wmsum_value(&tqs->tqs_thread_wakeups_nowork);
+	tqks->tqks_thread_sleeps.value.ui64 =
+	    wmsum_value(&tqs->tqs_thread_sleeps);
+
+	return (0);
+}
+
+static void
+taskq_kstats_init(taskq_t *tq)
+{
+	char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */
+	snprintf(name, sizeof (name), "%s.%d", tq->tq_name, tq->tq_instance);
+
+	kstat_t *ksp = kstat_create("taskq", 0, name, "misc",
+	    KSTAT_TYPE_NAMED, sizeof (taskq_kstats_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (ksp == NULL)
+		return;
+
+	ksp->ks_private = tq;
+	ksp->ks_update = taskq_kstats_update;
+	ksp->ks_data = kmem_alloc(sizeof (taskq_kstats_t), KM_SLEEP);
+	memcpy(ksp->ks_data, &taskq_kstats_template, sizeof (taskq_kstats_t));
+	kstat_install(ksp);
+
+	tq->tq_ksp = ksp;
+}
+
+static void
+taskq_kstats_fini(taskq_t *tq)
+{
+	if (tq->tq_ksp == NULL)
+		return;
+
+	kmem_free(tq->tq_ksp->ks_data, sizeof (taskq_kstats_t));
+	kstat_delete(tq->tq_ksp);
+
+	tq->tq_ksp = NULL;
+}
+
 taskq_t *
 taskq_create(const char *name, int threads_arg, pri_t pri,
     int minalloc, int maxalloc, uint_t flags)
@@ -1104,6 +1383,7 @@ taskq_create(const char *name, int threads_arg, pri_t pri,
 	init_waitqueue_head(&tq->tq_wait_waitq);
 	tq->tq_lock_class = TQ_LOCK_GENERAL;
 	INIT_LIST_HEAD(&tq->tq_taskqs);
+	taskq_stats_init(tq);
 
 	if (flags & TASKQ_PREPOPULATE) {
 		spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
@@ -1137,14 +1417,17 @@ taskq_create(const char *name, int threads_arg, pri_t pri,
 
 	if (rc) {
 		taskq_destroy(tq);
-		tq = NULL;
-	} else {
-		down_write(&tq_list_sem);
-		tq->tq_instance = taskq_find_by_name(name) + 1;
-		list_add_tail(&tq->tq_taskqs, &tq_list);
-		up_write(&tq_list_sem);
+		return (NULL);
 	}
 
+	down_write(&tq_list_sem);
+	tq->tq_instance = taskq_find_by_name(name) + 1;
+	list_add_tail(&tq->tq_taskqs, &tq_list);
+	up_write(&tq_list_sem);
+
+	/* Install kstats late, because the name includes tq_instance */
+	taskq_kstats_init(tq);
+
 	return (tq);
 }
 EXPORT_SYMBOL(taskq_create);
@@ -1177,6 +1460,8 @@ taskq_destroy(taskq_t *tq)
 
 	taskq_wait(tq);
 
+	taskq_kstats_fini(tq);
+
 	/* remove taskq from global list used by the kstats */
 	down_write(&tq_list_sem);
 	list_del(&tq->tq_taskqs);
@@ -1230,6 +1515,7 @@ taskq_destroy(taskq_t *tq)
 
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
+	taskq_stats_fini(tq);
 	kmem_strfree(tq->tq_name);
 	kmem_free(tq, sizeof (taskq_t));
 }

From 3f8fd3cae081fc13608e30e25b2e9df73fc59de9 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 7 May 2024 10:26:20 +1000
Subject: [PATCH 36/65] spl-taskq: summary stats for all taskqs

This adds /proc/spl/kstats/taskq/summary, which attempts to show a
useful subset of stats for all taskqs in the system.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Syneto
Closes #16171
---
 module/os/linux/spl/spl-taskq.c | 98 +++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)

diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c
index 61012bfb36d3..29b8f5426502 100644
--- a/module/os/linux/spl/spl-taskq.c
+++ b/module/os/linux/spl/spl-taskq.c
@@ -1557,6 +1557,100 @@ taskq_create_synced(const char *name, int nthreads, pri_t pri,
 }
 EXPORT_SYMBOL(taskq_create_synced);
 
+static kstat_t *taskq_summary_ksp = NULL;
+
+static int
+spl_taskq_kstat_headers(char *buf, size_t size)
+{
+	size_t n = snprintf(buf, size,
+	    "%-20s | %-17s | %-23s\n"
+	    "%-20s | %-17s | %-23s\n"
+	    "%-20s | %-17s | %-23s\n",
+	    "", "threads", "tasks on queue",
+	    "taskq name", "tot [act idl] max", " pend [ norm  high] dly",
+	    "--------------------", "-----------------",
+	    "-----------------------");
+	return (n >= size ? ENOMEM : 0);
+}
+
+static int
+spl_taskq_kstat_data(char *buf, size_t size, void *data)
+{
+	struct list_head *tql = NULL;
+	taskq_t *tq;
+	char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */
+	char threads[25];
+	char tasks[30];
+	size_t n;
+	int err = 0;
+
+	down_read(&tq_list_sem);
+	list_for_each_prev(tql, &tq_list) {
+		tq = list_entry(tql, taskq_t, tq_taskqs);
+
+		mutex_enter(tq->tq_ksp->ks_lock);
+		taskq_kstats_update(tq->tq_ksp, KSTAT_READ);
+		taskq_kstats_t *tqks = tq->tq_ksp->ks_data;
+
+		snprintf(name, sizeof (name), "%s.%d", tq->tq_name,
+		    tq->tq_instance);
+		snprintf(threads, sizeof (threads), "%3llu [%3llu %3llu] %3llu",
+		    tqks->tqks_threads_total.value.ui64,
+		    tqks->tqks_threads_active.value.ui64,
+		    tqks->tqks_threads_idle.value.ui64,
+		    tqks->tqks_threads_max.value.ui64);
+		snprintf(tasks, sizeof (tasks), "%5llu [%5llu %5llu] %3llu",
+		    tqks->tqks_tasks_total.value.ui64,
+		    tqks->tqks_tasks_pending.value.ui64,
+		    tqks->tqks_tasks_priority.value.ui64,
+		    tqks->tqks_tasks_delayed.value.ui64);
+
+		mutex_exit(tq->tq_ksp->ks_lock);
+
+		n = snprintf(buf, size, "%-20s | %-17s | %-23s\n",
+		    name, threads, tasks);
+		if (n >= size) {
+			err = ENOMEM;
+			break;
+		}
+
+		buf = &buf[n];
+		size -= n;
+	}
+
+	up_read(&tq_list_sem);
+
+	return (err);
+}
+
+static void
+spl_taskq_kstat_init(void)
+{
+	kstat_t *ksp = kstat_create("taskq", 0, "summary", "misc",
+	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+
+	if (ksp == NULL)
+		return;
+
+	ksp->ks_data = (void *)(uintptr_t)1;
+	ksp->ks_ndata = 1;
+	kstat_set_raw_ops(ksp, spl_taskq_kstat_headers,
+	    spl_taskq_kstat_data, NULL);
+	kstat_install(ksp);
+
+	taskq_summary_ksp = ksp;
+}
+
+static void
+spl_taskq_kstat_fini(void)
+{
+	if (taskq_summary_ksp == NULL)
+		return;
+
+	kstat_delete(taskq_summary_ksp);
+	taskq_summary_ksp = NULL;
+}
+
 static unsigned int spl_taskq_kick = 0;
 
 /*
@@ -1737,12 +1831,16 @@ spl_taskq_init(void)
 	 */
 	dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC;
 
+	spl_taskq_kstat_init();
+
 	return (0);
 }
 
 void
 spl_taskq_fini(void)
 {
+	spl_taskq_kstat_fini();
+
 	taskq_destroy(dynamic_taskq);
 	dynamic_taskq = NULL;
 

From 816d2b2bfc2591b951f32aeb7c00e14e27ee624c Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 7 May 2024 10:17:12 +1000
Subject: [PATCH 37/65] spl-proc: remove old taskq stats

These had minimal useful information for the admin, didn't work properly
in some places, and knew far too much about taskq internals.

With the new stats available, these should never be needed anymore.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Syneto
Closes #16171
---
 man/man4/spl.4                 |  11 --
 module/os/linux/spl/spl-proc.c | 268 ---------------------------------
 2 files changed, 279 deletions(-)

diff --git a/man/man4/spl.4 b/man/man4/spl.4
index 5cc12764e18c..22832c492db8 100644
--- a/man/man4/spl.4
+++ b/man/man4/spl.4
@@ -175,17 +175,6 @@ Increasing this value will
 result in a slower thread creation rate which may be preferable for some
 configurations.
 .
-.It Sy spl_max_show_tasks Ns = Ns Sy 512 Pq uint
-The maximum number of tasks per pending list in each taskq shown in
-.Pa /proc/spl/taskq{,-all} .
-Write
-.Sy 0
-to turn off the limit.
-The proc file will walk the lists with lock held,
-reading it could cause a lock-up if the list grow too large
-without limiting the output.
-"(truncated)" will be shown if the list is larger than the limit.
-.
 .It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 5000 Pq uint
 Minimum idle threads exit interval for dynamic taskqs.
 Smaller values allow idle threads exit more often and potentially be
diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c
index 2c0cdd9febf5..9fefcd03c410 100644
--- a/module/os/linux/spl/spl-proc.c
+++ b/module/os/linux/spl/spl-proc.c
@@ -31,7 +31,6 @@
 #include <sys/kmem.h>
 #include <sys/kmem_cache.h>
 #include <sys/vmem.h>
-#include <sys/taskq.h>
 #include <sys/proc.h>
 #include <linux/ctype.h>
 #include <linux/kmod.h>
@@ -63,8 +62,6 @@ static struct ctl_table_header *spl_kstat = NULL;
 static struct proc_dir_entry *proc_spl = NULL;
 static struct proc_dir_entry *proc_spl_kmem = NULL;
 static struct proc_dir_entry *proc_spl_kmem_slab = NULL;
-static struct proc_dir_entry *proc_spl_taskq_all = NULL;
-static struct proc_dir_entry *proc_spl_taskq = NULL;
 struct proc_dir_entry *proc_spl_kstat = NULL;
 
 #ifdef DEBUG_KMEM
@@ -177,195 +174,6 @@ proc_dohostid(CONST_CTL_TABLE *table, int write,
 	return (0);
 }
 
-static void
-taskq_seq_show_headers(struct seq_file *f)
-{
-	seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n",
-	    "taskq", "act", "nthr", "spwn", "maxt", "pri",
-	    "mina", "maxa", "cura", "flags");
-}
-
-/* indices into the lheads array below */
-#define	LHEAD_PEND	0
-#define	LHEAD_PRIO	1
-#define	LHEAD_DELAY	2
-#define	LHEAD_WAIT	3
-#define	LHEAD_ACTIVE	4
-#define	LHEAD_SIZE	5
-
-static unsigned int spl_max_show_tasks = 512;
-/* CSTYLED */
-module_param(spl_max_show_tasks, uint, 0644);
-MODULE_PARM_DESC(spl_max_show_tasks, "Max number of tasks shown in taskq proc");
-
-static int
-taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag)
-{
-	taskq_t *tq = p;
-	taskq_thread_t *tqt = NULL;
-	spl_wait_queue_entry_t *wq;
-	struct task_struct *tsk;
-	taskq_ent_t *tqe;
-	char name[100];
-	struct list_head *lheads[LHEAD_SIZE], *lh;
-	static char *list_names[LHEAD_SIZE] =
-	    {"pend", "prio", "delay", "wait", "active" };
-	int i, j, have_lheads = 0;
-	unsigned long wflags, flags;
-
-	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
-	spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags);
-
-	/* get the various lists and check whether they're empty */
-	lheads[LHEAD_PEND] = &tq->tq_pend_list;
-	lheads[LHEAD_PRIO] = &tq->tq_prio_list;
-	lheads[LHEAD_DELAY] = &tq->tq_delay_list;
-#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
-	lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.head;
-#else
-	lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list;
-#endif
-	lheads[LHEAD_ACTIVE] = &tq->tq_active_list;
-
-	for (i = 0; i < LHEAD_SIZE; ++i) {
-		if (list_empty(lheads[i]))
-			lheads[i] = NULL;
-		else
-			++have_lheads;
-	}
-
-	/* early return in non-"all" mode if lists are all empty */
-	if (!allflag && !have_lheads) {
-		spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
-		spin_unlock_irqrestore(&tq->tq_lock, flags);
-		return (0);
-	}
-
-	/* unlock the waitq quickly */
-	if (!lheads[LHEAD_WAIT])
-		spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
-
-	/* show the base taskq contents */
-	snprintf(name, sizeof (name), "%s/%d", tq->tq_name, tq->tq_instance);
-	seq_printf(f, "%-25s ", name);
-	seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n",
-	    tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn,
-	    tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc,
-	    tq->tq_nalloc, tq->tq_flags);
-
-	/* show the active list */
-	if (lheads[LHEAD_ACTIVE]) {
-		j = 0;
-		list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) {
-			if (j == 0)
-				seq_printf(f, "\t%s:",
-				    list_names[LHEAD_ACTIVE]);
-			else if (j == 2) {
-				seq_printf(f, "\n\t       ");
-				j = 0;
-			}
-			seq_printf(f, " [%d]%pf(%ps)",
-			    tqt->tqt_thread->pid,
-			    tqt->tqt_task->tqent_func,
-			    tqt->tqt_task->tqent_arg);
-			++j;
-		}
-		seq_printf(f, "\n");
-	}
-
-	for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i)
-		if (lheads[i]) {
-			j = 0;
-			list_for_each(lh, lheads[i]) {
-				if (spl_max_show_tasks != 0 &&
-				    j >= spl_max_show_tasks) {
-					seq_printf(f, "\n\t(truncated)");
-					break;
-				}
-				/* show the wait waitq list */
-				if (i == LHEAD_WAIT) {
-#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
-					wq = list_entry(lh,
-					    spl_wait_queue_entry_t, entry);
-#else
-					wq = list_entry(lh,
-					    spl_wait_queue_entry_t, task_list);
-#endif
-					if (j == 0)
-						seq_printf(f, "\t%s:",
-						    list_names[i]);
-					else if (j % 8 == 0)
-						seq_printf(f, "\n\t     ");
-
-					tsk = wq->private;
-					seq_printf(f, " %d", tsk->pid);
-				/* pend, prio and delay lists */
-				} else {
-					tqe = list_entry(lh, taskq_ent_t,
-					    tqent_list);
-					if (j == 0)
-						seq_printf(f, "\t%s:",
-						    list_names[i]);
-					else if (j % 2 == 0)
-						seq_printf(f, "\n\t     ");
-
-					seq_printf(f, " %pf(%ps)",
-					    tqe->tqent_func,
-					    tqe->tqent_arg);
-				}
-				++j;
-			}
-			seq_printf(f, "\n");
-		}
-	if (lheads[LHEAD_WAIT])
-		spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
-	spin_unlock_irqrestore(&tq->tq_lock, flags);
-
-	return (0);
-}
-
-static int
-taskq_all_seq_show(struct seq_file *f, void *p)
-{
-	return (taskq_seq_show_impl(f, p, B_TRUE));
-}
-
-static int
-taskq_seq_show(struct seq_file *f, void *p)
-{
-	return (taskq_seq_show_impl(f, p, B_FALSE));
-}
-
-static void *
-taskq_seq_start(struct seq_file *f, loff_t *pos)
-{
-	struct list_head *p;
-	loff_t n = *pos;
-
-	down_read(&tq_list_sem);
-	if (!n)
-		taskq_seq_show_headers(f);
-
-	p = tq_list.next;
-	while (n--) {
-		p = p->next;
-		if (p == &tq_list)
-		return (NULL);
-	}
-
-	return (list_entry(p, taskq_t, tq_taskqs));
-}
-
-static void *
-taskq_seq_next(struct seq_file *f, void *p, loff_t *pos)
-{
-	taskq_t *tq = p;
-
-	++*pos;
-	return ((tq->tq_taskqs.next == &tq_list) ?
-	    NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs));
-}
-
 static void
 slab_seq_show_headers(struct seq_file *f)
 {
@@ -501,66 +309,6 @@ static const kstat_proc_op_t proc_slab_operations = {
 #endif
 };
 
-static void
-taskq_seq_stop(struct seq_file *f, void *v)
-{
-	up_read(&tq_list_sem);
-}
-
-static const struct seq_operations taskq_all_seq_ops = {
-	.show	= taskq_all_seq_show,
-	.start	= taskq_seq_start,
-	.next	= taskq_seq_next,
-	.stop	= taskq_seq_stop,
-};
-
-static const struct seq_operations taskq_seq_ops = {
-	.show	= taskq_seq_show,
-	.start	= taskq_seq_start,
-	.next	= taskq_seq_next,
-	.stop	= taskq_seq_stop,
-};
-
-static int
-proc_taskq_all_open(struct inode *inode, struct file *filp)
-{
-	return (seq_open(filp, &taskq_all_seq_ops));
-}
-
-static int
-proc_taskq_open(struct inode *inode, struct file *filp)
-{
-	return (seq_open(filp, &taskq_seq_ops));
-}
-
-static const kstat_proc_op_t proc_taskq_all_operations = {
-#ifdef HAVE_PROC_OPS_STRUCT
-	.proc_open	= proc_taskq_all_open,
-	.proc_read	= seq_read,
-	.proc_lseek	= seq_lseek,
-	.proc_release	= seq_release,
-#else
-	.open		= proc_taskq_all_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-#endif
-};
-
-static const kstat_proc_op_t proc_taskq_operations = {
-#ifdef HAVE_PROC_OPS_STRUCT
-	.proc_open	= proc_taskq_open,
-	.proc_read	= seq_read,
-	.proc_lseek	= seq_lseek,
-	.proc_release	= seq_release,
-#else
-	.open		= proc_taskq_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-#endif
-};
-
 static struct ctl_table spl_kmem_table[] = {
 #ifdef DEBUG_KMEM
 	{
@@ -677,8 +425,6 @@ static void spl_proc_cleanup(void)
 	remove_proc_entry("kstat", proc_spl);
 	remove_proc_entry("slab", proc_spl_kmem);
 	remove_proc_entry("kmem", proc_spl);
-	remove_proc_entry("taskq-all", proc_spl);
-	remove_proc_entry("taskq", proc_spl);
 	remove_proc_entry("spl", NULL);
 
 #ifndef HAVE_REGISTER_SYSCTL_TABLE
@@ -761,20 +507,6 @@ spl_proc_init(void)
 		goto out;
 	}
 
-	proc_spl_taskq_all = proc_create_data("taskq-all", 0444, proc_spl,
-	    &proc_taskq_all_operations, NULL);
-	if (proc_spl_taskq_all == NULL) {
-		rc = -EUNATCH;
-		goto out;
-	}
-
-	proc_spl_taskq = proc_create_data("taskq", 0444, proc_spl,
-	    &proc_taskq_operations, NULL);
-	if (proc_spl_taskq == NULL) {
-		rc = -EUNATCH;
-		goto out;
-	}
-
 	proc_spl_kmem = proc_mkdir("kmem", proc_spl);
 	if (proc_spl_kmem == NULL) {
 		rc = -EUNATCH;

From 8e6a9aabb1e4038b1893d5eba5ebc2318988bd9c Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Tue, 20 Aug 2024 01:30:57 +0500
Subject: [PATCH 38/65] linux/zvol_os.c: Fix max_discard_sectors limit for 6.8+
 kernel

In kernels 6.8 and later, the zvol block device is allocated with
qlimits passed during initialization. However, the zvol driver does not
set `max_hw_discard_sectors`, which is necessary to properly
initialize `max_discard_sectors`. This causes the `zvol_misc_trim` test
to fail on 6.8+ kernels when invoking the `blkdiscard` command. Setting
`max_hw_discard_sectors` in the `HAVE_BLK_ALLOC_DISK_2ARG` case resolve
the issue.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #16462
---
 module/os/linux/zfs/zvol_os.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index e04f64e232a6..1ac079cc686d 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1213,6 +1213,7 @@ zvol_queue_limits_convert(zvol_queue_limits_t *limits,
 	qlimits->io_opt = limits->zql_io_opt;
 	qlimits->physical_block_size = limits->zql_physical_block_size;
 	qlimits->max_discard_sectors = limits->zql_max_discard_sectors;
+	qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors;
 	qlimits->discard_granularity = limits->zql_discard_granularity;
 #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
 	qlimits->features =

From a2c4e95cfdf60e8350884ff77a0df00d5ecdd275 Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Tue, 20 Aug 2024 18:45:26 +0500
Subject: [PATCH 39/65] linux/zvol_os.c: cleanup limits for non-blk mq case

Rob Noris suggested that we could clean up redundant limits for the case
of non-blk mq scenario.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #16462
---
 module/os/linux/zfs/zvol_os.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 1ac079cc686d..d1e3061b50e6 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1252,7 +1252,6 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
 
 	zso->zvo_disk->minors = ZVOL_MINORS;
 	zso->zvo_queue = zso->zvo_disk->queue;
-	zvol_queue_limits_apply(limits, zso->zvo_queue);
 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
 	struct queue_limits qlimits;
 	zvol_queue_limits_convert(limits, &qlimits);
@@ -1266,10 +1265,6 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
 	zso->zvo_disk->minors = ZVOL_MINORS;
 	zso->zvo_queue = zso->zvo_disk->queue;
 
-#ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
-	blk_queue_set_write_cache(zso->zvo_queue, B_TRUE);
-#endif
-
 #else
 	zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
 	if (zso->zvo_queue == NULL)

From bbe8512a93b0078c43fb5aa6f265059376647bc7 Mon Sep 17 00:00:00 2001
From: shodanshok <g.danti@assyoma.it>
Date: Wed, 21 Aug 2024 19:00:33 +0200
Subject: [PATCH 40/65] Ignore zfs_arc_shrinker_limit in direct reclaim mode

zfs_arc_shrinker_limit (default: 10000) avoids ARC collapse
due to excessive memory reclaim. However, when the kernel is
in direct reclaim mode (ie: low on memory), limiting ARC reclaim
increases OOM risk. This is especially true on system without
(or with inadequate) swap.

This patch ignores zfs_arc_shrinker_limit when the kernel is in
direct reclaim mode, avoiding most OOM. It also restores
"echo 3 > /proc/sys/vm/drop_caches" ability to correctly drop
(almost) all ARC.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Adam Moss <c@yotes.com>
Signed-off-by: Gionatan Danti <g.danti@assyoma.it>
Closes #16313
---
 man/man4/zfs.4               | 1 +
 module/os/linux/zfs/arc_os.c | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 2be3a8414aac..20bb95c1aeea 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -838,6 +838,7 @@ This is a limit on how many pages the ARC shrinker makes available for
 eviction in response to one page allocation attempt.
 Note that in practice, the kernel's shrinker can ask us to evict
 up to about four times this for one allocation attempt.
+To reduce OOM risk, this limit is applied for kswapd reclaims only.
 .Pp
 The default limit of
 .Sy 10000 Pq in practice, Em 160 MiB No per allocation attempt with 4 KiB pages
diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c
index 75a9ea53225e..c6b9cb2ddb3f 100644
--- a/module/os/linux/zfs/arc_os.c
+++ b/module/os/linux/zfs/arc_os.c
@@ -201,9 +201,9 @@ arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
 	 * See also the comment above zfs_arc_shrinker_limit.
 	 */
 	int64_t can_free = btop(arc_evictable_memory());
-	int64_t limit = zfs_arc_shrinker_limit != 0 ?
-	    zfs_arc_shrinker_limit : INT64_MAX;
-	return (MIN(can_free, limit));
+	if (current_is_kswapd() && zfs_arc_shrinker_limit)
+		can_free = MIN(can_free, zfs_arc_shrinker_limit);
+	return (can_free);
 }
 
 static unsigned long

From b3f4e4e1ec930be85ebdf3c7d23f0be23800491c Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Fri, 7 Jun 2024 18:00:31 +1000
Subject: [PATCH 41/65] abd: remove ABD_FLAG_ZEROS

Nothing ever checks it.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16253
---
 include/sys/abd.h              | 3 +--
 module/os/freebsd/zfs/abd_os.c | 2 +-
 module/os/linux/zfs/abd_os.c   | 2 +-
 module/zfs/abd.c               | 2 +-
 4 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/include/sys/abd.h b/include/sys/abd.h
index 7b7d84b528cd..daa247e0cb19 100644
--- a/include/sys/abd.h
+++ b/include/sys/abd.h
@@ -44,8 +44,7 @@ typedef enum abd_flags {
 	ABD_FLAG_LINEAR_PAGE 	= 1 << 5, /* linear but allocd from page */
 	ABD_FLAG_GANG		= 1 << 6, /* mult ABDs chained together */
 	ABD_FLAG_GANG_FREE	= 1 << 7, /* gang ABD is responsible for mem */
-	ABD_FLAG_ZEROS		= 1 << 8, /* ABD for zero-filled buffer */
-	ABD_FLAG_ALLOCD		= 1 << 9, /* we allocated the abd_t */
+	ABD_FLAG_ALLOCD		= 1 << 8, /* we allocated the abd_t */
 } abd_flags_t;
 
 typedef struct abd {
diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c
index fb5c46ecf7c2..ce8c30025f3d 100644
--- a/module/os/freebsd/zfs/abd_os.c
+++ b/module/os/freebsd/zfs/abd_os.c
@@ -250,7 +250,7 @@ abd_alloc_zero_scatter(void)
 
 	n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
 	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
-	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_ZEROS;
+	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
 	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
 
 	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index f7af20c619a4..c4cc2ce01d61 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -509,7 +509,7 @@ abd_alloc_zero_scatter(void)
 	ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl;
 	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
 	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
-	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
+	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK;
 
 	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
 		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
index 94f492522f0d..f1df6082f045 100644
--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
@@ -113,7 +113,7 @@ abd_verify(abd_t *abd)
 	ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
 	    ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
 	    ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG |
-	    ABD_FLAG_GANG_FREE | ABD_FLAG_ZEROS | ABD_FLAG_ALLOCD));
+	    ABD_FLAG_GANG_FREE | ABD_FLAG_ALLOCD));
 	IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
 	IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
 	if (abd_is_linear(abd)) {

From 2b7d9a786346f70799fdc043f2455b870e924330 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 4 Jun 2024 13:13:05 -0400
Subject: [PATCH 42/65] zio: no alloc canary in userspace

Makes it harder to use memory debuggers like valgrind directly, because
they can't see canary overruns.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16253
---
 module/zfs/zio.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 1f3acb9b921e..73252c2da970 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -299,10 +299,13 @@ zio_fini(void)
  * ==========================================================================
  */
 
-#ifdef ZFS_DEBUG
-static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
+#if defined(ZFS_DEBUG) && defined(_KERNEL)
+#define	ZFS_ZIO_BUF_CANARY	1
 #endif
 
+#ifdef ZFS_ZIO_BUF_CANARY
+static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
+
 /*
  * Use empty space after the buffer to detect overflows.
  *
@@ -314,7 +317,6 @@ static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b;
 static void
 zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
 {
-#ifdef ZFS_DEBUG
 	size_t off = P2ROUNDUP(size, sizeof (ulong_t));
 	ulong_t *canary = p + off / sizeof (ulong_t);
 	size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
@@ -323,13 +325,11 @@ zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
 		asize = (c + 2) << SPA_MINBLOCKSHIFT;
 	for (; off < asize; canary++, off += sizeof (ulong_t))
 		*canary = zio_buf_canary;
-#endif
 }
 
 static void
 zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
 {
-#ifdef ZFS_DEBUG
 	size_t off = P2ROUNDUP(size, sizeof (ulong_t));
 	ulong_t *canary = p + off / sizeof (ulong_t);
 	size_t asize = (c + 1) << SPA_MINBLOCKSHIFT;
@@ -343,8 +343,8 @@ zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c)
 			    *canary, zio_buf_canary);
 		}
 	}
-#endif
 }
+#endif
 
 /*
  * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
@@ -363,7 +363,9 @@ zio_buf_alloc(size_t size)
 #endif
 
 	void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE);
+#ifdef ZFS_ZIO_BUF_CANARY
 	zio_buf_put_canary(p, size, zio_buf_cache, c);
+#endif
 	return (p);
 }
 
@@ -381,7 +383,9 @@ zio_data_buf_alloc(size_t size)
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
 	void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE);
+#ifdef ZFS_ZIO_BUF_CANARY
 	zio_buf_put_canary(p, size, zio_data_buf_cache, c);
+#endif
 	return (p);
 }
 
@@ -395,7 +399,9 @@ zio_buf_free(void *buf, size_t size)
 	atomic_add_64(&zio_buf_cache_frees[c], 1);
 #endif
 
+#ifdef ZFS_ZIO_BUF_CANARY
 	zio_buf_check_canary(buf, size, zio_buf_cache, c);
+#endif
 	kmem_cache_free(zio_buf_cache[c], buf);
 }
 
@@ -406,7 +412,9 @@ zio_data_buf_free(void *buf, size_t size)
 
 	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 
+#ifdef ZFS_ZIO_BUF_CANARY
 	zio_buf_check_canary(buf, size, zio_data_buf_cache, c);
+#endif
 	kmem_cache_free(zio_data_buf_cache[c], buf);
 }
 

From 7a5b4355e2e3b3cdedcc75300323db35c98e78df Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 25 Dec 2023 22:25:48 +1100
Subject: [PATCH 43/65] abd_os: split userspace and Linux kernel code

The Linux abd_os.c serves double-duty as the userspace scatter abd
implementation, by carrying an emulation of kernel scatterlists. This
commit lifts common and userspace-specific parts out into a separate
abd_os.c for libzpool.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16253
---
 include/sys/abd.h            |   2 +
 lib/libzpool/Makefile.am     |   2 +-
 lib/libzpool/abd_os.c        | 492 +++++++++++++++++++++++++++++++++++
 module/os/linux/zfs/abd_os.c | 151 +----------
 4 files changed, 498 insertions(+), 149 deletions(-)
 create mode 100644 lib/libzpool/abd_os.c

diff --git a/include/sys/abd.h b/include/sys/abd.h
index daa247e0cb19..ed008465c891 100644
--- a/include/sys/abd.h
+++ b/include/sys/abd.h
@@ -68,7 +68,9 @@ typedef struct abd {
 		} abd_scatter;
 		struct abd_linear {
 			void		*abd_buf;
+#if defined(__linux__) && defined(_KERNEL)
 			struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
+#endif
 		} abd_linear;
 		struct abd_gang {
 			list_t abd_gang_chain;
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index 070dc0132f2f..eb0dd0ace1fc 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -9,6 +9,7 @@ lib_LTLIBRARIES += libzpool.la
 CPPCHECKTARGETS += libzpool.la
 
 dist_libzpool_la_SOURCES = \
+	%D%/abd_os.c \
 	%D%/kernel.c \
 	%D%/taskq.c \
 	%D%/util.c
@@ -39,7 +40,6 @@ nodist_libzpool_la_SOURCES = \
 	module/lua/lvm.c \
 	module/lua/lzio.c \
 	\
-	module/os/linux/zfs/abd_os.c \
 	module/os/linux/zfs/arc_os.c \
 	module/os/linux/zfs/trace.c \
 	module/os/linux/zfs/vdev_file.c \
diff --git a/lib/libzpool/abd_os.c b/lib/libzpool/abd_os.c
new file mode 100644
index 000000000000..de93f99a556a
--- /dev/null
+++ b/lib/libzpool/abd_os.c
@@ -0,0 +1,492 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+/*
+ * See abd.c for a general overview of the arc buffered data (ABD).
+ *
+ * Linear buffers act exactly like normal buffers and are always mapped into the
+ * kernel's virtual memory space, while scattered ABD data chunks are allocated
+ * as physical pages and then mapped in only while they are actually being
+ * accessed through one of the abd_* library functions. Using scattered ABDs
+ * provides several benefits:
+ *
+ *  (1) They avoid use of kmem_*, preventing performance problems where running
+ *      kmem_reap on very large memory systems never finishes and causes
+ *      constant TLB shootdowns.
+ *
+ *  (2) Fragmentation is less of an issue since when we are at the limit of
+ *      allocatable space, we won't have to search around for a long free
+ *      hole in the VA space for large ARC allocations. Each chunk is mapped in
+ *      individually, so even if we are using HIGHMEM (see next point) we
+ *      wouldn't need to worry about finding a contiguous address range.
+ *
+ *  (3) If we are not using HIGHMEM, then all physical memory is always
+ *      mapped into the kernel's address space, so we also avoid the map /
+ *      unmap costs on each ABD access.
+ *
+ * If we are not using HIGHMEM, scattered buffers which have only one chunk
+ * can be treated as linear buffers, because they are contiguous in the
+ * kernel's virtual address space.  See abd_alloc_chunks() for details.
+ */
+
+#include <sys/abd_impl.h>
+#include <sys/param.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+
+
+#define	abd_for_each_sg(abd, sg, n, i)	\
+	for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i)
+
+/*
+ * zfs_abd_scatter_min_size is the minimum allocation size to use scatter
+ * ABD's.  Smaller allocations will use linear ABD's which uses
+ * zio_[data_]buf_alloc().
+ *
+ * Scatter ABD's use at least one page each, so sub-page allocations waste
+ * some space when allocated as scatter (e.g. 2KB scatter allocation wastes
+ * half of each page).  Using linear ABD's for small allocations means that
+ * they will be put on slabs which contain many allocations.  This can
+ * improve memory efficiency, but it also makes it much harder for ARC
+ * evictions to actually free pages, because all the buffers on one slab need
+ * to be freed in order for the slab (and underlying pages) to be freed.
+ * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's
+ * possible for them to actually waste more memory than scatter (one page per
+ * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th).
+ *
+ * Spill blocks are typically 512B and are heavily used on systems running
+ * selinux with the default dnode size and the `xattr=sa` property set.
+ *
+ * By default we use linear allocations for 512B and 1KB, and scatter
+ * allocations for larger (1.5KB and up).
+ */
+static int zfs_abd_scatter_min_size = 512 * 3;
+
+/*
+ * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are
+ * just a single zero'd page. This allows us to conserve memory by
+ * only using a single zero page for the scatterlist.
+ */
+abd_t *abd_zero_scatter = NULL;
+
+struct page;
+/*
+ * abd_zero_page will be allocated with a zero'ed PAGESIZE buffer, which is
+ * assigned to each of the pages of abd_zero_scatter.
+ */
+static struct page *abd_zero_page = NULL;
+
+static kmem_cache_t *abd_cache = NULL;
+
+static uint_t
+abd_chunkcnt_for_bytes(size_t size)
+{
+	return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE);
+}
+
+abd_t *
+abd_alloc_struct_impl(size_t size)
+{
+	/*
+	 * In Linux we do not use the size passed in during ABD
+	 * allocation, so we just ignore it.
+	 */
+	(void) size;
+	abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE);
+	ASSERT3P(abd, !=, NULL);
+
+	return (abd);
+}
+
+void
+abd_free_struct_impl(abd_t *abd)
+{
+	kmem_cache_free(abd_cache, abd);
+}
+
+#define	nth_page(pg, i) \
+	((struct page *)((void *)(pg) + (i) * PAGESIZE))
+
+struct scatterlist {
+	struct page *page;
+	int length;
+	int end;
+};
+
+static void
+sg_init_table(struct scatterlist *sg, int nr)
+{
+	memset(sg, 0, nr * sizeof (struct scatterlist));
+	sg[nr - 1].end = 1;
+}
+
+/*
+ * This must be called if any of the sg_table allocation functions
+ * are called.
+ */
+static void
+abd_free_sg_table(abd_t *abd)
+{
+	int nents = ABD_SCATTER(abd).abd_nents;
+	vmem_free(ABD_SCATTER(abd).abd_sgl,
+	    nents * sizeof (struct scatterlist));
+}
+
+#define	for_each_sg(sgl, sg, nr, i)	\
+	for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
+
+static inline void
+sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
+    unsigned int offset)
+{
+	/* currently we don't use offset */
+	ASSERT(offset == 0);
+	sg->page = page;
+	sg->length = len;
+}
+
+static inline struct page *
+sg_page(struct scatterlist *sg)
+{
+	return (sg->page);
+}
+
+static inline struct scatterlist *
+sg_next(struct scatterlist *sg)
+{
+	if (sg->end)
+		return (NULL);
+
+	return (sg + 1);
+}
+
+void
+abd_alloc_chunks(abd_t *abd, size_t size)
+{
+	unsigned nr_pages = abd_chunkcnt_for_bytes(size);
+	struct scatterlist *sg;
+	int i;
+
+	ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
+	    sizeof (struct scatterlist), KM_SLEEP);
+	sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
+
+	abd_for_each_sg(abd, sg, nr_pages, i) {
+		struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
+		sg_set_page(sg, p, PAGESIZE, 0);
+	}
+	ABD_SCATTER(abd).abd_nents = nr_pages;
+}
+
+void
+abd_free_chunks(abd_t *abd)
+{
+	int i, n = ABD_SCATTER(abd).abd_nents;
+	struct scatterlist *sg;
+
+	abd_for_each_sg(abd, sg, n, i) {
+		struct page *p = nth_page(sg_page(sg), 0);
+		umem_free_aligned(p, PAGESIZE);
+	}
+	abd_free_sg_table(abd);
+}
+
+static void
+abd_alloc_zero_scatter(void)
+{
+	unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
+	struct scatterlist *sg;
+	int i;
+
+	abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
+	memset(abd_zero_page, 0, PAGESIZE);
+	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
+	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
+	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK;
+	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
+	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
+	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
+	ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages *
+	    sizeof (struct scatterlist), KM_SLEEP);
+
+	sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages);
+
+	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
+		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
+	}
+}
+
+boolean_t
+abd_size_alloc_linear(size_t size)
+{
+	return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size);
+}
+
+void
+abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
+{
+	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
+	int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size;
+	if (op == ABDSTAT_INCR) {
+		arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
+	} else {
+		arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
+	}
+}
+
+void
+abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
+{
+	(void) abd;
+	(void) op;
+	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
+}
+
+void
+abd_verify_scatter(abd_t *abd)
+{
+	size_t n;
+	int i = 0;
+	struct scatterlist *sg = NULL;
+
+	ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
+	ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
+	    ABD_SCATTER(abd).abd_sgl->length);
+	n = ABD_SCATTER(abd).abd_nents;
+	abd_for_each_sg(abd, sg, n, i) {
+		ASSERT3P(sg_page(sg), !=, NULL);
+	}
+}
+
+static void
+abd_free_zero_scatter(void)
+{
+	abd_free_sg_table(abd_zero_scatter);
+	abd_free_struct(abd_zero_scatter);
+	abd_zero_scatter = NULL;
+	ASSERT3P(abd_zero_page, !=, NULL);
+	umem_free_aligned(abd_zero_page, PAGESIZE);
+}
+
+void
+abd_init(void)
+{
+	abd_cache = kmem_cache_create("abd_t", sizeof (abd_t),
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	abd_alloc_zero_scatter();
+}
+
+void
+abd_fini(void)
+{
+	abd_free_zero_scatter();
+
+	if (abd_cache) {
+		kmem_cache_destroy(abd_cache);
+		abd_cache = NULL;
+	}
+}
+
+void
+abd_free_linear_page(abd_t *abd)
+{
+	(void) abd;
+	__builtin_unreachable();
+}
+
+/*
+ * If we're going to use this ABD for doing I/O using the block layer, the
+ * consumer of the ABD data doesn't care if it's scattered or not, and we don't
+ * plan to store this ABD in memory for a long period of time, we should
+ * allocate the ABD type that requires the least data copying to do the I/O.
+ *
+ * On Linux the optimal thing to do would be to use abd_get_offset() and
+ * construct a new ABD which shares the original pages thereby eliminating
+ * the copy.  But for the moment a new linear ABD is allocated until this
+ * performance optimization can be implemented.
+ */
+abd_t *
+abd_alloc_for_io(size_t size, boolean_t is_metadata)
+{
+	return (abd_alloc(size, is_metadata));
+}
+
+abd_t *
+abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
+    size_t size)
+{
+	(void) size;
+	int i = 0;
+	struct scatterlist *sg = NULL;
+
+	abd_verify(sabd);
+	ASSERT3U(off, <=, sabd->abd_size);
+
+	size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
+
+	if (abd == NULL)
+		abd = abd_alloc_struct(0);
+
+	/*
+	 * Even if this buf is filesystem metadata, we only track that
+	 * if we own the underlying data buffer, which is not true in
+	 * this case. Therefore, we don't ever use ABD_FLAG_META here.
+	 */
+
+	abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) {
+		if (new_offset < sg->length)
+			break;
+		new_offset -= sg->length;
+	}
+
+	ABD_SCATTER(abd).abd_sgl = sg;
+	ABD_SCATTER(abd).abd_offset = new_offset;
+	ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
+
+	return (abd);
+}
+
+/*
+ * Initialize the abd_iter.
+ */
+void
+abd_iter_init(struct abd_iter *aiter, abd_t *abd)
+{
+	ASSERT(!abd_is_gang(abd));
+	abd_verify(abd);
+	memset(aiter, 0, sizeof (struct abd_iter));
+	aiter->iter_abd = abd;
+	if (!abd_is_linear(abd)) {
+		aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
+		aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
+	}
+}
+
+/*
+ * This is just a helper function to see if we have exhausted the
+ * abd_iter and reached the end.
+ */
+boolean_t
+abd_iter_at_end(struct abd_iter *aiter)
+{
+	ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
+	return (aiter->iter_pos == aiter->iter_abd->abd_size);
+}
+
+/*
+ * Advance the iterator by a certain amount. Cannot be called when a chunk is
+ * in use. This can be safely called when the aiter has already exhausted, in
+ * which case this does nothing.
+ */
+void
+abd_iter_advance(struct abd_iter *aiter, size_t amount)
+{
+	/*
+	 * Ensure that last chunk is not in use. abd_iterate_*() must clear
+	 * this state (directly or abd_iter_unmap()) before advancing.
+	 */
+	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0(aiter->iter_mapsize);
+	ASSERT3P(aiter->iter_page, ==, NULL);
+	ASSERT0(aiter->iter_page_doff);
+	ASSERT0(aiter->iter_page_dsize);
+
+	/* There's nothing left to advance to, so do nothing */
+	if (abd_iter_at_end(aiter))
+		return;
+
+	aiter->iter_pos += amount;
+	aiter->iter_offset += amount;
+	if (!abd_is_linear(aiter->iter_abd)) {
+		while (aiter->iter_offset >= aiter->iter_sg->length) {
+			aiter->iter_offset -= aiter->iter_sg->length;
+			aiter->iter_sg = sg_next(aiter->iter_sg);
+			if (aiter->iter_sg == NULL) {
+				ASSERT0(aiter->iter_offset);
+				break;
+			}
+		}
+	}
+}
+
+/*
+ * Map the current chunk into aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+void
+abd_iter_map(struct abd_iter *aiter)
+{
+	void *paddr;
+	size_t offset = 0;
+
+	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0(aiter->iter_mapsize);
+
+	/* There's nothing left to iterate over, so do nothing */
+	if (abd_iter_at_end(aiter))
+		return;
+
+	if (abd_is_linear(aiter->iter_abd)) {
+		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
+		offset = aiter->iter_offset;
+		aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
+		paddr = ABD_LINEAR_BUF(aiter->iter_abd);
+	} else {
+		offset = aiter->iter_offset;
+		aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
+		    aiter->iter_abd->abd_size - aiter->iter_pos);
+
+		paddr = sg_page(aiter->iter_sg);
+	}
+
+	aiter->iter_mapaddr = (char *)paddr + offset;
+}
+
+/*
+ * Unmap the current chunk from aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+void
+abd_iter_unmap(struct abd_iter *aiter)
+{
+	/* There's nothing left to unmap, so do nothing */
+	if (abd_iter_at_end(aiter))
+		return;
+
+	ASSERT3P(aiter->iter_mapaddr, !=, NULL);
+	ASSERT3U(aiter->iter_mapsize, >, 0);
+
+	aiter->iter_mapaddr = NULL;
+	aiter->iter_mapsize = 0;
+}
+
+void
+abd_cache_reap_now(void)
+{
+}
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index c4cc2ce01d61..60287ccdda98 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -58,22 +58,16 @@
 #include <sys/arc.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_znode.h>
-#ifdef _KERNEL
 #include <linux/kmap_compat.h>
 #include <linux/mm_compat.h>
 #include <linux/scatterlist.h>
 #include <linux/version.h>
-#endif
 
-#ifdef _KERNEL
 #if defined(MAX_ORDER)
 #define	ABD_MAX_ORDER	(MAX_ORDER)
 #elif defined(MAX_PAGE_ORDER)
 #define	ABD_MAX_ORDER	(MAX_PAGE_ORDER)
 #endif
-#else
-#define	ABD_MAX_ORDER	(1)
-#endif
 
 typedef struct abd_stats {
 	kstat_named_t abdstat_struct_size;
@@ -193,11 +187,9 @@ abd_t *abd_zero_scatter = NULL;
 
 struct page;
 /*
- * _KERNEL   - Will point to ZERO_PAGE if it is available or it will be
- *             an allocated zero'd PAGESIZE buffer.
- * Userspace - Will be an allocated zero'ed PAGESIZE buffer.
- *
- * abd_zero_page is assigned to each of the pages of abd_zero_scatter.
+ * abd_zero_page is assigned to each of the pages of abd_zero_scatter. It will
+ * point to ZERO_PAGE if it is available or it will be an allocated zero'd
+ * PAGESIZE buffer.
  */
 static struct page *abd_zero_page = NULL;
 
@@ -232,7 +224,6 @@ abd_free_struct_impl(abd_t *abd)
 	ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t));
 }
 
-#ifdef _KERNEL
 static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1;
 
 /*
@@ -520,134 +511,6 @@ abd_alloc_zero_scatter(void)
 	ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
 }
 
-#else /* _KERNEL */
-
-#ifndef PAGE_SHIFT
-#define	PAGE_SHIFT (highbit64(PAGESIZE)-1)
-#endif
-
-#define	zfs_kmap_local(chunk)		((void *)chunk)
-#define	zfs_kunmap_local(addr)		do { (void)(addr); } while (0)
-#define	local_irq_save(flags)		do { (void)(flags); } while (0)
-#define	local_irq_restore(flags)	do { (void)(flags); } while (0)
-#define	nth_page(pg, i) \
-	((struct page *)((void *)(pg) + (i) * PAGESIZE))
-
-struct scatterlist {
-	struct page *page;
-	int length;
-	int end;
-};
-
-static void
-sg_init_table(struct scatterlist *sg, int nr)
-{
-	memset(sg, 0, nr * sizeof (struct scatterlist));
-	sg[nr - 1].end = 1;
-}
-
-/*
- * This must be called if any of the sg_table allocation functions
- * are called.
- */
-static void
-abd_free_sg_table(abd_t *abd)
-{
-	int nents = ABD_SCATTER(abd).abd_nents;
-	vmem_free(ABD_SCATTER(abd).abd_sgl,
-	    nents * sizeof (struct scatterlist));
-}
-
-#define	for_each_sg(sgl, sg, nr, i)	\
-	for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
-
-static inline void
-sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
-    unsigned int offset)
-{
-	/* currently we don't use offset */
-	ASSERT(offset == 0);
-	sg->page = page;
-	sg->length = len;
-}
-
-static inline struct page *
-sg_page(struct scatterlist *sg)
-{
-	return (sg->page);
-}
-
-static inline struct scatterlist *
-sg_next(struct scatterlist *sg)
-{
-	if (sg->end)
-		return (NULL);
-
-	return (sg + 1);
-}
-
-void
-abd_alloc_chunks(abd_t *abd, size_t size)
-{
-	unsigned nr_pages = abd_chunkcnt_for_bytes(size);
-	struct scatterlist *sg;
-	int i;
-
-	ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
-	    sizeof (struct scatterlist), KM_SLEEP);
-	sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
-
-	abd_for_each_sg(abd, sg, nr_pages, i) {
-		struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
-		sg_set_page(sg, p, PAGESIZE, 0);
-	}
-	ABD_SCATTER(abd).abd_nents = nr_pages;
-}
-
-void
-abd_free_chunks(abd_t *abd)
-{
-	int i, n = ABD_SCATTER(abd).abd_nents;
-	struct scatterlist *sg;
-
-	abd_for_each_sg(abd, sg, n, i) {
-		struct page *p = nth_page(sg_page(sg), 0);
-		umem_free_aligned(p, PAGESIZE);
-	}
-	abd_free_sg_table(abd);
-}
-
-static void
-abd_alloc_zero_scatter(void)
-{
-	unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
-	struct scatterlist *sg;
-	int i;
-
-	abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
-	memset(abd_zero_page, 0, PAGESIZE);
-	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
-	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
-	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
-	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
-	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
-	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
-	ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages *
-	    sizeof (struct scatterlist), KM_SLEEP);
-
-	sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages);
-
-	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
-		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
-	}
-
-	ABDSTAT_BUMP(abdstat_scatter_cnt);
-	ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
-	ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
-}
-
-#endif /* _KERNEL */
-
 boolean_t
 abd_size_alloc_linear(size_t size)
 {
@@ -712,14 +575,10 @@ abd_free_zero_scatter(void)
 	abd_free_struct(abd_zero_scatter);
 	abd_zero_scatter = NULL;
 	ASSERT3P(abd_zero_page, !=, NULL);
-#if defined(_KERNEL)
 #if defined(HAVE_ZERO_PAGE_GPL_ONLY)
 	abd_unmark_zfs_page(abd_zero_page);
 	__free_page(abd_zero_page);
 #endif /* HAVE_ZERO_PAGE_GPL_ONLY */
-#else
-	umem_free_aligned(abd_zero_page, PAGESIZE);
-#endif /* _KERNEL */
 }
 
 static int
@@ -1014,8 +873,6 @@ abd_cache_reap_now(void)
 {
 }
 
-#if defined(_KERNEL)
-
 /*
  * This is abd_iter_page(), the function underneath abd_iterate_page_func().
  * It yields the next page struct and data offset and size within it, without
@@ -1297,5 +1154,3 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
 module_param(zfs_abd_scatter_max_order, uint, 0644);
 MODULE_PARM_DESC(zfs_abd_scatter_max_order,
 	"Maximum order allocation used for a scatter ABD.");
-
-#endif /* _KERNEL */

From 5b9e69539249bb823de65c182dd225e8edaf408b Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 1 Apr 2024 15:37:34 +1100
Subject: [PATCH 44/65] abd_os: break out platform-specific header parts

Removing the platform #ifdefs from shared headers in favour of
per-platform headers. Makes abd_t much leaner, among other things.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16253
---
 config/Rules.am                          |  3 +-
 include/os/freebsd/Makefile.am           |  2 +
 include/os/freebsd/zfs/sys/abd_impl_os.h | 41 ++++++++++++++++
 include/os/freebsd/zfs/sys/abd_os.h      | 46 ++++++++++++++++++
 include/os/linux/Makefile.am             |  2 +
 include/os/linux/zfs/sys/abd_impl_os.h   | 41 ++++++++++++++++
 include/os/linux/zfs/sys/abd_os.h        | 62 ++++++++++++++++++++++++
 include/sys/abd.h                        | 35 ++-----------
 include/sys/abd_impl.h                   | 14 +-----
 lib/libzpool/Makefile.am                 |  2 +
 lib/libzpool/include/Makefile.am         |  4 ++
 lib/libzpool/include/sys/abd_impl_os.h   | 41 ++++++++++++++++
 lib/libzpool/include/sys/abd_os.h        | 47 ++++++++++++++++++
 module/os/freebsd/zfs/abd_os.c           |  2 -
 14 files changed, 294 insertions(+), 48 deletions(-)
 create mode 100644 include/os/freebsd/zfs/sys/abd_impl_os.h
 create mode 100644 include/os/freebsd/zfs/sys/abd_os.h
 create mode 100644 include/os/linux/zfs/sys/abd_impl_os.h
 create mode 100644 include/os/linux/zfs/sys/abd_os.h
 create mode 100644 lib/libzpool/include/Makefile.am
 create mode 100644 lib/libzpool/include/sys/abd_impl_os.h
 create mode 100644 lib/libzpool/include/sys/abd_os.h

diff --git a/config/Rules.am b/config/Rules.am
index 00ac890e2303..b462826e2c89 100644
--- a/config/Rules.am
+++ b/config/Rules.am
@@ -10,7 +10,8 @@ AM_CPPFLAGS = \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/module/icp/include \
 	-I$(top_srcdir)/lib/libspl/include \
-	-I$(top_srcdir)/lib/libspl/include/os/@ac_system_l@
+	-I$(top_srcdir)/lib/libspl/include/os/@ac_system_l@ \
+	-I$(top_srcdir)/lib/libzpool/include
 
 AM_LIBTOOLFLAGS = --silent
 
diff --git a/include/os/freebsd/Makefile.am b/include/os/freebsd/Makefile.am
index 292f79b8ce72..d975c4fe69fa 100644
--- a/include/os/freebsd/Makefile.am
+++ b/include/os/freebsd/Makefile.am
@@ -77,6 +77,8 @@ noinst_HEADERS = \
 	%D%/spl/sys/zmod.h \
 	%D%/spl/sys/zone.h \
 	\
+	%D%/zfs/sys/abd_os.h \
+	%D%/zfs/sys/abd_impl_os.h \
 	%D%/zfs/sys/arc_os.h \
 	%D%/zfs/sys/freebsd_crypto.h \
 	%D%/zfs/sys/freebsd_event.h \
diff --git a/include/os/freebsd/zfs/sys/abd_impl_os.h b/include/os/freebsd/zfs/sys/abd_impl_os.h
new file mode 100644
index 000000000000..309e77110d3c
--- /dev/null
+++ b/include/os/freebsd/zfs/sys/abd_impl_os.h
@@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+#ifndef _ABD_IMPL_OS_H
+#define	_ABD_IMPL_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	abd_enter_critical(flags)	critical_enter()
+#define	abd_exit_critical(flags)	critical_exit()
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_IMPL_OS_H */
diff --git a/include/os/freebsd/zfs/sys/abd_os.h b/include/os/freebsd/zfs/sys/abd_os.h
new file mode 100644
index 000000000000..57122ee83e8d
--- /dev/null
+++ b/include/os/freebsd/zfs/sys/abd_os.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ */
+
+#ifndef _ABD_OS_H
+#define	_ABD_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct abd_scatter {
+	uint_t		abd_offset;
+	void		*abd_chunks[1]; /* actually variable-length */
+};
+
+struct abd_linear {
+	void		*abd_buf;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_H */
diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am
index f31ae50b96af..9100aebb541e 100644
--- a/include/os/linux/Makefile.am
+++ b/include/os/linux/Makefile.am
@@ -20,6 +20,8 @@ kernel_linux_HEADERS = \
 
 kernel_sysdir = $(kerneldir)/sys
 kernel_sys_HEADERS = \
+	%D%/zfs/sys/abd_os.h \
+	%D%/zfs/sys/abd_impl_os.h \
 	%D%/zfs/sys/policy.h \
 	%D%/zfs/sys/trace_acl.h \
 	%D%/zfs/sys/trace_arc.h \
diff --git a/include/os/linux/zfs/sys/abd_impl_os.h b/include/os/linux/zfs/sys/abd_impl_os.h
new file mode 100644
index 000000000000..8192522cd229
--- /dev/null
+++ b/include/os/linux/zfs/sys/abd_impl_os.h
@@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+#ifndef _ABD_IMPL_OS_H
+#define	_ABD_IMPL_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	abd_enter_critical(flags)	local_irq_save(flags)
+#define	abd_exit_critical(flags)	local_irq_restore(flags)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_IMPL_OS_H */
diff --git a/include/os/linux/zfs/sys/abd_os.h b/include/os/linux/zfs/sys/abd_os.h
new file mode 100644
index 000000000000..ce4f5a2bdf9b
--- /dev/null
+++ b/include/os/linux/zfs/sys/abd_os.h
@@ -0,0 +1,62 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ */
+
+#ifndef _ABD_OS_H
+#define	_ABD_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct abd_scatter {
+	uint_t		abd_offset;
+	uint_t		abd_nents;
+	struct scatterlist *abd_sgl;
+};
+
+struct abd_linear {
+	void		*abd_buf;
+	struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
+};
+
+typedef struct abd abd_t;
+
+typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
+int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
+    void *);
+
+/*
+ * Linux ABD bio functions
+ * Note: these are only needed to support vdev_classic. See comment in
+ * vdev_disk.c.
+ */
+unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
+unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_H */
diff --git a/include/sys/abd.h b/include/sys/abd.h
index ed008465c891..67bf5e802c88 100644
--- a/include/sys/abd.h
+++ b/include/sys/abd.h
@@ -30,6 +30,7 @@
 #include <sys/debug.h>
 #include <sys/zfs_refcount.h>
 #include <sys/uio.h>
+#include <sys/abd_os.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -57,21 +58,8 @@ typedef struct abd {
 #endif
 	kmutex_t	abd_mtx;
 	union {
-		struct abd_scatter {
-			uint_t		abd_offset;
-#if defined(__FreeBSD__) && defined(_KERNEL)
-			void    *abd_chunks[1]; /* actually variable-length */
-#else
-			uint_t		abd_nents;
-			struct scatterlist *abd_sgl;
-#endif
-		} abd_scatter;
-		struct abd_linear {
-			void		*abd_buf;
-#if defined(__linux__) && defined(_KERNEL)
-			struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
-#endif
-		} abd_linear;
+		struct abd_scatter	abd_scatter;
+		struct abd_linear	abd_linear;
 		struct abd_gang {
 			list_t abd_gang_chain;
 		} abd_gang;
@@ -80,9 +68,6 @@ typedef struct abd {
 
 typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
 typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
-#if defined(__linux__) && defined(_KERNEL)
-typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
-#endif
 
 extern int zfs_abd_scatter_enabled;
 
@@ -129,10 +114,6 @@ void abd_release_ownership_of_buf(abd_t *);
 int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
 int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
     abd_iter_func2_t *, void *);
-#if defined(__linux__) && defined(_KERNEL)
-int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
-    void *);
-#endif
 void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
 void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
 void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
@@ -226,16 +207,6 @@ abd_get_size(abd_t *abd)
 void abd_init(void);
 void abd_fini(void);
 
-/*
- * Linux ABD bio functions
- * Note: these are only needed to support vdev_classic. See comment in
- * vdev_disk.c.
- */
-#if defined(__linux__) && defined(_KERNEL)
-unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
-unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
-#endif
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h
index f88ea25e245d..1eb25d94adc5 100644
--- a/include/sys/abd_impl.h
+++ b/include/sys/abd_impl.h
@@ -28,6 +28,7 @@
 #define	_ABD_IMPL_H
 
 #include <sys/abd.h>
+#include <sys/abd_impl_os.h>
 #include <sys/wmsum.h>
 
 #ifdef __cplusplus
@@ -111,19 +112,6 @@ void abd_iter_page(struct abd_iter *);
 #define	ABD_LINEAR_BUF(abd)	(abd->abd_u.abd_linear.abd_buf)
 #define	ABD_GANG(abd)		(abd->abd_u.abd_gang)
 
-#if defined(_KERNEL)
-#if defined(__FreeBSD__)
-#define	abd_enter_critical(flags)	critical_enter()
-#define	abd_exit_critical(flags)	critical_exit()
-#else
-#define	abd_enter_critical(flags)	local_irq_save(flags)
-#define	abd_exit_critical(flags)	local_irq_restore(flags)
-#endif
-#else /* !_KERNEL */
-#define	abd_enter_critical(flags)	((void)0)
-#define	abd_exit_critical(flags)	((void)0)
-#endif
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index eb0dd0ace1fc..6989fefc6662 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -1,3 +1,5 @@
+include $(srcdir)/%D%/include/Makefile.am
+
 libzpool_la_CFLAGS  = $(AM_CFLAGS) $(KERNEL_CFLAGS) $(LIBRARY_CFLAGS)
 libzpool_la_CFLAGS += $(ZLIB_CFLAGS)
 
diff --git a/lib/libzpool/include/Makefile.am b/lib/libzpool/include/Makefile.am
new file mode 100644
index 000000000000..2e0c4c5610be
--- /dev/null
+++ b/lib/libzpool/include/Makefile.am
@@ -0,0 +1,4 @@
+libzpooldir = $(includedir)/libzpool
+libzpool_HEADERS = \
+	%D%/sys/abd_os.h \
+	%D%/sys/abd_impl_os.h
diff --git a/lib/libzpool/include/sys/abd_impl_os.h b/lib/libzpool/include/sys/abd_impl_os.h
new file mode 100644
index 000000000000..3137346f3bb2
--- /dev/null
+++ b/lib/libzpool/include/sys/abd_impl_os.h
@@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+#ifndef _ABD_IMPL_OS_H
+#define	_ABD_IMPL_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	abd_enter_critical(flags)	((void)0)
+#define	abd_exit_critical(flags)	((void)0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_IMPL_OS_H */
diff --git a/lib/libzpool/include/sys/abd_os.h b/lib/libzpool/include/sys/abd_os.h
new file mode 100644
index 000000000000..67f7e5606bec
--- /dev/null
+++ b/lib/libzpool/include/sys/abd_os.h
@@ -0,0 +1,47 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ */
+
+#ifndef _ABD_OS_H
+#define	_ABD_OS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct abd_scatter {
+	uint_t		abd_offset;
+	uint_t		abd_nents;
+	struct scatterlist *abd_sgl;
+};
+
+struct abd_linear {
+	void		*abd_buf;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _ABD_H */
diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c
index ce8c30025f3d..f24ea3dc7685 100644
--- a/module/os/freebsd/zfs/abd_os.c
+++ b/module/os/freebsd/zfs/abd_os.c
@@ -95,14 +95,12 @@ struct {
  */
 static size_t zfs_abd_scatter_min_size = PAGE_SIZE + 1;
 
-#if defined(_KERNEL)
 SYSCTL_DECL(_vfs_zfs);
 
 SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
 	&zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
 SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_scatter_min_size, CTLFLAG_RWTUN,
 	&zfs_abd_scatter_min_size, 0, "Minimum size of scatter allocations.");
-#endif
 
 kmem_cache_t *abd_chunk_cache;
 static kstat_t *abd_ksp;

From b69bebb535572ef905b065182d8c80d2fff5a8b4 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Sun, 21 Apr 2024 16:37:06 +1000
Subject: [PATCH 45/65] libzpool/abd_os: iovec-based scatter abd

This is intended to be a simple userspace scatter abd based on struct
iovec. It's not very sophisticated as-is, but sets a base for something
much more interesting.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16253
---
 lib/libzpool/abd_os.c             | 469 +++++++++++-------------------
 lib/libzpool/include/sys/abd_os.h |   4 +-
 2 files changed, 173 insertions(+), 300 deletions(-)

diff --git a/lib/libzpool/abd_os.c b/lib/libzpool/abd_os.c
index de93f99a556a..5a91605b2fe3 100644
--- a/lib/libzpool/abd_os.c
+++ b/lib/libzpool/abd_os.c
@@ -24,34 +24,6 @@
  * Copyright (c) 2023, 2024, Klara Inc.
  */
 
-/*
- * See abd.c for a general overview of the arc buffered data (ABD).
- *
- * Linear buffers act exactly like normal buffers and are always mapped into the
- * kernel's virtual memory space, while scattered ABD data chunks are allocated
- * as physical pages and then mapped in only while they are actually being
- * accessed through one of the abd_* library functions. Using scattered ABDs
- * provides several benefits:
- *
- *  (1) They avoid use of kmem_*, preventing performance problems where running
- *      kmem_reap on very large memory systems never finishes and causes
- *      constant TLB shootdowns.
- *
- *  (2) Fragmentation is less of an issue since when we are at the limit of
- *      allocatable space, we won't have to search around for a long free
- *      hole in the VA space for large ARC allocations. Each chunk is mapped in
- *      individually, so even if we are using HIGHMEM (see next point) we
- *      wouldn't need to worry about finding a contiguous address range.
- *
- *  (3) If we are not using HIGHMEM, then all physical memory is always
- *      mapped into the kernel's address space, so we also avoid the map /
- *      unmap costs on each ABD access.
- *
- * If we are not using HIGHMEM, scattered buffers which have only one chunk
- * can be treated as linear buffers, because they are contiguous in the
- * kernel's virtual address space.  See abd_alloc_chunks() for details.
- */
-
 #include <sys/abd_impl.h>
 #include <sys/param.h>
 #include <sys/zio.h>
@@ -59,199 +31,112 @@
 #include <sys/zfs_context.h>
 #include <sys/zfs_znode.h>
 
-
-#define	abd_for_each_sg(abd, sg, n, i)	\
-	for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i)
-
-/*
- * zfs_abd_scatter_min_size is the minimum allocation size to use scatter
- * ABD's.  Smaller allocations will use linear ABD's which uses
- * zio_[data_]buf_alloc().
- *
- * Scatter ABD's use at least one page each, so sub-page allocations waste
- * some space when allocated as scatter (e.g. 2KB scatter allocation wastes
- * half of each page).  Using linear ABD's for small allocations means that
- * they will be put on slabs which contain many allocations.  This can
- * improve memory efficiency, but it also makes it much harder for ARC
- * evictions to actually free pages, because all the buffers on one slab need
- * to be freed in order for the slab (and underlying pages) to be freed.
- * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's
- * possible for them to actually waste more memory than scatter (one page per
- * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th).
- *
- * Spill blocks are typically 512B and are heavily used on systems running
- * selinux with the default dnode size and the `xattr=sa` property set.
- *
- * By default we use linear allocations for 512B and 1KB, and scatter
- * allocations for larger (1.5KB and up).
- */
-static int zfs_abd_scatter_min_size = 512 * 3;
-
 /*
- * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are
- * just a single zero'd page. This allows us to conserve memory by
- * only using a single zero page for the scatterlist.
+ * We're simulating scatter/gather with 4K allocations, since that's more like
+ * what a typical kernel does.
  */
-abd_t *abd_zero_scatter = NULL;
+#define	ABD_PAGESIZE	(4096)
+#define	ABD_PAGESHIFT	(12)
+#define	ABD_PAGEMASK	(ABD_PAGESIZE-1)
 
-struct page;
 /*
- * abd_zero_page will be allocated with a zero'ed PAGESIZE buffer, which is
- * assigned to each of the pages of abd_zero_scatter.
+ * See rationale in module/os/linux/zfs/abd_os.c, but in userspace this is
+ * mostly useful to get a mix of linear and scatter ABDs for testing.
  */
-static struct page *abd_zero_page = NULL;
+#define	ABD_SCATTER_MIN_SIZE	(512 * 3)
 
-static kmem_cache_t *abd_cache = NULL;
+abd_t *abd_zero_scatter = NULL;
 
 static uint_t
-abd_chunkcnt_for_bytes(size_t size)
+abd_iovcnt_for_bytes(size_t size)
 {
-	return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE);
+	/*
+	 * Each iovec points to a 4K page. There's no real reason to do this
+	 * in userspace, but our whole point here is to make it feel a bit
+	 * more like a real paged memory model.
+	 */
+	return (P2ROUNDUP(size, ABD_PAGESIZE) / ABD_PAGESIZE);
 }
 
 abd_t *
 abd_alloc_struct_impl(size_t size)
 {
 	/*
-	 * In Linux we do not use the size passed in during ABD
-	 * allocation, so we just ignore it.
+	 * Zero-sized means it will be used for a linear or gang abd, so just
+	 * allocate the abd itself and return.
 	 */
-	(void) size;
-	abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE);
-	ASSERT3P(abd, !=, NULL);
+	if (size == 0)
+		return (umem_alloc(sizeof (abd_t), UMEM_NOFAIL));
 
+	/*
+	 * Allocating for a scatter abd, so compute how many ABD_PAGESIZE
+	 * iovecs we will need to hold this size. Append that allocation to the
+	 * end. Note that struct abd_scatter has includes abd_iov[1], so we
+	 * allocate one less iovec than we need.
+	 *
+	 * Note we're not allocating the pages proper, just the iovec pointers.
+	 * That's down in abd_alloc_chunks. We _could_ do it here in a single
+	 * allocation, but it's fiddly and harder to read for no real gain.
+	 */
+	uint_t n = abd_iovcnt_for_bytes(size);
+	abd_t *abd = umem_alloc(sizeof (abd_t) + (n-1) * sizeof (struct iovec),
+	    UMEM_NOFAIL);
+	ABD_SCATTER(abd).abd_offset = 0;
+	ABD_SCATTER(abd).abd_iovcnt = n;
 	return (abd);
 }
 
 void
 abd_free_struct_impl(abd_t *abd)
 {
-	kmem_cache_free(abd_cache, abd);
-}
-
-#define	nth_page(pg, i) \
-	((struct page *)((void *)(pg) + (i) * PAGESIZE))
-
-struct scatterlist {
-	struct page *page;
-	int length;
-	int end;
-};
-
-static void
-sg_init_table(struct scatterlist *sg, int nr)
-{
-	memset(sg, 0, nr * sizeof (struct scatterlist));
-	sg[nr - 1].end = 1;
-}
-
-/*
- * This must be called if any of the sg_table allocation functions
- * are called.
- */
-static void
-abd_free_sg_table(abd_t *abd)
-{
-	int nents = ABD_SCATTER(abd).abd_nents;
-	vmem_free(ABD_SCATTER(abd).abd_sgl,
-	    nents * sizeof (struct scatterlist));
-}
-
-#define	for_each_sg(sgl, sg, nr, i)	\
-	for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
-
-static inline void
-sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
-    unsigned int offset)
-{
-	/* currently we don't use offset */
-	ASSERT(offset == 0);
-	sg->page = page;
-	sg->length = len;
-}
-
-static inline struct page *
-sg_page(struct scatterlist *sg)
-{
-	return (sg->page);
-}
-
-static inline struct scatterlist *
-sg_next(struct scatterlist *sg)
-{
-	if (sg->end)
-		return (NULL);
-
-	return (sg + 1);
+	/* For scatter, compute the extra amount we need to free */
+	uint_t iovcnt =
+	    abd_is_linear(abd) || abd_is_gang(abd) ?
+	    0 : (ABD_SCATTER(abd).abd_iovcnt - 1);
+	umem_free(abd, sizeof (abd_t) + iovcnt * sizeof (struct iovec));
 }
 
 void
 abd_alloc_chunks(abd_t *abd, size_t size)
 {
-	unsigned nr_pages = abd_chunkcnt_for_bytes(size);
-	struct scatterlist *sg;
-	int i;
-
-	ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
-	    sizeof (struct scatterlist), KM_SLEEP);
-	sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
+	/*
+	 * We've already allocated the iovec array; ensure that the wanted size
+	 * actually matches, otherwise the caller has made a mistake somewhere.
+	 */
+	uint_t n = ABD_SCATTER(abd).abd_iovcnt;
+	ASSERT3U(n, ==, abd_iovcnt_for_bytes(size));
 
-	abd_for_each_sg(abd, sg, nr_pages, i) {
-		struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
-		sg_set_page(sg, p, PAGESIZE, 0);
+	/*
+	 * Allocate a ABD_PAGESIZE region for each iovec.
+	 */
+	struct iovec *iov = ABD_SCATTER(abd).abd_iov;
+	for (int i = 0; i < n; i++) {
+		iov[i].iov_base =
+		    umem_alloc_aligned(ABD_PAGESIZE, ABD_PAGESIZE, UMEM_NOFAIL);
+		iov[i].iov_len = ABD_PAGESIZE;
 	}
-	ABD_SCATTER(abd).abd_nents = nr_pages;
 }
 
 void
 abd_free_chunks(abd_t *abd)
 {
-	int i, n = ABD_SCATTER(abd).abd_nents;
-	struct scatterlist *sg;
-
-	abd_for_each_sg(abd, sg, n, i) {
-		struct page *p = nth_page(sg_page(sg), 0);
-		umem_free_aligned(p, PAGESIZE);
-	}
-	abd_free_sg_table(abd);
-}
-
-static void
-abd_alloc_zero_scatter(void)
-{
-	unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
-	struct scatterlist *sg;
-	int i;
-
-	abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
-	memset(abd_zero_page, 0, PAGESIZE);
-	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
-	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
-	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK;
-	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
-	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
-	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
-	ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages *
-	    sizeof (struct scatterlist), KM_SLEEP);
-
-	sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages);
-
-	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
-		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
-	}
+	uint_t n = ABD_SCATTER(abd).abd_iovcnt;
+	struct iovec *iov = ABD_SCATTER(abd).abd_iov;
+	for (int i = 0; i < n; i++)
+		umem_free_aligned(iov[i].iov_base, ABD_PAGESIZE);
 }
 
 boolean_t
 abd_size_alloc_linear(size_t size)
 {
-	return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size);
+	return (size < ABD_SCATTER_MIN_SIZE);
 }
 
 void
 abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
 {
 	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
-	int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size;
+	int waste = P2ROUNDUP(abd->abd_size, ABD_PAGESIZE) - abd->abd_size;
 	if (op == ABDSTAT_INCR) {
 		arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
 	} else {
@@ -270,67 +155,72 @@ abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
 void
 abd_verify_scatter(abd_t *abd)
 {
-	size_t n;
-	int i = 0;
-	struct scatterlist *sg = NULL;
-
-	ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
-	ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
-	    ABD_SCATTER(abd).abd_sgl->length);
-	n = ABD_SCATTER(abd).abd_nents;
-	abd_for_each_sg(abd, sg, n, i) {
-		ASSERT3P(sg_page(sg), !=, NULL);
+#ifdef ZFS_DEBUG
+	/*
+	 * scatter abds shall have:
+	 * - at least one iovec
+	 * - all iov_base point somewhere
+	 * - all iov_len are ABD_PAGESIZE
+	 * - offset set within the abd pages somewhere
+	 */
+	uint_t n = ABD_SCATTER(abd).abd_iovcnt;
+	ASSERT3U(n, >, 0);
+
+	uint_t len = 0;
+	for (int i = 0; i < n; i++) {
+		ASSERT3P(ABD_SCATTER(abd).abd_iov[i].iov_base, !=, NULL);
+		ASSERT3U(ABD_SCATTER(abd).abd_iov[i].iov_len, ==, ABD_PAGESIZE);
+		len += ABD_PAGESIZE;
 	}
-}
 
-static void
-abd_free_zero_scatter(void)
-{
-	abd_free_sg_table(abd_zero_scatter);
-	abd_free_struct(abd_zero_scatter);
-	abd_zero_scatter = NULL;
-	ASSERT3P(abd_zero_page, !=, NULL);
-	umem_free_aligned(abd_zero_page, PAGESIZE);
+	ASSERT3U(ABD_SCATTER(abd).abd_offset, <, len);
+#endif
 }
 
 void
 abd_init(void)
 {
-	abd_cache = kmem_cache_create("abd_t", sizeof (abd_t),
-	    0, NULL, NULL, NULL, NULL, NULL, 0);
+	/*
+	 * Create the "zero" scatter abd. This is always the size of the
+	 * largest possible block, but only actually has a single allocated
+	 * page, which all iovecs in the abd point to.
+	 */
+	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
+	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
+	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
+
+	void *zero =
+	    umem_alloc_aligned(ABD_PAGESIZE, ABD_PAGESIZE, UMEM_NOFAIL);
+	memset(zero, 0, ABD_PAGESIZE);
 
-	abd_alloc_zero_scatter();
+	uint_t n = abd_iovcnt_for_bytes(SPA_MAXBLOCKSIZE);
+	struct iovec *iov = ABD_SCATTER(abd_zero_scatter).abd_iov;
+	for (int i = 0; i < n; i++) {
+		iov[i].iov_base = zero;
+		iov[i].iov_len = ABD_PAGESIZE;
+	}
 }
 
 void
 abd_fini(void)
 {
-	abd_free_zero_scatter();
-
-	if (abd_cache) {
-		kmem_cache_destroy(abd_cache);
-		abd_cache = NULL;
-	}
+	umem_free_aligned(
+	    ABD_SCATTER(abd_zero_scatter).abd_iov[0].iov_base, ABD_PAGESIZE);
+	abd_free_struct(abd_zero_scatter);
+	abd_zero_scatter = NULL;
 }
 
 void
 abd_free_linear_page(abd_t *abd)
 {
+	/*
+	 * LINEAR_PAGE is specific to the Linux kernel; we never set this
+	 * flag, so this will never be called.
+	 */
 	(void) abd;
-	__builtin_unreachable();
+	PANIC("unreachable");
 }
 
-/*
- * If we're going to use this ABD for doing I/O using the block layer, the
- * consumer of the ABD data doesn't care if it's scattered or not, and we don't
- * plan to store this ABD in memory for a long period of time, we should
- * allocate the ABD type that requires the least data copying to do the I/O.
- *
- * On Linux the optimal thing to do would be to use abd_get_offset() and
- * construct a new ABD which shares the original pages thereby eliminating
- * the copy.  But for the moment a new linear ABD is allocated until this
- * performance optimization can be implemented.
- */
 abd_t *
 abd_alloc_for_io(size_t size, boolean_t is_metadata)
 {
@@ -338,43 +228,60 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata)
 }
 
 abd_t *
-abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
-    size_t size)
+abd_get_offset_scatter(abd_t *dabd, abd_t *sabd, size_t off, size_t size)
 {
-	(void) size;
-	int i = 0;
-	struct scatterlist *sg = NULL;
-
-	abd_verify(sabd);
-	ASSERT3U(off, <=, sabd->abd_size);
-
-	size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
 
-	if (abd == NULL)
-		abd = abd_alloc_struct(0);
+	/*
+	 * Create a new scatter dabd by borrowing data pages from sabd to cover
+	 * off+size.
+	 *
+	 * sabd is an existing scatter abd with a set of iovecs, each covering
+	 * an ABD_PAGESIZE (4K) allocation. It's "zero" is at abd_offset.
+	 *
+	 *   [........][........][........][........]
+	 *      ^- sabd_offset
+	 *
+	 * We want to produce a new abd, referencing those allocations at the
+	 * given offset.
+	 *
+	 *   [........][........][........][........]
+	 *                    ^- dabd_offset = sabd_offset + off
+	 *                                        ^- dabd_offset + size
+	 *
+	 * In this example, dabd needs three iovecs. The first iovec is offset
+	 * 0, so the final dabd_offset is masked back into the first iovec.
+	 *
+	 *             [........][........][........]
+	 *                    ^- dabd_offset
+	 */
+	size_t soff = ABD_SCATTER(sabd).abd_offset + off;
+	size_t doff = soff & ABD_PAGEMASK;
+	size_t iovcnt = abd_iovcnt_for_bytes(doff + size);
 
 	/*
-	 * Even if this buf is filesystem metadata, we only track that
-	 * if we own the underlying data buffer, which is not true in
-	 * this case. Therefore, we don't ever use ABD_FLAG_META here.
+	 * If the passed-in abd has enough allocated iovecs already, reuse it.
+	 * Otherwise, make a new one. The caller will free the original if the
+	 * one it gets back is not the same.
+	 *
+	 * Note that it's ok if we reuse an abd with more iovecs than we need.
+	 * abd_size has the usable amount of data, and the abd does not own the
+	 * pages referenced by the iovecs. At worst, they're holding dangling
+	 * pointers that we'll never use anyway.
 	 */
+	if (dabd == NULL || ABD_SCATTER(dabd).abd_iovcnt < iovcnt)
+		dabd = abd_alloc_struct(iovcnt << ABD_PAGESHIFT);
 
-	abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) {
-		if (new_offset < sg->length)
-			break;
-		new_offset -= sg->length;
-	}
+	/* Set offset into first page in view */
+	ABD_SCATTER(dabd).abd_offset = doff;
 
-	ABD_SCATTER(abd).abd_sgl = sg;
-	ABD_SCATTER(abd).abd_offset = new_offset;
-	ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
+	/* Copy the wanted iovecs from the source to the dest */
+	memcpy(&ABD_SCATTER(dabd).abd_iov[0],
+	    &ABD_SCATTER(sabd).abd_iov[soff >> ABD_PAGESHIFT],
+	    iovcnt * sizeof (struct iovec));
 
-	return (abd);
+	return (dabd);
 }
 
-/*
- * Initialize the abd_iter.
- */
 void
 abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 {
@@ -382,16 +289,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 	abd_verify(abd);
 	memset(aiter, 0, sizeof (struct abd_iter));
 	aiter->iter_abd = abd;
-	if (!abd_is_linear(abd)) {
-		aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
-		aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
-	}
 }
 
-/*
- * This is just a helper function to see if we have exhausted the
- * abd_iter and reached the end.
- */
 boolean_t
 abd_iter_at_end(struct abd_iter *aiter)
 {
@@ -399,83 +298,57 @@ abd_iter_at_end(struct abd_iter *aiter)
 	return (aiter->iter_pos == aiter->iter_abd->abd_size);
 }
 
-/*
- * Advance the iterator by a certain amount. Cannot be called when a chunk is
- * in use. This can be safely called when the aiter has already exhausted, in
- * which case this does nothing.
- */
 void
 abd_iter_advance(struct abd_iter *aiter, size_t amount)
 {
-	/*
-	 * Ensure that last chunk is not in use. abd_iterate_*() must clear
-	 * this state (directly or abd_iter_unmap()) before advancing.
-	 */
 	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
 	ASSERT0(aiter->iter_mapsize);
-	ASSERT3P(aiter->iter_page, ==, NULL);
-	ASSERT0(aiter->iter_page_doff);
-	ASSERT0(aiter->iter_page_dsize);
 
-	/* There's nothing left to advance to, so do nothing */
 	if (abd_iter_at_end(aiter))
 		return;
 
 	aiter->iter_pos += amount;
-	aiter->iter_offset += amount;
-	if (!abd_is_linear(aiter->iter_abd)) {
-		while (aiter->iter_offset >= aiter->iter_sg->length) {
-			aiter->iter_offset -= aiter->iter_sg->length;
-			aiter->iter_sg = sg_next(aiter->iter_sg);
-			if (aiter->iter_sg == NULL) {
-				ASSERT0(aiter->iter_offset);
-				break;
-			}
-		}
-	}
+	ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
 }
 
-/*
- * Map the current chunk into aiter. This can be safely called when the aiter
- * has already exhausted, in which case this does nothing.
- */
 void
 abd_iter_map(struct abd_iter *aiter)
 {
-	void *paddr;
-	size_t offset = 0;
-
 	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
 	ASSERT0(aiter->iter_mapsize);
 
-	/* There's nothing left to iterate over, so do nothing */
 	if (abd_iter_at_end(aiter))
 		return;
 
 	if (abd_is_linear(aiter->iter_abd)) {
-		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
-		offset = aiter->iter_offset;
-		aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
-		paddr = ABD_LINEAR_BUF(aiter->iter_abd);
-	} else {
-		offset = aiter->iter_offset;
-		aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
-		    aiter->iter_abd->abd_size - aiter->iter_pos);
-
-		paddr = sg_page(aiter->iter_sg);
+		aiter->iter_mapaddr =
+		    ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
+		aiter->iter_mapsize =
+		    aiter->iter_abd->abd_size - aiter->iter_pos;
+		return;
 	}
 
-	aiter->iter_mapaddr = (char *)paddr + offset;
+	/*
+	 * For scatter, we index into the appropriate iovec, and return the
+	 * smaller of the amount requested, or up to the end of the page.
+	 */
+	size_t poff = aiter->iter_pos + ABD_SCATTER(aiter->iter_abd).abd_offset;
+
+	ASSERT3U(poff >> ABD_PAGESHIFT, <=,
+	    ABD_SCATTER(aiter->iter_abd).abd_iovcnt);
+	struct iovec *iov = &ABD_SCATTER(aiter->iter_abd).
+	    abd_iov[poff >> ABD_PAGESHIFT];
+
+	aiter->iter_mapsize = MIN(ABD_PAGESIZE - (poff & ABD_PAGEMASK),
+	    aiter->iter_abd->abd_size - aiter->iter_pos);
+	ASSERT3U(aiter->iter_mapsize, <=, ABD_PAGESIZE);
+
+	aiter->iter_mapaddr = iov->iov_base + (poff & ABD_PAGEMASK);
 }
 
-/*
- * Unmap the current chunk from aiter. This can be safely called when the aiter
- * has already exhausted, in which case this does nothing.
- */
 void
 abd_iter_unmap(struct abd_iter *aiter)
 {
-	/* There's nothing left to unmap, so do nothing */
 	if (abd_iter_at_end(aiter))
 		return;
 
diff --git a/lib/libzpool/include/sys/abd_os.h b/lib/libzpool/include/sys/abd_os.h
index 67f7e5606bec..8ff6aa1e9e4f 100644
--- a/lib/libzpool/include/sys/abd_os.h
+++ b/lib/libzpool/include/sys/abd_os.h
@@ -32,8 +32,8 @@ extern "C" {
 
 struct abd_scatter {
 	uint_t		abd_offset;
-	uint_t		abd_nents;
-	struct scatterlist *abd_sgl;
+	uint_t		abd_iovcnt;
+	struct iovec	abd_iov[1]; /* actually variable-length */
 };
 
 struct abd_linear {

From 9e15877dfb3e80021551301aac71976216b3fe1b Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Wed, 21 Aug 2024 17:38:06 -0700
Subject: [PATCH 46/65] Linux 6.10 compat: META

Update the META file to reflect compatibility with the 6.10 kernel.

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #16466
---
 META | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/META b/META
index 7aac80c541ba..76ca22cbae00 100644
--- a/META
+++ b/META
@@ -6,5 +6,5 @@ Release:       1
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS
-Linux-Maximum: 6.9
+Linux-Maximum: 6.10
 Linux-Minimum: 3.10

From ba2209ec9e2166dd9c6d80b61b4aed3dd457be4b Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 11 Jul 2024 07:37:30 +1000
Subject: [PATCH 47/65] abd_get_from_buf_struct: wrap existing buf with ABD
 stored on stack

This allows a simple "wrapping" ABD for an existing linear buffer to be
allocated on the stack, avoiding an allocation.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 include/sys/abd.h |  1 +
 module/zfs/abd.c  | 22 +++++++++++++++++-----
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/include/sys/abd.h b/include/sys/abd.h
index 67bf5e802c88..567b88c0fc01 100644
--- a/include/sys/abd.h
+++ b/include/sys/abd.h
@@ -93,6 +93,7 @@ abd_t *abd_get_offset_size(abd_t *, size_t, size_t);
 abd_t *abd_get_offset_struct(abd_t *, abd_t *, size_t, size_t);
 abd_t *abd_get_zeros(size_t);
 abd_t *abd_get_from_buf(void *, size_t);
+abd_t *abd_get_from_buf_struct(abd_t *, void *, size_t);
 void abd_cache_reap_now(void);
 
 /*
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
index f1df6082f045..c8c4d2270fae 100644
--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
@@ -603,13 +603,11 @@ abd_get_zeros(size_t size)
 }
 
 /*
- * Allocate a linear ABD structure for buf.
+ * Create a linear ABD for an existing buf.
  */
-abd_t *
-abd_get_from_buf(void *buf, size_t size)
+static abd_t *
+abd_get_from_buf_impl(abd_t *abd, void *buf, size_t size)
 {
-	abd_t *abd = abd_alloc_struct(0);
-
 	VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
 
 	/*
@@ -625,6 +623,20 @@ abd_get_from_buf(void *buf, size_t size)
 	return (abd);
 }
 
+abd_t *
+abd_get_from_buf(void *buf, size_t size)
+{
+	abd_t *abd = abd_alloc_struct(0);
+	return (abd_get_from_buf_impl(abd, buf, size));
+}
+
+abd_t *
+abd_get_from_buf_struct(abd_t *abd, void *buf, size_t size)
+{
+	abd_init_struct(abd);
+	return (abd_get_from_buf_impl(abd, buf, size));
+}
+
 /*
  * Get the raw buffer associated with a linear ABD.
  */

From 5eede0d5fde556107321fae6b41d6f83eeaf28a1 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 4 Jul 2024 16:11:12 +1000
Subject: [PATCH 48/65] compress: rework callers to always use the zio_compress
 calls

This will make future refactoring easier.

There are two we can't change for the moment, because zio_compress_data
does hole detection & collapsing which zio_decompress_data does not
actually know how to handle.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 module/zfs/blkptr.c      |  7 +++++--
 module/zfs/ddt_zap.c     | 13 +++++++++----
 module/zfs/dsl_dataset.c |  1 +
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/module/zfs/blkptr.c b/module/zfs/blkptr.c
index d85f0737f6f6..6a6f06c73578 100644
--- a/module/zfs/blkptr.c
+++ b/module/zfs/blkptr.c
@@ -142,8 +142,11 @@ decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen)
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
 		uint8_t dstbuf[BPE_PAYLOAD_SIZE];
 		decode_embedded_bp_compressed(bp, dstbuf);
-		VERIFY0(zio_decompress_data_buf(BP_GET_COMPRESS(bp),
-		    dstbuf, buf, psize, buflen, NULL));
+		abd_t dstabd;
+		abd_get_from_buf_struct(&dstabd, dstbuf, psize);
+		VERIFY0(zio_decompress_data(BP_GET_COMPRESS(bp), &dstabd,
+		    buf, psize, buflen, NULL));
+		abd_free(&dstabd);
 	} else {
 		ASSERT3U(lsize, ==, psize);
 		decode_embedded_bp_compressed(bp, buf);
diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c
index 4e01624f3684..8e78ec3277c7 100644
--- a/module/zfs/ddt_zap.c
+++ b/module/zfs/ddt_zap.c
@@ -52,6 +52,7 @@ ddt_zap_compress(const void *src, uchar_t *dst, size_t s_len, size_t d_len)
 
 	ASSERT3U(d_len, >=, s_len + 1);	/* no compression plus version byte */
 
+	/* Call compress function directly to avoid hole detection. */
 	c_len = ci->ci_compress((void *)src, dst, s_len, d_len - 1,
 	    ci->ci_level);
 
@@ -72,12 +73,16 @@ ddt_zap_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
 {
 	uchar_t version = *src++;
 	int cpfunc = version & DDT_ZAP_COMPRESS_FUNCTION_MASK;
-	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
 
-	if (ci->ci_decompress != NULL)
-		(void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
-	else
+	if (zio_compress_table[cpfunc].ci_decompress == NULL) {
 		memcpy(dst, src, d_len);
+		return;
+	}
+
+	abd_t sabd;
+	abd_get_from_buf_struct(&sabd, src, s_len);
+	VERIFY0(zio_decompress_data(cpfunc, &sabd, dst, s_len, d_len, NULL));
+	abd_free(&sabd);
 
 	if (((version & DDT_ZAP_COMPRESS_BYTEORDER_MASK) != 0) !=
 	    (ZFS_HOST_BYTEORDER != 0))
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 45d8a290d67d..28e07259ddda 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -2425,6 +2425,7 @@ get_receive_resume_token_impl(dsl_dataset_t *ds)
 	fnvlist_free(token_nv);
 	compressed = kmem_alloc(packed_size, KM_SLEEP);
 
+	/* Call compress function directly to avoid hole detection. */
 	compressed_size = gzip_compress(packed, compressed,
 	    packed_size, packed_size, 6);
 

From b4d81b1a6a8c1254910f7e8b48e2f58fe77b769a Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Fri, 5 Jul 2024 15:01:57 +1000
Subject: [PATCH 49/65] zstream: use zio_compress calls for compression

This is updating zstream to use the zio_compress calls rather than using
its own dispatch. Since that was fairly entangled, some refactoring
included.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 cmd/zstream/zstream_decompress.c | 123 +++++++++++++------------------
 cmd/zstream/zstream_recompress.c |  93 +++++++++++------------
 2 files changed, 98 insertions(+), 118 deletions(-)

diff --git a/cmd/zstream/zstream_decompress.c b/cmd/zstream/zstream_decompress.c
index 0cef36c0441f..f5f66080d060 100644
--- a/cmd/zstream/zstream_decompress.c
+++ b/cmd/zstream/zstream_decompress.c
@@ -22,6 +22,8 @@
 /*
  * Copyright 2022 Axcient.  All rights reserved.
  * Use is subject to license terms.
+ *
+ * Copyright (c) 2024, Klara, Inc.
  */
 
 #include <err.h>
@@ -257,83 +259,64 @@ zstream_do_decompress(int argc, char *argv[])
 			ENTRY e = {.key = key};
 
 			p = hsearch(e, FIND);
-			if (p != NULL) {
-				zio_decompress_func_t *xfunc = NULL;
-				switch ((enum zio_compress)(intptr_t)p->data) {
-				case ZIO_COMPRESS_OFF:
-					xfunc = NULL;
-					break;
-				case ZIO_COMPRESS_LZJB:
-					xfunc = lzjb_decompress;
-					break;
-				case ZIO_COMPRESS_GZIP_1:
-					xfunc = gzip_decompress;
-					break;
-				case ZIO_COMPRESS_ZLE:
-					xfunc = zle_decompress;
-					break;
-				case ZIO_COMPRESS_LZ4:
-					xfunc = lz4_decompress_zfs;
-					break;
-				case ZIO_COMPRESS_ZSTD:
-					xfunc = zfs_zstd_decompress;
-					break;
-				default:
-					assert(B_FALSE);
-				}
-
-
+			if (p == NULL) {
 				/*
-				 * Read and decompress the block
+				 * Read the contents of the block unaltered
 				 */
-				char *lzbuf = safe_calloc(payload_size);
-				(void) sfread(lzbuf, payload_size, stdin);
-				if (xfunc == NULL) {
-					memcpy(buf, lzbuf, payload_size);
-					drrw->drr_compressiontype =
-					    ZIO_COMPRESS_OFF;
-					if (verbose)
-						fprintf(stderr, "Resetting "
-						    "compression type to off "
-						    "for ino %llu offset "
-						    "%llu\n",
-						    (u_longlong_t)
-						    drrw->drr_object,
-						    (u_longlong_t)
-						    drrw->drr_offset);
-				} else if (0 != xfunc(lzbuf, buf,
-				    payload_size, payload_size, 0)) {
-					/*
-					 * The block must not be compressed,
-					 * at least not with this compression
-					 * type, possibly because it gets
-					 * written multiple times in this
-					 * stream.
-					 */
-					warnx("decompression failed for "
-					    "ino %llu offset %llu",
-					    (u_longlong_t)drrw->drr_object,
-					    (u_longlong_t)drrw->drr_offset);
-					memcpy(buf, lzbuf, payload_size);
-				} else if (verbose) {
-					drrw->drr_compressiontype =
-					    ZIO_COMPRESS_OFF;
-					fprintf(stderr, "successfully "
-					    "decompressed ino %llu "
-					    "offset %llu\n",
+				(void) sfread(buf, payload_size, stdin);
+				break;
+			}
+
+			/*
+			 * Read and decompress the block
+			 */
+			enum zio_compress c =
+			    (enum zio_compress)(intptr_t)p->data;
+
+			if (c == ZIO_COMPRESS_OFF) {
+				(void) sfread(buf, payload_size, stdin);
+				drrw->drr_compressiontype = ZIO_COMPRESS_OFF;
+				if (verbose)
+					fprintf(stderr,
+					    "Resetting compression type to "
+					    "off for ino %llu offset %llu\n",
 					    (u_longlong_t)drrw->drr_object,
 					    (u_longlong_t)drrw->drr_offset);
-				} else {
-					drrw->drr_compressiontype =
-					    ZIO_COMPRESS_OFF;
-				}
-				free(lzbuf);
-			} else {
+				break;
+			}
+
+			char *lzbuf = safe_calloc(payload_size);
+			(void) sfread(lzbuf, payload_size, stdin);
+
+			abd_t sabd;
+			abd_get_from_buf_struct(&sabd, lzbuf, payload_size);
+			int err = zio_decompress_data(c, &sabd, buf,
+			    payload_size, payload_size, NULL);
+			abd_free(&sabd);
+
+			if (err != 0) {
 				/*
-				 * Read the contents of the block unaltered
+				 * The block must not be compressed, at least
+				 * not with this compression type, possibly
+				 * because it gets written multiple times in
+				 * this stream.
 				 */
-				(void) sfread(buf, payload_size, stdin);
+				warnx("decompression failed for "
+				    "ino %llu offset %llu",
+				    (u_longlong_t)drrw->drr_object,
+				    (u_longlong_t)drrw->drr_offset);
+				memcpy(buf, lzbuf, payload_size);
+			} else if (verbose) {
+				drrw->drr_compressiontype = ZIO_COMPRESS_OFF;
+				fprintf(stderr, "successfully decompressed "
+				    "ino %llu offset %llu\n",
+				    (u_longlong_t)drrw->drr_object,
+				    (u_longlong_t)drrw->drr_offset);
+			} else {
+				drrw->drr_compressiontype = ZIO_COMPRESS_OFF;
 			}
+
+			free(lzbuf);
 			break;
 		}
 
diff --git a/cmd/zstream/zstream_recompress.c b/cmd/zstream/zstream_recompress.c
index f9e01d1aa4c4..0e5cc9cd8158 100644
--- a/cmd/zstream/zstream_recompress.c
+++ b/cmd/zstream/zstream_recompress.c
@@ -22,10 +22,9 @@
 /*
  * Copyright 2022 Axcient.  All rights reserved.
  * Use is subject to license terms.
- */
-
-/*
+ *
  * Copyright (c) 2022 by Delphix. All rights reserved.
+ * Copyright (c) 2024, Klara, Inc.
  */
 
 #include <err.h>
@@ -72,7 +71,7 @@ zstream_do_recompress(int argc, char *argv[])
 	dmu_replay_record_t *drr = &thedrr;
 	zio_cksum_t stream_cksum;
 	int c;
-	int level = -1;
+	int level = 0;
 
 	while ((c = getopt(argc, argv, "l:")) != -1) {
 		switch (c) {
@@ -97,34 +96,22 @@ zstream_do_recompress(int argc, char *argv[])
 
 	if (argc != 1)
 		zstream_usage();
-	int type = 0;
-	zio_compress_info_t *cinfo = NULL;
-	if (0 == strcmp(argv[0], "off")) {
-		type = ZIO_COMPRESS_OFF;
-		cinfo = &zio_compress_table[type];
-	} else if (0 == strcmp(argv[0], "inherit") ||
-	    0 == strcmp(argv[0], "empty") ||
-	    0 == strcmp(argv[0], "on")) {
-		// Fall through to invalid compression type case
+
+	enum zio_compress ctype;
+	if (strcmp(argv[0], "off") == 0) {
+		ctype = ZIO_COMPRESS_OFF;
 	} else {
-		for (int i = 0; i < ZIO_COMPRESS_FUNCTIONS; i++) {
-			if (0 == strcmp(zio_compress_table[i].ci_name,
-			    argv[0])) {
-				cinfo = &zio_compress_table[i];
-				type = i;
+		for (ctype = 0; ctype < ZIO_COMPRESS_FUNCTIONS; ctype++) {
+			if (strcmp(argv[0],
+			    zio_compress_table[ctype].ci_name) == 0)
 				break;
-			}
 		}
-	}
-	if (cinfo == NULL) {
-		fprintf(stderr, "Invalid compression type %s.\n",
-		    argv[0]);
-		exit(2);
-	}
-
-	if (cinfo->ci_compress == NULL) {
-		type = 0;
-		cinfo = &zio_compress_table[0];
+		if (ctype == ZIO_COMPRESS_FUNCTIONS ||
+		    zio_compress_table[ctype].ci_compress == NULL) {
+			fprintf(stderr, "Invalid compression type %s.\n",
+			    argv[0]);
+			exit(2);
+		}
 	}
 
 	if (isatty(STDIN_FILENO)) {
@@ -135,6 +122,7 @@ zstream_do_recompress(int argc, char *argv[])
 		exit(1);
 	}
 
+	abd_init();
 	fletcher_4_init();
 	zio_init();
 	zstd_init();
@@ -247,53 +235,60 @@ zstream_do_recompress(int argc, char *argv[])
 				(void) sfread(buf, payload_size, stdin);
 				break;
 			}
-			if (drrw->drr_compressiontype >=
-			    ZIO_COMPRESS_FUNCTIONS) {
+			enum zio_compress dtype = drrw->drr_compressiontype;
+			if (dtype >= ZIO_COMPRESS_FUNCTIONS) {
 				fprintf(stderr, "Invalid compression type in "
-				    "stream: %d\n", drrw->drr_compressiontype);
+				    "stream: %d\n", dtype);
 				exit(3);
 			}
-			zio_compress_info_t *dinfo =
-			    &zio_compress_table[drrw->drr_compressiontype];
+			if (zio_compress_table[dtype].ci_decompress == NULL)
+				dtype = ZIO_COMPRESS_OFF;
 
 			/* Set up buffers to minimize memcpys */
 			char *cbuf, *dbuf;
-			if (cinfo->ci_compress == NULL)
+			if (ctype == ZIO_COMPRESS_OFF)
 				dbuf = buf;
 			else
 				dbuf = safe_calloc(bufsz);
 
-			if (dinfo->ci_decompress == NULL)
+			if (dtype == ZIO_COMPRESS_OFF)
 				cbuf = dbuf;
 			else
 				cbuf = safe_calloc(payload_size);
 
 			/* Read and decompress the payload */
 			(void) sfread(cbuf, payload_size, stdin);
-			if (dinfo->ci_decompress != NULL) {
-				if (0 != dinfo->ci_decompress(cbuf, dbuf,
-				    payload_size, MIN(bufsz,
-				    drrw->drr_logical_size), dinfo->ci_level)) {
+			if (dtype != ZIO_COMPRESS_OFF) {
+				abd_t cabd;
+				abd_get_from_buf_struct(&cabd,
+				    cbuf, payload_size);
+				if (zio_decompress_data(dtype, &cabd, dbuf,
+				    payload_size,
+				    MIN(bufsz, drrw->drr_logical_size),
+				    NULL) != 0) {
 					warnx("decompression type %d failed "
 					    "for ino %llu offset %llu",
-					    type,
+					    dtype,
 					    (u_longlong_t)drrw->drr_object,
 					    (u_longlong_t)drrw->drr_offset);
 					exit(4);
 				}
 				payload_size = drrw->drr_logical_size;
+				abd_free(&cabd);
 				free(cbuf);
 			}
 
 			/* Recompress the payload */
-			if (cinfo->ci_compress != NULL) {
-				payload_size = P2ROUNDUP(cinfo->ci_compress(
-				    dbuf, buf, drrw->drr_logical_size,
-				    MIN(payload_size, bufsz), (level == -1 ?
-				    cinfo->ci_level : level)),
+			if (ctype != ZIO_COMPRESS_OFF) {
+				abd_t dabd;
+				abd_get_from_buf_struct(&dabd,
+				    dbuf, drrw->drr_logical_size);
+				payload_size = P2ROUNDUP(zio_compress_data(
+				    ctype, &dabd, (void **)&buf,
+				    drrw->drr_logical_size, level),
 				    SPA_MINBLOCKSIZE);
 				if (payload_size != drrw->drr_logical_size) {
-					drrw->drr_compressiontype = type;
+					drrw->drr_compressiontype = ctype;
 					drrw->drr_compressed_size =
 					    payload_size;
 				} else {
@@ -301,9 +296,10 @@ zstream_do_recompress(int argc, char *argv[])
 					drrw->drr_compressiontype = 0;
 					drrw->drr_compressed_size = 0;
 				}
+				abd_free(&dabd);
 				free(dbuf);
 			} else {
-				drrw->drr_compressiontype = type;
+				drrw->drr_compressiontype = ctype;
 				drrw->drr_compressed_size = 0;
 			}
 			break;
@@ -371,6 +367,7 @@ zstream_do_recompress(int argc, char *argv[])
 	fletcher_4_fini();
 	zio_fini();
 	zstd_fini();
+	abd_fini();
 
 	return (0);
 }

From e119483a95e8fece4097419689c7803754ca5c75 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Sat, 29 Jun 2024 11:47:16 +1000
Subject: [PATCH 50/65] compress: remove zio_decompress_data_buf

Nothing uses it anymore!

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 include/sys/zio_compress.h |  2 --
 module/zfs/zio_compress.c  | 24 ++++++++++--------------
 2 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h
index 691d7b624488..d0caee279f8e 100644
--- a/include/sys/zio_compress.h
+++ b/include/sys/zio_compress.h
@@ -187,8 +187,6 @@ extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void **dst,
     size_t s_len, uint8_t level);
 extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
     size_t s_len, size_t d_len, uint8_t *level);
-extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
-    size_t s_len, size_t d_len, uint8_t *level);
 extern int zio_compress_to_feature(enum zio_compress comp);
 
 #ifdef	__cplusplus
diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c
index c3bceababa38..1d448b002615 100644
--- a/module/zfs/zio_compress.c
+++ b/module/zfs/zio_compress.c
@@ -161,26 +161,22 @@ zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len,
 }
 
 int
-zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
+zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
     size_t s_len, size_t d_len, uint8_t *level)
 {
 	zio_compress_info_t *ci = &zio_compress_table[c];
 	if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
 		return (SET_ERROR(EINVAL));
 
-	if (ci->ci_decompress_level != NULL && level != NULL)
-		return (ci->ci_decompress_level(src, dst, s_len, d_len, level));
+	void *sbuf = abd_borrow_buf_copy(src, s_len);
 
-	return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
-}
+	int err;
+	if (ci->ci_decompress_level != NULL && level != NULL)
+		err = ci->ci_decompress_level(sbuf, dst, s_len, d_len, level);
+	else
+		err = ci->ci_decompress(sbuf, dst, s_len, d_len, ci->ci_level);
 
-int
-zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
-    size_t s_len, size_t d_len, uint8_t *level)
-{
-	void *tmp = abd_borrow_buf_copy(src, s_len);
-	int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len, level);
-	abd_return_buf(src, tmp, s_len);
+	abd_return_buf(src, sbuf, s_len);
 
 	/*
 	 * Decompression shouldn't fail, because we've already verified
@@ -189,9 +185,9 @@ zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
 	 */
 	if (zio_decompress_fail_fraction != 0 &&
 	    random_in_range(zio_decompress_fail_fraction) == 0)
-		ret = SET_ERROR(EINVAL);
+		err = SET_ERROR(EINVAL);
 
-	return (ret);
+	return (err);
 }
 
 int

From dd0c08f9c65ccf9d9c0c08a29de9fc21e136c47d Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Sat, 29 Jun 2024 11:19:10 +1000
Subject: [PATCH 51/65] compress: remove unused abd compress prototype

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 include/sys/zio_compress.h | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h
index d0caee279f8e..a7d19b633efa 100644
--- a/include/sys/zio_compress.h
+++ b/include/sys/zio_compress.h
@@ -130,17 +130,7 @@ typedef int zio_decompress_func_t(void *src, void *dst,
 /* Common signature for all zio decompress and get level functions. */
 typedef int zio_decompresslevel_func_t(void *src, void *dst,
     size_t s_len, size_t d_len, uint8_t *level);
-/* Common signature for all zio get-compression-level functions. */
-typedef int zio_getlevel_func_t(void *src, size_t s_len, uint8_t *level);
 
-
-/*
- * Common signature for all zio decompress functions using an ABD as input.
- * This is helpful if you have both compressed ARC and scatter ABDs enabled,
- * but is not a requirement for all compression algorithms.
- */
-typedef int zio_decompress_abd_func_t(abd_t *src, void *dst,
-    size_t s_len, size_t d_len, int);
 /*
  * Information about each compression function.
  */

From 522816498c0ea0d8dfa449cd18e2032b8ac0a9b8 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Sat, 29 Jun 2024 11:16:50 +1000
Subject: [PATCH 52/65] compress: standardise names of compression functions

This is mostly to make searching easier.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 include/sys/zio_compress.h |  32 +++++-----
 include/sys/zstd/zstd.h    |   2 -
 module/zfs/dsl_dataset.c   |   2 +-
 module/zfs/gzip.c          |   6 +-
 module/zfs/lz4_zfs.c       |   4 +-
 module/zfs/lzjb.c          |   6 +-
 module/zfs/zio_compress.c  |  48 +++++++++------
 module/zfs/zle.c           |   6 +-
 module/zstd/zfs_zstd.c     | 122 ++++++++++++++++++-------------------
 9 files changed, 122 insertions(+), 106 deletions(-)

diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h
index a7d19b633efa..56376fdd10a0 100644
--- a/include/sys/zio_compress.h
+++ b/include/sys/zio_compress.h
@@ -153,22 +153,22 @@ extern void lz4_fini(void);
 /*
  * Compression routines.
  */
-extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern size_t lz4_compress_zfs(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
-extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len,
-    int level);
+extern size_t zfs_lzjb_compress(void *src, void *dst, size_t s_len,
+    size_t d_len, int level);
+extern int zfs_lzjb_decompress(void *src, void *dst, size_t s_len,
+    size_t d_len, int level);
+extern size_t zfs_gzip_compress(void *src, void *dst, size_t s_len,
+    size_t d_len, int level);
+extern int zfs_gzip_decompress(void *src, void *dst, size_t s_len,
+    size_t d_len, int level);
+extern size_t zfs_zle_compress(void *src, void *dst, size_t s_len,
+    size_t d_len, int level);
+extern int zfs_zle_decompress(void *src, void *dst, size_t s_len,
+    size_t d_len, int level);
+extern size_t zfs_lz4_compress(void *src, void *dst, size_t s_len,
+    size_t d_len, int level);
+extern int zfs_lz4_decompress(void *src, void *dst, size_t s_len,
+    size_t d_len, int level);
 
 /*
  * Compress and decompress data if necessary.
diff --git a/include/sys/zstd/zstd.h b/include/sys/zstd/zstd.h
index d8c3fa86dce3..f9e7ac0b32ea 100644
--- a/include/sys/zstd/zstd.h
+++ b/include/sys/zstd/zstd.h
@@ -92,8 +92,6 @@ void zstd_fini(void);
 
 size_t zfs_zstd_compress(void *s_start, void *d_start, size_t s_len,
     size_t d_len, int level);
-size_t zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len,
-    size_t d_len, int level);
 int zfs_zstd_get_level(void *s_start, size_t s_len, uint8_t *level);
 int zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
     size_t d_len, uint8_t *level);
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 28e07259ddda..e62ecdb259f7 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -2426,7 +2426,7 @@ get_receive_resume_token_impl(dsl_dataset_t *ds)
 	compressed = kmem_alloc(packed_size, KM_SLEEP);
 
 	/* Call compress function directly to avoid hole detection. */
-	compressed_size = gzip_compress(packed, compressed,
+	compressed_size = zfs_gzip_compress(packed, compressed,
 	    packed_size, packed_size, 6);
 
 	zio_cksum_t cksum;
diff --git a/module/zfs/gzip.c b/module/zfs/gzip.c
index f3b19446352a..0ca66c2bd657 100644
--- a/module/zfs/gzip.c
+++ b/module/zfs/gzip.c
@@ -48,7 +48,8 @@ typedef uLongf zlen_t;
 #endif
 
 size_t
-gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+zfs_gzip_compress(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int n)
 {
 	int ret;
 	zlen_t dstlen = d_len;
@@ -83,7 +84,8 @@ gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 }
 
 int
-gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+zfs_gzip_decompress(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int n)
 {
 	(void) n;
 	zlen_t dstlen = d_len;
diff --git a/module/zfs/lz4_zfs.c b/module/zfs/lz4_zfs.c
index a3b9e7070373..698ed69956e3 100644
--- a/module/zfs/lz4_zfs.c
+++ b/module/zfs/lz4_zfs.c
@@ -53,7 +53,7 @@ int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,
 static kmem_cache_t *lz4_cache;
 
 size_t
-lz4_compress_zfs(void *s_start, void *d_start, size_t s_len,
+zfs_lz4_compress(void *s_start, void *d_start, size_t s_len,
     size_t d_len, int n)
 {
 	(void) n;
@@ -81,7 +81,7 @@ lz4_compress_zfs(void *s_start, void *d_start, size_t s_len,
 }
 
 int
-lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
+zfs_lz4_decompress(void *s_start, void *d_start, size_t s_len,
     size_t d_len, int n)
 {
 	(void) n;
diff --git a/module/zfs/lzjb.c b/module/zfs/lzjb.c
index a24f17e0fe74..b246693120ae 100644
--- a/module/zfs/lzjb.c
+++ b/module/zfs/lzjb.c
@@ -46,7 +46,8 @@
 #define	LEMPEL_SIZE	1024
 
 size_t
-lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+zfs_lzjb_compress(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int n)
 {
 	(void) n;
 	uchar_t *src = s_start;
@@ -101,7 +102,8 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 }
 
 int
-lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+zfs_lzjb_decompress(void *s_start, void *d_start,
+    size_t s_len, size_t d_len, int n)
 {
 	(void) s_len, (void) n;
 	uchar_t *src = s_start;
diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c
index 1d448b002615..9553a9377c3d 100644
--- a/module/zfs/zio_compress.c
+++ b/module/zfs/zio_compress.c
@@ -54,24 +54,36 @@ static unsigned long zio_decompress_fail_fraction = 0;
  * PART OF THE ON-DISK FORMAT.
  */
 zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
-	{"inherit",	0,	NULL,		NULL, NULL},
-	{"on",		0,	NULL,		NULL, NULL},
-	{"uncompressed", 0,	NULL,		NULL, NULL},
-	{"lzjb",	0,	lzjb_compress,	lzjb_decompress, NULL},
-	{"empty",	0,	NULL,		NULL, NULL},
-	{"gzip-1",	1,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-2",	2,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-3",	3,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-4",	4,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-5",	5,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-6",	6,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-7",	7,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-8",	8,	gzip_compress,	gzip_decompress, NULL},
-	{"gzip-9",	9,	gzip_compress,	gzip_decompress, NULL},
-	{"zle",		64,	zle_compress,	zle_decompress, NULL},
-	{"lz4",		0,	lz4_compress_zfs, lz4_decompress_zfs, NULL},
-	{"zstd",	ZIO_ZSTD_LEVEL_DEFAULT,	zfs_zstd_compress_wrap,
-	    zfs_zstd_decompress, zfs_zstd_decompress_level},
+	{"inherit",	0,	NULL,	NULL, NULL},
+	{"on",		0,	NULL,	NULL, NULL},
+	{"uncompressed", 0,	NULL,	NULL, NULL},
+	{"lzjb",	0,
+	    zfs_lzjb_compress,	zfs_lzjb_decompress, NULL},
+	{"empty",	0,	NULL,	NULL, NULL},
+	{"gzip-1",	1,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-2",	2,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-3",	3,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-4",	4,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-5",	5,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-6",	6,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-7",	7,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-8",	8,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"gzip-9",	9,
+	    zfs_gzip_compress,	zfs_gzip_decompress, NULL},
+	{"zle",		64,
+	    zfs_zle_compress,	zfs_zle_decompress, NULL},
+	{"lz4",		0,
+	    zfs_lz4_compress,	zfs_lz4_decompress, NULL},
+	{"zstd",	ZIO_ZSTD_LEVEL_DEFAULT,
+	    zfs_zstd_compress,	zfs_zstd_decompress, zfs_zstd_decompress_level},
 };
 
 uint8_t
diff --git a/module/zfs/zle.c b/module/zfs/zle.c
index 1483a65af803..32b5fe18cec6 100644
--- a/module/zfs/zle.c
+++ b/module/zfs/zle.c
@@ -35,7 +35,8 @@
 #include <sys/zio_compress.h>
 
 size_t
-zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+zfs_zle_compress(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int n)
 {
 	uchar_t *src = s_start;
 	uchar_t *dst = d_start;
@@ -65,7 +66,8 @@ zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 }
 
 int
-zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+zfs_zle_decompress(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int n)
 {
 	uchar_t *src = s_start;
 	uchar_t *dst = d_start;
diff --git a/module/zstd/zfs_zstd.c b/module/zstd/zfs_zstd.c
index 05120d27b8d6..34ab8fd8a424 100644
--- a/module/zstd/zfs_zstd.c
+++ b/module/zstd/zfs_zstd.c
@@ -429,68 +429,9 @@ zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
 	return (1);
 }
 
-
-size_t
-zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len,
-    int level)
-{
-	int16_t zstd_level;
-	if (zstd_enum_to_level(level, &zstd_level)) {
-		ZSTDSTAT_BUMP(zstd_stat_com_inval);
-		return (s_len);
-	}
-	/*
-	 * A zstd early abort heuristic.
-	 *
-	 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
-	 *   128k), don't try any of this, just go.
-	 *   (because experimentally that was a reasonable cutoff for a perf win
-	 *   with tiny ratio change)
-	 * - First, we try LZ4 compression, and if it doesn't early abort, we
-	 *   jump directly to whatever compression level we intended to try.
-	 * - Second, we try zstd-1 - if that errors out (usually, but not
-	 *   exclusively, if it would overflow), we give up early.
-	 *
-	 *   If it works, instead we go on and compress anyway.
-	 *
-	 * Why two passes? LZ4 alone gets you a lot of the way, but on highly
-	 * compressible data, it was losing up to 8.5% of the compressed
-	 * savings versus no early abort, and all the zstd-fast levels are
-	 * worse indications on their own than LZ4, and don't improve the LZ4
-	 * pass noticably if stacked like this.
-	 */
-	size_t actual_abort_size = zstd_abort_size;
-	if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
-	    s_len >= actual_abort_size) {
-		int pass_len = 1;
-		pass_len = lz4_compress_zfs(s_start, d_start, s_len, d_len, 0);
-		if (pass_len < d_len) {
-			ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
-			goto keep_trying;
-		}
-		ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
-
-		pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len,
-		    ZIO_ZSTD_LEVEL_1);
-		if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
-			ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
-			return (s_len);
-		}
-		ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
-	} else {
-		ZSTDSTAT_BUMP(zstd_stat_passignored);
-		if (s_len < actual_abort_size) {
-			ZSTDSTAT_BUMP(zstd_stat_passignored_size);
-		}
-	}
-keep_trying:
-	return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level));
-
-}
-
 /* Compress block using zstd */
-size_t
-zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
+static size_t
+zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len,
     int level)
 {
 	size_t c_len;
@@ -594,6 +535,65 @@ zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
 	return (c_len + sizeof (*hdr));
 }
 
+
+size_t
+zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
+    int level)
+{
+	int16_t zstd_level;
+	if (zstd_enum_to_level(level, &zstd_level)) {
+		ZSTDSTAT_BUMP(zstd_stat_com_inval);
+		return (s_len);
+	}
+	/*
+	 * A zstd early abort heuristic.
+	 *
+	 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
+	 *   128k), don't try any of this, just go.
+	 *   (because experimentally that was a reasonable cutoff for a perf win
+	 *   with tiny ratio change)
+	 * - First, we try LZ4 compression, and if it doesn't early abort, we
+	 *   jump directly to whatever compression level we intended to try.
+	 * - Second, we try zstd-1 - if that errors out (usually, but not
+	 *   exclusively, if it would overflow), we give up early.
+	 *
+	 *   If it works, instead we go on and compress anyway.
+	 *
+	 * Why two passes? LZ4 alone gets you a lot of the way, but on highly
+	 * compressible data, it was losing up to 8.5% of the compressed
+	 * savings versus no early abort, and all the zstd-fast levels are
+	 * worse indications on their own than LZ4, and don't improve the LZ4
+	 * pass noticably if stacked like this.
+	 */
+	size_t actual_abort_size = zstd_abort_size;
+	if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
+	    s_len >= actual_abort_size) {
+		int pass_len = 1;
+		pass_len = zfs_lz4_compress(s_start, d_start, s_len, d_len, 0);
+		if (pass_len < d_len) {
+			ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
+			goto keep_trying;
+		}
+		ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
+
+		pass_len = zfs_zstd_compress_impl(s_start, d_start, s_len,
+		    d_len, ZIO_ZSTD_LEVEL_1);
+		if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
+			ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
+			return (s_len);
+		}
+		ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
+	} else {
+		ZSTDSTAT_BUMP(zstd_stat_passignored);
+		if (s_len < actual_abort_size) {
+			ZSTDSTAT_BUMP(zstd_stat_passignored_size);
+		}
+	}
+keep_trying:
+	return (zfs_zstd_compress_impl(s_start, d_start, s_len, d_len, level));
+
+}
+
 /* Decompress block using zstd and return its stored level */
 int
 zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,

From d3c12383c95cf7988ac00234a42a4da7989c9034 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Sun, 30 Jun 2024 12:10:00 +1000
Subject: [PATCH 53/65] compress: change compression providers API to use ABDs

This commit changes the provider compress and decompress API to take ABD
pointers instead of buffer pointers for both data source and
destination. It then updates all providers to match.

This doesn't actually change the providers to do chunked compression,
just changes the API to allow such an update in the future. Helper
macros are added to easily adapt the ABD functions to their buffer-based
implementations.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 cmd/zstream/zstream_decompress.c |  6 ++--
 include/sys/zio_compress.h       | 58 +++++++++++++++++++++++++-------
 include/sys/zstd/zstd.h          |  6 ++--
 module/zfs/ddt_zap.c             |  8 +++--
 module/zfs/dsl_dataset.c         |  7 +++-
 module/zfs/gzip.c                | 11 +++---
 module/zfs/lz4_zfs.c             | 11 +++---
 module/zfs/lzjb.c                | 11 +++---
 module/zfs/zio_compress.c        | 19 ++++++-----
 module/zfs/zle.c                 | 11 +++---
 module/zstd/zfs_zstd.c           | 26 +++++++++-----
 11 files changed, 120 insertions(+), 54 deletions(-)

diff --git a/cmd/zstream/zstream_decompress.c b/cmd/zstream/zstream_decompress.c
index f5f66080d060..f8f439d4626d 100644
--- a/cmd/zstream/zstream_decompress.c
+++ b/cmd/zstream/zstream_decompress.c
@@ -288,10 +288,12 @@ zstream_do_decompress(int argc, char *argv[])
 			char *lzbuf = safe_calloc(payload_size);
 			(void) sfread(lzbuf, payload_size, stdin);
 
-			abd_t sabd;
+			abd_t sabd, dabd;
 			abd_get_from_buf_struct(&sabd, lzbuf, payload_size);
-			int err = zio_decompress_data(c, &sabd, buf,
+			abd_get_from_buf_struct(&dabd, buf, payload_size);
+			int err = zio_decompress_data(c, &sabd, &dabd,
 			    payload_size, payload_size, NULL);
+			abd_free(&dabd);
 			abd_free(&sabd);
 
 			if (err != 0) {
diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h
index 56376fdd10a0..d41b5dfd447f 100644
--- a/include/sys/zio_compress.h
+++ b/include/sys/zio_compress.h
@@ -22,7 +22,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Copyright (c) 2019, Allan Jude
- * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, 2024, Klara, Inc.
  * Use is subject to license terms.
  * Copyright (c) 2015, 2016 by Delphix. All rights reserved.
  */
@@ -122,13 +122,13 @@ enum zio_zstd_levels {
 struct zio_prop;
 
 /* Common signature for all zio compress functions. */
-typedef size_t zio_compress_func_t(void *src, void *dst,
+typedef size_t zio_compress_func_t(abd_t *src, abd_t *dst,
     size_t s_len, size_t d_len, int);
 /* Common signature for all zio decompress functions. */
-typedef int zio_decompress_func_t(void *src, void *dst,
+typedef int zio_decompress_func_t(abd_t *src, abd_t *dst,
     size_t s_len, size_t d_len, int);
 /* Common signature for all zio decompress and get level functions. */
-typedef int zio_decompresslevel_func_t(void *src, void *dst,
+typedef int zio_decompresslevel_func_t(abd_t *src, abd_t *dst,
     size_t s_len, size_t d_len, uint8_t *level);
 
 /*
@@ -153,21 +153,21 @@ extern void lz4_fini(void);
 /*
  * Compression routines.
  */
-extern size_t zfs_lzjb_compress(void *src, void *dst, size_t s_len,
+extern size_t zfs_lzjb_compress(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, int level);
-extern int zfs_lzjb_decompress(void *src, void *dst, size_t s_len,
+extern int zfs_lzjb_decompress(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, int level);
-extern size_t zfs_gzip_compress(void *src, void *dst, size_t s_len,
+extern size_t zfs_gzip_compress(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, int level);
-extern int zfs_gzip_decompress(void *src, void *dst, size_t s_len,
+extern int zfs_gzip_decompress(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, int level);
-extern size_t zfs_zle_compress(void *src, void *dst, size_t s_len,
+extern size_t zfs_zle_compress(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, int level);
-extern int zfs_zle_decompress(void *src, void *dst, size_t s_len,
+extern int zfs_zle_decompress(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, int level);
-extern size_t zfs_lz4_compress(void *src, void *dst, size_t s_len,
+extern size_t zfs_lz4_compress(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, int level);
-extern int zfs_lz4_decompress(void *src, void *dst, size_t s_len,
+extern int zfs_lz4_decompress(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, int level);
 
 /*
@@ -179,6 +179,40 @@ extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
     size_t s_len, size_t d_len, uint8_t *level);
 extern int zio_compress_to_feature(enum zio_compress comp);
 
+#define	ZFS_COMPRESS_WRAP_DECL(name)					\
+size_t									\
+name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, int n)		\
+{									\
+	void *s_buf = abd_borrow_buf_copy(src, s_len);			\
+	void *d_buf = abd_borrow_buf(dst, d_len);			\
+	size_t c_len = name##_buf(s_buf, d_buf, s_len, d_len, n);	\
+	abd_return_buf(src, s_buf, s_len);				\
+	abd_return_buf_copy(dst, d_buf, d_len);				\
+	return (c_len);							\
+}
+#define	ZFS_DECOMPRESS_WRAP_DECL(name)					\
+int									\
+name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, int n)		\
+{									\
+	void *s_buf = abd_borrow_buf_copy(src, s_len);			\
+	void *d_buf = abd_borrow_buf(dst, d_len);			\
+	int err = name##_buf(s_buf, d_buf, s_len, d_len, n);		\
+	abd_return_buf(src, s_buf, s_len);				\
+	abd_return_buf_copy(dst, d_buf, d_len);				\
+	return (err);							\
+}
+#define	ZFS_DECOMPRESS_LEVEL_WRAP_DECL(name)				\
+int									\
+name(abd_t *src, abd_t *dst, size_t s_len, size_t d_len, uint8_t *n)	\
+{									\
+	void *s_buf = abd_borrow_buf_copy(src, s_len);			\
+	void *d_buf = abd_borrow_buf(dst, d_len);			\
+	int err = name##_buf(s_buf, d_buf, s_len, d_len, n);		\
+	abd_return_buf(src, s_buf, s_len);				\
+	abd_return_buf_copy(dst, d_buf, d_len);				\
+	return (err);							\
+}
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/include/sys/zstd/zstd.h b/include/sys/zstd/zstd.h
index f9e7ac0b32ea..6d212b082f9a 100644
--- a/include/sys/zstd/zstd.h
+++ b/include/sys/zstd/zstd.h
@@ -90,12 +90,12 @@ typedef struct zfs_zstd_meta {
 int zstd_init(void);
 void zstd_fini(void);
 
-size_t zfs_zstd_compress(void *s_start, void *d_start, size_t s_len,
+size_t zfs_zstd_compress(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, int level);
 int zfs_zstd_get_level(void *s_start, size_t s_len, uint8_t *level);
-int zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
+int zfs_zstd_decompress_level(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, uint8_t *level);
-int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len,
+int zfs_zstd_decompress(abd_t *src, abd_t *dst, size_t s_len,
     size_t d_len, int n);
 void zfs_zstd_cache_reap_now(void);
 
diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c
index 8e78ec3277c7..e96984b86f0c 100644
--- a/module/zfs/ddt_zap.c
+++ b/module/zfs/ddt_zap.c
@@ -53,8 +53,12 @@ ddt_zap_compress(const void *src, uchar_t *dst, size_t s_len, size_t d_len)
 	ASSERT3U(d_len, >=, s_len + 1);	/* no compression plus version byte */
 
 	/* Call compress function directly to avoid hole detection. */
-	c_len = ci->ci_compress((void *)src, dst, s_len, d_len - 1,
-	    ci->ci_level);
+	abd_t sabd, dabd;
+	abd_get_from_buf_struct(&sabd, (void *)src, s_len);
+	abd_get_from_buf_struct(&dabd, dst, d_len);
+	c_len = ci->ci_compress(&sabd, &dabd, s_len, d_len - 1, ci->ci_level);
+	abd_free(&dabd);
+	abd_free(&sabd);
 
 	if (c_len == s_len) {
 		cpfunc = ZIO_COMPRESS_OFF;
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index e62ecdb259f7..042725b235d0 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -2426,8 +2426,13 @@ get_receive_resume_token_impl(dsl_dataset_t *ds)
 	compressed = kmem_alloc(packed_size, KM_SLEEP);
 
 	/* Call compress function directly to avoid hole detection. */
-	compressed_size = zfs_gzip_compress(packed, compressed,
+	abd_t pabd, cabd;
+	abd_get_from_buf_struct(&pabd, packed, packed_size);
+	abd_get_from_buf_struct(&cabd, compressed, packed_size);
+	compressed_size = zfs_gzip_compress(&pabd, &cabd,
 	    packed_size, packed_size, 6);
+	abd_free(&cabd);
+	abd_free(&pabd);
 
 	zio_cksum_t cksum;
 	fletcher_4_native_varsize(compressed, compressed_size, &cksum);
diff --git a/module/zfs/gzip.c b/module/zfs/gzip.c
index 0ca66c2bd657..e7fd6f63c4be 100644
--- a/module/zfs/gzip.c
+++ b/module/zfs/gzip.c
@@ -47,8 +47,8 @@ typedef uLongf zlen_t;
 
 #endif
 
-size_t
-zfs_gzip_compress(void *s_start, void *d_start, size_t s_len,
+static size_t
+zfs_gzip_compress_buf(void *s_start, void *d_start, size_t s_len,
     size_t d_len, int n)
 {
 	int ret;
@@ -83,8 +83,8 @@ zfs_gzip_compress(void *s_start, void *d_start, size_t s_len,
 	return ((size_t)dstlen);
 }
 
-int
-zfs_gzip_decompress(void *s_start, void *d_start, size_t s_len,
+static int
+zfs_gzip_decompress_buf(void *s_start, void *d_start, size_t s_len,
     size_t d_len, int n)
 {
 	(void) n;
@@ -105,3 +105,6 @@ zfs_gzip_decompress(void *s_start, void *d_start, size_t s_len,
 
 	return (0);
 }
+
+ZFS_COMPRESS_WRAP_DECL(zfs_gzip_compress)
+ZFS_DECOMPRESS_WRAP_DECL(zfs_gzip_decompress)
diff --git a/module/zfs/lz4_zfs.c b/module/zfs/lz4_zfs.c
index 698ed69956e3..0033b5e50d1a 100644
--- a/module/zfs/lz4_zfs.c
+++ b/module/zfs/lz4_zfs.c
@@ -52,8 +52,8 @@ int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,
 
 static kmem_cache_t *lz4_cache;
 
-size_t
-zfs_lz4_compress(void *s_start, void *d_start, size_t s_len,
+static size_t
+zfs_lz4_compress_buf(void *s_start, void *d_start, size_t s_len,
     size_t d_len, int n)
 {
 	(void) n;
@@ -80,8 +80,8 @@ zfs_lz4_compress(void *s_start, void *d_start, size_t s_len,
 	return (bufsiz + sizeof (bufsiz));
 }
 
-int
-zfs_lz4_decompress(void *s_start, void *d_start, size_t s_len,
+static int
+zfs_lz4_decompress_buf(void *s_start, void *d_start, size_t s_len,
     size_t d_len, int n)
 {
 	(void) n;
@@ -100,6 +100,9 @@ zfs_lz4_decompress(void *s_start, void *d_start, size_t s_len,
 	    d_start, bufsiz, d_len) < 0);
 }
 
+ZFS_COMPRESS_WRAP_DECL(zfs_lz4_compress)
+ZFS_DECOMPRESS_WRAP_DECL(zfs_lz4_decompress)
+
 /*
  * LZ4 API Description:
  *
diff --git a/module/zfs/lzjb.c b/module/zfs/lzjb.c
index b246693120ae..2db549b1626f 100644
--- a/module/zfs/lzjb.c
+++ b/module/zfs/lzjb.c
@@ -45,8 +45,8 @@
 #define	OFFSET_MASK	((1 << (16 - MATCH_BITS)) - 1)
 #define	LEMPEL_SIZE	1024
 
-size_t
-zfs_lzjb_compress(void *s_start, void *d_start, size_t s_len,
+static size_t
+zfs_lzjb_compress_buf(void *s_start, void *d_start, size_t s_len,
     size_t d_len, int n)
 {
 	(void) n;
@@ -101,8 +101,8 @@ zfs_lzjb_compress(void *s_start, void *d_start, size_t s_len,
 	return (dst - (uchar_t *)d_start);
 }
 
-int
-zfs_lzjb_decompress(void *s_start, void *d_start,
+static int
+zfs_lzjb_decompress_buf(void *s_start, void *d_start,
     size_t s_len, size_t d_len, int n)
 {
 	(void) s_len, (void) n;
@@ -132,3 +132,6 @@ zfs_lzjb_decompress(void *s_start, void *d_start,
 	}
 	return (0);
 }
+
+ZFS_COMPRESS_WRAP_DECL(zfs_lzjb_compress)
+ZFS_DECOMPRESS_WRAP_DECL(zfs_lzjb_decompress)
diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c
index 9553a9377c3d..118003bd295a 100644
--- a/module/zfs/zio_compress.c
+++ b/module/zfs/zio_compress.c
@@ -29,7 +29,7 @@
 
 /*
  * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
- * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, 2024, Klara, Inc.
  * Copyright (c) 2019, Allan Jude
  */
 
@@ -160,10 +160,10 @@ zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len,
 	if (*dst == NULL)
 		*dst = zio_buf_alloc(s_len);
 
-	/* No compression algorithms can read from ABDs directly */
-	void *tmp = abd_borrow_buf_copy(src, s_len);
-	c_len = ci->ci_compress(tmp, *dst, s_len, d_len, complevel);
-	abd_return_buf(src, tmp, s_len);
+	abd_t dabd;
+	abd_get_from_buf_struct(&dabd, dst, d_len);
+	c_len = ci->ci_compress(src, &dabd, s_len, d_len, complevel);
+	abd_free(&dabd);
 
 	if (c_len > d_len)
 		return (s_len);
@@ -180,15 +180,16 @@ zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
 	if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
 		return (SET_ERROR(EINVAL));
 
-	void *sbuf = abd_borrow_buf_copy(src, s_len);
+	abd_t dabd;
+	abd_get_from_buf_struct(&dabd, dst, d_len);
 
 	int err;
 	if (ci->ci_decompress_level != NULL && level != NULL)
-		err = ci->ci_decompress_level(sbuf, dst, s_len, d_len, level);
+		err = ci->ci_decompress_level(src, &dabd, s_len, d_len, level);
 	else
-		err = ci->ci_decompress(sbuf, dst, s_len, d_len, ci->ci_level);
+		err = ci->ci_decompress(src, &dabd, s_len, d_len, ci->ci_level);
 
-	abd_return_buf(src, sbuf, s_len);
+	abd_free(&dabd);
 
 	/*
 	 * Decompression shouldn't fail, because we've already verified
diff --git a/module/zfs/zle.c b/module/zfs/zle.c
index 32b5fe18cec6..7810161966dc 100644
--- a/module/zfs/zle.c
+++ b/module/zfs/zle.c
@@ -34,8 +34,8 @@
 #include <sys/sysmacros.h>
 #include <sys/zio_compress.h>
 
-size_t
-zfs_zle_compress(void *s_start, void *d_start, size_t s_len,
+static size_t
+zfs_zle_compress_buf(void *s_start, void *d_start, size_t s_len,
     size_t d_len, int n)
 {
 	uchar_t *src = s_start;
@@ -65,8 +65,8 @@ zfs_zle_compress(void *s_start, void *d_start, size_t s_len,
 	return (src == s_end ? dst - (uchar_t *)d_start : s_len);
 }
 
-int
-zfs_zle_decompress(void *s_start, void *d_start, size_t s_len,
+static int
+zfs_zle_decompress_buf(void *s_start, void *d_start, size_t s_len,
     size_t d_len, int n)
 {
 	uchar_t *src = s_start;
@@ -91,3 +91,6 @@ zfs_zle_decompress(void *s_start, void *d_start, size_t s_len,
 	}
 	return (dst == d_end ? 0 : -1);
 }
+
+ZFS_COMPRESS_WRAP_DECL(zfs_zle_compress)
+ZFS_DECOMPRESS_WRAP_DECL(zfs_zle_decompress)
diff --git a/module/zstd/zfs_zstd.c b/module/zstd/zfs_zstd.c
index 34ab8fd8a424..8d1d53d234b4 100644
--- a/module/zstd/zfs_zstd.c
+++ b/module/zstd/zfs_zstd.c
@@ -536,8 +536,8 @@ zfs_zstd_compress_impl(void *s_start, void *d_start, size_t s_len, size_t d_len,
 }
 
 
-size_t
-zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
+static size_t
+zfs_zstd_compress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len,
     int level)
 {
 	int16_t zstd_level;
@@ -569,7 +569,10 @@ zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
 	if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
 	    s_len >= actual_abort_size) {
 		int pass_len = 1;
-		pass_len = zfs_lz4_compress(s_start, d_start, s_len, d_len, 0);
+		abd_t sabd;
+		abd_get_from_buf_struct(&sabd, s_start, s_len);
+		pass_len = zfs_lz4_compress(&sabd, d_start, s_len, d_len, 0);
+		abd_free(&sabd);
 		if (pass_len < d_len) {
 			ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
 			goto keep_trying;
@@ -595,8 +598,8 @@ zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
 }
 
 /* Decompress block using zstd and return its stored level */
-int
-zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
+static int
+zfs_zstd_decompress_level_buf(void *s_start, void *d_start, size_t s_len,
     size_t d_len, uint8_t *level)
 {
 	ZSTD_DCtx *dctx;
@@ -671,15 +674,20 @@ zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
 }
 
 /* Decompress datablock using zstd */
-int
-zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len,
-    int level __maybe_unused)
+static int
+zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int level __maybe_unused)
 {
 
-	return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len,
+	return (zfs_zstd_decompress_level_buf(s_start, d_start, s_len, d_len,
 	    NULL));
 }
 
+ZFS_COMPRESS_WRAP_DECL(zfs_zstd_compress)
+ZFS_DECOMPRESS_WRAP_DECL(zfs_zstd_decompress)
+ZFS_DECOMPRESS_LEVEL_WRAP_DECL(zfs_zstd_decompress_level)
+
+
 /* Allocator for zstd compression context using mempool_allocator */
 static void *
 zstd_alloc(void *opaque __maybe_unused, size_t size)

From f62e6e1f985b5cc197940dcd2dc839aab0708ca2 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 4 Jul 2024 14:48:38 +1000
Subject: [PATCH 54/65] compress: change zio_compress API to use ABDs

This commit changes the frontend zio_compress_data and
zio_decompress_data APIs to take ABD points instead of buffer pointers.

All callers are updated to match. Any that already have an appropriate
ABD nearby now use it directly, while at the rest we create an one.

Internally, the ABDs are passed through to the provider directly.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 cmd/zdb/zdb.c                    | 37 +++++++++-----
 cmd/zstream/zstream_recompress.c | 17 ++++---
 include/sys/zio_compress.h       |  4 +-
 module/zfs/arc.c                 | 84 ++++++++++++++------------------
 module/zfs/blkptr.c              | 12 +++--
 module/zfs/ddt_zap.c             |  6 ++-
 module/zfs/dmu_recv.c            |  7 ++-
 module/zfs/zio.c                 | 35 +++++++------
 module/zfs/zio_compress.c        | 20 +++-----
 module/zstd/zfs_zstd.c           |  6 ++-
 10 files changed, 116 insertions(+), 112 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index c72df3909356..41c2b6765585 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -4657,7 +4657,6 @@ dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,
 	l2arc_log_blk_phys_t this_lb;
 	uint64_t asize;
 	l2arc_log_blkptr_t lbps[2];
-	abd_t *abd;
 	zio_cksum_t cksum;
 	int failed = 0;
 	l2arc_dev_t dev;
@@ -4711,20 +4710,25 @@ dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,
 		switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) {
 		case ZIO_COMPRESS_OFF:
 			break;
-		default:
-			abd = abd_alloc_for_io(asize, B_TRUE);
+		default: {
+			abd_t *abd = abd_alloc_linear(asize, B_TRUE);
 			abd_copy_from_buf_off(abd, &this_lb, 0, asize);
-			if (zio_decompress_data(L2BLK_GET_COMPRESS(
-			    (&lbps[0])->lbp_prop), abd, &this_lb,
-			    asize, sizeof (this_lb), NULL) != 0) {
+			abd_t dabd;
+			abd_get_from_buf_struct(&dabd, &this_lb,
+			    sizeof (this_lb));
+			int err = zio_decompress_data(L2BLK_GET_COMPRESS(
+			    (&lbps[0])->lbp_prop), abd, &dabd,
+			    asize, sizeof (this_lb), NULL);
+			abd_free(&dabd);
+			abd_free(abd);
+			if (err != 0) {
 				(void) printf("L2ARC block decompression "
 				    "failed\n");
-				abd_free(abd);
 				goto out;
 			}
-			abd_free(abd);
 			break;
 		}
+		}
 
 		if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
 			byteswap_uint64_array(&this_lb, sizeof (this_lb));
@@ -8618,13 +8622,22 @@ try_decompress_block(abd_t *pabd, uint64_t lsize, uint64_t psize,
 	memset(lbuf, 0x00, lsize);
 	memset(lbuf2, 0xff, lsize);
 
+	abd_t labd, labd2;
+	abd_get_from_buf_struct(&labd, lbuf, lsize);
+	abd_get_from_buf_struct(&labd2, lbuf2, lsize);
+
+	boolean_t ret = B_FALSE;
 	if (zio_decompress_data(cfunc, pabd,
-	    lbuf, psize, lsize, NULL) == 0 &&
+	    &labd, psize, lsize, NULL) == 0 &&
 	    zio_decompress_data(cfunc, pabd,
-	    lbuf2, psize, lsize, NULL) == 0 &&
+	    &labd2, psize, lsize, NULL) == 0 &&
 	    memcmp(lbuf, lbuf2, lsize) == 0)
-		return (B_TRUE);
-	return (B_FALSE);
+		ret = B_TRUE;
+
+	abd_free(&labd2);
+	abd_free(&labd);
+
+	return (ret);
 }
 
 static uint64_t
diff --git a/cmd/zstream/zstream_recompress.c b/cmd/zstream/zstream_recompress.c
index 0e5cc9cd8158..32ef6fa54433 100644
--- a/cmd/zstream/zstream_recompress.c
+++ b/cmd/zstream/zstream_recompress.c
@@ -259,12 +259,13 @@ zstream_do_recompress(int argc, char *argv[])
 			/* Read and decompress the payload */
 			(void) sfread(cbuf, payload_size, stdin);
 			if (dtype != ZIO_COMPRESS_OFF) {
-				abd_t cabd;
+				abd_t cabd, dabd;
 				abd_get_from_buf_struct(&cabd,
 				    cbuf, payload_size);
-				if (zio_decompress_data(dtype, &cabd, dbuf,
-				    payload_size,
-				    MIN(bufsz, drrw->drr_logical_size),
+				abd_get_from_buf_struct(&dabd, dbuf,
+				    MIN(bufsz, drrw->drr_logical_size));
+				if (zio_decompress_data(dtype, &cabd, &dabd,
+				    payload_size, abd_get_size(&dabd),
 				    NULL) != 0) {
 					warnx("decompression type %d failed "
 					    "for ino %llu offset %llu",
@@ -274,17 +275,20 @@ zstream_do_recompress(int argc, char *argv[])
 					exit(4);
 				}
 				payload_size = drrw->drr_logical_size;
+				abd_free(&dabd);
 				abd_free(&cabd);
 				free(cbuf);
 			}
 
 			/* Recompress the payload */
 			if (ctype != ZIO_COMPRESS_OFF) {
-				abd_t dabd;
+				abd_t dabd, abd;
 				abd_get_from_buf_struct(&dabd,
 				    dbuf, drrw->drr_logical_size);
+				abd_t *pabd =
+				    abd_get_from_buf_struct(&abd, buf, bufsz);
 				payload_size = P2ROUNDUP(zio_compress_data(
-				    ctype, &dabd, (void **)&buf,
+				    ctype, &dabd, &pabd,
 				    drrw->drr_logical_size, level),
 				    SPA_MINBLOCKSIZE);
 				if (payload_size != drrw->drr_logical_size) {
@@ -296,6 +300,7 @@ zstream_do_recompress(int argc, char *argv[])
 					drrw->drr_compressiontype = 0;
 					drrw->drr_compressed_size = 0;
 				}
+				abd_free(&abd);
 				abd_free(&dabd);
 				free(dbuf);
 			} else {
diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h
index d41b5dfd447f..31602039a150 100644
--- a/include/sys/zio_compress.h
+++ b/include/sys/zio_compress.h
@@ -173,9 +173,9 @@ extern int zfs_lz4_decompress(abd_t *src, abd_t *dst, size_t s_len,
 /*
  * Compress and decompress data if necessary.
  */
-extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void **dst,
+extern size_t zio_compress_data(enum zio_compress c, abd_t *src, abd_t **dst,
     size_t s_len, uint8_t level);
-extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
+extern int zio_decompress_data(enum zio_compress c, abd_t *src, abd_t *abd,
     size_t s_len, size_t d_len, uint8_t *level);
 extern int zio_compress_to_feature(enum zio_compress comp);
 
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 3c657c979cdc..714a30e863a7 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -1767,12 +1767,12 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
 	uint64_t csize;
 	uint64_t lsize = HDR_GET_LSIZE(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
-	void *tmpbuf = NULL;
 	abd_t *abd = hdr->b_l1hdr.b_pabd;
+	boolean_t free_abd = B_FALSE;
 
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT(HDR_AUTHENTICATED(hdr));
-	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+	ASSERT3P(abd, !=, NULL);
 
 	/*
 	 * The MAC is calculated on the compressed data that is stored on disk.
@@ -1784,14 +1784,13 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
-
+		abd = NULL;
 		csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
-		    hdr->b_l1hdr.b_pabd, &tmpbuf, lsize, hdr->b_complevel);
-		ASSERT3P(tmpbuf, !=, NULL);
+		    hdr->b_l1hdr.b_pabd, &abd, lsize, hdr->b_complevel);
+		ASSERT3P(abd, !=, NULL);
 		ASSERT3U(csize, <=, psize);
-		abd = abd_get_from_buf(tmpbuf, lsize);
-		abd_take_ownership_of_buf(abd, B_TRUE);
 		abd_zero_off(abd, csize, psize - csize);
+		free_abd = B_TRUE;
 	}
 
 	/*
@@ -1810,16 +1809,10 @@ arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
 
 	if (ret == 0)
 		arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
-	else if (ret != ENOENT)
-		goto error;
+	else if (ret == ENOENT)
+		ret = 0;
 
-	if (tmpbuf != NULL)
-		abd_free(abd);
-
-	return (0);
-
-error:
-	if (tmpbuf != NULL)
+	if (free_abd)
 		abd_free(abd);
 
 	return (ret);
@@ -1836,7 +1829,6 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
 {
 	int ret;
 	abd_t *cabd = NULL;
-	void *tmp = NULL;
 	boolean_t no_crypt = B_FALSE;
 	boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
 
@@ -1871,17 +1863,14 @@ arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
 		 * linear buffer and wrapping it in an abd later.
 		 */
 		cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0);
-		tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
 
 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
-		    hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
+		    hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
 		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
 		if (ret != 0) {
-			abd_return_buf(cabd, tmp, arc_hdr_size(hdr));
 			goto error;
 		}
 
-		abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 		    arc_hdr_size(hdr), hdr);
 		hdr->b_l1hdr.b_pabd = cabd;
@@ -2123,10 +2112,14 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 			/* Skip byteswapping and checksumming (already done) */
 			return (0);
 		} else {
+			abd_t dabd;
+			abd_get_from_buf_struct(&dabd, buf->b_data,
+			    HDR_GET_LSIZE(hdr));
 			error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
-			    hdr->b_l1hdr.b_pabd, buf->b_data,
+			    hdr->b_l1hdr.b_pabd, &dabd,
 			    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr),
 			    &hdr->b_complevel);
+			abd_free(&dabd);
 
 			/*
 			 * Absent hardware errors or software bugs, this should
@@ -8531,18 +8524,15 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 		abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
 		    ARC_HDR_USE_RESERVE);
-		void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
 
 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
-		    hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
+		    hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
 		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
 		if (ret != 0) {
-			abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 			arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
 			goto error;
 		}
 
-		abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 		    arc_hdr_size(hdr), hdr);
 		hdr->b_l1hdr.b_pabd = cabd;
@@ -9037,9 +9027,8 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
 	}
 
 	if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
-		size_t bufsize = MAX(size, asize);
-		void *buf = zio_buf_alloc(bufsize);
-		uint64_t csize = zio_compress_data(compress, to_write, &buf,
+		cabd = abd_alloc_for_io(MAX(size, asize), ismd);
+		uint64_t csize = zio_compress_data(compress, to_write, &cabd,
 		    size, hdr->b_complevel);
 		if (csize > psize) {
 			/*
@@ -9047,13 +9036,12 @@ l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
 			 * psize.  Even if it fits into asize, it does not
 			 * matter, since checksum will never match on read.
 			 */
-			zio_buf_free(buf, bufsize);
+			abd_free(cabd);
 			return (SET_ERROR(EIO));
 		}
 		if (asize > csize)
-			memset((char *)buf + csize, 0, asize - csize);
-		to_write = cabd = abd_get_from_buf(buf, bufsize);
-		abd_take_ownership_of_buf(cabd, B_TRUE);
+			abd_zero_off(cabd, csize, asize - csize);
+		to_write = cabd;
 	}
 
 	if (HDR_ENCRYPTED(hdr)) {
@@ -10184,7 +10172,6 @@ l2arc_log_blk_read(l2arc_dev_t *dev,
 {
 	int		err = 0;
 	zio_cksum_t	cksum;
-	abd_t		*abd = NULL;
 	uint64_t	asize;
 
 	ASSERT(this_lbp != NULL && next_lbp != NULL);
@@ -10246,16 +10233,22 @@ l2arc_log_blk_read(l2arc_dev_t *dev,
 	switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
 	case ZIO_COMPRESS_OFF:
 		break;
-	case ZIO_COMPRESS_LZ4:
-		abd = abd_alloc_for_io(asize, B_TRUE);
+	case ZIO_COMPRESS_LZ4: {
+		abd_t *abd = abd_alloc_linear(asize, B_TRUE);
 		abd_copy_from_buf_off(abd, this_lb, 0, asize);
-		if ((err = zio_decompress_data(
+		abd_t dabd;
+		abd_get_from_buf_struct(&dabd, this_lb, sizeof (*this_lb));
+		err = zio_decompress_data(
 		    L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
-		    abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) {
+		    abd, &dabd, asize, sizeof (*this_lb), NULL);
+		abd_free(&dabd);
+		abd_free(abd);
+		if (err != 0) {
 			err = SET_ERROR(EINVAL);
 			goto cleanup;
 		}
 		break;
+	}
 	default:
 		err = SET_ERROR(EINVAL);
 		goto cleanup;
@@ -10272,8 +10265,6 @@ l2arc_log_blk_read(l2arc_dev_t *dev,
 		l2arc_log_blk_fetch_abort(*next_io);
 		*next_io = NULL;
 	}
-	if (abd != NULL)
-		abd_free(abd);
 	return (err);
 }
 
@@ -10509,7 +10500,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 	uint64_t		psize, asize;
 	zio_t			*wzio;
 	l2arc_lb_abd_buf_t	*abd_buf;
-	uint8_t			*tmpbuf = NULL;
+	abd_t			*abd = NULL;
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
 
 	VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
@@ -10532,7 +10523,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 
 	/* try to compress the buffer */
 	psize = zio_compress_data(ZIO_COMPRESS_LZ4,
-	    abd_buf->abd, (void **) &tmpbuf, sizeof (*lb), 0);
+	    abd_buf->abd, &abd, sizeof (*lb), 0);
 
 	/* a log block is never entirely zero */
 	ASSERT(psize != 0);
@@ -10558,27 +10549,26 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 	    ZIO_CHECKSUM_FLETCHER_4);
 	if (asize < sizeof (*lb)) {
 		/* compression succeeded */
-		memset(tmpbuf + psize, 0, asize - psize);
+		abd_zero_off(abd, psize, asize - psize);
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_LZ4);
 	} else {
 		/* compression failed */
-		memcpy(tmpbuf, lb, sizeof (*lb));
+		abd_copy_from_buf_off(abd, lb, 0, sizeof (*lb));
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_OFF);
 	}
 
 	/* checksum what we're about to write */
-	fletcher_4_native(tmpbuf, asize, NULL,
+	abd_fletcher_4_native(abd, asize, NULL,
 	    &l2dhdr->dh_start_lbps[0].lbp_cksum);
 
 	abd_free(abd_buf->abd);
 
 	/* perform the write itself */
-	abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb));
-	abd_take_ownership_of_buf(abd_buf->abd, B_TRUE);
+	abd_buf->abd = abd;
 	wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
 	    asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
diff --git a/module/zfs/blkptr.c b/module/zfs/blkptr.c
index 6a6f06c73578..ac801c2bcf3f 100644
--- a/module/zfs/blkptr.c
+++ b/module/zfs/blkptr.c
@@ -142,11 +142,13 @@ decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen)
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
 		uint8_t dstbuf[BPE_PAYLOAD_SIZE];
 		decode_embedded_bp_compressed(bp, dstbuf);
-		abd_t dstabd;
-		abd_get_from_buf_struct(&dstabd, dstbuf, psize);
-		VERIFY0(zio_decompress_data(BP_GET_COMPRESS(bp), &dstabd,
-		    buf, psize, buflen, NULL));
-		abd_free(&dstabd);
+		abd_t cabd, dabd;
+		abd_get_from_buf_struct(&cabd, dstbuf, psize);
+		abd_get_from_buf_struct(&dabd, buf, buflen);
+		VERIFY0(zio_decompress_data(BP_GET_COMPRESS(bp), &cabd,
+		    &dabd, psize, buflen, NULL));
+		abd_free(&dabd);
+		abd_free(&cabd);
 	} else {
 		ASSERT3U(lsize, ==, psize);
 		decode_embedded_bp_compressed(bp, buf);
diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c
index e96984b86f0c..d96dc505cdea 100644
--- a/module/zfs/ddt_zap.c
+++ b/module/zfs/ddt_zap.c
@@ -83,9 +83,11 @@ ddt_zap_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
 		return;
 	}
 
-	abd_t sabd;
+	abd_t sabd, dabd;
 	abd_get_from_buf_struct(&sabd, src, s_len);
-	VERIFY0(zio_decompress_data(cpfunc, &sabd, dst, s_len, d_len, NULL));
+	abd_get_from_buf_struct(&dabd, dst, d_len);
+	VERIFY0(zio_decompress_data(cpfunc, &sabd, &dabd, s_len, d_len, NULL));
+	abd_free(&dabd);
 	abd_free(&sabd);
 
 	if (((version & DDT_ZAP_COMPRESS_BYTEORDER_MASK) != 0) !=
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 0119191d7920..a1752650f3ba 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -1391,7 +1391,7 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
 		abd_t *dabd = abd_alloc_linear(
 		    drrw->drr_logical_size, B_FALSE);
 		err = zio_decompress_data(drrw->drr_compressiontype,
-		    abd, abd_to_buf(dabd), abd_get_size(abd),
+		    abd, dabd, abd_get_size(abd),
 		    abd_get_size(dabd), NULL);
 
 		if (err != 0) {
@@ -1407,9 +1407,8 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
 		/* Recompress the data */
 		abd_t *cabd = abd_alloc_linear(BP_GET_PSIZE(bp),
 		    B_FALSE);
-		void *buf = abd_to_buf(cabd);
 		uint64_t csize = zio_compress_data(BP_GET_COMPRESS(bp),
-		    abd, &buf, abd_get_size(abd),
+		    abd, &cabd, abd_get_size(abd),
 		    rwa->os->os_complevel);
 		abd_zero_off(cabd, csize, BP_GET_PSIZE(bp) - csize);
 		/* Swap in newly compressed data into the abd */
@@ -2221,7 +2220,7 @@ flush_write_batch_impl(struct receive_writer_arg *rwa)
 
 				err = zio_decompress_data(
 				    drrw->drr_compressiontype,
-				    abd, abd_to_buf(decomp_abd),
+				    abd, decomp_abd,
 				    abd_get_size(abd),
 				    abd_get_size(decomp_abd), NULL);
 
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 73252c2da970..a841e0a79107 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -487,11 +487,9 @@ static void
 zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
 {
 	if (zio->io_error == 0) {
-		void *tmp = abd_borrow_buf(data, size);
 		int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
-		    zio->io_abd, tmp, zio->io_size, size,
+		    zio->io_abd, data, zio->io_size, size,
 		    &zio->io_prop.zp_complevel);
-		abd_return_buf_copy(data, tmp, size);
 
 		if (zio_injection_enabled && ret == 0)
 			ret = zio_handle_fault_injection(zio, EINVAL);
@@ -538,17 +536,18 @@ zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
 			 * from the indirect block. We decompress it now and
 			 * throw away the result after we are finished.
 			 */
-			tmp = zio_buf_alloc(lsize);
+			abd_t *abd = abd_alloc_linear(lsize, B_TRUE);
 			ret = zio_decompress_data(BP_GET_COMPRESS(bp),
-			    zio->io_abd, tmp, zio->io_size, lsize,
+			    zio->io_abd, abd, zio->io_size, lsize,
 			    &zio->io_prop.zp_complevel);
 			if (ret != 0) {
+				abd_free(abd);
 				ret = SET_ERROR(EIO);
 				goto error;
 			}
-			ret = zio_crypt_do_indirect_mac_checksum(B_FALSE,
-			    tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac);
-			zio_buf_free(tmp, lsize);
+			ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
+			    abd, lsize, BP_SHOULD_BYTESWAP(bp), mac);
+			abd_free(abd);
 		} else {
 			ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
 			    zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac);
@@ -1866,30 +1865,32 @@ zio_write_compress(zio_t *zio)
 	/* If it's a compressed write that is not raw, compress the buffer. */
 	if (compress != ZIO_COMPRESS_OFF &&
 	    !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
-		void *cbuf = NULL;
+		abd_t *cabd = NULL;
 		if (abd_cmp_zero(zio->io_abd, lsize) == 0)
 			psize = 0;
 		else if (compress == ZIO_COMPRESS_EMPTY)
 			psize = lsize;
 		else
-			psize = zio_compress_data(compress, zio->io_abd, &cbuf,
+			psize = zio_compress_data(compress, zio->io_abd, &cabd,
 			    lsize, zp->zp_complevel);
 		if (psize == 0) {
 			compress = ZIO_COMPRESS_OFF;
 		} else if (psize >= lsize) {
 			compress = ZIO_COMPRESS_OFF;
-			if (cbuf != NULL)
-				zio_buf_free(cbuf, lsize);
+			if (cabd != NULL)
+				abd_free(cabd);
 		} else if (!zp->zp_dedup && !zp->zp_encrypt &&
 		    psize <= BPE_PAYLOAD_SIZE &&
 		    zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
 		    spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
+			void *cbuf = abd_borrow_buf_copy(cabd, lsize);
 			encode_embedded_bp_compressed(bp,
 			    cbuf, compress, lsize, psize);
 			BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
 			BP_SET_TYPE(bp, zio->io_prop.zp_type);
 			BP_SET_LEVEL(bp, zio->io_prop.zp_level);
-			zio_buf_free(cbuf, lsize);
+			abd_return_buf(cabd, cbuf, lsize);
+			abd_free(cabd);
 			BP_SET_LOGICAL_BIRTH(bp, zio->io_txg);
 			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 			ASSERT(spa_feature_is_active(spa,
@@ -1908,14 +1909,12 @@ zio_write_compress(zio_t *zio)
 			    psize);
 			if (rounded >= lsize) {
 				compress = ZIO_COMPRESS_OFF;
-				zio_buf_free(cbuf, lsize);
+				abd_free(cabd);
 				psize = lsize;
 			} else {
-				abd_t *cdata = abd_get_from_buf(cbuf, lsize);
-				abd_take_ownership_of_buf(cdata, B_TRUE);
-				abd_zero_off(cdata, psize, rounded - psize);
+				abd_zero_off(cabd, psize, rounded - psize);
 				psize = rounded;
-				zio_push_transform(zio, cdata,
+				zio_push_transform(zio, cabd,
 				    psize, lsize, NULL);
 			}
 		}
diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c
index 118003bd295a..faf430972078 100644
--- a/module/zfs/zio_compress.c
+++ b/module/zfs/zio_compress.c
@@ -128,7 +128,7 @@ zio_compress_select(spa_t *spa, enum zio_compress child,
 }
 
 size_t
-zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len,
+zio_compress_data(enum zio_compress c, abd_t *src, abd_t **dst, size_t s_len,
     uint8_t level)
 {
 	size_t c_len, d_len;
@@ -158,12 +158,9 @@ zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len,
 	}
 
 	if (*dst == NULL)
-		*dst = zio_buf_alloc(s_len);
+		*dst = abd_alloc_sametype(src, s_len);
 
-	abd_t dabd;
-	abd_get_from_buf_struct(&dabd, dst, d_len);
-	c_len = ci->ci_compress(src, &dabd, s_len, d_len, complevel);
-	abd_free(&dabd);
+	c_len = ci->ci_compress(src, *dst, s_len, d_len, complevel);
 
 	if (c_len > d_len)
 		return (s_len);
@@ -173,23 +170,18 @@ zio_compress_data(enum zio_compress c, abd_t *src, void **dst, size_t s_len,
 }
 
 int
-zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
+zio_decompress_data(enum zio_compress c, abd_t *src, abd_t *dst,
     size_t s_len, size_t d_len, uint8_t *level)
 {
 	zio_compress_info_t *ci = &zio_compress_table[c];
 	if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
 		return (SET_ERROR(EINVAL));
 
-	abd_t dabd;
-	abd_get_from_buf_struct(&dabd, dst, d_len);
-
 	int err;
 	if (ci->ci_decompress_level != NULL && level != NULL)
-		err = ci->ci_decompress_level(src, &dabd, s_len, d_len, level);
+		err = ci->ci_decompress_level(src, dst, s_len, d_len, level);
 	else
-		err = ci->ci_decompress(src, &dabd, s_len, d_len, ci->ci_level);
-
-	abd_free(&dabd);
+		err = ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
 
 	/*
 	 * Decompression shouldn't fail, because we've already verified
diff --git a/module/zstd/zfs_zstd.c b/module/zstd/zfs_zstd.c
index 8d1d53d234b4..e113962f65b6 100644
--- a/module/zstd/zfs_zstd.c
+++ b/module/zstd/zfs_zstd.c
@@ -569,9 +569,11 @@ zfs_zstd_compress_buf(void *s_start, void *d_start, size_t s_len, size_t d_len,
 	if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
 	    s_len >= actual_abort_size) {
 		int pass_len = 1;
-		abd_t sabd;
+		abd_t sabd, dabd;
 		abd_get_from_buf_struct(&sabd, s_start, s_len);
-		pass_len = zfs_lz4_compress(&sabd, d_start, s_len, d_len, 0);
+		abd_get_from_buf_struct(&dabd, d_start, d_len);
+		pass_len = zfs_lz4_compress(&sabd, &dabd, s_len, d_len, 0);
+		abd_free(&dabd);
 		abd_free(&sabd);
 		if (pass_len < d_len) {
 			ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);

From a9c94bea9fb3bef7704d71cd9486fbcebbe6e9c8 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Fri, 5 Jul 2024 13:39:33 +1000
Subject: [PATCH 55/65] zio_compress_data: limit dest length to ABD size

Some callers (eg `do_corrective_recv()`) pass in a dest buffer much
smaller than the wanted 87.5% of the source buffer, because the
incoming abd is larger than the source data and they "know" what the
decompressed size with be.

However, `abd_borrow_buf()` rightly asserts if we try to borrow more
than is available, so these callers fail.

Previously when all we had was a dest buffer, we didn't know how big it
was, so we couldn't do anything. Now we have a dest abd, with a size, so
we can clamp dest size to the abd size.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 module/zfs/zio_compress.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c
index faf430972078..9182917f75eb 100644
--- a/module/zfs/zio_compress.c
+++ b/module/zfs/zio_compress.c
@@ -135,13 +135,9 @@ zio_compress_data(enum zio_compress c, abd_t *src, abd_t **dst, size_t s_len,
 	uint8_t complevel;
 	zio_compress_info_t *ci = &zio_compress_table[c];
 
-	ASSERT3U(c, <, ZIO_COMPRESS_FUNCTIONS);
 	ASSERT3U(ci->ci_compress, !=, NULL);
 	ASSERT3U(s_len, >, 0);
 
-	/* Compress at least 12.5% */
-	d_len = s_len - (s_len >> 3);
-
 	complevel = ci->ci_level;
 
 	if (c == ZIO_COMPRESS_ZSTD) {
@@ -160,6 +156,9 @@ zio_compress_data(enum zio_compress c, abd_t *src, abd_t **dst, size_t s_len,
 	if (*dst == NULL)
 		*dst = abd_alloc_sametype(src, s_len);
 
+	/* Compress at least 12.5%, but limit to the size of the dest abd. */
+	d_len = MIN(s_len - (s_len >> 3), abd_get_size(*dst));
+
 	c_len = ci->ci_compress(src, *dst, s_len, d_len, complevel);
 
 	if (c_len > d_len)

From a537d90734a16d63c79080cfd2d710745d7c02fd Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 23 Jul 2024 11:43:18 +1000
Subject: [PATCH 56/65] zstream decompress: fix decompress size and output

This was incorrectly using the compressed length for the size of the
decompress buffer, and quietly doing nothing if the decompressor refused
to decompress the block because there wasn't enough space.

After that, it wasn't correctly rewriting the record to indicate
"not compressed".

So that's fixed now. Sigh.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 cmd/zstream/zstream_decompress.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/cmd/zstream/zstream_decompress.c b/cmd/zstream/zstream_decompress.c
index f8f439d4626d..c64011e3822a 100644
--- a/cmd/zstream/zstream_decompress.c
+++ b/cmd/zstream/zstream_decompress.c
@@ -275,7 +275,8 @@ zstream_do_decompress(int argc, char *argv[])
 
 			if (c == ZIO_COMPRESS_OFF) {
 				(void) sfread(buf, payload_size, stdin);
-				drrw->drr_compressiontype = ZIO_COMPRESS_OFF;
+				drrw->drr_compressiontype = 0;
+				drrw->drr_compressed_size = 0;
 				if (verbose)
 					fprintf(stderr,
 					    "Resetting compression type to "
@@ -285,18 +286,32 @@ zstream_do_decompress(int argc, char *argv[])
 				break;
 			}
 
+			uint64_t lsize = drrw->drr_logical_size;
+			ASSERT3U(payload_size, <=, lsize);
+
 			char *lzbuf = safe_calloc(payload_size);
 			(void) sfread(lzbuf, payload_size, stdin);
 
 			abd_t sabd, dabd;
 			abd_get_from_buf_struct(&sabd, lzbuf, payload_size);
-			abd_get_from_buf_struct(&dabd, buf, payload_size);
+			abd_get_from_buf_struct(&dabd, buf, lsize);
 			int err = zio_decompress_data(c, &sabd, &dabd,
-			    payload_size, payload_size, NULL);
+			    payload_size, lsize, NULL);
 			abd_free(&dabd);
 			abd_free(&sabd);
 
-			if (err != 0) {
+			if (err == 0) {
+				drrw->drr_compressiontype = 0;
+				drrw->drr_compressed_size = 0;
+				payload_size = lsize;
+				if (verbose) {
+					fprintf(stderr,
+					    "successfully decompressed "
+					    "ino %llu offset %llu\n",
+					    (u_longlong_t)drrw->drr_object,
+					    (u_longlong_t)drrw->drr_offset);
+				}
+			} else {
 				/*
 				 * The block must not be compressed, at least
 				 * not with this compression type, possibly
@@ -308,14 +323,6 @@ zstream_do_decompress(int argc, char *argv[])
 				    (u_longlong_t)drrw->drr_object,
 				    (u_longlong_t)drrw->drr_offset);
 				memcpy(buf, lzbuf, payload_size);
-			} else if (verbose) {
-				drrw->drr_compressiontype = ZIO_COMPRESS_OFF;
-				fprintf(stderr, "successfully decompressed "
-				    "ino %llu offset %llu\n",
-				    (u_longlong_t)drrw->drr_object,
-				    (u_longlong_t)drrw->drr_offset);
-			} else {
-				drrw->drr_compressiontype = ZIO_COMPRESS_OFF;
 			}
 
 			free(lzbuf);

From cb36f4f3529473d977189010f41b9a98c644d2d3 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 23 Jul 2024 11:43:18 +1000
Subject: [PATCH 57/65] zstream recompress: fix zero recompressed buffer and
 output

If compression happend, any garbage past the compress size was not
zeroed out.

If compression didn't happen, then the payload size was still set to
the rounded-up return from zio_compress_data(), which is dependent on
the input, which is not necessarily the logical size.

So that's all fixed too, mostly from stealing the math from zio.c.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 cmd/zstream/zstream_recompress.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/cmd/zstream/zstream_recompress.c b/cmd/zstream/zstream_recompress.c
index 32ef6fa54433..ae2c56320b2a 100644
--- a/cmd/zstream/zstream_recompress.c
+++ b/cmd/zstream/zstream_recompress.c
@@ -287,24 +287,26 @@ zstream_do_recompress(int argc, char *argv[])
 				    dbuf, drrw->drr_logical_size);
 				abd_t *pabd =
 				    abd_get_from_buf_struct(&abd, buf, bufsz);
-				payload_size = P2ROUNDUP(zio_compress_data(
-				    ctype, &dabd, &pabd,
-				    drrw->drr_logical_size, level),
-				    SPA_MINBLOCKSIZE);
-				if (payload_size != drrw->drr_logical_size) {
-					drrw->drr_compressiontype = ctype;
-					drrw->drr_compressed_size =
-					    payload_size;
-				} else {
+				size_t csize = zio_compress_data(ctype, &dabd,
+				    &pabd, drrw->drr_logical_size, level);
+				size_t rounded =
+				    P2ROUNDUP(csize, SPA_MINBLOCKSIZE);
+				if (rounded >= drrw->drr_logical_size) {
 					memcpy(buf, dbuf, payload_size);
 					drrw->drr_compressiontype = 0;
 					drrw->drr_compressed_size = 0;
+				} else {
+					abd_zero_off(pabd, csize,
+					    rounded - csize);
+					drrw->drr_compressiontype = ctype;
+					drrw->drr_compressed_size =
+					    payload_size = rounded;
 				}
 				abd_free(&abd);
 				abd_free(&dabd);
 				free(dbuf);
 			} else {
-				drrw->drr_compressiontype = ctype;
+				drrw->drr_compressiontype = 0;
 				drrw->drr_compressed_size = 0;
 			}
 			break;

From 34118eac06fba834f0c934419aec1b386c98665a Mon Sep 17 00:00:00 2001
From: Low-power <msl0000023508@gmail.com>
Date: Sat, 24 Aug 2024 01:39:09 +0800
Subject: [PATCH 58/65] Make mount.zfs(8) calling zfs_mount_at for legacy
 mounts as well

Commit 329e2ffa4bca456e65c3db7f5c5c04931c551b61 has made mount.zfs(8) to
call libzfs function 'zfs_mount_at', in order to propagate dataset
properties into mount options. This fix however, is limited to a special
use case where mount.zfs(8) is used in initrd with option '-o zfsutil'.
If either initrd or the user need to use mount.zfs(8) to mount a file
system with 'mountpoint' set to 'legacy', '-o zfsutil' can't be used and
the original issue #7947 will still happen.

Since the existing code already excluded the possibility of calling
'zfs_mount_at' when it was invoked as a helper program from zfs(8), by
checking 'ZFS_MOUNT_HELPER' environment variable, it makes no sense to
avoid calling 'zfs_mount_at' without '-o zfsutil'.

An exception however, is when mount.zfs(8) was invoked with '-o remount'
to update the mount options for an existing mount point. In this case
call mount(2) directly without modifying the mount options passed from
command line.

Furthermore, don't run mount.zfs(8) helper for automounting snapshot.
The above change to make mount.zfs(8) to call 'zfs_mount_at'
apparently caused it to trigger an automount for the snapshot
directory. When the helper was invoked as a result of a snapshot
automount, an infinite recursion will occur.

Since the need of invoking user mode mount(8) for automounting was to
overcome that the 'vfs_kern_mount' being GPL-only, just run mount(8)
without the mount.zfs(8) helper by adding option '-i'.

Reviewed-by: Umer Saleem <usaleem@ixsystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: WHR <whr@rivoreo.one>
Closes #16393
---
 cmd/mount_zfs.c                  | 5 ++---
 module/os/linux/zfs/zfs_ctldir.c | 8 ++++----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/cmd/mount_zfs.c b/cmd/mount_zfs.c
index fc9220950647..283074daf717 100644
--- a/cmd/mount_zfs.c
+++ b/cmd/mount_zfs.c
@@ -269,8 +269,7 @@ main(int argc, char **argv)
 		return (MOUNT_USAGE);
 	}
 
-	if (!zfsutil || sloppy ||
-	    libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
+	if (sloppy || libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
 		zfs_adjust_mount_options(zhp, mntpoint, mntopts, mtabopt);
 	}
 
@@ -337,7 +336,7 @@ main(int argc, char **argv)
 		    dataset, mntpoint, mntflags, zfsflags, mntopts, mtabopt);
 
 	if (!fake) {
-		if (zfsutil && !sloppy &&
+		if (!remount && !sloppy &&
 		    !libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
 			error = zfs_mount_at(zhp, mntopts, mntflags, mntpoint);
 			if (error) {
diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c
index 54ed70d0394f..e042116333fb 100644
--- a/module/os/linux/zfs/zfs_ctldir.c
+++ b/module/os/linux/zfs/zfs_ctldir.c
@@ -1101,8 +1101,8 @@ zfsctl_snapshot_mount(struct path *path, int flags)
 	zfsvfs_t *snap_zfsvfs;
 	zfs_snapentry_t *se;
 	char *full_name, *full_path;
-	char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL,
-	    NULL };
+	char *argv[] = { "/usr/bin/env", "mount", "-i", "-t", "zfs", "-n",
+	    NULL, NULL, NULL };
 	char *envp[] = { NULL };
 	int error;
 	struct path spath;
@@ -1153,8 +1153,8 @@ zfsctl_snapshot_mount(struct path *path, int flags)
 	 * value from call_usermodehelper() will be (exitcode << 8 + signal).
 	 */
 	dprintf("mount; name=%s path=%s\n", full_name, full_path);
-	argv[5] = full_name;
-	argv[6] = full_path;
+	argv[6] = full_name;
+	argv[7] = full_path;
 	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
 	if (error) {
 		if (!(error & MOUNT_BUSY << 8)) {

From 2420ee6e12cb4bc4918fc88d44d59b486b86e58b Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Sat, 24 Aug 2024 03:40:45 +1000
Subject: [PATCH 59/65] spl-taskq: fix task counts for delayed and cancelled
 tasks

Dispatched delayed tasks were not added to tasks_total, and cancelled
tasks were not removed. This notably could make tasks_total go to
UNIT64_MAX, but just generally meant the count could be wrong. So lets
not!

Sponsored-by: Klara, Inc.
Sponsored-by: Syneto
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16473
---
 module/os/linux/spl/spl-taskq.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c
index 29b8f5426502..c16bc9bc6409 100644
--- a/module/os/linux/spl/spl-taskq.c
+++ b/module/os/linux/spl/spl-taskq.c
@@ -620,6 +620,7 @@ taskq_cancel_id(taskq_t *tq, taskqid_t id)
 	if (t && t != ERR_PTR(-EBUSY)) {
 		list_del_init(&t->tqent_list);
 		TQSTAT_DEC_LIST(tq, t);
+		TQSTAT_DEC(tq, tasks_total);
 
 		t->tqent_flags |= TQENT_FLAG_CANCEL;
 		TQSTAT_INC(tq, tasks_cancelled);
@@ -760,6 +761,7 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
 	list_add_tail(&t->tqent_list, &tq->tq_delay_list);
 	TQENT_SET_LIST(t, TQENT_LIST_DELAY);
 	TQSTAT_INC_LIST(tq, t);
+	TQSTAT_INC(tq, tasks_total);
 
 	t->tqent_id = rc = tq->tq_next_id;
 	tq->tq_next_id++;

From 6be8bf5552b16475629a15ab62759eb7a6d73e3b Mon Sep 17 00:00:00 2001
From: Mateusz Piotrowski <0mp@FreeBSD.org>
Date: Mon, 26 Aug 2024 18:27:24 +0200
Subject: [PATCH 60/65] zpool: Provide GUID to zpool-reguid(8) with -g (#16239)

This commit extends the zpool-reguid(8) command with a -g flag, which
allows the user to specify the GUID to set.

This change also adds some general tests for zpool-reguid(8).

Sponsored-by: Wasabi Technology, Inc.
Sponsored-by: Klara, Inc.

Signed-off-by: Mateusz Piotrowski <0mp@FreeBSD.org>
Reviewed-by: Rob Norris <rob.norris@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
---
 cmd/zpool/zpool_main.c                        | 23 ++++--
 cmd/ztest.c                                   |  2 +-
 include/libzfs.h                              |  1 +
 include/sys/fs/zfs.h                          |  5 ++
 include/sys/spa.h                             |  2 +-
 lib/libzfs/libzfs.abi                         |  6 ++
 lib/libzfs/libzfs_pool.c                      | 41 ++++++++++-
 man/man8/zpool-reguid.8                       | 14 +++-
 module/zfs/spa.c                              | 25 ++++++-
 module/zfs/zfs_ioctl.c                        | 30 +++++++-
 tests/runfiles/common.run                     |  4 +
 .../cli_root/zpool_reguid/Makefile.am         |  6 ++
 .../cli_root/zpool_reguid/cleanup.ksh         | 32 ++++++++
 .../cli_root/zpool_reguid/setup.ksh           | 34 +++++++++
 .../zpool_reguid/zpool_reguid_001_pos.ksh     | 73 +++++++++++++++++++
 .../zpool_reguid/zpool_reguid_002_neg.ksh     | 60 +++++++++++++++
 16 files changed, 342 insertions(+), 16 deletions(-)
 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_reguid/Makefile.am
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_reguid/cleanup.ksh
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_reguid/setup.ksh
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_001_pos.ksh
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_002_neg.ksh

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 620746f8e7bb..9cd26a8650ad 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -537,7 +537,7 @@ get_usage(zpool_help_t idx)
 		    "\t    [-o property=value] <pool> <newpool> "
 		    "[<device> ...]\n"));
 	case HELP_REGUID:
-		return (gettext("\treguid <pool>\n"));
+		return (gettext("\treguid [-g guid] <pool>\n"));
 	case HELP_SYNC:
 		return (gettext("\tsync [pool] ...\n"));
 	case HELP_VERSION:
@@ -2025,7 +2025,7 @@ zpool_do_create(int argc, char **argv)
 				char *end;
 				u_longlong_t ver;
 
-				ver = strtoull(propval, &end, 10);
+				ver = strtoull(propval, &end, 0);
 				if (*end == '\0' &&
 				    ver < SPA_VERSION_FEATURES) {
 					enable_pool_features = B_FALSE;
@@ -8232,19 +8232,32 @@ zpool_do_clear(int argc, char **argv)
 }
 
 /*
- * zpool reguid <pool>
+ * zpool reguid [-g <guid>] <pool>
  */
 int
 zpool_do_reguid(int argc, char **argv)
 {
+	uint64_t guid;
+	uint64_t *guidp = NULL;
 	int c;
+	char *endptr;
 	char *poolname;
 	zpool_handle_t *zhp;
 	int ret = 0;
 
 	/* check options */
-	while ((c = getopt(argc, argv, "")) != -1) {
+	while ((c = getopt(argc, argv, "g:")) != -1) {
 		switch (c) {
+		case 'g':
+			errno = 0;
+			guid = strtoull(optarg, &endptr, 10);
+			if (errno != 0 || *endptr != '\0') {
+				(void) fprintf(stderr,
+				    gettext("invalid GUID: %s\n"), optarg);
+				usage(B_FALSE);
+			}
+			guidp = &guid;
+			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
@@ -8270,7 +8283,7 @@ zpool_do_reguid(int argc, char **argv)
 	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
 		return (1);
 
-	ret = zpool_reguid(zhp);
+	ret = zpool_set_guid(zhp, guidp);
 
 	zpool_close(zhp);
 	return (ret);
diff --git a/cmd/ztest.c b/cmd/ztest.c
index 6a9264ddcc4c..7c9db84d4ea4 100644
--- a/cmd/ztest.c
+++ b/cmd/ztest.c
@@ -6746,7 +6746,7 @@ ztest_reguid(ztest_ds_t *zd, uint64_t id)
 	load = spa_load_guid(spa);
 
 	(void) pthread_rwlock_wrlock(&ztest_name_lock);
-	error = spa_change_guid(spa);
+	error = spa_change_guid(spa, NULL);
 	zs->zs_guid = spa_guid(spa);
 	(void) pthread_rwlock_unlock(&ztest_name_lock);
 
diff --git a/include/libzfs.h b/include/libzfs.h
index bf5579f38fda..2412797541de 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -300,6 +300,7 @@ _LIBZFS_H int zpool_trim(zpool_handle_t *, pool_trim_func_t, nvlist_t *,
 
 _LIBZFS_H int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
 _LIBZFS_H int zpool_reguid(zpool_handle_t *);
+_LIBZFS_H int zpool_set_guid(zpool_handle_t *, const uint64_t *);
 _LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *);
 
 _LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *);
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index c7e48d1edc0e..73d686a002ee 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -1710,6 +1710,11 @@ typedef enum {
 #define	ZPOOL_INITIALIZE_COMMAND	"initialize_command"
 #define	ZPOOL_INITIALIZE_VDEVS		"initialize_vdevs"
 
+/*
+ * The following are names used when invoking ZFS_IOC_POOL_REGUID.
+ */
+#define	ZPOOL_REGUID_GUID	"guid"
+
 /*
  * The following are names used when invoking ZFS_IOC_POOL_TRIM.
  */
diff --git a/include/sys/spa.h b/include/sys/spa.h
index a70912335b16..93f381affd95 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -1092,7 +1092,7 @@ extern void spa_strfree(char *);
 extern uint64_t spa_generate_guid(spa_t *spa);
 extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);
 extern void spa_freeze(spa_t *spa);
-extern int spa_change_guid(spa_t *spa);
+extern int spa_change_guid(spa_t *spa, const uint64_t *guidp);
 extern void spa_upgrade(spa_t *spa, uint64_t version);
 extern void spa_evict_all(void);
 extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 88baa4168c31..87c5c4380be3 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -556,6 +556,7 @@
     <elf-symbol name='zpool_scan' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_search_import' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_set_bootenv' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='zpool_set_guid' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_set_prop' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_set_vdev_prop' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='zpool_skip_pool' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -6639,6 +6640,11 @@
       <parameter type-id='9c313c2d' name='guid'/>
       <return type-id='95e97e5e'/>
     </function-decl>
+    <function-decl name='zpool_set_guid' mangled-name='zpool_set_guid' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_set_guid'>
+      <parameter type-id='4c81de99' name='zhp'/>
+      <parameter type-id='713a56f5' name='guid'/>
+      <return type-id='95e97e5e'/>
+    </function-decl>
     <function-decl name='zpool_reguid' mangled-name='zpool_reguid' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_reguid'>
       <parameter type-id='4c81de99' name='zhp'/>
       <return type-id='95e97e5e'/>
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index e493e8562a7d..dfa7c4db6881 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -4310,22 +4310,55 @@ zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid)
 
 /*
  * Change the GUID for a pool.
+ *
+ * Similar to zpool_reguid(), but may take a GUID.
+ *
+ * If the guid argument is NULL, then no GUID is passed in the nvlist to the
+ * ioctl().
  */
 int
-zpool_reguid(zpool_handle_t *zhp)
+zpool_set_guid(zpool_handle_t *zhp, const uint64_t *guid)
 {
 	char errbuf[ERRBUFLEN];
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	nvlist_t *nvl = NULL;
 	zfs_cmd_t zc = {"\0"};
+	int error = -1;
+
+	if (guid != NULL) {
+		if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
+			return (no_memory(hdl));
+
+		if (nvlist_add_uint64(nvl, ZPOOL_REGUID_GUID, *guid) != 0) {
+			nvlist_free(nvl);
+			return (no_memory(hdl));
+		}
+
+		zcmd_write_src_nvlist(hdl, &zc, nvl);
+	}
 
 	(void) snprintf(errbuf, sizeof (errbuf),
 	    dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name);
 
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
-	if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0)
-		return (0);
+	error = zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc);
+	if (error) {
+		return (zpool_standard_error(hdl, errno, errbuf));
+	}
+	if (guid != NULL) {
+		zcmd_free_nvlists(&zc);
+		nvlist_free(nvl);
+	}
+	return (0);
+}
 
-	return (zpool_standard_error(hdl, errno, errbuf));
+/*
+ * Change the GUID for a pool.
+ */
+int
+zpool_reguid(zpool_handle_t *zhp)
+{
+	return (zpool_set_guid(zhp, NULL));
 }
 
 /*
diff --git a/man/man8/zpool-reguid.8 b/man/man8/zpool-reguid.8
index 1fd4ddd9a77d..4fda3f316e3b 100644
--- a/man/man8/zpool-reguid.8
+++ b/man/man8/zpool-reguid.8
@@ -25,8 +25,10 @@
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+.\" Copyright (c) 2024, Klara Inc.
+.\" Copyright (c) 2024, Mateusz Piotrowski
 .\"
-.Dd May 31, 2021
+.Dd June 21, 2023
 .Dt ZPOOL-REGUID 8
 .Os
 .
@@ -36,6 +38,7 @@
 .Sh SYNOPSIS
 .Nm zpool
 .Cm reguid
+.Op Fl g Ar guid
 .Ar pool
 .
 .Sh DESCRIPTION
@@ -43,6 +46,15 @@ Generates a new unique identifier for the pool.
 You must ensure that all devices in this pool are online and healthy before
 performing this action.
 .
+.Bl -tag -width Ds
+.It Fl g Ar guid
+Set the pool GUID to the provided value.
+The GUID can be any 64-bit value accepted by
+.Xr strtoull 3
+in base 10.
+.Nm
+will return an error if the provided GUID is already in use.
+.El
 .Sh SEE ALSO
 .Xr zpool-export 8 ,
 .Xr zpool-import 8
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 99a8d107ecab..d51cc4fcd09a 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1040,16 +1040,34 @@ spa_change_guid_sync(void *arg, dmu_tx_t *tx)
  * online when we do this, or else any vdevs that weren't present
  * would be orphaned from our pool.  We are also going to issue a
  * sysevent to update any watchers.
+ *
+ * The GUID of the pool will be changed to the value pointed to by guidp.
+ * The GUID may not be set to the reserverd value of 0.
+ * The new GUID will be generated if guidp is NULL.
  */
 int
-spa_change_guid(spa_t *spa)
+spa_change_guid(spa_t *spa, const uint64_t *guidp)
 {
-	int error;
 	uint64_t guid;
+	int error;
 
 	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);
-	guid = spa_generate_guid(NULL);
+
+	if (guidp != NULL) {
+		guid = *guidp;
+		if (guid == 0) {
+			error = SET_ERROR(EINVAL);
+			goto out;
+		}
+
+		if (spa_guid_exists(guid, 0)) {
+			error = SET_ERROR(EEXIST);
+			goto out;
+		}
+	} else {
+		guid = spa_generate_guid(NULL);
+	}
 
 	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
 	    spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
@@ -1068,6 +1086,7 @@ spa_change_guid(spa_t *spa)
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
 	}
 
+out:
 	mutex_exit(&spa_namespace_lock);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 897335dd4e4f..7ce2d919610f 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -1794,17 +1794,45 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc)
 	return (error);
 }
 
+/*
+ * inputs:
+ * zc_nvlist_src	nvlist optionally containing ZPOOL_REGUID_GUID
+ * zc_nvlist_src_size	size of the nvlist
+ */
 static int
 zfs_ioc_pool_reguid(zfs_cmd_t *zc)
 {
+	uint64_t *guidp = NULL;
+	nvlist_t *props = NULL;
 	spa_t *spa;
+	uint64_t guid;
 	int error;
 
+	if (zc->zc_nvlist_src_size != 0) {
+		error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+		    zc->zc_iflags, &props);
+		if (error != 0)
+			return (error);
+
+		error = nvlist_lookup_uint64(props, ZPOOL_REGUID_GUID, &guid);
+		if (error == 0)
+			guidp = &guid;
+		else if (error == ENOENT)
+			guidp = NULL;
+		else
+			goto out;
+	}
+
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error == 0) {
-		error = spa_change_guid(spa);
+		error = spa_change_guid(spa, guidp);
 		spa_close(spa, FTAG);
 	}
+
+out:
+	if (props != NULL)
+		nvlist_free(props);
+
 	return (error);
 }
 
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index ad131664698b..088e46ce578c 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -514,6 +514,10 @@ tags = ['functional', 'cli_root', 'zpool_offline']
 tests = ['zpool_online_001_pos', 'zpool_online_002_neg']
 tags = ['functional', 'cli_root', 'zpool_online']
 
+[tests/functional/cli_root/zpool_reguid]
+tests = ['zpool_reguid_001_pos', 'zpool_reguid_002_neg']
+tags = ['functional', 'cli_root', 'zpool_reguid']
+
 [tests/functional/cli_root/zpool_remove]
 tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos',
     'zpool_remove_003_pos']
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/Makefile.am
new file mode 100644
index 000000000000..87d46b394015
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/Makefile.am
@@ -0,0 +1,6 @@
+pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/cli_root/zpool_reguid
+dist_pkgdata_SCRIPTS = \
+	setup.ksh \
+	cleanup.ksh \
+	zpool_reguid_001_pos.ksh \
+	zpool_reguid_002_neg.ksh
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/cleanup.ksh
new file mode 100755
index 000000000000..3167a5097b5a
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/cleanup.ksh
@@ -0,0 +1,32 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/setup.ksh
new file mode 100755
index 000000000000..3d866cfd9f20
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/setup.ksh
@@ -0,0 +1,34 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+DISK=${DISKS%% *}
+
+default_setup $DISK
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_001_pos.ksh
new file mode 100755
index 000000000000..4e18abd988cd
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_001_pos.ksh
@@ -0,0 +1,73 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+# Copyright 2023 Mateusz Piotrowski
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Verify 'zpool reguid' can change pool's GUID.
+#
+# STRATEGY:
+# 1. Use zpool get to obtain the initial GUID of a pool.
+# 2. Change pool's GUID with zpool reguid.
+# 3. Verify the GUID has changed to a random GUID.
+#
+# 4. Change pool's GUID with zpool reguid -g.
+# 5. Verify the GUID has changed to the specified GUID.
+#
+
+# set_guid guid [expected_guid]
+set_guid() {
+	gflag_guid="$1"
+	expected_guid="${2:-"$gflag_guid"}"
+
+	initial_guid="$(zpool get -H -o value guid "$TESTPOOL")"
+	log_assert "Verify 'zpool reguid -g \"$gflag_guid\"' sets GUID as expected."
+	log_must zpool reguid -g "$gflag_guid" "$TESTPOOL"
+	retrieved_guid="$(zpool get -H -o value guid "$TESTPOOL")"
+	if [[ "$retrieved_guid" == "" ]]; then
+		log_fail "Unable to obtain the new GUID of pool $TESTPOOL"
+	fi
+	if [[ "$expected_guid" != "$retrieved_guid" ]]; then
+		log_fail "GUID set to '$retrieved_guid' instead of '$expected_guid'"
+	fi
+}
+
+log_assert "Verify 'zpool reguid' picks a new random GUID for the pool."
+initial_guid="$(zpool get -H -o value guid "$TESTPOOL")"
+if [[ $initial_guid == "" ]]; then
+	log_fail "Unable to obtain the initial GUID of pool $TESTPOOL"
+fi
+log_must zpool reguid "$TESTPOOL"
+new_guid="$(zpool get -H -o value guid "$TESTPOOL")"
+if [[ "$new_guid" == "" ]]; then
+	log_fail "Unable to obtain the new GUID of pool $TESTPOOL"
+fi
+if [[ "$initial_guid" == "$new_guid" ]]; then
+	log_fail "GUID change failed; GUID has not changed: $initial_guid"
+fi
+
+for g in "$(bc -e '2^64 - 1')" 0; do
+	set_guid "$g"
+done
+# zpool-reguid(8) will strip the leading 0.
+set_guid 0123 "123"
+# GUID "-1" is effectively 2^64 - 1 in value.
+set_guid -1 "$(bc -e '2^64 - 1')"
+
+log_pass "'zpool reguid' changes GUID as expected."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_002_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_002_neg.ksh
new file mode 100755
index 000000000000..599041e284e2
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_reguid/zpool_reguid_002_neg.ksh
@@ -0,0 +1,60 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+# Copyright 2023 Mateusz Piotrowski
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Verify 'zpool reguid' does not accept invalid GUIDs.
+#
+# STRATEGY:
+# 1. Call zpool reguid with an invalid GUID.
+# 2. Verify that the call fails.
+# 3. Verify that the pool GUID did not change.
+#
+# 4. Call zpool reguid with a GUID that is already in use.
+# 5. Verify that the call fails.
+#
+
+check_guid() {
+	invalid_guid="$1"
+	initial_guid="$(zpool get -H -o value guid "$TESTPOOL")"
+	log_assert "'zpool reguid' will not accept invalid GUID '$invalid_guid'"
+	if zpool reguid -g "$invalid_guid" "$TESTPOOL"; then
+		log_fail "'zpool reguid' accepted invalid GUID: $invalid_guid"
+	fi
+	final_guid="$(zpool get -H -o value guid "$TESTPOOL")"
+	if [[ "$initial_guid" != "$final_guid" ]]; then
+		log_fail "Invalid GUID change from '$initial_guid' to '$final_guid'"
+	fi
+}
+
+log_assert "Verify 'zpool reguid' does not accept invalid GUIDs"
+
+for ig in "$(bc -e '2^64')" 0xA 0xa; do
+	check_guid "$ig"
+done
+
+guid="42"
+log_assert "Verify 'zpool reguid -g' does not accept GUID which are already in use"
+log_must zpool reguid -g "$guid" "$TESTPOOL"
+if zpool reguid -g "$guid" "$TESTPOOL"; then
+	log_fail "'zpool reguid' accepted GUID that was already in use: $invalid_guid"
+fi
+
+log_pass "'zpool reguid' does not accept invalid GUIDs."

From 73866cf3468f59e89baba31b93d8fdf503b10b19 Mon Sep 17 00:00:00 2001
From: Jitendra Patidar <jitendra.patidar@nutanix.com>
Date: Tue, 27 Aug 2024 06:06:49 +0530
Subject: [PATCH 61/65] Fix issig() to check signal_pending after dequeue
 SIGSTOP/SIGTSTP

When process got SIGSTOP/SIGTSTP, issig() dequeue them and return 0.
But process could still have another signal pending after dequeue. So,
after dequeue, check and return 1, if signal_pending.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Jitendra Patidar <jitendra.patidar@nutanix.com>
Closes #16464
---
 module/os/linux/spl/spl-thread.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/module/os/linux/spl/spl-thread.c b/module/os/linux/spl/spl-thread.c
index dbb8eefa7ec4..2af766ac2049 100644
--- a/module/os/linux/spl/spl-thread.c
+++ b/module/os/linux/spl/spl-thread.c
@@ -186,6 +186,13 @@ issig(void)
 
 		schedule();
 #endif
+		/*
+		 * Dequeued SIGSTOP/SIGTSTP.
+		 * Check if process has other singal pending.
+		 */
+		if (signal_pending(current))
+			return (1);
+
 		return (0);
 	}
 

From 50b32cb925f20ececeff1b500811fa349fb419ba Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 27 Aug 2024 10:39:13 +1000
Subject: [PATCH 62/65] fm: pass io_flags through events & zed as uint64_t

In 4938d01db (#14086) zio_flag_t was converted from an enum (generally
signed 32-bit) to a uint64_t. The corresponding change wasn't made to
the error reporting subsystem, limiting the error flags being delivered
to zed to 32 bits. This bumps the whole pipeline to use uint64s.

A tiny bit of compatibility is added for newer zed working agsinst an
older kernel module, because its easy to do and misdetecting
scrub/resilver errors and taking action is potentially dangerous. Making
it work for new kernel modules against older zed seems to be far more
invasive for far less benefit, so I have not.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16469
---
 cmd/zed/agents/zfs_diagnosis.c | 15 ++++++++++++---
 module/zfs/zfs_fm.c            |  2 +-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/cmd/zed/agents/zfs_diagnosis.c b/cmd/zed/agents/zfs_diagnosis.c
index e0ad00800add..e35cd0756c60 100644
--- a/cmd/zed/agents/zfs_diagnosis.c
+++ b/cmd/zed/agents/zfs_diagnosis.c
@@ -844,7 +844,6 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
 		const char *failmode = NULL;
 		boolean_t checkremove = B_FALSE;
 		uint32_t pri = 0;
-		int32_t flags = 0;
 
 		/*
 		 * If this is a checksum or I/O error, then toss it into the
@@ -922,18 +921,28 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
 			}
 		} else if (fmd_nvl_class_match(hdl, nvl,
 		    ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
+			uint64_t flags = 0;
+			int32_t flags32 = 0;
 			/*
 			 * We ignore ereports for checksum errors generated by
 			 * scrub/resilver I/O to avoid potentially further
 			 * degrading the pool while it's being repaired.
+			 *
+			 * Note that FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS used to
+			 * be int32. To allow newer zed to work on older
+			 * kernels, if we don't find the flags, we look for
+			 * the older ones too.
 			 */
 			if (((nvlist_lookup_uint32(nvl,
 			    FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY, &pri) == 0) &&
 			    (pri == ZIO_PRIORITY_SCRUB ||
 			    pri == ZIO_PRIORITY_REBUILD)) ||
-			    ((nvlist_lookup_int32(nvl,
+			    ((nvlist_lookup_uint64(nvl,
 			    FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags) == 0) &&
-			    (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) {
+			    (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) ||
+			    ((nvlist_lookup_int32(nvl,
+			    FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, &flags32) == 0) &&
+			    (flags32 & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))) {
 				fmd_hdl_debug(hdl, "ignoring '%s' for "
 				    "scrub/resilver I/O", class);
 				return;
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index 2f43c4aa41b8..f7cecc9af8a4 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -645,7 +645,7 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
 		    DATA_TYPE_INT32, zio->io_error, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS,
-		    DATA_TYPE_INT32, zio->io_flags, NULL);
+		    DATA_TYPE_UINT64, zio->io_flags, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE,
 		    DATA_TYPE_UINT32, zio->io_stage, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE,

From 92fca1c2d0ea743c4c92e54df028f1639634b776 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 26 Aug 2024 16:24:59 +1000
Subject: [PATCH 63/65] zstream: build with debug to fix stack overruns

abd_t differs in size depending on whether or not ZFS_DEBUG is set. It
turns out that libzpool is built with FORCEDEBUG_CPPFLAGS, which sets
-DZFS_DEBUG, and so it always has a larger abd_t with extra debug
fields, regardless of whether or not --enable-debug is set.

zdb, ztest and zhack are also all built with FORCEDEBUG_CPPFLAGS, so had
the same idea of the size of abd_t, but zstream was not, and used the
"smaller" abd_t. In practice this didn't matter because it never used
abd_t directly.

This changed in b4d81b1a6, zstream was switched to use stack ABDs for
compression. When built with --enable-debug, zstream implicitly gets
ZFS_DEBUG, and everything was fine. Productions builds without that flag
ends up with the smaller abd_t, which is now mismatched with libzpool,
and causes stack overruns in zstream recompress.

The simplest fix for now is to compile zstream with FORCEDEBUG_CPPFLAGS
like the other binaries. This commit does that.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Issue #16476
Closes #16477
---
 cmd/zstream/Makefile.am | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cmd/zstream/Makefile.am b/cmd/zstream/Makefile.am
index 8506b351165e..f9d0b0cfd2b7 100644
--- a/cmd/zstream/Makefile.am
+++ b/cmd/zstream/Makefile.am
@@ -1,3 +1,5 @@
+zstream_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+
 sbin_PROGRAMS   += zstream
 CPPCHECKTARGETS += zstream
 

From b3b7491615308d80e363854e977387f633ad9327 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 27 Aug 2024 09:44:53 +1000
Subject: [PATCH 64/65] build: rename FORCEDEBUG_CPPFLAGS to LIBZPOOL_CPPFLAGS

This is just a very small attempt to make it more obvious that these
flags aren't optional for libzpool-using programs, by not making it seem
like there's an option to say "well, I don't _want_ to force debugging".

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Issue #16476
Closes #16477
---
 cmd/Makefile.am                 | 4 ++--
 cmd/raidz_test/Makefile.am      | 2 +-
 cmd/zdb/Makefile.am             | 2 +-
 cmd/zstream/Makefile.am         | 2 +-
 config/Rules.am                 | 5 ++++-
 lib/libzpool/Makefile.am        | 2 +-
 tests/zfs-tests/cmd/Makefile.am | 2 +-
 7 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/cmd/Makefile.am b/cmd/Makefile.am
index 2bd9d039f20e..96040976e53e 100644
--- a/cmd/Makefile.am
+++ b/cmd/Makefile.am
@@ -24,7 +24,7 @@ zfs_ids_to_path_LDADD = \
 	libzfs.la
 
 
-zhack_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+zhack_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 
 sbin_PROGRAMS   += zhack
 CPPCHECKTARGETS += zhack
@@ -39,7 +39,7 @@ zhack_LDADD = \
 
 
 ztest_CFLAGS    = $(AM_CFLAGS) $(KERNEL_CFLAGS)
-ztest_CPPFLAGS  = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+ztest_CPPFLAGS  = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 
 sbin_PROGRAMS   += ztest
 CPPCHECKTARGETS += ztest
diff --git a/cmd/raidz_test/Makefile.am b/cmd/raidz_test/Makefile.am
index 3b8b60568323..635216d65d73 100644
--- a/cmd/raidz_test/Makefile.am
+++ b/cmd/raidz_test/Makefile.am
@@ -1,5 +1,5 @@
 raidz_test_CFLAGS   = $(AM_CFLAGS)   $(KERNEL_CFLAGS)
-raidz_test_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+raidz_test_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 
 bin_PROGRAMS    += raidz_test
 CPPCHECKTARGETS += raidz_test
diff --git a/cmd/zdb/Makefile.am b/cmd/zdb/Makefile.am
index ebdc19128e1a..8a4388bd1884 100644
--- a/cmd/zdb/Makefile.am
+++ b/cmd/zdb/Makefile.am
@@ -1,4 +1,4 @@
-zdb_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+zdb_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 zdb_CFLAGS   = $(AM_CFLAGS) $(LIBCRYPTO_CFLAGS)
 
 sbin_PROGRAMS   += zdb
diff --git a/cmd/zstream/Makefile.am b/cmd/zstream/Makefile.am
index f9d0b0cfd2b7..be3539fe905d 100644
--- a/cmd/zstream/Makefile.am
+++ b/cmd/zstream/Makefile.am
@@ -1,4 +1,4 @@
-zstream_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+zstream_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 
 sbin_PROGRAMS   += zstream
 CPPCHECKTARGETS += zstream
diff --git a/config/Rules.am b/config/Rules.am
index b462826e2c89..9c0714c82513 100644
--- a/config/Rules.am
+++ b/config/Rules.am
@@ -71,4 +71,7 @@ KERNEL_CFLAGS       = $(FRAME_LARGER_THAN)
 LIBRARY_CFLAGS      = -no-suppress
 
 # Forcibly enable asserts/debugging for libzpool &al.
-FORCEDEBUG_CPPFLAGS = -DDEBUG -UNDEBUG -DZFS_DEBUG
+# Since ZFS_DEBUG can change shared data structures, all libzpool users must
+# be compiled with the same flags.
+# See https://github.com/openzfs/zfs/issues/16476
+LIBZPOOL_CPPFLAGS = -DDEBUG -UNDEBUG -DZFS_DEBUG
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index 6989fefc6662..81949bf9e5b8 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -3,7 +3,7 @@ include $(srcdir)/%D%/include/Makefile.am
 libzpool_la_CFLAGS  = $(AM_CFLAGS) $(KERNEL_CFLAGS) $(LIBRARY_CFLAGS)
 libzpool_la_CFLAGS += $(ZLIB_CFLAGS)
 
-libzpool_la_CPPFLAGS  = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+libzpool_la_CPPFLAGS  = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 libzpool_la_CPPFLAGS += -I$(srcdir)/include/os/@ac_system_l@/zfs
 libzpool_la_CPPFLAGS += -DLIB_ZPOOL_BUILD
 
diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am
index 23848a82ffbd..a8df06c2e990 100644
--- a/tests/zfs-tests/cmd/Makefile.am
+++ b/tests/zfs-tests/cmd/Makefile.am
@@ -24,7 +24,7 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/badsend
 
 
 scripts_zfs_tests_bin_PROGRAMS += %D%/btree_test
-%C%_btree_test_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS)
+%C%_btree_test_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 %C%_btree_test_LDADD = \
 	libzpool.la \
 	libzfs_core.la

From b65aeb44eef0e854bed204a6b8d7db93b276d98b Mon Sep 17 00:00:00 2001
From: Umer Saleem <usaleem@ixsystems.com>
Date: Wed, 4 Sep 2024 00:40:44 +0500
Subject: [PATCH 65/65] Revert "Make mount.zfs(8) calling zfs_mount_at for
 legacy mounts"

This reverts commit 34118eac06fba834f0c934419aec1b386c98665a.

Signed-off-by: Umer Saleem <usaleem@ixsystems.com>
---
 cmd/mount_zfs.c                  | 5 +++--
 module/os/linux/zfs/zfs_ctldir.c | 8 ++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/cmd/mount_zfs.c b/cmd/mount_zfs.c
index 283074daf717..fc9220950647 100644
--- a/cmd/mount_zfs.c
+++ b/cmd/mount_zfs.c
@@ -269,7 +269,8 @@ main(int argc, char **argv)
 		return (MOUNT_USAGE);
 	}
 
-	if (sloppy || libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
+	if (!zfsutil || sloppy ||
+	    libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
 		zfs_adjust_mount_options(zhp, mntpoint, mntopts, mtabopt);
 	}
 
@@ -336,7 +337,7 @@ main(int argc, char **argv)
 		    dataset, mntpoint, mntflags, zfsflags, mntopts, mtabopt);
 
 	if (!fake) {
-		if (!remount && !sloppy &&
+		if (zfsutil && !sloppy &&
 		    !libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) {
 			error = zfs_mount_at(zhp, mntopts, mntflags, mntpoint);
 			if (error) {
diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c
index e042116333fb..54ed70d0394f 100644
--- a/module/os/linux/zfs/zfs_ctldir.c
+++ b/module/os/linux/zfs/zfs_ctldir.c
@@ -1101,8 +1101,8 @@ zfsctl_snapshot_mount(struct path *path, int flags)
 	zfsvfs_t *snap_zfsvfs;
 	zfs_snapentry_t *se;
 	char *full_name, *full_path;
-	char *argv[] = { "/usr/bin/env", "mount", "-i", "-t", "zfs", "-n",
-	    NULL, NULL, NULL };
+	char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL,
+	    NULL };
 	char *envp[] = { NULL };
 	int error;
 	struct path spath;
@@ -1153,8 +1153,8 @@ zfsctl_snapshot_mount(struct path *path, int flags)
 	 * value from call_usermodehelper() will be (exitcode << 8 + signal).
 	 */
 	dprintf("mount; name=%s path=%s\n", full_name, full_path);
-	argv[6] = full_name;
-	argv[7] = full_path;
+	argv[5] = full_name;
+	argv[6] = full_path;
 	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
 	if (error) {
 		if (!(error & MOUNT_BUSY << 8)) {