From 666903610d08611aefdde043b731abfa5dbb21a1 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Sat, 12 Oct 2024 03:37:57 +1100
Subject: [PATCH 01/47] zpool/zfs: allow --json wherever -j is allowed

Mostly so that with the JSON formatting options are also used, they all
look the same. To my eye, `-j --json-flat-vdevs` suggests that they are
different or unrelated, while `--json --json-flat-vdevs` invites no
further questions.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Umer Saleem <usaleem@ixsystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16632
---
 cmd/zfs/zfs_main.c                            | 28 ++++++++--
 cmd/zpool/zpool_main.c                        | 10 +++-
 man/man8/zfs-list.8                           |  2 +-
 man/man8/zfs-mount.8                          |  2 +-
 man/man8/zfs-program.8                        |  2 +-
 man/man8/zfs-set.8                            |  2 +-
 man/man8/zpool-get.8                          |  4 +-
 man/man8/zpool-list.8                         |  2 +-
 man/man8/zpool-status.8                       |  2 +-
 .../functional/cli_root/json/json_sanity.ksh  | 51 +++++++++++--------
 10 files changed, 72 insertions(+), 33 deletions(-)

diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index f979fd189cc9..97397726c709 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -2162,6 +2162,7 @@ zfs_do_get(int argc, char **argv)
 	cb.cb_type = ZFS_TYPE_DATASET;
 
 	struct option long_options[] = {
+		{"json", no_argument, NULL, 'j'},
 		{"json-int", no_argument, NULL, ZFS_OPTION_JSON_NUMS_AS_INT},
 		{0, 0, 0, 0}
 	};
@@ -3852,6 +3853,7 @@ zfs_do_list(int argc, char **argv)
 	nvlist_t *data = NULL;
 
 	struct option long_options[] = {
+		{"json", no_argument, NULL, 'j'},
 		{"json-int", no_argument, NULL, ZFS_OPTION_JSON_NUMS_AS_INT},
 		{0, 0, 0, 0}
 	};
@@ -7436,9 +7438,15 @@ share_mount(int op, int argc, char **argv)
 	uint_t nthr;
 	jsobj = data = item = NULL;
 
+	struct option long_options[] = {
+		{"json", no_argument, NULL, 'j'},
+		{0, 0, 0, 0}
+	};
+
 	/* check options */
-	while ((c = getopt(argc, argv, op == OP_MOUNT ? ":ajRlvo:Of" : "al"))
-	    != -1) {
+	while ((c = getopt_long(argc, argv,
+	    op == OP_MOUNT ? ":ajRlvo:Of" : "al",
+	    op == OP_MOUNT ? long_options : NULL, NULL)) != -1) {
 		switch (c) {
 		case 'a':
 			do_all = 1;
@@ -8374,8 +8382,14 @@ zfs_do_channel_program(int argc, char **argv)
 	boolean_t sync_flag = B_TRUE, json_output = B_FALSE;
 	zpool_handle_t *zhp;
 
+	struct option long_options[] = {
+		{"json", no_argument, NULL, 'j'},
+		{0, 0, 0, 0}
+	};
+
 	/* check options */
-	while ((c = getopt(argc, argv, "nt:m:j")) != -1) {
+	while ((c = getopt_long(argc, argv, "nt:m:j", long_options,
+	    NULL)) != -1) {
 		switch (c) {
 		case 't':
 		case 'm': {
@@ -9083,7 +9097,13 @@ zfs_do_version(int argc, char **argv)
 	int c;
 	nvlist_t *jsobj = NULL, *zfs_ver = NULL;
 	boolean_t json = B_FALSE;
-	while ((c = getopt(argc, argv, "j")) != -1) {
+
+	struct option long_options[] = {
+		{"json", no_argument, NULL, 'j'},
+		{0, 0, 0, 0}
+	};
+
+	while ((c = getopt_long(argc, argv, "j", long_options, NULL)) != -1) {
 		switch (c) {
 		case 'j':
 			json = B_TRUE;
diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 09a6c6043280..ea180f6b705e 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -7340,6 +7340,7 @@ zpool_do_list(int argc, char **argv)
 	current_prop_type = ZFS_TYPE_POOL;
 
 	struct option long_options[] = {
+		{"json", no_argument, NULL, 'j'},
 		{"json-int", no_argument, NULL, ZPOOL_OPTION_JSON_NUMS_AS_INT},
 		{"json-pool-key-guid", no_argument, NULL,
 		    ZPOOL_OPTION_POOL_KEY_GUID},
@@ -10981,6 +10982,7 @@ zpool_do_status(int argc, char **argv)
 
 	struct option long_options[] = {
 		{"power", no_argument, NULL, ZPOOL_OPTION_POWER},
+		{"json", no_argument, NULL, 'j'},
 		{"json-int", no_argument, NULL, ZPOOL_OPTION_JSON_NUMS_AS_INT},
 		{"json-flat-vdevs", no_argument, NULL,
 		    ZPOOL_OPTION_JSON_FLAT_VDEVS},
@@ -12589,6 +12591,7 @@ zpool_do_get(int argc, char **argv)
 	current_prop_type = cb.cb_type;
 
 	struct option long_options[] = {
+		{"json", no_argument, NULL, 'j'},
 		{"json-int", no_argument, NULL, ZPOOL_OPTION_JSON_NUMS_AS_INT},
 		{"json-pool-key-guid", no_argument, NULL,
 		    ZPOOL_OPTION_POOL_KEY_GUID},
@@ -13503,7 +13506,12 @@ zpool_do_version(int argc, char **argv)
 	int c;
 	nvlist_t *jsobj = NULL, *zfs_ver = NULL;
 	boolean_t json = B_FALSE;
-	while ((c = getopt(argc, argv, "j")) != -1) {
+
+	struct option long_options[] = {
+		{"json", no_argument, NULL, 'j'},
+	};
+
+	while ((c = getopt_long(argc, argv, "j", long_options, NULL)) != -1) {
 		switch (c) {
 		case 'j':
 			json = B_TRUE;
diff --git a/man/man8/zfs-list.8 b/man/man8/zfs-list.8
index b49def08b72b..0fa7e9cc2613 100644
--- a/man/man8/zfs-list.8
+++ b/man/man8/zfs-list.8
@@ -71,7 +71,7 @@ The following fields are displayed:
 Used for scripting mode.
 Do not print headers and separate fields by a single tab instead of arbitrary
 white space.
-.It Fl j Op Ar --json-int
+.It Fl j , -json Op Ar --json-int
 Print the output in JSON format.
 Specify
 .Sy --json-int
diff --git a/man/man8/zfs-mount.8 b/man/man8/zfs-mount.8
index 6116fbaab77f..921e2499e31b 100644
--- a/man/man8/zfs-mount.8
+++ b/man/man8/zfs-mount.8
@@ -59,7 +59,7 @@
 .Xc
 Displays all ZFS file systems currently mounted.
 .Bl -tag -width "-j"
-.It Fl j
+.It Fl j , -json
 Displays all mounted file systems in JSON format.
 .El
 .It Xo
diff --git a/man/man8/zfs-program.8 b/man/man8/zfs-program.8
index 928620362be7..460cd2e11cf3 100644
--- a/man/man8/zfs-program.8
+++ b/man/man8/zfs-program.8
@@ -50,7 +50,7 @@ and any attempts to access or modify other pools will cause an error.
 .
 .Sh OPTIONS
 .Bl -tag -width "-t"
-.It Fl j
+.It Fl j , -json
 Display channel program output in JSON format.
 When this flag is specified and standard output is empty -
 channel program encountered an error.
diff --git a/man/man8/zfs-set.8 b/man/man8/zfs-set.8
index 204450d72ec9..f1718608b44b 100644
--- a/man/man8/zfs-set.8
+++ b/man/man8/zfs-set.8
@@ -130,7 +130,7 @@ The value
 can be used to display all properties that apply to the given dataset's type
 .Pq Sy filesystem , volume , snapshot , No or Sy bookmark .
 .Bl -tag -width "-s source"
-.It Fl j Op Ar --json-int
+.It Fl j , -json Op Ar --json-int
 Display the output in JSON format.
 Specify
 .Sy --json-int
diff --git a/man/man8/zpool-get.8 b/man/man8/zpool-get.8
index 5384906f17f2..1be83526d22d 100644
--- a/man/man8/zpool-get.8
+++ b/man/man8/zpool-get.8
@@ -98,7 +98,7 @@ See the
 .Xr zpoolprops 7
 manual page for more information on the available pool properties.
 .Bl -tag -compact -offset Ds -width "-o field"
-.It Fl j Op Ar --json-int, --json-pool-key-guid
+.It Fl j , -json Op Ar --json-int, --json-pool-key-guid
 Display the list of properties in JSON format.
 Specify
 .Sy --json-int
@@ -157,7 +157,7 @@ See the
 .Xr vdevprops 7
 manual page for more information on the available pool properties.
 .Bl -tag -compact -offset Ds -width "-o field"
-.It Fl j Op Ar --json-int
+.It Fl j , -json Op Ar --json-int
 Display the list of properties in JSON format.
 Specify
 .Sy --json-int
diff --git a/man/man8/zpool-list.8 b/man/man8/zpool-list.8
index b0ee659701d4..6d3478a68711 100644
--- a/man/man8/zpool-list.8
+++ b/man/man8/zpool-list.8
@@ -59,7 +59,7 @@ is specified, the command exits after
 .Ar count
 reports are printed.
 .Bl -tag -width Ds
-.It Fl j Op Ar --json-int, --json-pool-key-guid
+.It Fl j , -json Op Ar --json-int, --json-pool-key-guid
 Display the list of pools in JSON format.
 Specify
 .Sy --json-int
diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8
index f44f3f5f838f..b9b54185d050 100644
--- a/man/man8/zpool-status.8
+++ b/man/man8/zpool-status.8
@@ -70,7 +70,7 @@ See the
 option of
 .Nm zpool Cm iostat
 for complete details.
-.It Fl j Op Ar --json-int, --json-flat-vdevs, --json-pool-key-guid
+.It Fl j , -json Op Ar --json-int, --json-flat-vdevs, --json-pool-key-guid
 Display the status for ZFS pools in JSON format.
 Specify
 .Sy --json-int
diff --git a/tests/zfs-tests/tests/functional/cli_root/json/json_sanity.ksh b/tests/zfs-tests/tests/functional/cli_root/json/json_sanity.ksh
index e598dd57181e..d092a3b0e828 100755
--- a/tests/zfs-tests/tests/functional/cli_root/json/json_sanity.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/json/json_sanity.ksh
@@ -30,28 +30,39 @@
 # STRATEGY:
 # 1. Run different zfs/zpool -j commands and check for valid JSON
 
+#
+# -j and --json mean the same thing. Each command will be run twice, replacing
+# JSONFLAG with the flag under test.
 list=(
-    "zpool status -j -g --json-int --json-flat-vdevs --json-pool-key-guid"
-    "zpool status -p -j -g --json-int --json-flat-vdevs --json-pool-key-guid"
-    "zpool status -j -c upath"
-    "zpool status -j"
-    "zpool status -j testpool1"
-    "zpool list -j"
-    "zpool list -j -g"
-    "zpool list -j -o fragmentation"
-    "zpool get -j size"
-    "zpool get -j all"
-    "zpool version -j"
-    "zfs list -j"
-    "zfs list -j testpool1"
-    "zfs get -j all"
-    "zfs get -j available"
-    "zfs mount -j"
-    "zfs version -j"
+    "zpool status JSONFLAG -g --json-int --json-flat-vdevs --json-pool-key-guid"
+    "zpool status -p JSONFLAG -g --json-int --json-flat-vdevs --json-pool-key-guid"
+    "zpool status JSONFLAG -c upath"
+    "zpool status JSONFLAG"
+    "zpool status JSONFLAG testpool1"
+    "zpool list JSONFLAG"
+    "zpool list JSONFLAG -g"
+    "zpool list JSONFLAG -o fragmentation"
+    "zpool get JSONFLAG size"
+    "zpool get JSONFLAG all"
+    "zpool version JSONFLAG"
+    "zfs list JSONFLAG"
+    "zfs list JSONFLAG testpool1"
+    "zfs get JSONFLAG all"
+    "zfs get JSONFLAG available"
+    "zfs mount JSONFLAG"
+    "zfs version JSONFLAG"
 )
 
-for cmd in "${list[@]}" ; do
-    log_must eval "$cmd | jq > /dev/null" 
-done
+function run_json_tests
+{
+	typeset flag=$1
+	for cmd in "${list[@]}" ; do
+	    cmd=${cmd//JSONFLAG/$flag}
+	    log_must eval "$cmd | jq > /dev/null"
+	done
+}
+
+log_must run_json_tests -j
+log_must run_json_tests --json
 
 log_pass "zpool and zfs commands outputted valid JSON"

From 58162960a1dbcd35c72e42a4bbb821ccc5a0c505 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Matu=C5=A1ka?= <mm@FreeBSD.org>
Date: Fri, 11 Oct 2024 18:55:17 +0200
Subject: [PATCH 02/47] zdb: fix printf format in dump_zap()

When compiling zdb.c on 32-bit platforms, a format conversion error
is reported for a printf() in dump_zap().  Change %l to macro
%" PRIu64 " to match the platform size of a 64-bit unsigned integer.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Martin Matuska <mm@FreeBSD.org>
Closes #16635
---
 cmd/zdb/zdb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 2c136bacb1a8..16c7025802f3 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -1131,7 +1131,7 @@ dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
 		    !!(zap_getflags(zc.zc_zap) & ZAP_FLAG_UINT64_KEY);
 
 		if (key64)
-			(void) printf("\t\t0x%010lx = ",
+			(void) printf("\t\t0x%010" PRIu64 "x = ",
 			    *(uint64_t *)attrp->za_name);
 		else
 			(void) printf("\t\t%s = ", attrp->za_name);

From 7f830d783b1c0cbfc504cea8b2f7052ad58242d7 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 11 Oct 2024 14:16:00 -0700
Subject: [PATCH 03/47] CI: Stick with ubuntu-22.04 for CodeQL analysis

The ubuntu-latest alias now refers to ubuntu-24.04 instead of
ubuntu-22.04 which causes CodeQL's autobuild to fail with:

    cpp/autobuilder: deptrace not supported in ubuntu 24.04

Until deptrace is supported by ubuntu-24.04 hosted runners request
ubuntu-22.04 which is supported.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Closes #16639
---
 .github/workflows/codeql.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 2656a20fea0d..e975d7dd00b9 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -11,7 +11,7 @@ concurrency:
 jobs:
   analyze:
     name: Analyze
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     permissions:
       actions: read
       contents: read

From 5bc27acf512d6f778b4f1d838fc8df0b53f5567b Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 11 Oct 2024 14:22:24 -0700
Subject: [PATCH 04/47] ZTS: Slightly increase dedup_quota limit

As described in the comment above this check the space used by
logged entries is not accounted for and some margin needs to be
added in.  While uncommon we have slightly exceeded the 600,000
threshold on some CI run so we increase the limit a bit more.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #16637
---
 tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh
index 326152b510a9..b1657648b5a1 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh
@@ -221,7 +221,7 @@ function ddt_dedup_vdev_limit
 	# For here, we just set the entry count a little higher than what we
 	# expect to allow for some instability.
 	#
-	log_must test $(ddt_entries) -le 600000
+	log_must test $(ddt_entries) -le 650000
 
 	do_clean
 }

From c645b07eaa0e1def37ffe00d63e5835e8b13742d Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 11 Oct 2024 16:12:16 -0700
Subject: [PATCH 05/47] ZTS: Increase zpool_import_parallel_pos import margin

Increase the pool import time allowed by assuming a minimum reduction
to 1/2 instead of 1/3 when comparing sequential to parallel import
times.  This is sufficient to verify parallel imports are working as
intended and should address the occasional false positive failure
when the time is slightly exceeded.

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #16638
---
 .../cli_root/zpool_import/zpool_import_parallel_pos.ksh       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh
index 71b2437a37ec..57a7412e37cc 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh
@@ -113,7 +113,7 @@ wait
 parallel_time=$SECONDS
 log_note "asyncronously imported 4 pools in $parallel_time seconds"
 
-log_must test $parallel_time -lt $(($sequential_time / 3))
+log_must test $parallel_time -lt $(($sequential_time / 2))
 
 #
 # export pools with import delay injectors
@@ -132,6 +132,6 @@ log_must zpool import -a -d $DEVICE_DIR -f
 parallel_time=$SECONDS
 log_note "asyncronously imported 4 pools in $parallel_time seconds"
 
-log_must test $parallel_time -lt $(($sequential_time / 3))
+log_must test $parallel_time -lt $(($sequential_time / 2))
 
 log_pass "Pool imports occur in parallel"

From 56871e465a9aee71b532d20c58853c75190257ef Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Sat, 12 Oct 2024 13:48:56 -0700
Subject: [PATCH 06/47] Fallback to strerror() when strerror_l() isn't
 available

Some C libraries, such as uClibc, do not provide strerror_l() in
which case we fallback to strerror().

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #16636
Closes #16640
---
 config/user.m4     | 2 +-
 include/libzutil.h | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/config/user.m4 b/config/user.m4
index badd920d2b8a..4e31745a2abc 100644
--- a/config/user.m4
+++ b/config/user.m4
@@ -33,7 +33,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [
 	ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV
 	ZFS_AC_CONFIG_USER_ZFSEXEC
 
-	AC_CHECK_FUNCS([execvpe issetugid mlockall strlcat strlcpy gettid])
+	AC_CHECK_FUNCS([execvpe issetugid mlockall strerror_l strlcat strlcpy gettid])
 
 	AC_SUBST(RM)
 ])
diff --git a/include/libzutil.h b/include/libzutil.h
index e2108ceeaa44..f8712340cc5e 100644
--- a/include/libzutil.h
+++ b/include/libzutil.h
@@ -276,7 +276,11 @@ _LIBZUTIL_H void update_vdev_config_dev_sysfs_path(nvlist_t *nv,
  * Thread-safe strerror() for use in ZFS libraries
  */
 static inline char *zfs_strerror(int errnum) {
+#ifdef HAVE_STRERROR_L
 	return (strerror_l(errnum, uselocale(0)));
+#else
+	return (strerror(errnum));
+#endif
 }
 
 #ifdef	__cplusplus

From 77df762a1bd9180286645d4ebd195c16579225ea Mon Sep 17 00:00:00 2001
From: Tino Reichardt <milky-zfs@mcmilk.de>
Date: Sat, 12 Oct 2024 09:53:32 +0200
Subject: [PATCH 07/47] ZTS: Optimize Kernel Same-page Merging (KSM)

Kernel same-page Merging (KSM) allows KVM guests to share identical
memory pages. These shared pages are usually common libraries or other
identical, high-use data.

The current configuration was a bit to lazy - so KSM didn't work very
well. With the new configuration I could run 3 Linux VMs in parralel.

FreeBSD can't benefit from it. But FreeBSD is not so memory hungry in
general, so there is no need for it ;)

Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tino Reichardt <milky-zfs@mcmilk.de>
Closes #16641
---
 .github/workflows/scripts/qemu-1-setup.sh | 18 ++++++++++--------
 .github/workflows/scripts/qemu-5-setup.sh | 12 +++++++-----
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/scripts/qemu-1-setup.sh b/.github/workflows/scripts/qemu-1-setup.sh
index ebd80a2f98c1..f838da34efff 100755
--- a/.github/workflows/scripts/qemu-1-setup.sh
+++ b/.github/workflows/scripts/qemu-1-setup.sh
@@ -18,19 +18,21 @@ ssh-keygen -t ed25519 -f ~/.ssh/id_ed25519 -q -N ""
 
 # we expect RAM shortage
 cat << EOF | sudo tee /etc/ksmtuned.conf > /dev/null
+# /etc/ksmtuned.conf - Configuration file for ksmtuned
 # https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/7/html/virtualization_tuning_and_optimization_guide/chap-ksm
 KSM_MONITOR_INTERVAL=60
 
 # Millisecond sleep between ksm scans for 16Gb server.
 # Smaller servers sleep more, bigger sleep less.
-KSM_SLEEP_MSEC=10
-KSM_NPAGES_BOOST=300
-KSM_NPAGES_DECAY=-50
-KSM_NPAGES_MIN=64
-KSM_NPAGES_MAX=2048
-
-KSM_THRES_COEF=25
-KSM_THRES_CONST=2048
+KSM_SLEEP_MSEC=30
+
+KSM_NPAGES_BOOST=0
+KSM_NPAGES_DECAY=0
+KSM_NPAGES_MIN=1000
+KSM_NPAGES_MAX=25000
+
+KSM_THRES_COEF=80
+KSM_THRES_CONST=8192
 
 LOGFILE=/var/log/ksmtuned.log
 DEBUG=1
diff --git a/.github/workflows/scripts/qemu-5-setup.sh b/.github/workflows/scripts/qemu-5-setup.sh
index 7acb67a27920..5034c1896304 100755
--- a/.github/workflows/scripts/qemu-5-setup.sh
+++ b/.github/workflows/scripts/qemu-5-setup.sh
@@ -14,17 +14,19 @@ PID=$(pidof /usr/bin/qemu-system-x86_64)
 tail --pid=$PID -f /dev/null
 sudo virsh undefine openzfs
 
+# default values per test vm:
+VMs=2
+CPU=2
+
 # definitions of per operating system
 case "$OS" in
+  # FreeBSD can't be optimized via ksmtuned
   freebsd*)
-    VMs=2
-    CPU=3
     RAM=6
     ;;
   *)
-    VMs=2
-    CPU=3
-    RAM=7
+    # Linux can be optimized via ksmtuned
+    RAM=8
     ;;
 esac
 

From b5a3825244583d10730b08dc29070ff81e84eb27 Mon Sep 17 00:00:00 2001
From: Tino Reichardt <milky-zfs@mcmilk.de>
Date: Sat, 12 Oct 2024 09:54:50 +0200
Subject: [PATCH 08/47] ZTS: Make use of optimal CPU pinning

With CPU pinning, we should get some speedup because of better
cpu cache re-use.

Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tino Reichardt <milky-zfs@mcmilk.de>
Closes #16641
---
 .github/workflows/scripts/qemu-5-setup.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/scripts/qemu-5-setup.sh b/.github/workflows/scripts/qemu-5-setup.sh
index 5034c1896304..bc40e8894b22 100755
--- a/.github/workflows/scripts/qemu-5-setup.sh
+++ b/.github/workflows/scripts/qemu-5-setup.sh
@@ -18,10 +18,12 @@ sudo virsh undefine openzfs
 VMs=2
 CPU=2
 
-# definitions of per operating system
+# cpu pinning
+CPUSET=("0,1" "2,3")
+
 case "$OS" in
-  # FreeBSD can't be optimized via ksmtuned
   freebsd*)
+    # FreeBSD can't be optimized via ksmtuned
     RAM=6
     ;;
   *)
@@ -75,6 +77,7 @@ EOF
     --cpu host-passthrough \
     --virt-type=kvm --hvm \
     --vcpus=$CPU,sockets=1 \
+    --cpuset=${CPUSET[$((i-1))]} \
     --memory $((1024*RAM)) \
     --memballoon model=virtio \
     --graphics none \

From 0409c47fe01f36c11bfde4aeeece12697b337e6d Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Wed, 9 Oct 2024 13:47:09 -0700
Subject: [PATCH 09/47] Tag 2.3.0-rc2

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
---
 META | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/META b/META
index c2eff4ba0bbd..eca34a55d004 100644
--- a/META
+++ b/META
@@ -2,7 +2,7 @@ Meta:          1
 Name:          zfs
 Branch:        1.0
 Version:       2.3.0
-Release:       rc1
+Release:       rc2
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS

From 3d9129a7b6edf277ceb35acf2938285e3d410fdf Mon Sep 17 00:00:00 2001
From: Warner Losh <imp@bsdimp.com>
Date: Wed, 16 Oct 2024 11:00:40 -0600
Subject: [PATCH 10/47] freebsd: Use compiler.h from FreeBSD's base's linuxkpi

The FreeBSD linux/compiler.h in OpenZFS was copied from a very old
version of FreeBSD's linuxkpi's linux/compiler.h. There's no need for
this duplication. Use FreeBSD's linuxkpi version instead, and provide
zfs_fallthrough to augment it (it's all that's needed). Use #pragma once
to avoid naming issues for guard variables. Since this is a complete
rewrite, use my copyright here (the original code in FreeBSD still
credits everybody). This works back at least to FreeBSD 12.4, which
is not out of support, and all newer releases.

Remove extra copies of macros that were defined elsewhere, but are now
properly defined in LinuxKPI so are redundant.

Sponsored-by: Netflix
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Warner Losh <imp@bsdimp.com>
Closes #16650
---
 include/os/freebsd/linux/compiler.h  | 83 +++-------------------------
 include/os/freebsd/spl/sys/ccompat.h |  9 ---
 include/os/freebsd/spl/sys/debug.h   |  4 --
 3 files changed, 8 insertions(+), 88 deletions(-)

diff --git a/include/os/freebsd/linux/compiler.h b/include/os/freebsd/linux/compiler.h
index b408b77c746d..24f09c722158 100644
--- a/include/os/freebsd/linux/compiler.h
+++ b/include/os/freebsd/linux/compiler.h
@@ -1,10 +1,5 @@
 /*
- * Copyright (c) 2010 Isilon Systems, Inc.
- * Copyright (c) 2010 iXsystems, Inc.
- * Copyright (c) 2010 Panasas, Inc.
- * Copyright (c) 2013-2016 Mellanox Technologies, Ltd.
- * Copyright (c) 2015 François Tigeot
- * All rights reserved.
+ * Copyright (c) 2024 Warner Losh.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -26,76 +21,14 @@
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * $FreeBSD$
  */
-#ifndef	_LINUX_COMPILER_H_
-#define	_LINUX_COMPILER_H_
-
-#include <sys/cdefs.h>
-
-#define	__user
-#define	__kernel
-#define	__safe
-#define	__force
-#define	__nocast
-#define	__iomem
-#define	__chk_user_ptr(x)		((void)0)
-#define	__chk_io_ptr(x)			((void)0)
-#define	__builtin_warning(x, y...)	(1)
-#define	__acquires(x)
-#define	__releases(x)
-#define	__acquire(x)			do { } while (0)
-#define	__release(x)			do { } while (0)
-#define	__cond_lock(x, c)		(c)
-#define	__bitwise
-#define	__devinitdata
-#define	__deprecated
-#define	__init
-#define	__initconst
-#define	__devinit
-#define	__devexit
-#define	__exit
-#define	__rcu
-#define	__percpu
-#define	__weak __weak_symbol
-#define	__malloc
-#define	___stringify(...)		#__VA_ARGS__
-#define	__stringify(...)		___stringify(__VA_ARGS__)
-#define	__attribute_const__		__attribute__((__const__))
-#undef __always_inline
-#define	__always_inline			inline
-#define	noinline			__noinline
-#define	____cacheline_aligned		__aligned(CACHE_LINE_SIZE)
-#define	zfs_fallthrough			__attribute__((__fallthrough__))
-
-#if !defined(_KERNEL) && !defined(_STANDALONE)
-#define	likely(x)			__builtin_expect(!!(x), 1)
-#define	unlikely(x)			__builtin_expect(!!(x), 0)
-#endif
-#define	typeof(x)			__typeof(x)
-
-#define	uninitialized_var(x)		x = x
-#define	__maybe_unused			__unused
-#define	__always_unused			__unused
-#define	__must_check			__result_use_check
-
-#define	__printf(a, b)			__printflike(a, b)
 
-#define	barrier()			__asm__ __volatile__("": : :"memory")
-#define	___PASTE(a, b) a##b
-#define	__PASTE(a, b) ___PASTE(a, b)
-
-#define	ACCESS_ONCE(x)			(*(volatile __typeof(x) *)&(x))
-
-#define	WRITE_ONCE(x, v) do {		\
-	barrier();			\
-	ACCESS_ONCE(x) = (v);		\
-	barrier();			\
-} while (0)
-
-#define	lockless_dereference(p) READ_ONCE(p)
+/*
+ * FreeBSD's LinuxKPI compiler.h as far back as FreeBSD 12 has what we need,
+ * except zfs_fallthrough.
+ */
+#pragma once
 
-#define	_AT(T, X)	((T)(X))
+#include <compat/linuxkpi/common/include/linux/compiler.h>
 
-#endif	/* _LINUX_COMPILER_H_ */
+#define	zfs_fallthrough			__attribute__((__fallthrough__))
diff --git a/include/os/freebsd/spl/sys/ccompat.h b/include/os/freebsd/spl/sys/ccompat.h
index 48749fb8eea2..07b3515ad964 100644
--- a/include/os/freebsd/spl/sys/ccompat.h
+++ b/include/os/freebsd/spl/sys/ccompat.h
@@ -70,15 +70,6 @@ hlist_del(struct hlist_node *n)
 		n->next->pprev = n->pprev;
 }
 	/* BEGIN CSTYLED */
-#define	READ_ONCE(x) ({			\
-	__typeof(x) __var = ({		\
-		barrier();		\
-		ACCESS_ONCE(x);		\
-	});				\
-	barrier();			\
-	__var;				\
-})
-
 #define	HLIST_HEAD_INIT { }
 #define	HLIST_HEAD(name) struct hlist_head name = HLIST_HEAD_INIT
 #define	INIT_HLIST_HEAD(head) (head)->first = NULL
diff --git a/include/os/freebsd/spl/sys/debug.h b/include/os/freebsd/spl/sys/debug.h
index f041dde34fc8..9eb424dd0373 100644
--- a/include/os/freebsd/spl/sys/debug.h
+++ b/include/os/freebsd/spl/sys/debug.h
@@ -95,10 +95,6 @@ spl_assert(const char *buf, const char *file, const char *func, int line)
 #ifndef expect
 #define	expect(expr, value) (__builtin_expect((expr), (value)))
 #endif
-#ifndef __linux__
-#define	likely(expr)   expect((expr) != 0, 1)
-#define	unlikely(expr) expect((expr) != 0, 0)
-#endif
 
 #define	PANIC(fmt, a...)						\
 	spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a)

From 36a67b50a230aea6ddcef277ff3715fb9b877c07 Mon Sep 17 00:00:00 2001
From: Umer Saleem <usaleem@ixsystems.com>
Date: Thu, 17 Oct 2024 18:09:39 +0500
Subject: [PATCH 11/47] Fix inconsistent mount options for ZFS root

While mounting ZFS root during boot on Linux distributions from initrd,
mount from busybox is effectively used which executes mount system call
directly. This skips the ZFS helper mount.zfs, which checks and enables
the mount options as specified in dataset properties. As a result,
datasets mounted during boot from initrd do not have correct mount
options as specified in ZFS dataset properties.

There has been an attempt to use mount.zfs in zfs initrd script,
responsible for mounting the ZFS root filesystem (PR#13305). This was
later reverted (PR#14908) after discovering that using mount.zfs breaks
mounting of snapshots on root (/) and other child datasets of root have
the same issue (Issue#9461).

This happens because switching from busybox mount to mount.zfs correctly
parses the mount options but also adds 'mntpoint=/root' to the mount
options, which is then prepended to the snapshot mountpoint in
'.zfs/snapshot'. '/root' is the directory on Debian with initramfs-tools
where root filesystem is mounted before pivot_root. When Linux runtime
is reached, trying to access the snapshots on root results in
automounting the snapshot on '/root/.zfs/*', which fails.

This commit attempts to fix the automounting of snapshots on root, while
using mount.zfs in initrd script. Since the mountpoint of dataset is
stored in vfs_mntpoint field, we can check if current mountpoint of
dataset and vfs_mntpoint are same or not. If they are not same, reset
the vfs_mntpoint field with current mountpoint. This fixes the
mountpoints of root dataset and children in respective vfs_mntpoint
fields when we try to access the snapshots of root dataset or its
children. With correct mountpoint for root dataset and children stored
in vfs_mntpoint, all snapshots of root dataset are mounted correctly
and become accessible.

This fix will come into play only if current process, that is trying to
access the snapshots is not in chroot context. The Linux kernel API
that is used to convert struct path into char format (d_path), returns
the complete path for given struct path. It works in chroot environment
as well and returns the correct path from original filesystem root.

However d_path fails to return the complete path if any directory from
original root filesystem is mounted using --bind flag or --rbind flag
in chroot environment. In this case, if we try to access the snapshot
from outside the chroot environment, d_path returns the path correctly,
i.e. it returns the correct path to the directory that is mounted with
--bind flag. However inside the chroot environment, it only returns the
path inside chroot.

For now, there is not a better way in my understanding that gives the
complete path in char format and handles the case where directories from
root filesystem are mounted with --bind or --rbind on another path which
user will later chroot into. So this fix gets enabled if current
process trying to access the snapshot is not in chroot context.

With the snapshots issue fixed for root filesystem, using mount.zfs in
ZFS initrd script, mounts the datasets with correct mount options.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Signed-off-by: Umer Saleem <usaleem@ixsystems.com>
Closes #16646
---
 contrib/initramfs/scripts/zfs            |   5 +-
 include/os/linux/zfs/sys/zfs_vfsops_os.h |   1 +
 module/os/linux/zfs/zfs_ctldir.c         | 109 +++++++++++++++++++++--
 module/os/linux/zfs/zfs_vfsops.c         |   6 +-
 4 files changed, 111 insertions(+), 10 deletions(-)

diff --git a/contrib/initramfs/scripts/zfs b/contrib/initramfs/scripts/zfs
index 0a2bd2efda7a..c569b2528368 100644
--- a/contrib/initramfs/scripts/zfs
+++ b/contrib/initramfs/scripts/zfs
@@ -344,7 +344,7 @@ mount_fs()
 
 	# Need the _original_ datasets mountpoint!
 	mountpoint=$(get_fs_value "$fs" mountpoint)
-	ZFS_CMD="mount -o zfsutil -t zfs"
+	ZFS_CMD="mount.zfs -o zfsutil"
 	if [ "$mountpoint" = "legacy" ] || [ "$mountpoint" = "none" ]; then
 		# Can't use the mountpoint property. Might be one of our
 		# clones. Check the 'org.zol:mountpoint' property set in
@@ -359,9 +359,8 @@ mount_fs()
 				# isn't the root fs.
 				return 0
 			fi
-			# Don't use mount.zfs -o zfsutils for legacy mountpoint
 			if [ "$mountpoint" = "legacy" ]; then
-				ZFS_CMD="mount -t zfs"
+				ZFS_CMD="mount.zfs"
 			fi
 			# Last hail-mary: Hope 'rootmnt' is set!
 			mountpoint=""
diff --git a/include/os/linux/zfs/sys/zfs_vfsops_os.h b/include/os/linux/zfs/sys/zfs_vfsops_os.h
index 7067eb17900d..30aa3a103d33 100644
--- a/include/os/linux/zfs/sys/zfs_vfsops_os.h
+++ b/include/os/linux/zfs/sys/zfs_vfsops_os.h
@@ -69,6 +69,7 @@ typedef struct vfs {
 	boolean_t	vfs_do_relatime;
 	boolean_t	vfs_nbmand;
 	boolean_t	vfs_do_nbmand;
+	kmutex_t	vfs_mntpt_lock;
 } vfs_t;
 
 typedef struct zfs_mnt {
diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c
index 8a42a075cd25..f60d6ae91e0b 100644
--- a/module/os/linux/zfs/zfs_ctldir.c
+++ b/module/os/linux/zfs/zfs_ctldir.c
@@ -767,9 +767,6 @@ zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid,
 	uint64_t id, pos = 0;
 	int error = 0;
 
-	if (zfsvfs->z_vfs->vfs_mntpoint == NULL)
-		return (SET_ERROR(ENOENT));
-
 	cookie = spl_fstrans_mark();
 	snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 
@@ -786,8 +783,14 @@ zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid,
 			break;
 	}
 
-	snprintf(full_path, path_len, "%s/.zfs/snapshot/%s",
-	    zfsvfs->z_vfs->vfs_mntpoint, snapname);
+	mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock);
+	if (zfsvfs->z_vfs->vfs_mntpoint != NULL) {
+		snprintf(full_path, path_len, "%s/.zfs/snapshot/%s",
+		    zfsvfs->z_vfs->vfs_mntpoint, snapname);
+	} else
+		error = SET_ERROR(ENOENT);
+	mutex_exit(&zfsvfs->z_vfs->vfs_mntpt_lock);
+
 out:
 	kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN);
 	spl_fstrans_unmark(cookie);
@@ -1049,6 +1052,66 @@ exportfs_flush(void)
 	(void) call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
 }
 
+/*
+ * Returns the path in char format for given struct path. Uses
+ * d_path exported by kernel to convert struct path to char
+ * format. Returns the correct path for mountpoints and chroot
+ * environments.
+ *
+ * If chroot environment has directories that are mounted with
+ * --bind or --rbind flag, d_path returns the complete path inside
+ * chroot environment but does not return the absolute path, i.e.
+ * the path to chroot environment is missing.
+ */
+static int
+get_root_path(struct path *path, char *buff, int len)
+{
+	char *path_buffer, *path_ptr;
+	int error = 0;
+
+	path_get(path);
+	path_buffer = kmem_zalloc(len, KM_SLEEP);
+	path_ptr = d_path(path, path_buffer, len);
+	if (IS_ERR(path_ptr))
+		error = SET_ERROR(-PTR_ERR(path_ptr));
+	else
+		strcpy(buff, path_ptr);
+
+	kmem_free(path_buffer, len);
+	path_put(path);
+	return (error);
+}
+
+/*
+ * Returns if the current process root is chrooted or not. Linux
+ * kernel exposes the task_struct for current process and init.
+ * Since init process root points to actual root filesystem when
+ * Linux runtime is reached, we can compare the current process
+ * root with init process root to determine if root of the current
+ * process is different from init, which can reliably determine if
+ * current process is in chroot context or not.
+ */
+static int
+is_current_chrooted(void)
+{
+	struct task_struct *curr = current, *global = &init_task;
+	struct path cr_root, gl_root;
+
+	task_lock(curr);
+	get_fs_root(curr->fs, &cr_root);
+	task_unlock(curr);
+
+	task_lock(global);
+	get_fs_root(global->fs, &gl_root);
+	task_unlock(global);
+
+	int chrooted = !path_equal(&cr_root, &gl_root);
+	path_put(&gl_root);
+	path_put(&cr_root);
+
+	return (chrooted);
+}
+
 /*
  * Attempt to unmount a snapshot by making a call to user space.
  * There is no assurance that this can or will succeed, is just a
@@ -1123,14 +1186,50 @@ zfsctl_snapshot_mount(struct path *path, int flags)
 	if (error)
 		goto error;
 
+	if (is_current_chrooted() == 0) {
+		/*
+		 * Current process is not in chroot context
+		 */
+
+		char *m = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+		struct path mnt_path;
+		mnt_path.mnt = path->mnt;
+		mnt_path.dentry = path->mnt->mnt_root;
+
+		/*
+		 * Get path to current mountpoint
+		 */
+		error = get_root_path(&mnt_path, m, MAXPATHLEN);
+		if (error != 0) {
+			kmem_free(m, MAXPATHLEN);
+			goto error;
+		}
+		mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock);
+		if (zfsvfs->z_vfs->vfs_mntpoint != NULL) {
+			/*
+			 * If current mnountpoint and vfs_mntpoint are not same,
+			 * store current mountpoint in vfs_mntpoint.
+			 */
+			if (strcmp(zfsvfs->z_vfs->vfs_mntpoint, m) != 0) {
+				kmem_strfree(zfsvfs->z_vfs->vfs_mntpoint);
+				zfsvfs->z_vfs->vfs_mntpoint = kmem_strdup(m);
+			}
+		} else
+			zfsvfs->z_vfs->vfs_mntpoint = kmem_strdup(m);
+		mutex_exit(&zfsvfs->z_vfs->vfs_mntpt_lock);
+		kmem_free(m, MAXPATHLEN);
+	}
+
 	/*
 	 * Construct a mount point path from sb of the ctldir inode and dirent
 	 * name, instead of from d_path(), so that chroot'd process doesn't fail
 	 * on mount.zfs(8).
 	 */
+	mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock);
 	snprintf(full_path, MAXPATHLEN, "%s/.zfs/snapshot/%s",
 	    zfsvfs->z_vfs->vfs_mntpoint ? zfsvfs->z_vfs->vfs_mntpoint : "",
 	    dname(dentry));
+	mutex_exit(&zfsvfs->z_vfs->vfs_mntpt_lock);
 
 	snprintf(options, 7, "%s",
 	    zfs_snapshot_no_setuid ? "nosuid" : "suid");
diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c
index de3e8c89cfdd..3c53a8a315c3 100644
--- a/module/os/linux/zfs/zfs_vfsops.c
+++ b/module/os/linux/zfs/zfs_vfsops.c
@@ -115,7 +115,7 @@ zfsvfs_vfs_free(vfs_t *vfsp)
 	if (vfsp != NULL) {
 		if (vfsp->vfs_mntpoint != NULL)
 			kmem_strfree(vfsp->vfs_mntpoint);
-
+		mutex_destroy(&vfsp->vfs_mntpt_lock);
 		kmem_free(vfsp, sizeof (vfs_t));
 	}
 }
@@ -197,10 +197,11 @@ zfsvfs_parse_option(char *option, int token, substring_t *args, vfs_t *vfsp)
 		vfsp->vfs_do_nbmand = B_TRUE;
 		break;
 	case TOKEN_MNTPOINT:
+		if (vfsp->vfs_mntpoint != NULL)
+			kmem_strfree(vfsp->vfs_mntpoint);
 		vfsp->vfs_mntpoint = match_strdup(&args[0]);
 		if (vfsp->vfs_mntpoint == NULL)
 			return (SET_ERROR(ENOMEM));
-
 		break;
 	default:
 		break;
@@ -219,6 +220,7 @@ zfsvfs_parse_options(char *mntopts, vfs_t **vfsp)
 	int error;
 
 	tmp_vfsp = kmem_zalloc(sizeof (vfs_t), KM_SLEEP);
+	mutex_init(&tmp_vfsp->vfs_mntpt_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	if (mntopts != NULL) {
 		substring_t args[MAX_OPT_ARGS];

From bcd61d95790d3b005a7124775def9a2bc3ecb128 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Tue, 15 Oct 2024 22:18:19 +1100
Subject: [PATCH 12/47] libspl/backtrace: dump registers in libunwind
 backtraces

More useful stuff, especially when trying to follow a disassembly.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #16653
---
 lib/libspl/backtrace.c | 44 +++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/lib/libspl/backtrace.c b/lib/libspl/backtrace.c
index d26d742106e2..e969823c2e5d 100644
--- a/lib/libspl/backtrace.c
+++ b/lib/libspl/backtrace.c
@@ -65,32 +65,58 @@ libspl_backtrace(int fd)
 	ssize_t ret __attribute__((unused));
 	unw_context_t uc;
 	unw_cursor_t cp;
-	unw_word_t loc;
+	unw_word_t v;
 	char buf[128];
-	size_t n;
+	size_t n, c;
 
-	ret = write(fd, "Call trace:\n", 12);
 	unw_getcontext(&uc);
+
 	unw_init_local(&cp, &uc);
+	ret = write(fd, "Registers:\n", 11);
+	c = 0;
+	for (uint_t regnum = 0; regnum <= UNW_TDEP_LAST_REG; regnum++) {
+		if (unw_get_reg(&cp, regnum, &v) < 0)
+			continue;
+		const char *name = unw_regname(regnum);
+		for (n = 0; name[n] != '\0' && name[n] != '?'; n++) {}
+		if (n == 0) {
+			buf[0] = '?';
+			n = libspl_u64_to_hex_str(regnum, 2,
+			    &buf[1], sizeof (buf)-1) + 1;
+			name = buf;
+		}
+		ret = write(fd, "      ", 5-MIN(n, 3));
+		ret = write(fd, name, n);
+		ret = write(fd, ": 0x", 4);
+		n = libspl_u64_to_hex_str(v, 18, buf, sizeof (buf));
+		ret = write(fd, buf, n);
+		if (!(++c % 3))
+			ret = write(fd, "\n", 1);
+	}
+	if (c % 3)
+		ret = write(fd, "\n", 1);
+
+	unw_init_local(&cp, &uc);
+	ret = write(fd, "Call trace:\n", 12);
 	while (unw_step(&cp) > 0) {
-		unw_get_reg(&cp, UNW_REG_IP, &loc);
+		unw_get_reg(&cp, UNW_REG_IP, &v);
 		ret = write(fd, "  [0x", 5);
-		n = libspl_u64_to_hex_str(loc, 10, buf, sizeof (buf));
+		n = libspl_u64_to_hex_str(v, 18, buf, sizeof (buf));
 		ret = write(fd, buf, n);
 		ret = write(fd, "] ", 2);
-		unw_get_proc_name(&cp, buf, sizeof (buf), &loc);
+		unw_get_proc_name(&cp, buf, sizeof (buf), &v);
 		for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {}
 		ret = write(fd, buf, n);
 		ret = write(fd, "+0x", 3);
-		n = libspl_u64_to_hex_str(loc, 2, buf, sizeof (buf));
+		n = libspl_u64_to_hex_str(v, 2, buf, sizeof (buf));
 		ret = write(fd, buf, n);
 #ifdef HAVE_LIBUNWIND_ELF
 		ret = write(fd, " (in ", 5);
-		unw_get_elf_filename(&cp, buf, sizeof (buf), &loc);
+		unw_get_elf_filename(&cp, buf, sizeof (buf), &v);
 		for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {}
 		ret = write(fd, buf, n);
 		ret = write(fd, " +0x", 4);
-		n = libspl_u64_to_hex_str(loc, 2, buf, sizeof (buf));
+		n = libspl_u64_to_hex_str(v, 2, buf, sizeof (buf));
 		ret = write(fd, buf, n);
 		ret = write(fd, ")", 1);
 #endif

From d5db840260a1e39af11766227132e9b4d871b21b Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Fri, 18 Oct 2024 13:49:41 +1100
Subject: [PATCH 13/47] libspl/backtrace: helper macros for output

My eyes are going blurry looking at all those write calls. This is much
nicer.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Close #16653
---
 lib/libspl/backtrace.c | 58 ++++++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/lib/libspl/backtrace.c b/lib/libspl/backtrace.c
index e969823c2e5d..b4568d124dde 100644
--- a/lib/libspl/backtrace.c
+++ b/lib/libspl/backtrace.c
@@ -25,12 +25,20 @@
 
 #include <sys/backtrace.h>
 #include <sys/types.h>
+#include <sys/debug.h>
 #include <unistd.h>
 
 /*
- * libspl_backtrace() must be safe to call from inside a signal hander. This
- * mostly means it must not allocate, and so we can't use things like printf.
+ * Output helpers. libspl_backtrace() must not block, must be thread-safe and
+ * must be safe to call from a signal handler. At least, that means not having
+ * printf, so we end up having to call write() directly on the fd. That's
+ * awkward, as we always have to pass through a length, and some systems will
+ * complain if we don't consume the return. So we have some macros to make
+ * things a little more palatable.
  */
+#define	spl_bt_write_n(fd, s, n) \
+	do { ssize_t r __maybe_unused = write(fd, s, n); } while (0)
+#define	spl_bt_write(fd, s)		spl_bt_write_n(fd, s, sizeof (s))
 
 #if defined(HAVE_LIBUNWIND)
 #define	UNW_LOCAL_ONLY
@@ -62,7 +70,6 @@ libspl_u64_to_hex_str(uint64_t v, size_t digits, char *buf, size_t buflen)
 void
 libspl_backtrace(int fd)
 {
-	ssize_t ret __attribute__((unused));
 	unw_context_t uc;
 	unw_cursor_t cp;
 	unw_word_t v;
@@ -72,7 +79,7 @@ libspl_backtrace(int fd)
 	unw_getcontext(&uc);
 
 	unw_init_local(&cp, &uc);
-	ret = write(fd, "Registers:\n", 11);
+	spl_bt_write(fd, "Registers:\n");
 	c = 0;
 	for (uint_t regnum = 0; regnum <= UNW_TDEP_LAST_REG; regnum++) {
 		if (unw_get_reg(&cp, regnum, &v) < 0)
@@ -85,42 +92,42 @@ libspl_backtrace(int fd)
 			    &buf[1], sizeof (buf)-1) + 1;
 			name = buf;
 		}
-		ret = write(fd, "      ", 5-MIN(n, 3));
-		ret = write(fd, name, n);
-		ret = write(fd, ": 0x", 4);
+		spl_bt_write_n(fd, "      ", 5-MIN(n, 3));
+		spl_bt_write_n(fd, name, n);
+		spl_bt_write(fd, ": 0x");
 		n = libspl_u64_to_hex_str(v, 18, buf, sizeof (buf));
-		ret = write(fd, buf, n);
+		spl_bt_write_n(fd, buf, n);
 		if (!(++c % 3))
-			ret = write(fd, "\n", 1);
+			spl_bt_write(fd, "\n");
 	}
 	if (c % 3)
-		ret = write(fd, "\n", 1);
+		spl_bt_write(fd, "\n");
 
 	unw_init_local(&cp, &uc);
-	ret = write(fd, "Call trace:\n", 12);
+	spl_bt_write(fd, "Call trace:\n");
 	while (unw_step(&cp) > 0) {
 		unw_get_reg(&cp, UNW_REG_IP, &v);
-		ret = write(fd, "  [0x", 5);
+		spl_bt_write(fd, "  [0x");
 		n = libspl_u64_to_hex_str(v, 18, buf, sizeof (buf));
-		ret = write(fd, buf, n);
-		ret = write(fd, "] ", 2);
+		spl_bt_write_n(fd, buf, n);
+		spl_bt_write(fd, "] ");
 		unw_get_proc_name(&cp, buf, sizeof (buf), &v);
 		for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {}
-		ret = write(fd, buf, n);
-		ret = write(fd, "+0x", 3);
+		spl_bt_write_n(fd, buf, n);
+		spl_bt_write(fd, "+0x");
 		n = libspl_u64_to_hex_str(v, 2, buf, sizeof (buf));
-		ret = write(fd, buf, n);
+		spl_bt_write_n(fd, buf, n);
 #ifdef HAVE_LIBUNWIND_ELF
-		ret = write(fd, " (in ", 5);
+		spl_bt_write(fd, " (in ");
 		unw_get_elf_filename(&cp, buf, sizeof (buf), &v);
 		for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {}
-		ret = write(fd, buf, n);
-		ret = write(fd, " +0x", 4);
+		spl_bt_write_n(fd, buf, n);
+		spl_bt_write(fd, " +0x");
 		n = libspl_u64_to_hex_str(v, 2, buf, sizeof (buf));
-		ret = write(fd, buf, n);
-		ret = write(fd, ")", 1);
+		spl_bt_write_n(fd, buf, n);
+		spl_bt_write(fd, ")");
 #endif
-		ret = write(fd, "\n", 1);
+		spl_bt_write(fd, "\n");
 	}
 }
 #elif defined(HAVE_BACKTRACE)
@@ -129,15 +136,12 @@ libspl_backtrace(int fd)
 void
 libspl_backtrace(int fd)
 {
-	ssize_t ret __attribute__((unused));
 	void *btptrs[64];
 	size_t nptrs = backtrace(btptrs, 64);
-	ret = write(fd, "Call trace:\n", 12);
+	spl_bt_write(fd, "Call trace:\n");
 	backtrace_symbols_fd(btptrs, nptrs, fd);
 }
 #else
-#include <sys/debug.h>
-
 void
 libspl_backtrace(int fd __maybe_unused)
 {

From f52d7aaaac2472a5d2c9675dec017910881d5ecb Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Fri, 18 Oct 2024 14:30:05 +1100
Subject: [PATCH 14/47] libspl/backtrace: rename and document hex conversion
 function

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #16653
---
 lib/libspl/backtrace.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/lib/libspl/backtrace.c b/lib/libspl/backtrace.c
index b4568d124dde..b541858cbc56 100644
--- a/lib/libspl/backtrace.c
+++ b/lib/libspl/backtrace.c
@@ -44,8 +44,13 @@
 #define	UNW_LOCAL_ONLY
 #include <libunwind.h>
 
+/*
+ * Convert `v` to ASCII hex characters. The bottom `n` nybbles (4-bits ie one
+ * hex digit) will be written, up to `buflen`. The buffer will not be
+ * null-terminated. Returns the number of digits written.
+ */
 static size_t
-libspl_u64_to_hex_str(uint64_t v, size_t digits, char *buf, size_t buflen)
+spl_bt_u64_to_hex_str(uint64_t v, size_t n, char *buf, size_t buflen)
 {
 	static const char hexdigits[] = {
 	    '0', '1', '2', '3', '4', '5', '6', '7',
@@ -53,10 +58,10 @@ libspl_u64_to_hex_str(uint64_t v, size_t digits, char *buf, size_t buflen)
 	};
 
 	size_t pos = 0;
-	boolean_t want = (digits == 0);
+	boolean_t want = (n == 0);
 	for (int i = 15; i >= 0; i--) {
 		const uint64_t d = v >> (i * 4) & 0xf;
-		if (!want && (d != 0 || digits > i))
+		if (!want && (d != 0 || n > i))
 			want = B_TRUE;
 		if (want) {
 			buf[pos++] = hexdigits[d];
@@ -88,14 +93,14 @@ libspl_backtrace(int fd)
 		for (n = 0; name[n] != '\0' && name[n] != '?'; n++) {}
 		if (n == 0) {
 			buf[0] = '?';
-			n = libspl_u64_to_hex_str(regnum, 2,
+			n = spl_bt_u64_to_hex_str(regnum, 2,
 			    &buf[1], sizeof (buf)-1) + 1;
 			name = buf;
 		}
 		spl_bt_write_n(fd, "      ", 5-MIN(n, 3));
 		spl_bt_write_n(fd, name, n);
 		spl_bt_write(fd, ": 0x");
-		n = libspl_u64_to_hex_str(v, 18, buf, sizeof (buf));
+		n = spl_bt_u64_to_hex_str(v, 18, buf, sizeof (buf));
 		spl_bt_write_n(fd, buf, n);
 		if (!(++c % 3))
 			spl_bt_write(fd, "\n");
@@ -108,14 +113,14 @@ libspl_backtrace(int fd)
 	while (unw_step(&cp) > 0) {
 		unw_get_reg(&cp, UNW_REG_IP, &v);
 		spl_bt_write(fd, "  [0x");
-		n = libspl_u64_to_hex_str(v, 18, buf, sizeof (buf));
+		n = spl_bt_u64_to_hex_str(v, 18, buf, sizeof (buf));
 		spl_bt_write_n(fd, buf, n);
 		spl_bt_write(fd, "] ");
 		unw_get_proc_name(&cp, buf, sizeof (buf), &v);
 		for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {}
 		spl_bt_write_n(fd, buf, n);
 		spl_bt_write(fd, "+0x");
-		n = libspl_u64_to_hex_str(v, 2, buf, sizeof (buf));
+		n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf));
 		spl_bt_write_n(fd, buf, n);
 #ifdef HAVE_LIBUNWIND_ELF
 		spl_bt_write(fd, " (in ");
@@ -123,7 +128,7 @@ libspl_backtrace(int fd)
 		for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {}
 		spl_bt_write_n(fd, buf, n);
 		spl_bt_write(fd, " +0x");
-		n = libspl_u64_to_hex_str(v, 2, buf, sizeof (buf));
+		n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf));
 		spl_bt_write_n(fd, buf, n);
 		spl_bt_write(fd, ")");
 #endif

From b4cd10ce5b581865546d6cbd5d35c44ba8d031f4 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Fri, 18 Oct 2024 15:10:33 +1100
Subject: [PATCH 15/47] libspl/backtrace: comment and harden libunwind
 backtracer

This is the sort of code that we get right once and never look at again.
Anyone reading this code is already likely in the middle of a debugging
nightmare, and then they have a wall of manual string construction and
an unfamiliar and idiosyncratic library to deal with. So, comment the
whole thing to try to make it clear what's going on.

In pursuit of the above, I've added return checks to some of the
libunwind calls, fixed the frame loop to not skip the "top" frame
(however unseful it may be), and fix a couple of calls to
spl_bt_u64_to_hex_str() which requested 18 digits instead of 16.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #16653
---
 lib/libspl/backtrace.c | 166 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 141 insertions(+), 25 deletions(-)

diff --git a/lib/libspl/backtrace.c b/lib/libspl/backtrace.c
index b541858cbc56..6e8b3b12122d 100644
--- a/lib/libspl/backtrace.c
+++ b/lib/libspl/backtrace.c
@@ -79,61 +79,177 @@ libspl_backtrace(int fd)
 	unw_cursor_t cp;
 	unw_word_t v;
 	char buf[128];
-	size_t n, c;
+	size_t n;
+	int err;
 
+	/* Snapshot the current frame and state. */
 	unw_getcontext(&uc);
 
-	unw_init_local(&cp, &uc);
+	/*
+	 * TODO: walk back to the frame that tripped the assertion / the place
+	 *       where the signal was recieved.
+	 */
+
+	/*
+	 * Register dump. We're going to loop over all the registers in the
+	 * top frame, and show them, with names, in a nice three-column
+	 * layout, which keeps us within 80 columns.
+	 */
 	spl_bt_write(fd, "Registers:\n");
-	c = 0;
+
+	/* Initialise a frame cursor, starting at the current frame */
+	unw_init_local(&cp, &uc);
+
+	/*
+	 * libunwind's list of possible registers for this architecture is an
+	 * enum, unw_regnum_t. UNW_TDEP_LAST_REG is the highest-numbered
+	 * register in that list, however, not all register numbers in this
+	 * range are defined by the architecture, and not all defined registers
+	 * will be present on every implementation of that architecture.
+	 * Moreover, libunwind provides nice names for most, but not all
+	 * registers, but these are hardcoded; a name being available does not
+	 * mean that register is available.
+	 *
+	 * So, we have to pull this all together here. We try to get the value
+	 * of every possible register. If we get a value for it, then the
+	 * register must exist, and so we get its name. If libunwind has no
+	 * name for it, we synthesize something. These cases should be rare,
+	 * and they're usually for uninteresting or niche registers, so it
+	 * shouldn't really matter. We can see the value, and that's the main
+	 * thing.
+	 */
+	uint_t cols = 0;
 	for (uint_t regnum = 0; regnum <= UNW_TDEP_LAST_REG; regnum++) {
+		/*
+		 * Get the value. Any error probably means the register
+		 * doesn't exist, and we skip it.
+		 */
 		if (unw_get_reg(&cp, regnum, &v) < 0)
 			continue;
+
+		/*
+		 * Register name. If libunwind doesn't have a name for it,
+		 * it will return "???". As a shortcut, we just treat '?'
+		 * is an alternate end-of-string character.
+		 */
 		const char *name = unw_regname(regnum);
 		for (n = 0; name[n] != '\0' && name[n] != '?'; n++) {}
 		if (n == 0) {
+			/*
+			 * No valid name, so make one of the form "?xx", where
+			 * "xx" is the two-char hex of libunwind's register
+			 * number.
+			 */
 			buf[0] = '?';
 			n = spl_bt_u64_to_hex_str(regnum, 2,
 			    &buf[1], sizeof (buf)-1) + 1;
 			name = buf;
 		}
+
+		/*
+		 * Two spaces of padding before each column, plus extra
+		 * spaces to align register names shorter than three chars.
+		 */
 		spl_bt_write_n(fd, "      ", 5-MIN(n, 3));
+
+		/* Register name and column punctuation */
 		spl_bt_write_n(fd, name, n);
 		spl_bt_write(fd, ": 0x");
-		n = spl_bt_u64_to_hex_str(v, 18, buf, sizeof (buf));
+
+		/*
+		 * Convert register value (from unw_get_reg()) to hex. We're
+		 * assuming that all registers are 64-bits wide, which is
+		 * probably fine for any general-purpose registers on any
+		 * machine currently in use. A more generic way would be to
+		 * look at the width of unw_word_t, but that would also
+		 * complicate the column code a bit. This is fine.
+		 */
+		n = spl_bt_u64_to_hex_str(v, 16, buf, sizeof (buf));
 		spl_bt_write_n(fd, buf, n);
-		if (!(++c % 3))
+
+		/* Every third column, emit a newline */
+		if (!(++cols % 3))
 			spl_bt_write(fd, "\n");
 	}
-	if (c % 3)
+
+	/* If we finished before the third column, emit a newline. */
+	if (cols % 3)
 		spl_bt_write(fd, "\n");
 
-	unw_init_local(&cp, &uc);
+	/* Now the main event, the backtrace. */
 	spl_bt_write(fd, "Call trace:\n");
-	while (unw_step(&cp) > 0) {
-		unw_get_reg(&cp, UNW_REG_IP, &v);
+
+	/* Reset the cursor to the top again. */
+	unw_init_local(&cp, &uc);
+
+	do {
+		/*
+		 * Getting the IP should never fail; libunwind handles it
+		 * specially, because its used a lot internally. Still, no
+		 * point being silly about it, as the last thing we want is
+		 * our crash handler to crash. So if it ever does fail, we'll
+		 * show an error line, but keep going to the next frame.
+		 */
+		if (unw_get_reg(&cp, UNW_REG_IP, &v) < 0) {
+			spl_bt_write(fd, "  [couldn't get IP register; "
+			    "corrupt frame?]");
+			continue;
+		}
+
+		/* IP & punctuation */
+		n = spl_bt_u64_to_hex_str(v, 16, buf, sizeof (buf));
 		spl_bt_write(fd, "  [0x");
-		n = spl_bt_u64_to_hex_str(v, 18, buf, sizeof (buf));
 		spl_bt_write_n(fd, buf, n);
 		spl_bt_write(fd, "] ");
-		unw_get_proc_name(&cp, buf, sizeof (buf), &v);
-		for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {}
-		spl_bt_write_n(fd, buf, n);
-		spl_bt_write(fd, "+0x");
-		n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf));
-		spl_bt_write_n(fd, buf, n);
+
+		/*
+		 * Function ("procedure") name for the current frame. `v`
+		 * receives the offset from the named function to the IP, which
+		 * we show as a "+offset" suffix.
+		 *
+		 * If libunwind can't determine the name, we just show "???"
+		 * instead. We've already displayed the IP above; that will
+		 * have to do.
+		 *
+		 * unw_get_proc_name() will return ENOMEM if the buffer is too
+		 * small, instead truncating the name. So we treat that as a
+		 * success and use whatever is in the buffer.
+		 */
+		err = unw_get_proc_name(&cp, buf, sizeof (buf), &v);
+		if (err == 0 || err == -UNW_ENOMEM) {
+			for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {}
+			spl_bt_write_n(fd, buf, n);
+
+			/* Offset from proc name */
+			spl_bt_write(fd, "+0x");
+			n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf));
+			spl_bt_write_n(fd, buf, n);
+		} else
+			spl_bt_write(fd, "???");
+
 #ifdef HAVE_LIBUNWIND_ELF
-		spl_bt_write(fd, " (in ");
-		unw_get_elf_filename(&cp, buf, sizeof (buf), &v);
-		for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {}
-		spl_bt_write_n(fd, buf, n);
-		spl_bt_write(fd, " +0x");
-		n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf));
-		spl_bt_write_n(fd, buf, n);
-		spl_bt_write(fd, ")");
+		/*
+		 * Newer libunwind has unw_get_elf_filename(), which gets
+		 * the name of the ELF object that the frame was executing in.
+		 * Like `unw_get_proc_name()`, `v` recieves the offset within
+		 * the file, and UNW_ENOMEM indicates that a truncate filename
+		 * was left in the buffer.
+		 */
+		err = unw_get_elf_filename(&cp, buf, sizeof (buf), &v);
+		if (err == 0 || err == -UNW_ENOMEM) {
+			for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {}
+			spl_bt_write(fd, " (in ");
+			spl_bt_write_n(fd, buf, n);
+
+			/* Offset within file */
+			spl_bt_write(fd, " +0x");
+			n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf));
+			spl_bt_write_n(fd, buf, n);
+			spl_bt_write(fd, ")");
+		}
 #endif
 		spl_bt_write(fd, "\n");
-	}
+	} while (unw_step(&cp) > 0);
 }
 #elif defined(HAVE_BACKTRACE)
 #include <execinfo.h>

From ace2e17a9b6db4581ce70405a8a38e164f16fe30 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Sun, 20 Oct 2024 12:39:05 -0400
Subject: [PATCH 16/47] zfs_debug: Restore log size limit for userspace

For some reason it was dropped when split from kernel, that makes
raidz_test to accumulate in RAM up to 100GB of logs we don't need.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Igor Kozhukhov <igor@dilos.org>
Reviewed-by:  Rob Norris <robn@despairlabs.com>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16492
Closes #16566
Closes #16664
---
 lib/libzpool/zfs_debug.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/lib/libzpool/zfs_debug.c b/lib/libzpool/zfs_debug.c
index df49a9a33fe8..82c7229932f0 100644
--- a/lib/libzpool/zfs_debug.c
+++ b/lib/libzpool/zfs_debug.c
@@ -35,9 +35,25 @@ typedef struct zfs_dbgmsg {
 
 static list_t zfs_dbgmsgs;
 static kmutex_t zfs_dbgmsgs_lock;
+static uint_t zfs_dbgmsg_size = 0;
+static uint_t zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
 
 int zfs_dbgmsg_enable = B_TRUE;
 
+static void
+zfs_dbgmsg_purge(uint_t max_size)
+{
+	while (zfs_dbgmsg_size > max_size) {
+		zfs_dbgmsg_t *zdm = list_remove_head(&zfs_dbgmsgs);
+		if (zdm == NULL)
+			return;
+
+		uint_t size = zdm->zdm_size;
+		kmem_free(zdm, size);
+		zfs_dbgmsg_size -= size;
+	}
+}
+
 void
 zfs_dbgmsg_init(void)
 {
@@ -74,6 +90,8 @@ __zfs_dbgmsg(char *buf)
 
 	mutex_enter(&zfs_dbgmsgs_lock);
 	list_insert_tail(&zfs_dbgmsgs, zdm);
+	zfs_dbgmsg_size += size;
+	zfs_dbgmsg_purge(zfs_dbgmsg_maxsize);
 	mutex_exit(&zfs_dbgmsgs_lock);
 }
 

From 2b359c7824ea538128a084836510c37ec8e99e72 Mon Sep 17 00:00:00 2001
From: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Date: Mon, 21 Oct 2024 01:43:16 +0900
Subject: [PATCH 17/47] Fix compile-time warnings caused by duplicate struct
 typedefs

Some compiler/versions warn these typedefs according to #16660.

The platform specific header sys/abd_os.h shouldn't define or use abd_t,
as it's defined in its non-platform specific consumer sys/abd.h.
Do the same as what FreeBSD header does.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
Closes #16660
Closes #16665
---
 include/os/linux/spl/sys/taskq.h  |  3 +--
 include/os/linux/zfs/sys/abd_os.h | 12 ++++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h
index f63a397f293d..4cb3a055c3c4 100644
--- a/include/os/linux/spl/sys/taskq.h
+++ b/include/os/linux/spl/sys/taskq.h
@@ -38,8 +38,7 @@
 #include <sys/rwlock.h>
 #include <sys/wait.h>
 #include <sys/wmsum.h>
-
-typedef struct kstat_s kstat_t;
+#include <sys/kstat.h>
 
 #define	TASKQ_NAMELEN		31
 
diff --git a/include/os/linux/zfs/sys/abd_os.h b/include/os/linux/zfs/sys/abd_os.h
index 606e8bf682e8..3eed968e90c0 100644
--- a/include/os/linux/zfs/sys/abd_os.h
+++ b/include/os/linux/zfs/sys/abd_os.h
@@ -30,6 +30,8 @@
 extern "C" {
 #endif
 
+struct abd;
+
 struct abd_scatter {
 	uint_t		abd_offset;
 	uint_t		abd_nents;
@@ -41,10 +43,8 @@ struct abd_linear {
 	struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
 };
 
-typedef struct abd abd_t;
-
 typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
-int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
+int abd_iterate_page_func(struct abd *, size_t, size_t, abd_iter_page_func_t *,
     void *);
 
 /*
@@ -52,11 +52,11 @@ int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
  * Note: these are only needed to support vdev_classic. See comment in
  * vdev_disk.c.
  */
-unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
-unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t);
+unsigned int abd_bio_map_off(struct bio *, struct abd *, unsigned int, size_t);
+unsigned long abd_nr_pages_off(struct abd *, unsigned int, size_t);
 
 __attribute__((malloc))
-abd_t *abd_alloc_from_pages(struct page **, unsigned long, uint64_t);
+struct abd *abd_alloc_from_pages(struct page **, unsigned long, uint64_t);
 
 #ifdef __cplusplus
 }

From 78d39d91fa7256dab53ca08ab8be6cd11412c31d Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Mon, 21 Oct 2024 04:01:49 +1100
Subject: [PATCH 18/47] zdb: show bp in uberblock dump

Just another useful nugget of info in times of strife.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #16667
---
 cmd/zdb/zdb.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 16c7025802f3..42b82c08249f 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -4266,6 +4266,10 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
 	(void) printf("\ttimestamp = %llu UTC = %s",
 	    (u_longlong_t)ub->ub_timestamp, ctime(&timestamp));
 
+	char blkbuf[BP_SPRINTF_LEN];
+	snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
+	(void) printf("\tbp = %s\n", blkbuf);
+
 	(void) printf("\tmmp_magic = %016llx\n",
 	    (u_longlong_t)ub->ub_mmp_magic);
 	if (MMP_VALID(ub)) {

From e30c69365d42c077a501e674b1784be702d9458e Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Mon, 21 Oct 2024 13:50:13 +1100
Subject: [PATCH 19/47] config: fix dequeue_signal check for kernels <4.20

Before 4.20, kernel_siginfo_t was just called siginfo_t. This was
causing the kthread_dequeue_signal_3arg_task check, which uses
kernel_siginfo_t, to fail on older kernels.

In d6b8c17f1, we started checking for the "new" three-arg
dequeue_signal() by testing for the "old" version. Because that test is
explicitly using kernel_siginfo_t, it would fail, leading to the build
trying to use the new three-arg version, which would then not compile.

This commit fixes that by avoiding checking for the old 3-arg
dequeue_signal entirely. Instead, we check for the new one, as well as
the 4-arg form, and we use the old form as a fallback. This way, we
never have to test for it explicitly, and once we're building
HAVE_SIGINFO will make sure we get the right kernel_siginfo_t for it, so
everything works out nice.

Original-patch-by: Finix <yancw@info2soft.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #16666
---
 config/kernel-kthread.m4         | 37 +++++++++++++++++++-------------
 module/os/linux/spl/spl-thread.c |  6 +++---
 2 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/config/kernel-kthread.m4 b/config/kernel-kthread.m4
index 4d580efead6b..607953146323 100644
--- a/config/kernel-kthread.m4
+++ b/config/kernel-kthread.m4
@@ -17,14 +17,21 @@ AC_DEFUN([ZFS_AC_KERNEL_KTHREAD_COMPLETE_AND_EXIT], [
 
 AC_DEFUN([ZFS_AC_KERNEL_KTHREAD_DEQUEUE_SIGNAL], [
 	dnl #
-	dnl # 5.17 API: enum pid_type * as new 4th dequeue_signal() argument,
-	dnl # 5768d8906bc23d512b1a736c1e198aa833a6daa4 ("signal: Requeue signals in the appropriate queue")
+	dnl # prehistory:
+	dnl #     int dequeue_signal(struct task_struct *task, sigset_t *mask,
+	dnl #         siginfo_t *info)
 	dnl #
-	dnl # int dequeue_signal(struct task_struct *task, sigset_t *mask, kernel_siginfo_t *info);
-	dnl # int dequeue_signal(struct task_struct *task, sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type);
+	dnl # 4.20: kernel_siginfo_t introduced, replaces siginfo_t
+	dnl #     int dequeue_signal(struct task_struct *task, sigset_t *mask,
+	dnl           kernel_siginfo_t *info)
 	dnl #
-	dnl # 6.12 API: first arg struct_task* removed
-	dnl # int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type);
+	dnl # 5.17: enum pid_type introduced as 4th arg
+	dnl #     int dequeue_signal(struct task_struct *task, sigset_t *mask,
+	dnl #         kernel_siginfo_t *info, enum pid_type *type)
+	dnl #
+	dnl # 6.12: first arg struct_task* removed
+	dnl #     int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info,
+	dnl #         enum pid_type *type)
 	dnl #
 	AC_MSG_CHECKING([whether dequeue_signal() takes 4 arguments])
 	ZFS_LINUX_TEST_RESULT([kthread_dequeue_signal_4arg], [
@@ -33,11 +40,11 @@ AC_DEFUN([ZFS_AC_KERNEL_KTHREAD_DEQUEUE_SIGNAL], [
 		    [dequeue_signal() takes 4 arguments])
 	], [
 		AC_MSG_RESULT(no)
-		AC_MSG_CHECKING([whether dequeue_signal() a task argument])
-		ZFS_LINUX_TEST_RESULT([kthread_dequeue_signal_3arg_task], [
+		AC_MSG_CHECKING([whether 3-arg dequeue_signal() takes a type argument])
+		ZFS_LINUX_TEST_RESULT([kthread_dequeue_signal_3arg_type], [
 			AC_MSG_RESULT(yes)
-			AC_DEFINE(HAVE_DEQUEUE_SIGNAL_3ARG_TASK, 1,
-			    [dequeue_signal() takes a task argument])
+			AC_DEFINE(HAVE_DEQUEUE_SIGNAL_3ARG_TYPE, 1,
+			    [3-arg dequeue_signal() takes a type argument])
 		], [
 			AC_MSG_RESULT(no)
 		])
@@ -56,27 +63,27 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_KTHREAD_COMPLETE_AND_EXIT], [
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_SRC_KTHREAD_DEQUEUE_SIGNAL], [
-	ZFS_LINUX_TEST_SRC([kthread_dequeue_signal_3arg_task], [
+	ZFS_LINUX_TEST_SRC([kthread_dequeue_signal_4arg], [
 		#include <linux/sched/signal.h>
 	], [
 		struct task_struct *task = NULL;
 		sigset_t *mask = NULL;
 		kernel_siginfo_t *info = NULL;
+		enum pid_type *type = NULL;
 		int error __attribute__ ((unused));
 
-		error = dequeue_signal(task, mask, info);
+		error = dequeue_signal(task, mask, info, type);
 	])
 
-	ZFS_LINUX_TEST_SRC([kthread_dequeue_signal_4arg], [
+	ZFS_LINUX_TEST_SRC([kthread_dequeue_signal_3arg_type], [
 		#include <linux/sched/signal.h>
 	], [
-		struct task_struct *task = NULL;
 		sigset_t *mask = NULL;
 		kernel_siginfo_t *info = NULL;
 		enum pid_type *type = NULL;
 		int error __attribute__ ((unused));
 
-		error = dequeue_signal(task, mask, info, type);
+		error = dequeue_signal(mask, info, type);
 	])
 ])
 
diff --git a/module/os/linux/spl/spl-thread.c b/module/os/linux/spl/spl-thread.c
index 7f74d44f91ff..7b0ce30c7884 100644
--- a/module/os/linux/spl/spl-thread.c
+++ b/module/os/linux/spl/spl-thread.c
@@ -171,11 +171,11 @@ issig(void)
 #if defined(HAVE_DEQUEUE_SIGNAL_4ARG)
 	enum pid_type __type;
 	if (dequeue_signal(current, &set, &__info, &__type) != 0) {
-#elif defined(HAVE_DEQUEUE_SIGNAL_3ARG_TASK)
-	if (dequeue_signal(current, &set, &__info) != 0) {
-#else
+#elif defined(HAVE_DEQUEUE_SIGNAL_3ARG_TYPE)
 	enum pid_type __type;
 	if (dequeue_signal(&set, &__info, &__type) != 0) {
+#else
+	if (dequeue_signal(current, &set, &__info) != 0) {
 #endif
 		spin_unlock_irq(&current->sighand->siglock);
 		kernel_signal_stop();

From ede715d1e426ea11b410bc64bc8ceca8f4b1e1f1 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Tue, 22 Oct 2024 05:38:30 +1100
Subject: [PATCH 20/47] spl/thread: explicitly define thread_func_t as noreturn

All of our thread entry functions have this signature:

    void (*)(void*) __attribute__((noreturn))

The low-level `__thread_create()` function accepts a `thread_func_t` as
the entry point, which is defined more simply as:

    void (*)(void *)

And then the `thread_create()` and `thread_create_named()` macros cast
the passed-in function point down to `thread_func_t`, that is, casting
away the `noreturn` attribute.

Clang considers casting between these two types to be invalid because
both the caller and the callee may have elided parts of the stack frame
save and restore, knowing that they won't be needed.

Recent Linux appears to be setting `-Wcast-function-type-strict`, which
causes this invalid cast to emit a warning, which with `-Werror` is
converted to an error, breaking the build.

This commit fixes this in the simplest possible way: adding `noreturn`
to the `thread_func_t` attribute. Since all our thread entry functions
already have this attribute, it's arguably a just a consistency fix
anyway.

I considered removing the casts in the macros, which silences the
warnings, but it turns out that Clang has a bug that won't emit this
error for implicit conversions, only explicit casts. So leaving them
there seems like a reasonable belt-and-suspenders approach. Also,
frankly, this whole mechanism seems a little undercooked inside LLVM, so
I'm content go with my intuition about the smallest, least invaisve
change.

**NOTE**: `__thread_create` is exported by `spl.ko` and has a
`thread_func_t` arg, so this is an ABI break. Whether that matters in
practice, I have no idea.

Further reading:
- https://github.com/llvm/llvm-project/commit/1aad641c793090b4d036c03e737df2ebe2c32c57
- https://github.com/llvm/llvm-project/issues/7325
- https://github.com/llvm/llvm-project/issues/41465

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #16672
Closes #16673
---
 include/os/linux/spl/sys/thread.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/os/linux/spl/sys/thread.h b/include/os/linux/spl/sys/thread.h
index 4f7f659e528d..c7ef7efa0a25 100644
--- a/include/os/linux/spl/sys/thread.h
+++ b/include/os/linux/spl/sys/thread.h
@@ -42,7 +42,7 @@
 #define	TS_ZOMB				EXIT_ZOMBIE
 #define	TS_STOPPED			TASK_STOPPED
 
-typedef void (*thread_func_t)(void *);
+typedef void (*thread_func_t)(void *) __attribute__((noreturn));
 
 #define	thread_create_named(name, stk, stksize, func, arg, len,	\
     pp, state, pri)	\

From 5237760b17cae246b8275a6ee388560dadf89f3b Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Mon, 21 Oct 2024 14:46:20 -0400
Subject: [PATCH 21/47] ZTS: Add additional exceptions

The following tests have been observed to occasionally fail when
running under the CI.  Updated our exceptions list to track them.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #16670
---
 tests/test-runner/bin/zts-report.py.in | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
index 2562836213af..07ec2c4b601b 100755
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@@ -213,6 +213,7 @@ maybe = {
     'cli_root/zfs_unshare/zfs_unshare_006_pos': ['SKIP', na_reason],
     'cli_root/zpool_add/zpool_add_004_pos': ['FAIL', known_reason],
     'cli_root/zpool_destroy/zpool_destroy_001_pos': ['SKIP', 6145],
+    'cli_root/zpool_import/import_devices_missing': ['FAIL', 16669],
     'cli_root/zpool_import/zpool_import_missing_003_pos': ['SKIP', 6839],
     'cli_root/zpool_initialize/zpool_initialize_import_export':
         ['FAIL', 11948],
@@ -275,7 +276,8 @@ if sys.platform.startswith('freebsd'):
         'pool_checkpoint/checkpoint_big_rewind': ['FAIL', 12622],
         'pool_checkpoint/checkpoint_indirect': ['FAIL', 12623],
         'resilver/resilver_restart_001': ['FAIL', known_reason],
-        'snapshot/snapshot_002_pos': ['FAIL', '14831'],
+        'snapshot/snapshot_002_pos': ['FAIL', 14831],
+        'zvol/zvol_misc/zvol_misc_volmode': ['FAIL', 16668],
         'bclone/bclone_crossfs_corner_cases': ['SKIP', cfr_cross_reason],
         'bclone/bclone_crossfs_corner_cases_limited':
             ['SKIP', cfr_cross_reason],

From 77d81974b608b7fdcc5185f34dc74e9c7a3df24d Mon Sep 17 00:00:00 2001
From: Tino Reichardt <milky-zfs@mcmilk.de>
Date: Thu, 24 Oct 2024 23:05:45 +0200
Subject: [PATCH 22/47] Fix dependency install on Debian 11 (#16683)

Adding cryptsetup breaks some dialog things on Debian 11.
Apply some workaround for it.

Signed-off-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
---
 .github/workflows/scripts/qemu-3-deps.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/scripts/qemu-3-deps.sh b/.github/workflows/scripts/qemu-3-deps.sh
index a2fb5e38249a..4c6227b88ed0 100755
--- a/.github/workflows/scripts/qemu-3-deps.sh
+++ b/.github/workflows/scripts/qemu-3-deps.sh
@@ -111,6 +111,7 @@ case "$1" in
     archlinux
     ;;
   debian*)
+    echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections
     debian
     echo "##[group]Install Debian specific"
     sudo apt-get install -yq linux-perf dh-sequence-dkms

From 7e3ce4efaa15903d2aa6e6b014a935444dbb3f11 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 25 Oct 2024 12:03:37 -0400
Subject: [PATCH 23/47] Pack dmu_buf_impl_t by 16 bytes

On 64bit FreeBSD this reduces one from 296 to 280 bytes.  On small
block workloads dbufs may consume gigabytes of ARC, and this saves
5% of it.

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16684
---
 include/sys/dbuf.h | 53 +++++++++++++++++++++-------------------------
 1 file changed, 24 insertions(+), 29 deletions(-)

diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h
index 56741cd2a58b..c9bfb9a8026c 100644
--- a/include/sys/dbuf.h
+++ b/include/sys/dbuf.h
@@ -264,6 +264,27 @@ typedef struct dmu_buf_impl {
 	 */
 	uint8_t db_level;
 
+	/* This block was freed while a read or write was active. */
+	uint8_t db_freed_in_flight;
+
+	/*
+	 * Evict user data as soon as the dirty and reference counts are equal.
+	 */
+	uint8_t db_user_immediate_evict;
+
+	/*
+	 * dnode_evict_dbufs() or dnode_evict_bonus() tried to evict this dbuf,
+	 * but couldn't due to outstanding references.  Evict once the refcount
+	 * drops to 0.
+	 */
+	uint8_t db_pending_evict;
+
+	/* Number of TXGs in which this buffer is dirty. */
+	uint8_t db_dirtycnt;
+
+	/* The buffer was partially read.  More reads may follow. */
+	uint8_t db_partial_read;
+
 	/*
 	 * Protects db_buf's contents if they contain an indirect block or data
 	 * block of the meta-dnode. We use this lock to protect the structure of
@@ -288,6 +309,9 @@ typedef struct dmu_buf_impl {
 	 */
 	dbuf_states_t db_state;
 
+	/* In which dbuf cache this dbuf is, if any. */
+	dbuf_cached_state_t db_caching_status;
+
 	/*
 	 * Refcount accessed by dmu_buf_{hold,rele}.
 	 * If nonzero, the buffer can't be destroyed.
@@ -304,39 +328,10 @@ typedef struct dmu_buf_impl {
 	/* Link in dbuf_cache or dbuf_metadata_cache */
 	multilist_node_t db_cache_link;
 
-	/* Tells us which dbuf cache this dbuf is in, if any */
-	dbuf_cached_state_t db_caching_status;
-
 	uint64_t db_hash;
 
-	/* Data which is unique to data (leaf) blocks: */
-
 	/* User callback information. */
 	dmu_buf_user_t *db_user;
-
-	/*
-	 * Evict user data as soon as the dirty and reference
-	 * counts are equal.
-	 */
-	uint8_t db_user_immediate_evict;
-
-	/*
-	 * This block was freed while a read or write was
-	 * active.
-	 */
-	uint8_t db_freed_in_flight;
-
-	/*
-	 * dnode_evict_dbufs() or dnode_evict_bonus() tried to
-	 * evict this dbuf, but couldn't due to outstanding
-	 * references.  Evict once the refcount drops to 0.
-	 */
-	uint8_t db_pending_evict;
-
-	uint8_t db_dirtycnt;
-
-	/* The buffer was partially read.  More reads may follow. */
-	uint8_t db_partial_read;
 } dmu_buf_impl_t;
 
 #define	DBUF_HASH_MUTEX(h, idx) \

From f7b4bca66a1e8382bef314ec8ab14c99e3f54f84 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Fri, 25 Oct 2024 09:07:44 -0700
Subject: [PATCH 24/47] ZTS: Add LUKS sanity test

Add a LUKS sanity test to trigger: #16631

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #16681
---
 .github/workflows/scripts/qemu-3-deps.sh      | 35 ++++----
 tests/runfiles/linux.run                      |  6 ++
 tests/zfs-tests/include/commands.cfg          |  1 +
 tests/zfs-tests/tests/Makefile.am             |  3 +-
 .../tests/functional/luks/luks_sanity.ksh     | 90 +++++++++++++++++++
 5 files changed, 117 insertions(+), 18 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/luks/luks_sanity.ksh

diff --git a/.github/workflows/scripts/qemu-3-deps.sh b/.github/workflows/scripts/qemu-3-deps.sh
index 4c6227b88ed0..bf06c1892ceb 100755
--- a/.github/workflows/scripts/qemu-3-deps.sh
+++ b/.github/workflows/scripts/qemu-3-deps.sh
@@ -13,10 +13,10 @@ function archlinux() {
   echo "##[endgroup]"
 
   echo "##[group]Install Development Tools"
-  sudo pacman -Sy --noconfirm base-devel bc cpio dhclient dkms fakeroot \
-    fio gdb inetutils jq less linux linux-headers lsscsi nfs-utils parted \
-    pax perf python-packaging python-setuptools qemu-guest-agent ksh samba \
-    sysstat rng-tools rsync wget xxhash
+  sudo pacman -Sy --noconfirm base-devel bc cpio cryptsetup dhclient dkms \
+    fakeroot fio gdb inetutils jq less linux linux-headers lsscsi nfs-utils \
+    parted pax perf python-packaging python-setuptools qemu-guest-agent ksh \
+    samba sysstat rng-tools rsync wget xxhash
   echo "##[endgroup]"
 }
 
@@ -30,11 +30,11 @@ function debian() {
 
   echo "##[group]Install Development Tools"
   sudo apt-get install -y \
-    acl alien attr autoconf bc cpio curl dbench dh-python dkms fakeroot \
-    fio gdb gdebi git ksh lcov isc-dhcp-client jq libacl1-dev libaio-dev \
-    libattr1-dev libblkid-dev libcurl4-openssl-dev libdevmapper-dev libelf-dev \
-    libffi-dev libmount-dev libpam0g-dev libselinux-dev libssl-dev libtool \
-    libtool-bin libudev-dev libunwind-dev linux-headers-$(uname -r) \
+    acl alien attr autoconf bc cpio cryptsetup curl dbench dh-python dkms \
+    fakeroot fio gdb gdebi git ksh lcov isc-dhcp-client jq libacl1-dev \
+    libaio-dev libattr1-dev libblkid-dev libcurl4-openssl-dev libdevmapper-dev \
+    libelf-dev libffi-dev libmount-dev libpam0g-dev libselinux-dev libssl-dev \
+    libtool libtool-bin libudev-dev libunwind-dev linux-headers-$(uname -r) \
     lsscsi nfs-kernel-server pamtester parted python3 python3-all-dev \
     python3-cffi python3-dev python3-distlib python3-packaging \
     python3-setuptools python3-sphinx qemu-guest-agent rng-tools rpm2cpio \
@@ -68,14 +68,15 @@ function rhel() {
   echo "##[group]Install Development Tools"
   sudo dnf group install -y "Development Tools"
   sudo dnf install -y \
-    acl attr bc bzip2 curl dbench dkms elfutils-libelf-devel fio gdb git \
-    jq kernel-rpm-macros ksh libacl-devel libaio-devel libargon2-devel \
-    libattr-devel libblkid-devel libcurl-devel libffi-devel ncompress \
-    libselinux-devel libtirpc-devel libtool libudev-devel libuuid-devel \
-    lsscsi mdadm nfs-utils openssl-devel pam-devel pamtester parted perf \
-    python3 python3-cffi python3-devel python3-packaging kernel-devel \
-    python3-setuptools qemu-guest-agent rng-tools rpcgen rpm-build rsync \
-    samba sysstat systemd watchdog wget xfsprogs-devel xxhash zlib-devel
+    acl attr bc bzip2 cryptsetup curl dbench dkms elfutils-libelf-devel fio \
+    gdb git jq kernel-rpm-macros ksh libacl-devel libaio-devel \
+    libargon2-devel libattr-devel libblkid-devel libcurl-devel libffi-devel \
+    ncompress libselinux-devel libtirpc-devel libtool libudev-devel \
+    libuuid-devel lsscsi mdadm nfs-utils openssl-devel pam-devel pamtester \
+    parted perf python3 python3-cffi python3-devel python3-packaging \
+    kernel-devel python3-setuptools qemu-guest-agent rng-tools rpcgen \
+    rpm-build rsync samba sysstat systemd watchdog wget xfsprogs-devel xxhash \
+    zlib-devel
   echo "##[endgroup]"
 }
 
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 5534cd27f637..76d07a6cc9c1 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -147,6 +147,12 @@ tags = ['functional', 'largest_pool']
 tests = ['longname_001_pos', 'longname_002_pos', 'longname_003_pos']
 tags = ['functional', 'longname']
 
+[tests/functional/luks:Linux]
+pre =
+post =
+tests = ['luks_sanity']
+tags = ['functional', 'luks']
+
 [tests/functional/mmap:Linux]
 tests = ['mmap_libaio_001_pos', 'mmap_sync_001_pos']
 tags = ['functional', 'mmap']
diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg
index be41ce5210e8..5985b5fe1526 100644
--- a/tests/zfs-tests/include/commands.cfg
+++ b/tests/zfs-tests/include/commands.cfg
@@ -129,6 +129,7 @@ export SYSTEM_FILES_LINUX='attr
     blkdiscard
     blockdev
     chattr
+    cryptsetup
     exportfs
     fallocate
     flock
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index bc767b9f624f..7d1551a63f0d 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -80,7 +80,8 @@ if BUILD_LINUX
 nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/simd/simd_supported.ksh \
 	functional/tmpfile/cleanup.ksh \
-	functional/tmpfile/setup.ksh
+	functional/tmpfile/setup.ksh \
+	functional/luks/luks_sanity.ksh
 endif
 
 nobase_dist_datadir_zfs_tests_tests_DATA += \
diff --git a/tests/zfs-tests/tests/functional/luks/luks_sanity.ksh b/tests/zfs-tests/tests/functional/luks/luks_sanity.ksh
new file mode 100755
index 000000000000..9cee26503de7
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/luks/luks_sanity.ksh
@@ -0,0 +1,90 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024 by Lawrence Livermore National Security, LLC.
+# Use is subject to license terms.
+#
+
+# DESCRIPTION:
+#	Verify ZFS works on a LUKS-backed pool
+#
+# STRATEGY:
+#	1. Create a LUKS device
+#	2. Make a pool with it
+#	3. Write files to the pool
+#	4. Verify no errors
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "both"
+
+VDEV=$(mktemp --suffix=luks_sanity)
+TESTPOOL=testpool
+
+function cleanup
+{
+	log_must zpool destroy $TESTPOOL
+
+	log_must cryptsetup luksClose /dev/mapper/luksdev
+	log_must rm -f $VDEV
+}
+
+log_assert "Verify ZFS on LUKS works"
+log_onexit cleanup
+
+PASS="fdsjfosdijfsdkjsldfjdlk"
+
+# Make a small LUKS device since LUKS formatting takes time and we want to
+# make this test run as quickly as possible.
+truncate -s 100M $VDEV
+
+log_must cryptsetup luksFormat --type luks2 $VDEV <<< $PASS
+log_must cryptsetup luksOpen $VDEV luksdev <<< $PASS
+
+log_must zpool create $TESTPOOL /dev/mapper/luksdev
+
+CPUS=$(get_num_cpus)
+
+# Use these specific size and offset ranges as they often cause errors with
+# https://github.com/openzfs/zfs/issues/16631
+# and we want to try to test for that.
+for SIZE in {70..100} ; do
+	for OFF in {70..100} ; do
+		for i in {1..$CPUS} ; do
+			dd if=/dev/urandom of=/$TESTPOOL/file$i-bs$SIZE-off$OFF \
+			    seek=$OFF bs=$SIZE count=1 &>/dev/null &
+		done
+		wait
+	done
+	sync_pool $TESTPOOL
+	rm -f /$TESTPOOL/file*
+done
+
+# Verify no read/write/checksum errors.  Don't use JSON here so that we could
+# could potentially backport this test case to the 2.2.x branch.
+if zpool status -e | grep -q "luksdev" ; then
+	log_note "$(zpool status -v)"
+	log_fail "Saw errors writing to LUKS device"
+fi
+
+log_pass "Verified ZFS on LUKS works"

From fd2cae969ff83841ae35a5bf5e7b82b8a3a5f80b Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dimitry@andric.com>
Date: Tue, 29 Oct 2024 19:49:54 +0100
Subject: [PATCH 25/47] Fix gcc unused value warning in FreeBSD simd.h

The macros `simd_stat_init()` and `simd_stat_fini()` in FreeBSD's
`simd.h` are defined as zero, but they are actually only used as
statements. Replace the definitions with `do {} while (0)` instead, to
avoid gcc `-Wunused-value` warnings.

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Toomas Soome <tsoome@me.com>
Signed-off-by: Dimitry Andric <dimitry@andric.com>
Closes #16693
---
 include/os/freebsd/spl/sys/simd.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/os/freebsd/spl/sys/simd.h b/include/os/freebsd/spl/sys/simd.h
index 6bc46755c4e3..d16e1db5e826 100644
--- a/include/os/freebsd/spl/sys/simd.h
+++ b/include/os/freebsd/spl/sys/simd.h
@@ -50,7 +50,7 @@
 #define	kfpu_fini()		do {} while (0)
 #endif
 
-#define	simd_stat_init()	0
-#define	simd_stat_fini()	0
+#define	simd_stat_init()	do {} while (0)
+#define	simd_stat_fini()	do {} while (0)
 
 #endif

From f3823a9ab2d1d8dfbf5f0b8df64a47317bbe7fda Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dimitry@andric.com>
Date: Tue, 29 Oct 2024 20:05:02 +0100
Subject: [PATCH 26/47] Fix gcc uninitialized warning in FreeBSD zio_crypt.c

In FreeBSD's `zio_do_crypt_data()`, ensure that two `struct uio`
variables are cleared before copying data out of them. This avoids
accessing garbage data, and fixes gcc `-Wuninitialized` warnings.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Toomas Soome <tsoome@me.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Dimitry Andric <dimitry@andric.com>
Closes #16688
---
 module/os/freebsd/zfs/zio_crypt.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/module/os/freebsd/zfs/zio_crypt.c b/module/os/freebsd/zfs/zio_crypt.c
index 2b62abcccb78..feaca93fb933 100644
--- a/module/os/freebsd/zfs/zio_crypt.c
+++ b/module/os/freebsd/zfs/zio_crypt.c
@@ -1686,11 +1686,10 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
 	freebsd_crypt_session_t *tmpl = NULL;
 	uint8_t *authbuf = NULL;
 
-
+	memset(&puio_s, 0, sizeof (puio_s));
+	memset(&cuio_s, 0, sizeof (cuio_s));
 	zfs_uio_init(&puio, &puio_s);
 	zfs_uio_init(&cuio, &cuio_s);
-	memset(GET_UIO_STRUCT(&puio), 0, sizeof (struct uio));
-	memset(GET_UIO_STRUCT(&cuio), 0, sizeof (struct uio));
 
 #ifdef FCRYPTO_DEBUG
 	printf("%s(%s, %p, %p, %d, %p, %p, %u, %s, %p, %p, %p)\n",

From bbc0d34bfd2b6714d96c98d3fd252431e03c6148 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 29 Oct 2024 15:23:24 -0400
Subject: [PATCH 27/47] On the first vdev open ignore impossible ashift hints

If on the first open device's logical ashift is bigger than set
by pool's ashift property, ignore the last as unusable instead of
creating vdev that will fail most of I/Os due to misalignment.

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Signed-off-by:  Alexander Motin <mav@FreeBSD.org>
Sponsored by:   iXsystems, Inc.
Closes #16690
---
 module/zfs/vdev.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index bcab46c63bfa..983f444d79b0 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -2205,10 +2205,11 @@ vdev_open(vdev_t *vd)
 		vd->vdev_max_asize = max_asize;
 
 		/*
-		 * If the vdev_ashift was not overridden at creation time,
+		 * If the vdev_ashift was not overridden at creation time
+		 * (0) or the override value is impossible for the device,
 		 * then set it the logical ashift and optimize the ashift.
 		 */
-		if (vd->vdev_ashift == 0) {
+		if (vd->vdev_ashift < vd->vdev_logical_ashift) {
 			vd->vdev_ashift = vd->vdev_logical_ashift;
 
 			if (vd->vdev_logical_ashift > ASHIFT_MAX) {

From 8ac70aade7aff78b77d682d863b96376ffb4cc1e Mon Sep 17 00:00:00 2001
From: Serapheim Dimitropoulos <serapheimd@gmail.com>
Date: Wed, 30 Oct 2024 17:11:40 -0700
Subject: [PATCH 28/47] Add warning for external consumers of
 dmu_tx_callback_register

While reading some code @grwilson came across the above function that
seemingly had no consumers besides a ztest callback that ensures that
the tx_callback infrastructure works correctly. It turns out that Lustre
is the main (and potentially the only) consumer of this. Refer to
`osd_trans_commit_cb` of `lustre/osd-zfs/osd_handler.c` in the Lustre
repo for more info. Let's add a comment highlighting this before someone
removes it by mistake.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Serapheim Dimitropoulos <serapheimd@gmail.com>
Closes #16698
---
 module/zfs/dmu_tx.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c
index 3fdcebdff918..6aee7afb6954 100644
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@ -1377,6 +1377,13 @@ dmu_tx_pool(dmu_tx_t *tx)
 	return (tx->tx_pool);
 }
 
+/*
+ * Register a callback to be executed at the end of a TXG.
+ *
+ * Note: This currently exists for outside consumers, specifically the ZFS OSD
+ * for Lustre. Please do not remove before checking that project. For examples
+ * on how to use this see `ztest_commit_callback`.
+ */
 void
 dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
 {

From 19a8dd48e1ae9759f66e9204f9985d1abe2a68b0 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Fri, 25 Oct 2024 16:14:37 +1100
Subject: [PATCH 29/47] vdev_disk: try harder to ensure IO alignment rules

It seems out our notion of "properly" aligned IO was incomplete. In
particular, dm-crypt does its own splitting, and assumes that a logical
block will never cross an order-0 page boundary (ie, the physical page
size, not compound size). This effectively means that it needs to be
possible to split a BIO at any page or block size boundary and have it
work correctly.

This updates the alignment check function to enforce these rules (to the
extent possible).

Our response to misaligned data is to make some new allocation that is
properly aligned, and copy the data into it. It turns out that
linearising (via abd_borrow_buf()) is not enough, because we allocate eg
4K blocks from a general purpose slab, and so may receive (or already
have) a 4K block that crosses pages.

So instead, we allocate a new ABD, which is guaranteed to be aligned
properly to block sizes, and then copy everything into it, and back out
on the way back.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16687 #16631 #15646 #15533 #14533
---
 module/os/linux/zfs/vdev_disk.c               | 120 ++++----
 .../functional/vdev_disk/page_alignment.c     | 264 +++++++++++-------
 2 files changed, 230 insertions(+), 154 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index a6271d3a7df1..7a4944410906 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -805,14 +805,11 @@ vbio_completion(struct bio *bio)
 	 * to the ADB, with changes if appropriate.
 	 */
 	if (vbio->vbio_abd != NULL) {
-		void *buf = abd_to_buf(vbio->vbio_abd);
+		if (zio->io_type == ZIO_TYPE_READ)
+			abd_copy(zio->io_abd, vbio->vbio_abd, zio->io_size);
+
 		abd_free(vbio->vbio_abd);
 		vbio->vbio_abd = NULL;
-
-		if (zio->io_type == ZIO_TYPE_READ)
-			abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
-		else
-			abd_return_buf(zio->io_abd, buf, zio->io_size);
 	}
 
 	/* Final cleanup */
@@ -834,38 +831,61 @@ vbio_completion(struct bio *bio)
  * NOTE: if you change this function, change the copy in
  * tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test
  * data there to validate the change you're making.
- *
  */
 typedef struct {
-	uint_t  bmask;
-	uint_t  npages;
-	uint_t  end;
-} vdev_disk_check_pages_t;
+	size_t	blocksize;
+	int	seen_first;
+	int	seen_last;
+} vdev_disk_check_alignment_t;
 
 static int
-vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
+vdev_disk_check_alignment_cb(struct page *page, size_t off, size_t len,
+    void *priv)
 {
 	(void) page;
-	vdev_disk_check_pages_t *s = priv;
+	vdev_disk_check_alignment_t *s = priv;
 
 	/*
-	 * If we didn't finish on a block size boundary last time, then there
-	 * would be a gap if we tried to use this ABD as-is, so abort.
+	 * The cardinal rule: a single on-disk block must never cross an
+	 * physical (order-0) page boundary, as the kernel expects to be able
+	 * to split at both LBS and page boundaries.
+	 *
+	 * This implies various alignment rules for the blocks in this
+	 * (possibly compound) page, which we can check for.
 	 */
-	if (s->end != 0)
-		return (1);
 
 	/*
-	 * Note if we're taking less than a full block, so we can check it
-	 * above on the next call.
+	 * If the previous page did not end on a page boundary, then we
+	 * can't proceed without creating a hole.
 	 */
-	s->end = (off+len) & s->bmask;
+	if (s->seen_last)
+		return (1);
 
-	/* All blocks after the first must start on a block size boundary. */
-	if (s->npages != 0 && (off & s->bmask) != 0)
+	/* This page must contain only whole LBS-sized blocks. */
+	if (!IS_P2ALIGNED(len, s->blocksize))
 		return (1);
 
-	s->npages++;
+	/*
+	 * If this is not the first page in the ABD, then the data must start
+	 * on a page-aligned boundary (so the kernel can split on page
+	 * boundaries without having to deal with a hole). If it is, then
+	 * it can start on LBS-alignment.
+	 */
+	if (s->seen_first) {
+		if (!IS_P2ALIGNED(off, PAGESIZE))
+			return (1);
+	} else {
+		if (!IS_P2ALIGNED(off, s->blocksize))
+			return (1);
+		s->seen_first = 1;
+	}
+
+	/*
+	 * If this data does not end on a page-aligned boundary, then this
+	 * must be the last page in the ABD, for the same reason.
+	 */
+	s->seen_last = !IS_P2ALIGNED(off+len, PAGESIZE);
+
 	return (0);
 }
 
@@ -874,15 +894,14 @@ vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
  * the number of pages, or 0 if it can't be submitted like this.
  */
 static boolean_t
-vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
+vdev_disk_check_alignment(abd_t *abd, uint64_t size, struct block_device *bdev)
 {
-	vdev_disk_check_pages_t s = {
-	    .bmask = bdev_logical_block_size(bdev)-1,
-	    .npages = 0,
-	    .end = 0,
+	vdev_disk_check_alignment_t s = {
+	    .blocksize = bdev_logical_block_size(bdev),
 	};
 
-	if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
+	if (abd_iterate_page_func(abd, 0, size,
+	    vdev_disk_check_alignment_cb, &s))
 		return (B_FALSE);
 
 	return (B_TRUE);
@@ -916,37 +935,32 @@ vdev_disk_io_rw(zio_t *zio)
 
 	/*
 	 * Check alignment of the incoming ABD. If any part of it would require
-	 * submitting a page that is not aligned to the logical block size,
-	 * then we take a copy into a linear buffer and submit that instead.
-	 * This should be impossible on a 512b LBS, and fairly rare on 4K,
-	 * usually requiring abnormally-small data blocks (eg gang blocks)
-	 * mixed into the same ABD as larger ones (eg aggregated).
+	 * submitting a page that is not aligned to both the logical block size
+	 * and the page size, then we take a copy into a new memory region with
+	 * correct alignment.  This should be impossible on a 512b LBS. On
+	 * larger blocks, this can happen at least when a small number of
+	 * blocks (usually 1) are allocated from a shared slab, or when
+	 * abnormally-small data regions (eg gang headers) are mixed into the
+	 * same ABD as larger allocations (eg aggregations).
 	 */
 	abd_t *abd = zio->io_abd;
-	if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) {
-		void *buf;
-		if (zio->io_type == ZIO_TYPE_READ)
-			buf = abd_borrow_buf(zio->io_abd, zio->io_size);
-		else
-			buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+	if (!vdev_disk_check_alignment(abd, zio->io_size, bdev)) {
+		/* Allocate a new memory region with guaranteed alignment */
+		abd = abd_alloc_for_io(zio->io_size,
+		    zio->io_abd->abd_flags & ABD_FLAG_META);
 
-		/*
-		 * Wrap the copy in an abd_t, so we can use the same iterators
-		 * to count and fill the vbio later.
-		 */
-		abd = abd_get_from_buf(buf, zio->io_size);
+		/* If we're writing copy our data into it */
+		if (zio->io_type == ZIO_TYPE_WRITE)
+			abd_copy(abd, zio->io_abd, zio->io_size);
 
 		/*
-		 * False here would mean the borrowed copy has an invalid
-		 * alignment too, which would mean we've somehow been passed a
-		 * linear ABD with an interior page that has a non-zero offset
-		 * or a size not a multiple of PAGE_SIZE. This is not possible.
-		 * It would mean either zio_buf_alloc() or its underlying
-		 * allocators have done something extremely strange, or our
-		 * math in vdev_disk_check_pages() is wrong. In either case,
+		 * False here would mean the new allocation has an invalid
+		 * alignment too, which would mean that abd_alloc() is not
+		 * guaranteeing this, or our logic in
+		 * vdev_disk_check_alignment() is wrong. In either case,
 		 * something in seriously wrong and its not safe to continue.
 		 */
-		VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev));
+		VERIFY(vdev_disk_check_alignment(abd, zio->io_size, bdev));
 	}
 
 	/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
diff --git a/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c b/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c
index 5c6d28eb2c44..7b926da6c01c 100644
--- a/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c
+++ b/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c
@@ -30,7 +30,7 @@
 
 /*
  * This tests the vdev_disk page alignment check callback
- * vdev_disk_check_pages_cb(). For now, this test includes a copy of that
+ * vdev_disk_check_alignment_cb(). For now, this test includes a copy of that
  * function from module/os/linux/zfs/vdev_disk.c. If you change it here,
  * remember to change it there too, and add tests data here to validate the
  * change you're making.
@@ -38,36 +38,69 @@
 
 struct page;
 
+/*
+ * This is spl_pagesize() in userspace, which requires linking libspl, but
+ * would also then use the platform page size, which isn't what we want for
+ * a test. To keep the check callback the same as the real one, we just
+ * redefine it.
+ */
+#undef	PAGESIZE
+#define	PAGESIZE	(4096)
+
 typedef struct {
-	uint32_t  bmask;
-	uint32_t  npages;
-	uint32_t  end;
-} vdev_disk_check_pages_t;
+	size_t	blocksize;
+	int	seen_first;
+	int	seen_last;
+} vdev_disk_check_alignment_t;
 
 static int
-vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
+vdev_disk_check_alignment_cb(struct page *page, size_t off, size_t len,
+    void *priv)
 {
 	(void) page;
-	vdev_disk_check_pages_t *s = priv;
+	vdev_disk_check_alignment_t *s = priv;
 
 	/*
-	 * If we didn't finish on a block size boundary last time, then there
-	 * would be a gap if we tried to use this ABD as-is, so abort.
+	 * The cardinal rule: a single on-disk block must never cross an
+	 * physical (order-0) page boundary, as the kernel expects to be able
+	 * to split at both LBS and page boundaries.
+	 *
+	 * This implies various alignment rules for the blocks in this
+	 * (possibly compound) page, which we can check for.
 	 */
-	if (s->end != 0)
-		return (1);
 
 	/*
-	 * Note if we're taking less than a full block, so we can check it
-	 * above on the next call.
+	 * If the previous page did not end on a page boundary, then we
+	 * can't proceed without creating a hole.
 	 */
-	s->end = (off+len) & s->bmask;
+	if (s->seen_last)
+		return (1);
 
-	/* All blocks after the first must start on a block size boundary. */
-	if (s->npages != 0 && (off & s->bmask) != 0)
+	/* This page must contain only whole LBS-sized blocks. */
+	if (!IS_P2ALIGNED(len, s->blocksize))
 		return (1);
 
-	s->npages++;
+	/*
+	 * If this is not the first page in the ABD, then the data must start
+	 * on a page-aligned boundary (so the kernel can split on page
+	 * boundaries without having to deal with a hole). If it is, then
+	 * it can start on LBS-alignment.
+	 */
+	if (s->seen_first) {
+		if (!IS_P2ALIGNED(off, PAGESIZE))
+			return (1);
+	} else {
+		if (!IS_P2ALIGNED(off, s->blocksize))
+			return (1);
+		s->seen_first = 1;
+	}
+
+	/*
+	 * If this data does not end on a page-aligned boundary, then this
+	 * must be the last page in the ABD, for the same reason.
+	 */
+	s->seen_last = !IS_P2ALIGNED(off+len, PAGESIZE);
+
 	return (0);
 }
 
@@ -75,8 +108,8 @@ typedef struct {
 	/* test name */
 	const char	*name;
 
-	/* blocks size mask */
-	uint32_t	mask;
+	/* stored block size */
+	uint32_t	blocksize;
 
 	/* amount of data to take */
 	size_t		size;
@@ -89,39 +122,39 @@ static const page_test_t valid_tests[] = {
 	/* 512B block tests */
 	{
 		"512B blocks, 4K single page",
-		0x1ff, 0x1000, {
+		512, 0x1000, {
 			{ 0x0, 0x1000 },
 		},
 	}, {
 		"512B blocks, 1K at start of page",
-		0x1ff, 0x400, {
+		512, 0x400, {
 			{ 0x0, 0x1000 },
 		},
 	}, {
 		"512B blocks, 1K at end of page",
-		0x1ff, 0x400, {
+		512, 0x400, {
 			{ 0x0c00, 0x0400 },
 		},
 	}, {
 		"512B blocks, 1K within page, 512B start offset",
-		0x1ff, 0x400, {
+		512, 0x400, {
 			{ 0x0200, 0x0e00 },
 		},
 	}, {
 		"512B blocks, 8K across 2x4K pages",
-		0x1ff, 0x2000, {
+		512, 0x2000, {
 			{ 0x0, 0x1000 },
 			{ 0x0, 0x1000 },
 		},
 	}, {
 		"512B blocks, 4K across two pages, 2K start offset",
-		0x1ff, 0x1000, {
+		512, 0x1000, {
 			{ 0x0800, 0x0800 },
 			{ 0x0,    0x0800 },
 		},
 	}, {
 		"512B blocks, 16K across 5x4K pages, 512B start offset",
-		0x1ff, 0x4000, {
+		512, 0x4000, {
 			{ 0x0200, 0x0e00 },
 			{ 0x0,    0x1000 },
 			{ 0x0,    0x1000 },
@@ -130,7 +163,7 @@ static const page_test_t valid_tests[] = {
 		},
 	}, {
 		"512B blocks, 64K data, 8x8K compound pages",
-		0x1ff, 0x10000, {
+		512, 0x10000, {
 			{ 0x0, 0x2000 },
 			{ 0x0, 0x2000 },
 			{ 0x0, 0x2000 },
@@ -142,7 +175,7 @@ static const page_test_t valid_tests[] = {
 		},
 	}, {
 		"512B blocks, 64K data, 9x8K compound pages, 512B start offset",
-		0x1ff, 0x10000, {
+		512, 0x10000, {
 			{ 0x0200, 0x1e00 },
 			{ 0x0,    0x2000 },
 			{ 0x0,    0x2000 },
@@ -155,7 +188,7 @@ static const page_test_t valid_tests[] = {
 		},
 	}, {
 		"512B blocks, 64K data, 2x16K compound pages, 8x4K pages",
-		0x1ff, 0x10000, {
+		512, 0x10000, {
 			{ 0x0, 0x8000 },
 			{ 0x0, 0x8000 },
 			{ 0x0, 0x1000 },
@@ -169,7 +202,7 @@ static const page_test_t valid_tests[] = {
 		},
 	}, {
 		"512B blocks, 64K data, mixed 4K/8K/16K pages",
-		0x1ff, 0x10000, {
+		512, 0x10000, {
 			{ 0x0, 0x1000 },
 			{ 0x0, 0x2000 },
 			{ 0x0, 0x1000 },
@@ -183,7 +216,7 @@ static const page_test_t valid_tests[] = {
 		},
 	}, {
 		"512B blocks, 64K data, mixed 4K/8K/16K pages, 1K start offset",
-		0x1ff, 0x10000, {
+		512, 0x10000, {
 			{ 0x0400, 0x0c00 },
 			{ 0x0,    0x1000 },
 			{ 0x0,    0x1000 },
@@ -200,48 +233,18 @@ static const page_test_t valid_tests[] = {
 	/* 4K block tests */
 	{
 		"4K blocks, 4K single page",
-		0xfff, 0x1000, {
-			{ 0x0, 0x1000 },
-		},
-	}, {
-		"4K blocks, 1K at start of page",
-		0xfff, 0x400, {
+		4096, 0x1000, {
 			{ 0x0, 0x1000 },
 		},
-	}, {
-		"4K blocks, 1K at end of page",
-		0xfff, 0x400, {
-			{ 0x0c00, 0x0400 },
-		},
-	}, {
-		"4K blocks, 1K within page, 512B start offset",
-		0xfff, 0x400, {
-			{ 0x0200, 0x0e00 },
-		},
 	}, {
 		"4K blocks, 8K across 2x4K pages",
-		0xfff, 0x2000, {
+		4096, 0x2000, {
 			{ 0x0, 0x1000 },
 			{ 0x0, 0x1000 },
 		},
-	}, {
-		"4K blocks, 4K across two pages, 2K start offset",
-		0xfff, 0x1000, {
-			{ 0x0800, 0x0800 },
-			{ 0x0,    0x0800 },
-		},
-	}, {
-		"4K blocks, 16K across 5x4K pages, 512B start offset",
-		0xfff, 0x4000, {
-			{ 0x0200, 0x0e00 },
-			{ 0x0,    0x1000 },
-			{ 0x0,    0x1000 },
-			{ 0x0,    0x1000 },
-			{ 0x0,    0x0200 },
-		},
 	}, {
 		"4K blocks, 64K data, 8x8K compound pages",
-		0xfff, 0x10000, {
+		4096, 0x10000, {
 			{ 0x0, 0x2000 },
 			{ 0x0, 0x2000 },
 			{ 0x0, 0x2000 },
@@ -251,22 +254,9 @@ static const page_test_t valid_tests[] = {
 			{ 0x0, 0x2000 },
 			{ 0x0, 0x2000 },
 		},
-	}, {
-		"4K blocks, 64K data, 9x8K compound pages, 512B start offset",
-		0xfff, 0x10000, {
-			{ 0x0200, 0x1e00 },
-			{ 0x0,    0x2000 },
-			{ 0x0,    0x2000 },
-			{ 0x0,    0x2000 },
-			{ 0x0,    0x2000 },
-			{ 0x0,    0x2000 },
-			{ 0x0,    0x2000 },
-			{ 0x0,    0x2000 },
-			{ 0x0,    0x0200 },
-		},
 	}, {
 		"4K blocks, 64K data, 2x16K compound pages, 8x4K pages",
-		0xfff, 0x10000, {
+		4096, 0x10000, {
 			{ 0x0, 0x8000 },
 			{ 0x0, 0x8000 },
 			{ 0x0, 0x1000 },
@@ -280,7 +270,7 @@ static const page_test_t valid_tests[] = {
 		},
 	}, {
 		"4K blocks, 64K data, mixed 4K/8K/16K pages",
-		0xfff, 0x10000, {
+		4096, 0x10000, {
 			{ 0x0, 0x1000 },
 			{ 0x0, 0x2000 },
 			{ 0x0, 0x1000 },
@@ -292,29 +282,19 @@ static const page_test_t valid_tests[] = {
 			{ 0x0, 0x1000 },
 			{ 0x0, 0x2000 },
 		},
-	}, {
-		"4K blocks, 64K data, mixed 4K/8K/16K pages, 1K start offset",
-		0xfff, 0x10000, {
-			{ 0x0400, 0x0c00 },
-			{ 0x0,    0x1000 },
-			{ 0x0,    0x1000 },
-			{ 0x0,    0x1000 },
-			{ 0x0,    0x2000 },
-			{ 0x0,    0x2000 },
-			{ 0x0,    0x1000 },
-			{ 0x0,    0x8000 },
-			{ 0x0,    0x1000 },
-			{ 0x0,    0x0400 },
-		},
 	},
 
 	{ 0 },
 };
 
 static const page_test_t invalid_tests[] = {
+	/*
+	 * Gang tests. Composed of lots of smaller allocations, rarely properly
+	 * aligned.
+	 */
 	{
 		"512B blocks, 16K data, 512 leader (gang block simulation)",
-		0x1ff, 0x8000, {
+		512, 0x8000, {
 			{ 0x0, 0x0200 },
 			{ 0x0, 0x1000 },
 			{ 0x0, 0x1000 },
@@ -324,7 +304,7 @@ static const page_test_t invalid_tests[] = {
 	}, {
 		"4K blocks, 32K data, 2 incompatible spans "
 		"(gang abd simulation)",
-		0xfff, 0x8000, {
+		4096, 0x8000, {
 			{ 0x0800, 0x0800 },
 			{ 0x0,    0x1000 },
 			{ 0x0,    0x1000 },
@@ -337,6 +317,90 @@ static const page_test_t invalid_tests[] = {
 			{ 0x0,    0x0800 },
 		},
 	},
+
+	/*
+	 * Blocks must not span multiple physical pages. These tests used to
+	 * be considered valid, but were since found to be invalid and were
+	 * moved here.
+	 */
+	{
+		"4K blocks, 4K across two pages, 2K start offset",
+		4096, 0x1000, {
+			{ 0x0800, 0x0800 },
+			{ 0x0,    0x0800 },
+		},
+	}, {
+		"4K blocks, 16K across 5x4K pages, 512B start offset",
+		4096, 0x4000, {
+			{ 0x0200, 0x0e00 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x0200 },
+		},
+	}, {
+		"4K blocks, 64K data, 9x8K compound pages, 512B start offset",
+		4096, 0x10000, {
+			{ 0x0200, 0x1e00 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x0200 },
+		},
+	}, {
+		"4K blocks, 64K data, mixed 4K/8K/16K pages, 1K start offset",
+		4096, 0x10000, {
+			{ 0x0400, 0x0c00 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x2000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x8000 },
+			{ 0x0,    0x1000 },
+			{ 0x0,    0x0400 },
+		},
+	},
+
+	/*
+	 * This is the very typical case of a 4K block being allocated from
+	 * the middle of a mixed-used slab backed by a higher-order compound
+	 * page.
+	 */
+	{
+		"4K blocks, 4K data from compound slab, 2K-align offset",
+		4096, 0x1000, {
+			{ 0x1800, 0x6800 }
+		}
+	},
+
+	/*
+	 * Blocks smaller than LBS should never be possible, but used to be by
+	 * accident (see GH#16990). We test for and reject them just to be
+	 * sure.
+	 */
+	{
+		"4K blocks, 1K at end of page",
+		4096, 0x400, {
+			{ 0x0c00, 0x0400 },
+		},
+	}, {
+		"4K blocks, 1K at start of page",
+		4096, 0x400, {
+			{ 0x0, 0x1000 },
+		},
+	}, {
+		"4K blocks, 1K within page, 512B start offset",
+		4096, 0x400, {
+			{ 0x0200, 0x0e00 },
+		},
+	},
+
 	{ 0 },
 };
 
@@ -345,10 +409,8 @@ run_test(const page_test_t *test, bool verbose)
 {
 	size_t rem = test->size;
 
-	vdev_disk_check_pages_t s = {
-	    .bmask = 0xfff,
-	    .npages = 0,
-	    .end = 0,
+	vdev_disk_check_alignment_t s = {
+	    .blocksize = test->blocksize,
 	};
 
 	for (int i = 0; test->pages[i][1] > 0; i++) {
@@ -362,7 +424,7 @@ run_test(const page_test_t *test, bool verbose)
 			    "rem %lx, take %lx\n",
 			    i, off, len, rem, take);
 
-		if (vdev_disk_check_pages_cb(NULL, off, take, &s)) {
+		if (vdev_disk_check_alignment_cb(NULL, off, take, &s)) {
 			if (verbose)
 				printf("  ABORT: misalignment detected, "
 				    "rem %lx\n", rem);
@@ -389,7 +451,7 @@ run_test_set(const page_test_t *tests, bool want, int *ntests, int *npassed)
 	for (const page_test_t *test = &tests[0]; test->name; test++) {
 		bool pass = (run_test(test, false) == want);
 		if (pass) {
-			printf("%s: PASS\n", test->name);
+			printf("%c %s: PASS\n", want ? '+' : '-', test->name);
 			(*npassed)++;
 		} else {
 			printf("%s: FAIL [expected %s, got %s]\n", test->name,

From 86b5853cfbf66af1afbef63a2dfbea488e8c9a0b Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Fri, 25 Oct 2024 17:28:20 +1100
Subject: [PATCH 30/47] vdev_disk: move abd return and free off the interrupt
 handler

Freeing an ABD can take sleeping locks to update various stats. We
aren't allowed to sleep on an interrupt handler. So, move the free off
to the io_done callback.

We should never have been freeing things in the interrupt handler, but
we got away with it because we were usually freeing a linear ABD, which
at most is returning two objects to a cache and never sleeping. Scatter
ABDs can be used now, and those have more complex locking.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #16687
---
 module/os/linux/zfs/vdev_disk.c | 40 ++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 13 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 7a4944410906..6a66a72b91a9 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -801,21 +801,13 @@ vbio_completion(struct bio *bio)
 	bio_put(bio);
 
 	/*
-	 * If we copied the ABD before issuing it, clean up and return the copy
-	 * to the ADB, with changes if appropriate.
+	 * We're likely in an interrupt context so we can't do ABD/memory work
+	 * here; instead we stash vbio on the zio and take care of it in the
+	 * done callback.
 	 */
-	if (vbio->vbio_abd != NULL) {
-		if (zio->io_type == ZIO_TYPE_READ)
-			abd_copy(zio->io_abd, vbio->vbio_abd, zio->io_size);
+	ASSERT3P(zio->io_bio, ==, NULL);
+	zio->io_bio = vbio;
 
-		abd_free(vbio->vbio_abd);
-		vbio->vbio_abd = NULL;
-	}
-
-	/* Final cleanup */
-	kmem_free(vbio, sizeof (vbio_t));
-
-	/* All done, submit for processing */
 	zio_delay_interrupt(zio);
 }
 
@@ -1451,6 +1443,28 @@ vdev_disk_io_start(zio_t *zio)
 static void
 vdev_disk_io_done(zio_t *zio)
 {
+	/* If this was a read or write, we need to clean up the vbio */
+	if (zio->io_bio != NULL) {
+		vbio_t *vbio = zio->io_bio;
+		zio->io_bio = NULL;
+
+		/*
+		 * If we copied the ABD before issuing it, clean up and return
+		 * the copy to the ADB, with changes if appropriate.
+		 */
+		if (vbio->vbio_abd != NULL) {
+			if (zio->io_type == ZIO_TYPE_READ)
+				abd_copy(zio->io_abd, vbio->vbio_abd,
+				    zio->io_size);
+
+			abd_free(vbio->vbio_abd);
+			vbio->vbio_abd = NULL;
+		}
+
+		/* Final cleanup */
+		kmem_free(vbio, sizeof (vbio_t));
+	}
+
 	/*
 	 * If the device returned EIO, we revalidate the media.  If it is
 	 * determined the media has changed this triggers the asynchronous

From 903d3f9187c2f6e3c0b3db620f475111a067267b Mon Sep 17 00:00:00 2001
From: Rich Ercolani <214141+rincebrain@users.noreply.github.com>
Date: Thu, 31 Oct 2024 22:32:31 -0400
Subject: [PATCH 31/47] Added output to `zpool online` and `offline`

I was surprised to discover today that `zpool online` and
`zpool offline` don't print any information about why they failed in
many cases, they just return 1 with no information about why.

Let's improve that where we can without changing the library function.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Rich Ercolani <rincebrain@gmail.com>
Closes #16244
---
 cmd/zpool/zpool_main.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index ea180f6b705e..6a45a063d91a 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -7966,8 +7966,11 @@ zpool_do_online(int argc, char **argv)
 
 	poolname = argv[0];
 
-	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
+	if ((zhp = zpool_open(g_zfs, poolname)) == NULL) {
+		(void) fprintf(stderr, gettext("failed to open pool "
+		    "\"%s\""), poolname);
 		return (1);
+	}
 
 	for (i = 1; i < argc; i++) {
 		vdev_state_t oldstate;
@@ -7988,12 +7991,15 @@ zpool_do_online(int argc, char **argv)
 		    &l2cache, NULL);
 		if (tgt == NULL) {
 			ret = 1;
+			(void) fprintf(stderr, gettext("couldn't find device "
+			"\"%s\" in pool \"%s\"\n"), argv[i], poolname);
 			continue;
 		}
 		uint_t vsc;
 		oldstate = ((vdev_stat_t *)fnvlist_lookup_uint64_array(tgt,
 		    ZPOOL_CONFIG_VDEV_STATS, &vsc))->vs_state;
-		if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) {
+		if ((rc = zpool_vdev_online(zhp, argv[i], flags,
+		    &newstate)) == 0) {
 			if (newstate != VDEV_STATE_HEALTHY) {
 				(void) printf(gettext("warning: device '%s' "
 				    "onlined, but remains in faulted state\n"),
@@ -8019,6 +8025,9 @@ zpool_do_online(int argc, char **argv)
 				}
 			}
 		} else {
+			(void) fprintf(stderr, gettext("Failed to online "
+			    "\"%s\" in pool \"%s\": %d\n"),
+			    argv[i], poolname, rc);
 			ret = 1;
 		}
 	}
@@ -8103,8 +8112,11 @@ zpool_do_offline(int argc, char **argv)
 
 	poolname = argv[0];
 
-	if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
+	if ((zhp = zpool_open(g_zfs, poolname)) == NULL) {
+		(void) fprintf(stderr, gettext("failed to open pool "
+		    "\"%s\""), poolname);
 		return (1);
+	}
 
 	for (i = 1; i < argc; i++) {
 		uint64_t guid = zpool_vdev_path_to_guid(zhp, argv[i]);

From 7546fbd6e99144f33cb0adf68f2876f61651c8d0 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Sat, 2 Nov 2024 03:08:33 +1100
Subject: [PATCH 32/47] zdb: add extra -T flag to show histograms of BRT
 refcounts

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #16692
---
 cmd/zdb/zdb.c  | 29 ++++++++++++++++++++++++-----
 man/man8/zdb.8 |  4 +++-
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 42b82c08249f..46587671202a 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -2152,14 +2152,21 @@ dump_brt(spa_t *spa)
 	if (dump_opt['T'] < 3)
 		return;
 
+	/* -TTT shows a per-vdev histograms; -TTTT shows all entries */
+	boolean_t do_histo = dump_opt['T'] == 3;
+
 	char dva[64];
-	printf("\n%-16s %-10s\n", "DVA", "REFCNT");
+
+	if (!do_histo)
+		printf("\n%-16s %-10s\n", "DVA", "REFCNT");
 
 	for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
 		brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
 		if (brtvd == NULL || !brtvd->bv_initiated)
 			continue;
 
+		uint64_t counts[64] = {};
+
 		zap_cursor_t zc;
 		zap_attribute_t *za = zap_attribute_alloc();
 		for (zap_cursor_init(&zc, brt->brt_mos, brtvd->bv_mos_entries);
@@ -2172,14 +2179,26 @@ dump_brt(spa_t *spa)
 			    za->za_integer_length, za->za_num_integers,
 			    &refcnt));
 
-			uint64_t offset = *(const uint64_t *)za->za_name;
+			if (do_histo)
+				counts[highbit64(refcnt)]++;
+			else {
+				uint64_t offset =
+				    *(const uint64_t *)za->za_name;
 
-			snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx", vdevid,
-			    (u_longlong_t)offset);
-			printf("%-16s %-10llu\n", dva, (u_longlong_t)refcnt);
+				snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx",
+				    vdevid, (u_longlong_t)offset);
+				printf("%-16s %-10llu\n", dva,
+				    (u_longlong_t)refcnt);
+			}
 		}
 		zap_cursor_fini(&zc);
 		zap_attribute_free(za);
+
+		if (do_histo) {
+			printf("\nBRT: vdev %" PRIu64
+			    ": DVAs with 2^n refcnts:\n", vdevid);
+			dump_histogram(counts, 64, 0);
+		}
 	}
 }
 
diff --git a/man/man8/zdb.8 b/man/man8/zdb.8
index 08f5a3f70040..ae35454ad083 100644
--- a/man/man8/zdb.8
+++ b/man/man8/zdb.8
@@ -14,7 +14,7 @@
 .\" Copyright (c) 2017 Lawrence Livermore National Security, LLC.
 .\" Copyright (c) 2017 Intel Corporation.
 .\"
-.Dd November 18, 2023
+.Dd October 27, 2024
 .Dt ZDB 8
 .Os
 .
@@ -408,6 +408,8 @@ blocks cloned, the space saving as a result of cloning, and the saving ratio.
 .It Fl TT
 Display the per-vdev BRT statistics, including total references.
 .It Fl TTT
+Display histograms of per-vdev BRT refcounts.
+.It Fl TTTT
 Dump the contents of the block reference tables.
 .It Fl u , -uberblock
 Display the current uberblock.

From d367ef2995f3e04d71b8ae648867b323293c7cdf Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Fri, 1 Nov 2024 10:02:03 -0700
Subject: [PATCH 33/47] ZTS: Add Fedora 41, remove Fedora 39

Fedora 41 was released 10/29/24, and Fedora 39 will be EOL on 11/12/24.
Update Fedora runners in the test suite.  Some minor tweaks also needed
to support ksh 1.0.10.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #16700
---
 .github/workflows/scripts/qemu-2-start.sh            | 12 ++++++------
 .github/workflows/scripts/qemu-3-deps.sh             |  8 +++++++-
 .github/workflows/scripts/qemu-4-build.sh            |  2 +-
 .github/workflows/zfs-qemu.yml                       |  6 +++---
 .../cli_root/zpool_add/zpool_add_dryrun_output.ksh   |  6 +++---
 .../zpool_create/zpool_create_dryrun_output.ksh      |  4 ++--
 .../zpool_split/zpool_split_dryrun_output.ksh        |  6 +++---
 7 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh
index 84e13832d10f..39ac92107b71 100755
--- a/.github/workflows/scripts/qemu-2-start.sh
+++ b/.github/workflows/scripts/qemu-2-start.sh
@@ -52,16 +52,16 @@ case "$OS" in
     OSNAME="Debian 12"
     URL="https://cloud.debian.org/images/cloud/bookworm/latest/debian-12-generic-amd64.qcow2"
     ;;
-  fedora39)
-    OSNAME="Fedora 39"
-    OSv="fedora39"
-    URL="https://download.fedoraproject.org/pub/fedora/linux/releases/39/Cloud/x86_64/images/Fedora-Cloud-Base-39-1.5.x86_64.qcow2"
-    ;;
   fedora40)
     OSNAME="Fedora 40"
-    OSv="fedora39"
+    OSv="fedora-unknown"
     URL="https://download.fedoraproject.org/pub/fedora/linux/releases/40/Cloud/x86_64/images/Fedora-Cloud-Base-Generic.x86_64-40-1.14.qcow2"
     ;;
+  fedora41)
+    OSNAME="Fedora 41"
+    OSv="fedora-unknown"
+    URL="https://download.fedoraproject.org/pub/fedora/linux/releases/41/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-41-1.4.x86_64.qcow2"
+    ;;
   freebsd13-3r)
     OSNAME="FreeBSD 13.3-RELEASE"
     OSv="freebsd13.0"
diff --git a/.github/workflows/scripts/qemu-3-deps.sh b/.github/workflows/scripts/qemu-3-deps.sh
index bf06c1892ceb..96979cd02e09 100755
--- a/.github/workflows/scripts/qemu-3-deps.sh
+++ b/.github/workflows/scripts/qemu-3-deps.sh
@@ -66,7 +66,13 @@ function rhel() {
   echo "##[endgroup]"
 
   echo "##[group]Install Development Tools"
-  sudo dnf group install -y "Development Tools"
+
+  # Alma wants "Development Tools", Fedora 41 wants "development-tools"
+  if ! sudo dnf group install -y "Development Tools" ; then
+    echo "Trying 'development-tools' instead of 'Development Tools'"
+    sudo dnf group install -y development-tools
+  fi
+
   sudo dnf install -y \
     acl attr bc bzip2 cryptsetup curl dbench dkms elfutils-libelf-devel fio \
     gdb git jq kernel-rpm-macros ksh libacl-devel libaio-devel \
diff --git a/.github/workflows/scripts/qemu-4-build.sh b/.github/workflows/scripts/qemu-4-build.sh
index 1051ee1f4deb..955f605f5bce 100755
--- a/.github/workflows/scripts/qemu-4-build.sh
+++ b/.github/workflows/scripts/qemu-4-build.sh
@@ -83,7 +83,7 @@ function rpm_build_and_install() {
   echo "##[endgroup]"
 
   echo "##[group]Install"
-  run sudo dnf -y --skip-broken localinstall $(ls *.rpm | grep -v src.rpm)
+  run sudo dnf -y --nobest install $(ls *.rpm | grep -v src.rpm)
   echo "##[endgroup]"
 
 }
diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml
index f819e9938e31..e90030f4c02e 100644
--- a/.github/workflows/zfs-qemu.yml
+++ b/.github/workflows/zfs-qemu.yml
@@ -22,8 +22,8 @@ jobs:
       - name: Generate OS config and CI type
         id: os
         run: |
-          FULL_OS='["almalinux8", "almalinux9", "centos-stream9", "debian11", "debian12", "fedora39", "fedora40", "freebsd13-4r", "freebsd14-0r", "freebsd14-1s", "ubuntu20", "ubuntu22", "ubuntu24"]'
-          QUICK_OS='["almalinux8", "almalinux9", "debian12", "fedora40", "freebsd13-3r", "freebsd14-1r", "ubuntu24"]'
+          FULL_OS='["almalinux8", "almalinux9", "centos-stream9", "debian11", "debian12", "fedora40", "fedora41", "freebsd13-4r", "freebsd14-0r", "freebsd14-1s", "ubuntu20", "ubuntu22", "ubuntu24"]'
+          QUICK_OS='["almalinux8", "almalinux9", "debian12", "fedora41", "freebsd13-3r", "freebsd14-1r", "ubuntu24"]'
           # determine CI type when running on PR
           ci_type="full"
           if ${{ github.event_name == 'pull_request' }}; then
@@ -46,7 +46,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        # rhl:     almalinux8, almalinux9, centos-stream9, fedora39, fedora40
+        # rhl:     almalinux8, almalinux9, centos-stream9, fedora40, fedora41
         # debian:  debian11, debian12, ubuntu20, ubuntu22, ubuntu24
         # misc:    archlinux, tumbleweed
         # FreeBSD Release: freebsd13-3r, freebsd13-4r, freebsd14-0r, freebsd14-1r
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh
index fa1ce9c64d33..3b06e8c35c77 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh
@@ -148,9 +148,9 @@ done
 
 # Foreach test create pool, add -n devices and check output.
 for (( i=0; i < ${#tests[@]}; i+=1 )); do
-	typeset tree="${tests[$i].tree}"
-	typeset add="${tests[$i].add}"
-	typeset want="${tests[$i].want}"
+	tree="${tests[$i].tree}"
+	add="${tests[$i].add}"
+	want="${tests[$i].want}"
 
 	log_must eval zpool create "$TESTPOOL" $tree
 	log_must poolexists "$TESTPOOL"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh
index 485891c945a3..0671ea618e05 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh
@@ -124,8 +124,8 @@ done
 
 # Foreach test create pool, add -n devices and check output.
 for (( i=0; i < ${#tests[@]}; i+=1 )); do
-	typeset tree="${tests[$i].tree}"
-	typeset want="${tests[$i].want}"
+	tree="${tests[$i].tree}"
+	want="${tests[$i].want}"
 
 	typeset out="$(log_must eval "zpool create -n '$TESTPOOL' $tree" | \
 	    sed /^SUCCESS/d)"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh
index 410b1fe7a03e..def1d154387d 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh
@@ -133,9 +133,9 @@ done
 
 # Foreach test create pool, add -n devices and check output.
 for (( i=0; i < ${#tests[@]}; i+=1 )); do
-	typeset tree="${tests[$i].tree}"
-	typeset devs="${tests[$i].devs}"
-	typeset want="${tests[$i].want}"
+	tree="${tests[$i].tree}"
+	devs="${tests[$i].devs}"
+	want="${tests[$i].want}"
 
 	log_must eval zpool create "$TESTPOOL" $tree
 	log_must poolexists "$TESTPOOL"

From 880b73956be6b88a0d8bcd0bb75c066921501f3c Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Sat, 2 Nov 2024 08:43:25 +1100
Subject: [PATCH 34/47] zfs(4): remove "experimental" from zfs_bclone_enabled

I think we've done enough experiments.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #16189
Closes #16712
---
 man/man4/zfs.4         | 7 ++++---
 module/zfs/zfs_vnops.c | 6 +++---
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index c9f6ed0dece3..da027798f962 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -18,7 +18,7 @@
 .\"
 .\" Copyright (c) 2024, Klara, Inc.
 .\"
-.Dd October 2, 2024
+.Dd November 1, 2024
 .Dt ZFS 4
 .Os
 .
@@ -1333,9 +1333,10 @@ results in vector instructions
 from the respective CPU instruction set being used.
 .
 .It Sy zfs_bclone_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
-Enable the experimental block cloning feature.
+Enables access to the block cloning feature.
 If this setting is 0, then even if feature@block_cloning is enabled,
-attempts to clone blocks will act as though the feature is disabled.
+using functions and system calls that attempt to clone blocks will act as
+though the feature is disabled.
 .
 .It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int
 When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index d5e0d2a2b35a..6c15a5c472ea 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -58,9 +58,9 @@
 #include <sys/zfs_znode.h>
 
 /*
- * Enable the experimental block cloning feature.  If this setting is 0, then
- * even if feature@block_cloning is enabled, attempts to clone blocks will act
- * as though the feature is disabled.
+ * Enables access to the block cloning feature. If this setting is 0, then even
+ * if feature@block_cloning is enabled, using functions and system calls that
+ * attempt to clone blocks will act as though the feature is disabled.
  */
 int zfs_bclone_enabled = 1;
 

From 55cbd1f9bd22a67d25aab5c7ee2af4c9eee6e697 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 4 Nov 2024 19:42:06 -0500
Subject: [PATCH 35/47] Reduce dirty records memory usage

Small block workloads may use a very large number of dirty records.
During simple block cloning test due to BRT still using 4KB blocks
I can easily see up to 2.5M of those used.  Before this change
dbuf_dirty_record_t structures representing them were allocated via
kmem_zalloc(), that rounded their size up to 512 bytes.

Introduction of specialized kmem cache allows to reduce the size
from 512 to 408 bytes.  Additionally, since override and raw params
in dirty records are mutually exclusive, puting them into a union
allows to reduce structure size down to 368 bytes, increasing the
saving to 28%, that can be a 0.5GB or more of RAM.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16694
---
 include/sys/dbuf.h      | 26 +++++++++++++++++---------
 module/zfs/dbuf.c       | 21 ++++++++++++++++-----
 module/zfs/dmu.c        |  9 ++++-----
 module/zfs/dmu_direct.c |  1 +
 module/zfs/dnode_sync.c |  2 +-
 5 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h
index c9bfb9a8026c..e69464809a42 100644
--- a/include/sys/dbuf.h
+++ b/include/sys/dbuf.h
@@ -171,7 +171,6 @@ typedef struct dbuf_dirty_record {
 			 * gets COW'd in a subsequent transaction group.
 			 */
 			arc_buf_t *dr_data;
-			blkptr_t dr_overridden_by;
 			override_states_t dr_override_state;
 			uint8_t dr_copies;
 			boolean_t dr_nopwrite;
@@ -179,14 +178,21 @@ typedef struct dbuf_dirty_record {
 			boolean_t dr_diowrite;
 			boolean_t dr_has_raw_params;
 
-			/*
-			 * If dr_has_raw_params is set, the following crypt
-			 * params will be set on the BP that's written.
-			 */
-			boolean_t dr_byteorder;
-			uint8_t	dr_salt[ZIO_DATA_SALT_LEN];
-			uint8_t	dr_iv[ZIO_DATA_IV_LEN];
-			uint8_t	dr_mac[ZIO_DATA_MAC_LEN];
+			/* Override and raw params are mutually exclusive. */
+			union {
+				blkptr_t dr_overridden_by;
+				struct {
+					/*
+					 * If dr_has_raw_params is set, the
+					 * following crypt params will be set
+					 * on the BP that's written.
+					 */
+					boolean_t dr_byteorder;
+					uint8_t	dr_salt[ZIO_DATA_SALT_LEN];
+					uint8_t	dr_iv[ZIO_DATA_IV_LEN];
+					uint8_t	dr_mac[ZIO_DATA_MAC_LEN];
+				};
+			};
 		} dl;
 		struct dirty_lightweight_leaf {
 			/*
@@ -346,6 +352,8 @@ typedef struct dbuf_hash_table {
 
 typedef void (*dbuf_prefetch_fn)(void *, uint64_t, uint64_t, boolean_t);
 
+extern kmem_cache_t *dbuf_dirty_kmem_cache;
+
 uint64_t dbuf_whichblock(const struct dnode *di, const int64_t level,
     const uint64_t offset);
 
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index df9368fc8bdb..b1419d96f4ef 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -182,6 +182,7 @@ static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
  * Global data structures and functions for the dbuf cache.
  */
 static kmem_cache_t *dbuf_kmem_cache;
+kmem_cache_t *dbuf_dirty_kmem_cache;
 static taskq_t *dbu_evict_taskq;
 
 static kthread_t *dbuf_cache_evict_thread;
@@ -966,6 +967,8 @@ dbuf_init(void)
 	dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
 	    sizeof (dmu_buf_impl_t),
 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
+	dbuf_dirty_kmem_cache = kmem_cache_create("dbuf_dirty_record_t",
+	    sizeof (dbuf_dirty_record_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	for (int i = 0; i < hmsize; i++)
 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_NOLOCKDEP, NULL);
@@ -1041,6 +1044,7 @@ dbuf_fini(void)
 	    sizeof (kmutex_t));
 
 	kmem_cache_destroy(dbuf_kmem_cache);
+	kmem_cache_destroy(dbuf_dirty_kmem_cache);
 	taskq_destroy(dbu_evict_taskq);
 
 	mutex_enter(&dbuf_evict_lock);
@@ -2343,7 +2347,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	 * to make a copy of it so that the changes we make in this
 	 * transaction group won't leak out when we sync the older txg.
 	 */
-	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
+	dr = kmem_cache_alloc(dbuf_dirty_kmem_cache, KM_SLEEP);
+	memset(dr, 0, sizeof (*dr));
 	list_link_init(&dr->dr_dirty_node);
 	list_link_init(&dr->dr_dbuf_node);
 	dr->dr_dnode = dn;
@@ -2526,7 +2531,7 @@ dbuf_undirty_bonus(dbuf_dirty_record_t *dr)
 		mutex_destroy(&dr->dt.di.dr_mtx);
 		list_destroy(&dr->dt.di.dr_children);
 	}
-	kmem_free(dr, sizeof (dbuf_dirty_record_t));
+	kmem_cache_free(dbuf_dirty_kmem_cache, dr);
 	ASSERT3U(db->db_dirtycnt, >, 0);
 	db->db_dirtycnt -= 1;
 }
@@ -2616,7 +2621,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		}
 	}
 
-	kmem_free(dr, sizeof (dbuf_dirty_record_t));
+	kmem_cache_free(dbuf_dirty_kmem_cache, dr);
 
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
@@ -2941,7 +2946,7 @@ dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
 	 * (see dbuf_sync_dnode_leaf_crypt()).
 	 */
 	ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
-	ASSERT3U(db->db_level, ==, 0);
+	ASSERT0(db->db_level);
 	ASSERT(db->db_objset->os_raw_receive);
 
 	dmu_buf_will_dirty_impl(db_fake,
@@ -2950,6 +2955,7 @@ dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
 	dr = dbuf_find_dirty_eq(db, tx->tx_txg);
 
 	ASSERT3P(dr, !=, NULL);
+	ASSERT3U(dr->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN);
 
 	dr->dt.dl.dr_has_raw_params = B_TRUE;
 	dr->dt.dl.dr_byteorder = byteorder;
@@ -2964,10 +2970,14 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
 	struct dirty_leaf *dl;
 	dbuf_dirty_record_t *dr;
 
+	ASSERT3U(db->db.db_object, !=, DMU_META_DNODE_OBJECT);
+	ASSERT0(db->db_level);
+
 	dr = list_head(&db->db_dirty_records);
 	ASSERT3P(dr, !=, NULL);
 	ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 	dl = &dr->dt.dl;
+	ASSERT0(dl->dr_has_raw_params);
 	dl->dr_overridden_by = *bp;
 	dl->dr_override_state = DR_OVERRIDDEN;
 	BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
@@ -3040,6 +3050,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
 	ASSERT3P(dr, !=, NULL);
 	ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 	dl = &dr->dt.dl;
+	ASSERT0(dl->dr_has_raw_params);
 	encode_embedded_bp_compressed(&dl->dr_overridden_by,
 	    data, comp, uncompressed_size, compressed_size);
 	BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
@@ -5083,7 +5094,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 	dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
 	    zio->io_txg);
 
-	kmem_free(dr, sizeof (dbuf_dirty_record_t));
+	kmem_cache_free(dbuf_dirty_kmem_cache, dr);
 }
 
 static void
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 3f87cfe6bee9..362415a25895 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1895,6 +1895,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 	mutex_enter(&db->db_mtx);
 	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
 	if (zio->io_error == 0) {
+		ASSERT0(dr->dt.dl.dr_has_raw_params);
 		dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
 		if (dr->dt.dl.dr_nopwrite) {
 			blkptr_t *bp = zio->io_bp;
@@ -2190,6 +2191,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 		return (SET_ERROR(EALREADY));
 	}
 
+	ASSERT0(dr->dt.dl.dr_has_raw_params);
 	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
 	mutex_exit(&db->db_mtx);
@@ -2657,6 +2659,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 		db = (dmu_buf_impl_t *)dbuf;
 		bp = &bps[i];
 
+		ASSERT3U(db->db.db_object, !=, DMU_META_DNODE_OBJECT);
 		ASSERT0(db->db_level);
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(db->db_blkid != DMU_SPILL_BLKID);
@@ -2672,11 +2675,6 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 		db = (dmu_buf_impl_t *)dbuf;
 		bp = &bps[i];
 
-		ASSERT0(db->db_level);
-		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
-		ASSERT(db->db_blkid != DMU_SPILL_BLKID);
-		ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp));
-
 		dmu_buf_will_clone_or_dio(dbuf, tx);
 
 		mutex_enter(&db->db_mtx);
@@ -2685,6 +2683,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 		VERIFY(dr != NULL);
 		ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 		dl = &dr->dt.dl;
+		ASSERT0(dl->dr_has_raw_params);
 		dl->dr_overridden_by = *bp;
 		if (!BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) != 0) {
 			if (!BP_IS_EMBEDDED(bp)) {
diff --git a/module/zfs/dmu_direct.c b/module/zfs/dmu_direct.c
index ed96e7515bc7..40b78b519f49 100644
--- a/module/zfs/dmu_direct.c
+++ b/module/zfs/dmu_direct.c
@@ -180,6 +180,7 @@ dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx)
 	if (list_next(&db->db_dirty_records, dr_head) != NULL)
 		zp.zp_nopwrite = B_FALSE;
 
+	ASSERT0(dr_head->dt.dl.dr_has_raw_params);
 	ASSERT3S(dr_head->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN);
 	dr_head->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
 
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c
index f67dad002319..122d7d0d17d8 100644
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -566,7 +566,7 @@ dnode_undirty_dbufs(list_t *list)
 			mutex_destroy(&dr->dt.di.dr_mtx);
 			list_destroy(&dr->dt.di.dr_children);
 		}
-		kmem_free(dr, sizeof (dbuf_dirty_record_t));
+		kmem_cache_free(dbuf_dirty_kmem_cache, dr);
 		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
 	}
 }

From b96845b63204336a0e05a1b1c27cb02d626d4f5d Mon Sep 17 00:00:00 2001
From: Uglymotha <uglymotha@wizdom.nu>
Date: Tue, 5 Nov 2024 01:44:38 +0100
Subject: [PATCH 36/47] Verify parent_dev before calling
 udev_device_get_sysattr_value

Not all udev devices have parent devices.
Calling udev_device_get_ functions yield an assertion error
if called with a NULL pointer.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Sietse <sietse@wizdom.nu>
Co-authored-by: Sietse <sietse@wizdom.nu>
Closes #16705
Closes #16717
---
 cmd/zed/zed_disk_event.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c
index 32a8789d3001..4a2b7398e922 100644
--- a/cmd/zed/zed_disk_event.c
+++ b/cmd/zed/zed_disk_event.c
@@ -139,7 +139,8 @@ dev_event_nvlist(struct udev_device *dev)
 		 * is /dev/sda.
 		 */
 		struct udev_device *parent_dev = udev_device_get_parent(dev);
-		if ((value = udev_device_get_sysattr_value(parent_dev, "size"))
+		if (parent_dev != NULL &&
+		    (value = udev_device_get_sysattr_value(parent_dev, "size"))
 		    != NULL) {
 			uint64_t numval = DEV_BSIZE;
 

From ae48c2f6a97bd95b84099439bbb1e668fd7c2548 Mon Sep 17 00:00:00 2001
From: tstabrawa <59430211+tstabrawa@users.noreply.github.com>
Date: Sun, 3 Nov 2024 15:30:23 -0600
Subject: [PATCH 37/47] Revert "Avoid BUG in migrate_folio_extra"

This reverts commit b052035990594408899fa32fd4ad6603b75b6c6d.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: tstabrawa <59430211+tstabrawa@users.noreply.github.com>
Closes #16568
Closes #16723
---
 config/kernel-vfs-invalidate_folio.m4 | 33 -------------------
 config/kernel-vfs-release_folio.m4    | 32 -------------------
 config/kernel.m4                      |  4 ---
 module/os/linux/zfs/zfs_vnops_os.c    | 17 ----------
 module/os/linux/zfs/zfs_znode_os.c    |  8 -----
 module/os/linux/zfs/zpl_file.c        | 46 ---------------------------
 6 files changed, 140 deletions(-)
 delete mode 100644 config/kernel-vfs-invalidate_folio.m4
 delete mode 100644 config/kernel-vfs-release_folio.m4

diff --git a/config/kernel-vfs-invalidate_folio.m4 b/config/kernel-vfs-invalidate_folio.m4
deleted file mode 100644
index 61a5c8478af1..000000000000
--- a/config/kernel-vfs-invalidate_folio.m4
+++ /dev/null
@@ -1,33 +0,0 @@
-dnl #
-dnl # Linux 5.18 uses invalidate_folio in lieu of invalidate_page
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_INVALIDATE_FOLIO], [
-	ZFS_LINUX_TEST_SRC([vfs_has_invalidate_folio], [
-		#include <linux/fs.h>
-
-		static void
-		test_invalidate_folio(struct folio *folio, size_t offset,
-		                      size_t len) {
-			(void) folio; (void) offset; (void) len;
-			return;
-		}
-
-		static const struct address_space_operations
-		    aops __attribute__ ((unused)) = {
-			.invalidate_folio	= test_invalidate_folio,
-		};
-	],[])
-])
-
-AC_DEFUN([ZFS_AC_KERNEL_VFS_INVALIDATE_FOLIO], [
-	dnl #
-	dnl # Linux 5.18 uses invalidate_folio in lieu of invalidate_page
-	dnl #
-	AC_MSG_CHECKING([whether invalidate_folio exists])
-	ZFS_LINUX_TEST_RESULT([vfs_has_invalidate_folio], [
-		AC_MSG_RESULT([yes])
-		AC_DEFINE(HAVE_VFS_INVALIDATE_FOLIO, 1, [invalidate_folio exists])
-	],[
-		AC_MSG_RESULT([no])
-	])
-])
diff --git a/config/kernel-vfs-release_folio.m4 b/config/kernel-vfs-release_folio.m4
deleted file mode 100644
index f31db5677fd3..000000000000
--- a/config/kernel-vfs-release_folio.m4
+++ /dev/null
@@ -1,32 +0,0 @@
-dnl #
-dnl # Linux 5.19 uses release_folio in lieu of releasepage
-dnl #
-AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_RELEASE_FOLIO], [
-	ZFS_LINUX_TEST_SRC([vfs_has_release_folio], [
-		#include <linux/fs.h>
-
-		static bool
-		test_release_folio(struct folio *folio, gfp_t gfp) {
-			(void) folio; (void) gfp;
-			return (0);
-		}
-
-		static const struct address_space_operations
-		    aops __attribute__ ((unused)) = {
-			.release_folio	= test_release_folio,
-		};
-	],[])
-])
-
-AC_DEFUN([ZFS_AC_KERNEL_VFS_RELEASE_FOLIO], [
-	dnl #
-	dnl # Linux 5.19 uses release_folio in lieu of releasepage
-	dnl #
-	AC_MSG_CHECKING([whether release_folio exists])
-	ZFS_LINUX_TEST_RESULT([vfs_has_release_folio], [
-		AC_MSG_RESULT([yes])
-		AC_DEFINE(HAVE_VFS_RELEASE_FOLIO, 1, [release_folio exists])
-	],[
-		AC_MSG_RESULT([no])
-	])
-])
diff --git a/config/kernel.m4 b/config/kernel.m4
index 556df58082f9..761f9310753a 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -77,8 +77,6 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_SGET
 	ZFS_AC_KERNEL_SRC_VFS_FILEMAP_DIRTY_FOLIO
 	ZFS_AC_KERNEL_SRC_VFS_READ_FOLIO
-	ZFS_AC_KERNEL_SRC_VFS_RELEASE_FOLIO
-	ZFS_AC_KERNEL_SRC_VFS_INVALIDATE_FOLIO
 	ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS
 	ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO
 	ZFS_AC_KERNEL_SRC_VFS_READPAGES
@@ -189,8 +187,6 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_SGET
 	ZFS_AC_KERNEL_VFS_FILEMAP_DIRTY_FOLIO
 	ZFS_AC_KERNEL_VFS_READ_FOLIO
-	ZFS_AC_KERNEL_VFS_RELEASE_FOLIO
-	ZFS_AC_KERNEL_VFS_INVALIDATE_FOLIO
 	ZFS_AC_KERNEL_VFS_FSYNC_2ARGS
 	ZFS_AC_KERNEL_VFS_DIRECT_IO
 	ZFS_AC_KERNEL_VFS_READPAGES
diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index 469197220859..dd9fd760b9c2 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -260,15 +260,6 @@ update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
 			} else {
 				ClearPageError(pp);
 				SetPageUptodate(pp);
-				if (!PagePrivate(pp)) {
-					/*
-					 * Set private bit so page migration
-					 * will wait for us to finish writeback
-					 * before calling migrate_folio().
-					 */
-					SetPagePrivate(pp);
-					get_page(pp);
-				}
 
 				if (mapping_writably_mapped(mp))
 					flush_dcache_page(pp);
@@ -4090,14 +4081,6 @@ zfs_fillpage(struct inode *ip, struct page *pp)
 	} else {
 		ClearPageError(pp);
 		SetPageUptodate(pp);
-		if (!PagePrivate(pp)) {
-			/*
-			 * Set private bit so page migration will wait for us to
-			 * finish writeback before calling migrate_folio().
-			 */
-			SetPagePrivate(pp);
-			get_page(pp);
-		}
 	}
 
 	return (error);
diff --git a/module/os/linux/zfs/zfs_znode_os.c b/module/os/linux/zfs/zfs_znode_os.c
index bc1e17f086d9..bbaca2f58394 100644
--- a/module/os/linux/zfs/zfs_znode_os.c
+++ b/module/os/linux/zfs/zfs_znode_os.c
@@ -1577,14 +1577,6 @@ zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len)
 		mark_page_accessed(pp);
 		SetPageUptodate(pp);
 		ClearPageError(pp);
-		if (!PagePrivate(pp)) {
-			/*
-			 * Set private bit so page migration will wait for us to
-			 * finish writeback before calling migrate_folio().
-			 */
-			SetPagePrivate(pp);
-			get_page(pp);
-		}
 		unlock_page(pp);
 		put_page(pp);
 	}
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index 50c63695dcc8..4d1bf1d5477f 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -607,42 +607,6 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc)
 	return (zpl_putpage(pp, wbc, &for_sync));
 }
 
-static int
-zpl_releasepage(struct page *pp, gfp_t gfp)
-{
-	if (PagePrivate(pp)) {
-		ClearPagePrivate(pp);
-		put_page(pp);
-	}
-	return (1);
-}
-
-#ifdef HAVE_VFS_RELEASE_FOLIO
-static bool
-zpl_release_folio(struct folio *folio, gfp_t gfp)
-{
-	return (zpl_releasepage(&folio->page, gfp));
-}
-#endif
-
-#ifdef HAVE_VFS_INVALIDATE_FOLIO
-static void
-zpl_invalidate_folio(struct folio *folio, size_t offset, size_t len)
-{
-	if ((offset == 0) && (len == PAGE_SIZE)) {
-		zpl_releasepage(&folio->page, 0);
-	}
-}
-#else
-static void
-zpl_invalidatepage(struct page *pp, unsigned int offset, unsigned int len)
-{
-	if ((offset == 0) && (len == PAGE_SIZE)) {
-		zpl_releasepage(pp, 0);
-	}
-}
-#endif
-
 /*
  * The flag combination which matches the behavior of zfs_space() is
  * FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE.  The FALLOC_FL_PUNCH_HOLE
@@ -1126,16 +1090,6 @@ const struct address_space_operations zpl_address_space_operations = {
 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
 	.dirty_folio	= filemap_dirty_folio,
 #endif
-#ifdef HAVE_VFS_RELEASE_FOLIO
-	.release_folio	= zpl_release_folio,
-#else
-	.releasepage	= zpl_releasepage,
-#endif
-#ifdef HAVE_VFS_INVALIDATE_FOLIO
-	.invalidate_folio = zpl_invalidate_folio,
-#else
-	.invalidatepage = zpl_invalidatepage,
-#endif
 };
 
 const struct file_operations zpl_file_operations = {

From 661bb434e6a9beb47aac2fbc5ce94ba490e683dc Mon Sep 17 00:00:00 2001
From: tstabrawa <59430211+tstabrawa@users.noreply.github.com>
Date: Sun, 3 Nov 2024 16:00:20 -0600
Subject: [PATCH 38/47] Use simple folio migration function

Avoids using fallback_migrate_folio, which starts unnecessary writeback
(leading to BUG in migrate_folio_extra).

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Signed-off-by: tstabrawa <59430211+tstabrawa@users.noreply.github.com>
Closes #16568
Closes #16723
---
 config/kernel-vfs-migrate_folio.m4 | 27 +++++++++++++++++++++++++++
 config/kernel.m4                   |  2 ++
 module/os/linux/zfs/zpl_file.c     |  6 ++++++
 3 files changed, 35 insertions(+)
 create mode 100644 config/kernel-vfs-migrate_folio.m4

diff --git a/config/kernel-vfs-migrate_folio.m4 b/config/kernel-vfs-migrate_folio.m4
new file mode 100644
index 000000000000..186cd0581a17
--- /dev/null
+++ b/config/kernel-vfs-migrate_folio.m4
@@ -0,0 +1,27 @@
+dnl #
+dnl # Linux 6.0 uses migrate_folio in lieu of migrate_page
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_MIGRATE_FOLIO], [
+	ZFS_LINUX_TEST_SRC([vfs_has_migrate_folio], [
+		#include <linux/fs.h>
+		#include <linux/migrate.h>
+
+		static const struct address_space_operations
+		    aops __attribute__ ((unused)) = {
+			.migrate_folio	= migrate_folio,
+		};
+	],[])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_VFS_MIGRATE_FOLIO], [
+	dnl #
+	dnl # Linux 6.0 uses migrate_folio in lieu of migrate_page
+	dnl #
+	AC_MSG_CHECKING([whether migrate_folio exists])
+	ZFS_LINUX_TEST_RESULT([vfs_has_migrate_folio], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_VFS_MIGRATE_FOLIO, 1, [migrate_folio exists])
+	],[
+		AC_MSG_RESULT([no])
+	])
+])
diff --git a/config/kernel.m4 b/config/kernel.m4
index 761f9310753a..78f178ff27ac 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -77,6 +77,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_SGET
 	ZFS_AC_KERNEL_SRC_VFS_FILEMAP_DIRTY_FOLIO
 	ZFS_AC_KERNEL_SRC_VFS_READ_FOLIO
+	ZFS_AC_KERNEL_SRC_VFS_MIGRATE_FOLIO
 	ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS
 	ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO
 	ZFS_AC_KERNEL_SRC_VFS_READPAGES
@@ -187,6 +188,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_SGET
 	ZFS_AC_KERNEL_VFS_FILEMAP_DIRTY_FOLIO
 	ZFS_AC_KERNEL_VFS_READ_FOLIO
+	ZFS_AC_KERNEL_VFS_MIGRATE_FOLIO
 	ZFS_AC_KERNEL_VFS_FSYNC_2ARGS
 	ZFS_AC_KERNEL_VFS_DIRECT_IO
 	ZFS_AC_KERNEL_VFS_READPAGES
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index 4d1bf1d5477f..f6e014327717 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -28,6 +28,7 @@
 #include <linux/compat.h>
 #endif
 #include <linux/fs.h>
+#include <linux/migrate.h>
 #include <sys/file.h>
 #include <sys/dmu_objset.h>
 #include <sys/zfs_znode.h>
@@ -1090,6 +1091,11 @@ const struct address_space_operations zpl_address_space_operations = {
 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
 	.dirty_folio	= filemap_dirty_folio,
 #endif
+#ifdef HAVE_VFS_MIGRATE_FOLIO
+	.migrate_folio	= migrate_folio,
+#else
+	.migratepage	= migrate_page,
+#endif
 };
 
 const struct file_operations zpl_file_operations = {

From c82eb27b22c63f197aacb77528b4efa179f476a4 Mon Sep 17 00:00:00 2001
From: Chunwei Chen <tuxoko@gmail.com>
Date: Wed, 6 Nov 2024 11:52:01 -0800
Subject: [PATCH 39/47] ZFS send should use spill block prefetched from
 send_reader_thread

Currently, even though send_reader_thread prefetches spill block,
do_dump() will not use it and issues its own blocking arc_read. This
causes significant performance degradation when sending datasets with
lots of spill blocks.

For unmodified spill blocks, we also create send_range struct for them
in send_reader_thread and issue prefetches for them. We piggyback them
on the dnode send_range instead of enqueueing them so we don't break
send_range_after check.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Chunwei Chen <david.chen@nutanix.com>
Co-authored-by: david.chen <david.chen@nutanix.com>
Closes #16701
---
 module/zfs/dmu_send.c | 126 +++++++++++++++++++++---------------------
 1 file changed, 64 insertions(+), 62 deletions(-)

diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index a174972e9b57..aeb8ff3b6688 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -180,6 +180,8 @@ struct send_range {
 			 */
 			dnode_phys_t		*dnp;
 			blkptr_t		bp;
+			/* Piggyback unmodified spill block */
+			struct send_range	*spill_range;
 		} object;
 		struct srr {
 			uint32_t		datablksz;
@@ -231,6 +233,8 @@ range_free(struct send_range *range)
 		size_t size = sizeof (dnode_phys_t) *
 		    (range->sru.object.dnp->dn_extra_slots + 1);
 		kmem_free(range->sru.object.dnp, size);
+		if (range->sru.object.spill_range)
+			range_free(range->sru.object.spill_range);
 	} else if (range->type == DATA) {
 		mutex_enter(&range->sru.data.lock);
 		while (range->sru.data.io_outstanding)
@@ -617,7 +621,7 @@ dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
 	drrs->drr_length = blksz;
 	drrs->drr_toguid = dscp->dsc_toguid;
 
-	/* See comment in dump_dnode() for full details */
+	/* See comment in piggyback_unmodified_spill() for full details */
 	if (zfs_send_unmodified_spill_blocks &&
 	    (BP_GET_LOGICAL_BIRTH(bp) <= dscp->dsc_fromtxg)) {
 		drrs->drr_flags |= DRR_SPILL_UNMODIFIED;
@@ -793,35 +797,6 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
 	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0)
 		return (SET_ERROR(EINTR));
 
-	/*
-	 * Send DRR_SPILL records for unmodified spill blocks.	This is useful
-	 * because changing certain attributes of the object (e.g. blocksize)
-	 * can cause old versions of ZFS to incorrectly remove a spill block.
-	 * Including these records in the stream forces an up to date version
-	 * to always be written ensuring they're never lost.  Current versions
-	 * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can
-	 * ignore these unmodified spill blocks.
-	 */
-	if (zfs_send_unmodified_spill_blocks &&
-	    (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
-	    (BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= dscp->dsc_fromtxg)) {
-		struct send_range record;
-		blkptr_t *bp = DN_SPILL_BLKPTR(dnp);
-
-		memset(&record, 0, sizeof (struct send_range));
-		record.type = DATA;
-		record.object = object;
-		record.eos_marker = B_FALSE;
-		record.start_blkid = DMU_SPILL_BLKID;
-		record.end_blkid = record.start_blkid + 1;
-		record.sru.data.bp = *bp;
-		record.sru.data.obj_type = dnp->dn_type;
-		record.sru.data.datablksz = BP_GET_LSIZE(bp);
-
-		if (do_dump(dscp, &record) != 0)
-			return (SET_ERROR(EINTR));
-	}
-
 	if (dscp->dsc_err != 0)
 		return (SET_ERROR(EINTR));
 
@@ -911,6 +886,9 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
 	case OBJECT:
 		err = dump_dnode(dscp, &range->sru.object.bp, range->object,
 		    range->sru.object.dnp);
+		/* Dump piggybacked unmodified spill block */
+		if (!err && range->sru.object.spill_range)
+			err = do_dump(dscp, range->sru.object.spill_range);
 		return (err);
 	case OBJECT_RANGE: {
 		ASSERT3U(range->start_blkid + 1, ==, range->end_blkid);
@@ -939,34 +917,7 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
 
 		ASSERT3U(srdp->datablksz, ==, BP_GET_LSIZE(bp));
 		ASSERT3U(range->start_blkid + 1, ==, range->end_blkid);
-		if (BP_GET_TYPE(bp) == DMU_OT_SA) {
-			arc_flags_t aflags = ARC_FLAG_WAIT;
-			zio_flag_t zioflags = ZIO_FLAG_CANFAIL;
 
-			if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) {
-				ASSERT(BP_IS_PROTECTED(bp));
-				zioflags |= ZIO_FLAG_RAW;
-			}
-
-			zbookmark_phys_t zb;
-			ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID);
-			zb.zb_objset = dmu_objset_id(dscp->dsc_os);
-			zb.zb_object = range->object;
-			zb.zb_level = 0;
-			zb.zb_blkid = range->start_blkid;
-
-			arc_buf_t *abuf = NULL;
-			if (!dscp->dsc_dso->dso_dryrun && arc_read(NULL, spa,
-			    bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
-			    zioflags, &aflags, &zb) != 0)
-				return (SET_ERROR(EIO));
-
-			err = dump_spill(dscp, bp, zb.zb_object,
-			    (abuf == NULL ? NULL : abuf->b_data));
-			if (abuf != NULL)
-				arc_buf_destroy(abuf, &abuf);
-			return (err);
-		}
 		if (send_do_embed(bp, dscp->dsc_featureflags)) {
 			err = dump_write_embedded(dscp, range->object,
 			    range->start_blkid * srdp->datablksz,
@@ -975,8 +926,9 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
 		}
 		ASSERT(range->object > dscp->dsc_resume_object ||
 		    (range->object == dscp->dsc_resume_object &&
+		    (range->start_blkid == DMU_SPILL_BLKID ||
 		    range->start_blkid * srdp->datablksz >=
-		    dscp->dsc_resume_offset));
+		    dscp->dsc_resume_offset)));
 		/* it's a level-0 block of a regular object */
 
 		mutex_enter(&srdp->lock);
@@ -1006,8 +958,6 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
 		ASSERT(dscp->dsc_dso->dso_dryrun ||
 		    srdp->abuf != NULL || srdp->abd != NULL);
 
-		uint64_t offset = range->start_blkid * srdp->datablksz;
-
 		char *data = NULL;
 		if (srdp->abd != NULL) {
 			data = abd_to_buf(srdp->abd);
@@ -1016,6 +966,14 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
 			data = srdp->abuf->b_data;
 		}
 
+		if (BP_GET_TYPE(bp) == DMU_OT_SA) {
+			ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID);
+			err = dump_spill(dscp, bp, range->object, data);
+			return (err);
+		}
+
+		uint64_t offset = range->start_blkid * srdp->datablksz;
+
 		/*
 		 * If we have large blocks stored on disk but the send flags
 		 * don't allow us to send large blocks, we split the data from
@@ -1098,6 +1056,8 @@ range_alloc(enum type type, uint64_t object, uint64_t start_blkid,
 		range->sru.data.io_outstanding = 0;
 		range->sru.data.io_err = 0;
 		range->sru.data.io_compressed = B_FALSE;
+	} else if (type == OBJECT) {
+		range->sru.object.spill_range = NULL;
 	}
 	return (range);
 }
@@ -1742,6 +1702,45 @@ enqueue_range(struct send_reader_thread_arg *srta, bqueue_t *q, dnode_t *dn,
 	bqueue_enqueue(q, range, datablksz);
 }
 
+/*
+ * Send DRR_SPILL records for unmodified spill blocks.	This is useful
+ * because changing certain attributes of the object (e.g. blocksize)
+ * can cause old versions of ZFS to incorrectly remove a spill block.
+ * Including these records in the stream forces an up to date version
+ * to always be written ensuring they're never lost.  Current versions
+ * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can
+ * ignore these unmodified spill blocks.
+ *
+ * We piggyback the spill_range to dnode range instead of enqueueing it
+ * so send_range_after won't complain.
+ */
+static uint64_t
+piggyback_unmodified_spill(struct send_reader_thread_arg *srta,
+    struct send_range *range)
+{
+	ASSERT3U(range->type, ==, OBJECT);
+
+	dnode_phys_t *dnp = range->sru.object.dnp;
+	uint64_t fromtxg = srta->smta->to_arg->fromtxg;
+
+	if (!zfs_send_unmodified_spill_blocks ||
+	    !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ||
+	    !(BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= fromtxg))
+		return (0);
+
+	blkptr_t *bp = DN_SPILL_BLKPTR(dnp);
+	struct send_range *spill_range = range_alloc(DATA, range->object,
+	    DMU_SPILL_BLKID, DMU_SPILL_BLKID+1, B_FALSE);
+	spill_range->sru.data.bp = *bp;
+	spill_range->sru.data.obj_type = dnp->dn_type;
+	spill_range->sru.data.datablksz = BP_GET_LSIZE(bp);
+
+	issue_data_read(srta, spill_range);
+	range->sru.object.spill_range = spill_range;
+
+	return (BP_GET_LSIZE(bp));
+}
+
 /*
  * This thread is responsible for two things: First, it retrieves the correct
  * blkptr in the to ds if we need to send the data because of something from
@@ -1773,17 +1772,20 @@ send_reader_thread(void *arg)
 	uint64_t last_obj_exists = B_TRUE;
 	while (!range->eos_marker && !srta->cancel && smta->error == 0 &&
 	    err == 0) {
+		uint64_t spill = 0;
 		switch (range->type) {
 		case DATA:
 			issue_data_read(srta, range);
 			bqueue_enqueue(outq, range, range->sru.data.datablksz);
 			range = get_next_range_nofree(inq, range);
 			break;
-		case HOLE:
 		case OBJECT:
+			spill = piggyback_unmodified_spill(srta, range);
+			zfs_fallthrough;
+		case HOLE:
 		case OBJECT_RANGE:
 		case REDACT: // Redacted blocks must exist
-			bqueue_enqueue(outq, range, sizeof (*range));
+			bqueue_enqueue(outq, range, sizeof (*range) + spill);
 			range = get_next_range_nofree(inq, range);
 			break;
 		case PREVIOUSLY_REDACTED: {

From 8131793d6f08e359238a3d56428a96b81480bd0a Mon Sep 17 00:00:00 2001
From: Brian Atkinson <batkinson@lanl.gov>
Date: Thu, 7 Nov 2024 14:11:05 -0500
Subject: [PATCH 40/47] Update ABD stats for linear page Linux

a10e552 updated abd_free_linear_page() to no longer call
abd_update_scatter_stat(). This meant that linear pages that were not
attached to Direct I/O requests were not doing waste accounting for the
ARC. This led to performance issues due to incorrect ARC accounting that
resulted in 100% of CPU time being spent in arc_evict() during prolonged
I/O workloads with the ARC.

The call to abd_update_scatter_stats() is now conditionally called in
abd_free_linear_page() when the ABD is not from a Direct I/O request.

Reviewed-by: Mark Maybee <mmaybee@delphix.com>
Reviewed-by: Tony Nguyen <tony.nguyen@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Brian Atkinson <batkinson@lanl.gov>
Closes #16729
---
 module/os/linux/zfs/abd_os.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index 303af48cf3af..04ab8bbca352 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -701,6 +701,8 @@ abd_free_linear_page(abd_t *abd)
 	/* When backed by user page unmap it */
 	if (abd_is_from_pages(abd))
 		zfs_kunmap(sg_page(sg));
+	else
+		abd_update_scatter_stats(abd, ABDSTAT_DECR);
 
 	abd->abd_flags &= ~ABD_FLAG_LINEAR;
 	abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;

From e12d76176d4e5454db62eb48b58ecd4970838a76 Mon Sep 17 00:00:00 2001
From: Sam James <sam@gentoo.org>
Date: Thu, 7 Nov 2024 19:20:37 +0000
Subject: [PATCH 41/47] Use <fcntl.h> instead of <sys/fcntl.h>

When building on musl, we get:

```
In file included from tests/zfs-tests/cmd/getversion.c:22:
/usr/include/sys/fcntl.h:1:2: error: #warning redirecting incorrect
 #include <sys/fcntl.h> to <fcntl.h> [-Werror=cpp]
 1 | #warning redirecting incorrect #include <sys/fcntl.h> to <fcntl.h>

In file included from module/os/linux/zfs/vdev_file.c:36:
/usr/include/sys/fcntl.h:1:2: error: #warning redirecting incorrect
 #include <sys/fcntl.h> to <fcntl.h> [-Werror=cpp]
 1 | #warning redirecting incorrect #include <sys/fcntl.h> to <fcntl.h>
```

Bug: https://bugs.gentoo.org/925235
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Sam James <sam@gentoo.org>
Closes #15925
---
 module/os/linux/zfs/vdev_file.c  | 4 +++-
 tests/zfs-tests/cmd/getversion.c | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/module/os/linux/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c
index 4bffb6412ffd..2cab6532487a 100644
--- a/module/os/linux/zfs/vdev_file.c
+++ b/module/os/linux/zfs/vdev_file.c
@@ -33,11 +33,13 @@
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/abd.h>
-#include <sys/fcntl.h>
 #include <sys/vnode.h>
 #include <sys/zfs_file.h>
 #ifdef _KERNEL
 #include <linux/falloc.h>
+#include <sys/fcntl.h>
+#else
+#include <fcntl.h>
 #endif
 /*
  * Virtual device vector for files.
diff --git a/tests/zfs-tests/cmd/getversion.c b/tests/zfs-tests/cmd/getversion.c
index 62c1c5b6abc0..1e026b92d17d 100644
--- a/tests/zfs-tests/cmd/getversion.c
+++ b/tests/zfs-tests/cmd/getversion.c
@@ -19,9 +19,9 @@
  */
 
 #include <sys/ioctl.h>
-#include <sys/fcntl.h>
 #include <linux/fs.h>
 #include <err.h>
+#include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>

From 9061a4da0b2083bf5f7d88d1b19b91b3da62350c Mon Sep 17 00:00:00 2001
From: Umer Saleem <usaleem@ixsystems.com>
Date: Fri, 8 Nov 2024 00:31:14 +0500
Subject: [PATCH 42/47] JSON: fix user properties output for zfs list

This commit fixes JSON output for zfs list when user properties are
requested with -o flag. This case needed to be handled specifically
since zfs_prop_to_name does not return property name for user
properties, instead it is stored in pl->pl_user_prop.

Reviewed-by: Ameer Hamza <ahamza@ixsystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Umer Saleem <usaleem@ixsystems.com>
Closes #16732
---
 cmd/zfs/zfs_main.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c
index 97397726c709..7836f5909f4a 100644
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -3761,8 +3761,13 @@ collect_dataset(zfs_handle_t *zhp, list_cbdata_t *cb)
 		if (cb->cb_json) {
 			if (pl->pl_prop == ZFS_PROP_NAME)
 				continue;
+			const char *prop_name;
+			if (pl->pl_prop != ZPROP_USERPROP)
+				prop_name = zfs_prop_to_name(pl->pl_prop);
+			else
+				prop_name = pl->pl_user_prop;
 			if (zprop_nvlist_one_property(
-			    zfs_prop_to_name(pl->pl_prop), propstr,
+			    prop_name, propstr,
 			    sourcetype, source, NULL, props,
 			    cb->cb_json_as_int) != 0)
 				nomem();

From 1a54b13aafd275d0f217ab771c21bf54ff13bcf6 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Mon, 21 Oct 2024 12:00:47 -0700
Subject: [PATCH 43/47] Tag 2.3.0-rc3

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
---
 META | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/META b/META
index eca34a55d004..0fe0dedae79e 100644
--- a/META
+++ b/META
@@ -2,7 +2,7 @@ Meta:          1
 Name:          zfs
 Branch:        1.0
 Version:       2.3.0
-Release:       rc2
+Release:       rc3
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS

From 41f89d0ce778b625a23cad99fdaf92d5dd0fc12c Mon Sep 17 00:00:00 2001
From: Umer Saleem <usaleem@ixsystems.com>
Date: Fri, 8 Nov 2024 15:09:47 +0500
Subject: [PATCH 44/47] JSON: fix user properties output for zpool list

This commit fixes JSON output for zpool list when user properties are
requested with -o flag. This case needed to be handled specifically
since zpool_prop_to_name does not return property name for user
properties, instead it is stored in pl->pl_user_prop.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Umer Saleem <usaleem@ixsystems.com>
Closes #16734
---
 cmd/zpool/zpool_main.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 6a45a063d91a..4458b902de31 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -6882,8 +6882,13 @@ collect_pool(zpool_handle_t *zhp, list_cbdata_t *cb)
 		if (cb->cb_json) {
 			if (pl->pl_prop == ZPOOL_PROP_NAME)
 				continue;
+			const char *prop_name;
+			if (pl->pl_prop != ZPROP_USERPROP)
+				prop_name = zpool_prop_to_name(pl->pl_prop);
+			else
+				prop_name = pl->pl_user_prop;
 			(void) zprop_nvlist_one_property(
-			    zpool_prop_to_name(pl->pl_prop), propstr,
+			    prop_name, propstr,
 			    sourcetype, NULL, NULL, props, cb->cb_json_as_int);
 		} else {
 			/*

From 092cb94ef68074c09fc3249595f2fd4c5ed20bff Mon Sep 17 00:00:00 2001
From: Umer Saleem <usaleem@ixsystems.com>
Date: Fri, 8 Nov 2024 15:13:05 +0500
Subject: [PATCH 45/47] Fix user properties output for zpool list

In zpool_get_user_prop, when called from zpool_expand_proplist and
collect_pool, we often have zpool_props present in zpool_handle_t equal
to NULL. This mostly happens when only one user property is requested
using zpool list -o <user_property>. Checking for this case and
correctly initializing the zpool_props field in zpool_handle_t fixes
this issue.

Interestingly, this issue does not occur if we query any other property
like name or guid along with a user property with -o flag because while
accessing properties like guid, zpool_prop_get_int is called which
checks for this case specifically and calls zpool_get_all_props.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Umer Saleem <usaleem@ixsystems.com>
Closes #16734
---
 lib/libzfs/libzfs_pool.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index ef0346a96fd0..15388ccbbe2b 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -471,13 +471,15 @@ int
 zpool_get_userprop(zpool_handle_t *zhp, const char *propname, char *buf,
     size_t len, zprop_source_t *srctype)
 {
-	nvlist_t *nv, *nvl;
+	nvlist_t *nv;
 	uint64_t ival;
 	const char *value;
 	zprop_source_t source = ZPROP_SRC_LOCAL;
 
-	nvl = zhp->zpool_props;
-	if (nvlist_lookup_nvlist(nvl, propname, &nv) == 0) {
+	if (zhp->zpool_props == NULL)
+		zpool_get_all_props(zhp);
+
+	if (nvlist_lookup_nvlist(zhp->zpool_props, propname, &nv) == 0) {
 		if (nvlist_lookup_uint64(nv, ZPROP_SOURCE, &ival) == 0)
 			source = ival;
 		verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0);

From 5a0ec4a496aefa968b53b67e41179870786a7b10 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 13 Nov 2024 10:31:50 -0500
Subject: [PATCH 46/47] L2ARC: Move different stats updates earlier

..., before we make the header or the log block visible to others.
It should fix assertion on allocated space going negative if the
header is freed once the lock is dropped, while the write is still
going.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #16040
Closes #16743
---
 module/zfs/arc.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 76dc0b19139d..2ece7d67704f 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -9287,6 +9287,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 			hdr->b_l2hdr.b_hits = 0;
 			hdr->b_l2hdr.b_arcs_state =
 			    hdr->b_l1hdr.b_state->arcs_state;
+			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
+			    ARC_FLAG_L2_WRITING);
+
+			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
+			    arc_hdr_size(hdr), hdr);
+			l2arc_hdr_arcstats_increment(hdr);
+			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+
 			mutex_enter(&dev->l2ad_mtx);
 			if (pio == NULL) {
 				/*
@@ -9298,12 +9306,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 			}
 			list_insert_head(&dev->l2ad_buflist, hdr);
 			mutex_exit(&dev->l2ad_mtx);
-			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
-			    ARC_FLAG_L2_WRITING);
-
-			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
-			    arc_hdr_size(hdr), hdr);
-			l2arc_hdr_arcstats_increment(hdr);
 
 			boolean_t commit = l2arc_log_blk_insert(dev, hdr);
 			mutex_exit(hash_lock);
@@ -9333,7 +9335,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 			write_psize += psize;
 			write_asize += asize;
 			dev->l2ad_hand += asize;
-			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
 			if (commit) {
 				/* l2ad_hand will be adjusted inside. */
@@ -10585,6 +10586,8 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 	(void) zio_nowait(wzio);
 
 	dev->l2ad_hand += asize;
+	vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+
 	/*
 	 * Include the committed log block's pointer  in the list of pointers
 	 * to log blocks present in the L2ARC device.
@@ -10598,7 +10601,6 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 	zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
 	zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
 	mutex_exit(&dev->l2ad_mtx);
-	vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
 	/* bump the kstats */
 	ARCSTAT_INCR(arcstat_l2_write_bytes, asize);

From b79bddb53ed6652409c69b91c5297fe03443f0cc Mon Sep 17 00:00:00 2001
From: Ameer Hamza <ahamza@ixsystems.com>
Date: Tue, 12 Nov 2024 19:46:13 +0500
Subject: [PATCH 47/47] zvol_os.c: Increase optimal IO size

Since zvol read and write can process up to (DMU_MAX_ACCESS / 2) bytes
in a single operation, the current optimal I/O size is too low. SCST
directly reports this value as the optimal transfer length for the
target SCSI device. Increasing it from the previous volblocksize results
in performance improvement for large block parallel I/O workloads.

Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
---
 module/os/linux/zfs/zvol_os.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 2396690b40fd..47aa6417068d 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1176,7 +1176,7 @@ zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv,
 		limits->zql_max_segment_size = UINT_MAX;
 	}
 
-	limits->zql_io_opt = zv->zv_volblocksize;
+	limits->zql_io_opt = DMU_MAX_ACCESS / 2;
 
 	limits->zql_physical_block_size = zv->zv_volblocksize;
 	limits->zql_max_discard_sectors =