From 666903610d08611aefdde043b731abfa5dbb21a1 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 12 Oct 2024 03:37:57 +1100 Subject: [PATCH 01/47] zpool/zfs: allow --json wherever -j is allowed Mostly so that with the JSON formatting options are also used, they all look the same. To my eye, `-j --json-flat-vdevs` suggests that they are different or unrelated, while `--json --json-flat-vdevs` invites no further questions. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Umer Saleem Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Signed-off-by: Rob Norris Closes #16632 --- cmd/zfs/zfs_main.c | 28 ++++++++-- cmd/zpool/zpool_main.c | 10 +++- man/man8/zfs-list.8 | 2 +- man/man8/zfs-mount.8 | 2 +- man/man8/zfs-program.8 | 2 +- man/man8/zfs-set.8 | 2 +- man/man8/zpool-get.8 | 4 +- man/man8/zpool-list.8 | 2 +- man/man8/zpool-status.8 | 2 +- .../functional/cli_root/json/json_sanity.ksh | 51 +++++++++++-------- 10 files changed, 72 insertions(+), 33 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index f979fd189cc9..97397726c709 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -2162,6 +2162,7 @@ zfs_do_get(int argc, char **argv) cb.cb_type = ZFS_TYPE_DATASET; struct option long_options[] = { + {"json", no_argument, NULL, 'j'}, {"json-int", no_argument, NULL, ZFS_OPTION_JSON_NUMS_AS_INT}, {0, 0, 0, 0} }; @@ -3852,6 +3853,7 @@ zfs_do_list(int argc, char **argv) nvlist_t *data = NULL; struct option long_options[] = { + {"json", no_argument, NULL, 'j'}, {"json-int", no_argument, NULL, ZFS_OPTION_JSON_NUMS_AS_INT}, {0, 0, 0, 0} }; @@ -7436,9 +7438,15 @@ share_mount(int op, int argc, char **argv) uint_t nthr; jsobj = data = item = NULL; + struct option long_options[] = { + {"json", no_argument, NULL, 'j'}, + {0, 0, 0, 0} + }; + /* check options */ - while ((c = getopt(argc, argv, op == OP_MOUNT ? ":ajRlvo:Of" : "al")) - != -1) { + while ((c = getopt_long(argc, argv, + op == OP_MOUNT ? ":ajRlvo:Of" : "al", + op == OP_MOUNT ? long_options : NULL, NULL)) != -1) { switch (c) { case 'a': do_all = 1; @@ -8374,8 +8382,14 @@ zfs_do_channel_program(int argc, char **argv) boolean_t sync_flag = B_TRUE, json_output = B_FALSE; zpool_handle_t *zhp; + struct option long_options[] = { + {"json", no_argument, NULL, 'j'}, + {0, 0, 0, 0} + }; + /* check options */ - while ((c = getopt(argc, argv, "nt:m:j")) != -1) { + while ((c = getopt_long(argc, argv, "nt:m:j", long_options, + NULL)) != -1) { switch (c) { case 't': case 'm': { @@ -9083,7 +9097,13 @@ zfs_do_version(int argc, char **argv) int c; nvlist_t *jsobj = NULL, *zfs_ver = NULL; boolean_t json = B_FALSE; - while ((c = getopt(argc, argv, "j")) != -1) { + + struct option long_options[] = { + {"json", no_argument, NULL, 'j'}, + {0, 0, 0, 0} + }; + + while ((c = getopt_long(argc, argv, "j", long_options, NULL)) != -1) { switch (c) { case 'j': json = B_TRUE; diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 09a6c6043280..ea180f6b705e 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -7340,6 +7340,7 @@ zpool_do_list(int argc, char **argv) current_prop_type = ZFS_TYPE_POOL; struct option long_options[] = { + {"json", no_argument, NULL, 'j'}, {"json-int", no_argument, NULL, ZPOOL_OPTION_JSON_NUMS_AS_INT}, {"json-pool-key-guid", no_argument, NULL, ZPOOL_OPTION_POOL_KEY_GUID}, @@ -10981,6 +10982,7 @@ zpool_do_status(int argc, char **argv) struct option long_options[] = { {"power", no_argument, NULL, ZPOOL_OPTION_POWER}, + {"json", no_argument, NULL, 'j'}, {"json-int", no_argument, NULL, ZPOOL_OPTION_JSON_NUMS_AS_INT}, {"json-flat-vdevs", no_argument, NULL, ZPOOL_OPTION_JSON_FLAT_VDEVS}, @@ -12589,6 +12591,7 @@ zpool_do_get(int argc, char **argv) current_prop_type = cb.cb_type; struct option long_options[] = { + {"json", no_argument, NULL, 'j'}, {"json-int", no_argument, NULL, ZPOOL_OPTION_JSON_NUMS_AS_INT}, {"json-pool-key-guid", no_argument, NULL, ZPOOL_OPTION_POOL_KEY_GUID}, @@ -13503,7 +13506,12 @@ zpool_do_version(int argc, char **argv) int c; nvlist_t *jsobj = NULL, *zfs_ver = NULL; boolean_t json = B_FALSE; - while ((c = getopt(argc, argv, "j")) != -1) { + + struct option long_options[] = { + {"json", no_argument, NULL, 'j'}, + }; + + while ((c = getopt_long(argc, argv, "j", long_options, NULL)) != -1) { switch (c) { case 'j': json = B_TRUE; diff --git a/man/man8/zfs-list.8 b/man/man8/zfs-list.8 index b49def08b72b..0fa7e9cc2613 100644 --- a/man/man8/zfs-list.8 +++ b/man/man8/zfs-list.8 @@ -71,7 +71,7 @@ The following fields are displayed: Used for scripting mode. Do not print headers and separate fields by a single tab instead of arbitrary white space. -.It Fl j Op Ar --json-int +.It Fl j , -json Op Ar --json-int Print the output in JSON format. Specify .Sy --json-int diff --git a/man/man8/zfs-mount.8 b/man/man8/zfs-mount.8 index 6116fbaab77f..921e2499e31b 100644 --- a/man/man8/zfs-mount.8 +++ b/man/man8/zfs-mount.8 @@ -59,7 +59,7 @@ .Xc Displays all ZFS file systems currently mounted. .Bl -tag -width "-j" -.It Fl j +.It Fl j , -json Displays all mounted file systems in JSON format. .El .It Xo diff --git a/man/man8/zfs-program.8 b/man/man8/zfs-program.8 index 928620362be7..460cd2e11cf3 100644 --- a/man/man8/zfs-program.8 +++ b/man/man8/zfs-program.8 @@ -50,7 +50,7 @@ and any attempts to access or modify other pools will cause an error. . .Sh OPTIONS .Bl -tag -width "-t" -.It Fl j +.It Fl j , -json Display channel program output in JSON format. When this flag is specified and standard output is empty - channel program encountered an error. diff --git a/man/man8/zfs-set.8 b/man/man8/zfs-set.8 index 204450d72ec9..f1718608b44b 100644 --- a/man/man8/zfs-set.8 +++ b/man/man8/zfs-set.8 @@ -130,7 +130,7 @@ The value can be used to display all properties that apply to the given dataset's type .Pq Sy filesystem , volume , snapshot , No or Sy bookmark . .Bl -tag -width "-s source" -.It Fl j Op Ar --json-int +.It Fl j , -json Op Ar --json-int Display the output in JSON format. Specify .Sy --json-int diff --git a/man/man8/zpool-get.8 b/man/man8/zpool-get.8 index 5384906f17f2..1be83526d22d 100644 --- a/man/man8/zpool-get.8 +++ b/man/man8/zpool-get.8 @@ -98,7 +98,7 @@ See the .Xr zpoolprops 7 manual page for more information on the available pool properties. .Bl -tag -compact -offset Ds -width "-o field" -.It Fl j Op Ar --json-int, --json-pool-key-guid +.It Fl j , -json Op Ar --json-int, --json-pool-key-guid Display the list of properties in JSON format. Specify .Sy --json-int @@ -157,7 +157,7 @@ See the .Xr vdevprops 7 manual page for more information on the available pool properties. .Bl -tag -compact -offset Ds -width "-o field" -.It Fl j Op Ar --json-int +.It Fl j , -json Op Ar --json-int Display the list of properties in JSON format. Specify .Sy --json-int diff --git a/man/man8/zpool-list.8 b/man/man8/zpool-list.8 index b0ee659701d4..6d3478a68711 100644 --- a/man/man8/zpool-list.8 +++ b/man/man8/zpool-list.8 @@ -59,7 +59,7 @@ is specified, the command exits after .Ar count reports are printed. .Bl -tag -width Ds -.It Fl j Op Ar --json-int, --json-pool-key-guid +.It Fl j , -json Op Ar --json-int, --json-pool-key-guid Display the list of pools in JSON format. Specify .Sy --json-int diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index f44f3f5f838f..b9b54185d050 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -70,7 +70,7 @@ See the option of .Nm zpool Cm iostat for complete details. -.It Fl j Op Ar --json-int, --json-flat-vdevs, --json-pool-key-guid +.It Fl j , -json Op Ar --json-int, --json-flat-vdevs, --json-pool-key-guid Display the status for ZFS pools in JSON format. Specify .Sy --json-int diff --git a/tests/zfs-tests/tests/functional/cli_root/json/json_sanity.ksh b/tests/zfs-tests/tests/functional/cli_root/json/json_sanity.ksh index e598dd57181e..d092a3b0e828 100755 --- a/tests/zfs-tests/tests/functional/cli_root/json/json_sanity.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/json/json_sanity.ksh @@ -30,28 +30,39 @@ # STRATEGY: # 1. Run different zfs/zpool -j commands and check for valid JSON +# +# -j and --json mean the same thing. Each command will be run twice, replacing +# JSONFLAG with the flag under test. list=( - "zpool status -j -g --json-int --json-flat-vdevs --json-pool-key-guid" - "zpool status -p -j -g --json-int --json-flat-vdevs --json-pool-key-guid" - "zpool status -j -c upath" - "zpool status -j" - "zpool status -j testpool1" - "zpool list -j" - "zpool list -j -g" - "zpool list -j -o fragmentation" - "zpool get -j size" - "zpool get -j all" - "zpool version -j" - "zfs list -j" - "zfs list -j testpool1" - "zfs get -j all" - "zfs get -j available" - "zfs mount -j" - "zfs version -j" + "zpool status JSONFLAG -g --json-int --json-flat-vdevs --json-pool-key-guid" + "zpool status -p JSONFLAG -g --json-int --json-flat-vdevs --json-pool-key-guid" + "zpool status JSONFLAG -c upath" + "zpool status JSONFLAG" + "zpool status JSONFLAG testpool1" + "zpool list JSONFLAG" + "zpool list JSONFLAG -g" + "zpool list JSONFLAG -o fragmentation" + "zpool get JSONFLAG size" + "zpool get JSONFLAG all" + "zpool version JSONFLAG" + "zfs list JSONFLAG" + "zfs list JSONFLAG testpool1" + "zfs get JSONFLAG all" + "zfs get JSONFLAG available" + "zfs mount JSONFLAG" + "zfs version JSONFLAG" ) -for cmd in "${list[@]}" ; do - log_must eval "$cmd | jq > /dev/null" -done +function run_json_tests +{ + typeset flag=$1 + for cmd in "${list[@]}" ; do + cmd=${cmd//JSONFLAG/$flag} + log_must eval "$cmd | jq > /dev/null" + done +} + +log_must run_json_tests -j +log_must run_json_tests --json log_pass "zpool and zfs commands outputted valid JSON" From 58162960a1dbcd35c72e42a4bbb821ccc5a0c505 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Matu=C5=A1ka?= Date: Fri, 11 Oct 2024 18:55:17 +0200 Subject: [PATCH 02/47] zdb: fix printf format in dump_zap() When compiling zdb.c on 32-bit platforms, a format conversion error is reported for a printf() in dump_zap(). Change %l to macro %" PRIu64 " to match the platform size of a 64-bit unsigned integer. Reviewed-by: Brian Behlendorf Signed-off-by: Martin Matuska Closes #16635 --- cmd/zdb/zdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 2c136bacb1a8..16c7025802f3 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1131,7 +1131,7 @@ dump_zap(objset_t *os, uint64_t object, void *data, size_t size) !!(zap_getflags(zc.zc_zap) & ZAP_FLAG_UINT64_KEY); if (key64) - (void) printf("\t\t0x%010lx = ", + (void) printf("\t\t0x%010" PRIu64 "x = ", *(uint64_t *)attrp->za_name); else (void) printf("\t\t%s = ", attrp->za_name); From 7f830d783b1c0cbfc504cea8b2f7052ad58242d7 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 11 Oct 2024 14:16:00 -0700 Subject: [PATCH 03/47] CI: Stick with ubuntu-22.04 for CodeQL analysis The ubuntu-latest alias now refers to ubuntu-24.04 instead of ubuntu-22.04 which causes CodeQL's autobuild to fail with: cpp/autobuilder: deptrace not supported in ubuntu 24.04 Until deptrace is supported by ubuntu-24.04 hosted runners request ubuntu-22.04 which is supported. Signed-off-by: Brian Behlendorf Reviewed-by: Tino Reichardt Reviewed-by: George Melikov Closes #16639 --- .github/workflows/codeql.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 2656a20fea0d..e975d7dd00b9 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -11,7 +11,7 @@ concurrency: jobs: analyze: name: Analyze - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 permissions: actions: read contents: read From 5bc27acf512d6f778b4f1d838fc8df0b53f5567b Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 11 Oct 2024 14:22:24 -0700 Subject: [PATCH 04/47] ZTS: Slightly increase dedup_quota limit As described in the comment above this check the space used by logged entries is not accounted for and some margin needs to be added in. While uncommon we have slightly exceeded the 600,000 threshold on some CI run so we increase the limit a bit more. Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter Reviewed-by: George Melikov Signed-off-by: Brian Behlendorf Closes #16637 --- tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh index 326152b510a9..b1657648b5a1 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh @@ -221,7 +221,7 @@ function ddt_dedup_vdev_limit # For here, we just set the entry count a little higher than what we # expect to allow for some instability. # - log_must test $(ddt_entries) -le 600000 + log_must test $(ddt_entries) -le 650000 do_clean } From c645b07eaa0e1def37ffe00d63e5835e8b13742d Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 11 Oct 2024 16:12:16 -0700 Subject: [PATCH 05/47] ZTS: Increase zpool_import_parallel_pos import margin Increase the pool import time allowed by assuming a minimum reduction to 1/2 instead of 1/3 when comparing sequential to parallel import times. This is sufficient to verify parallel imports are working as intended and should address the occasional false positive failure when the time is slightly exceeded. Reviewed-by: Tino Reichardt Reviewed-by: George Melikov Reviewed-by: Tony Hutter Signed-off-by: Brian Behlendorf Closes #16638 --- .../cli_root/zpool_import/zpool_import_parallel_pos.ksh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh index 71b2437a37ec..57a7412e37cc 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh @@ -113,7 +113,7 @@ wait parallel_time=$SECONDS log_note "asyncronously imported 4 pools in $parallel_time seconds" -log_must test $parallel_time -lt $(($sequential_time / 3)) +log_must test $parallel_time -lt $(($sequential_time / 2)) # # export pools with import delay injectors @@ -132,6 +132,6 @@ log_must zpool import -a -d $DEVICE_DIR -f parallel_time=$SECONDS log_note "asyncronously imported 4 pools in $parallel_time seconds" -log_must test $parallel_time -lt $(($sequential_time / 3)) +log_must test $parallel_time -lt $(($sequential_time / 2)) log_pass "Pool imports occur in parallel" From 56871e465a9aee71b532d20c58853c75190257ef Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Sat, 12 Oct 2024 13:48:56 -0700 Subject: [PATCH 06/47] Fallback to strerror() when strerror_l() isn't available Some C libraries, such as uClibc, do not provide strerror_l() in which case we fallback to strerror(). Reviewed-by: Tino Reichardt Signed-off-by: Brian Behlendorf Closes #16636 Closes #16640 --- config/user.m4 | 2 +- include/libzutil.h | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/config/user.m4 b/config/user.m4 index badd920d2b8a..4e31745a2abc 100644 --- a/config/user.m4 +++ b/config/user.m4 @@ -33,7 +33,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [ ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV ZFS_AC_CONFIG_USER_ZFSEXEC - AC_CHECK_FUNCS([execvpe issetugid mlockall strlcat strlcpy gettid]) + AC_CHECK_FUNCS([execvpe issetugid mlockall strerror_l strlcat strlcpy gettid]) AC_SUBST(RM) ]) diff --git a/include/libzutil.h b/include/libzutil.h index e2108ceeaa44..f8712340cc5e 100644 --- a/include/libzutil.h +++ b/include/libzutil.h @@ -276,7 +276,11 @@ _LIBZUTIL_H void update_vdev_config_dev_sysfs_path(nvlist_t *nv, * Thread-safe strerror() for use in ZFS libraries */ static inline char *zfs_strerror(int errnum) { +#ifdef HAVE_STRERROR_L return (strerror_l(errnum, uselocale(0))); +#else + return (strerror(errnum)); +#endif } #ifdef __cplusplus From 77df762a1bd9180286645d4ebd195c16579225ea Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Sat, 12 Oct 2024 09:53:32 +0200 Subject: [PATCH 07/47] ZTS: Optimize Kernel Same-page Merging (KSM) Kernel same-page Merging (KSM) allows KVM guests to share identical memory pages. These shared pages are usually common libraries or other identical, high-use data. The current configuration was a bit to lazy - so KSM didn't work very well. With the new configuration I could run 3 Linux VMs in parralel. FreeBSD can't benefit from it. But FreeBSD is not so memory hungry in general, so there is no need for it ;) Reviewed-by: George Melikov Reviewed-by: Brian Behlendorf Signed-off-by: Tino Reichardt Closes #16641 --- .github/workflows/scripts/qemu-1-setup.sh | 18 ++++++++++-------- .github/workflows/scripts/qemu-5-setup.sh | 12 +++++++----- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/.github/workflows/scripts/qemu-1-setup.sh b/.github/workflows/scripts/qemu-1-setup.sh index ebd80a2f98c1..f838da34efff 100755 --- a/.github/workflows/scripts/qemu-1-setup.sh +++ b/.github/workflows/scripts/qemu-1-setup.sh @@ -18,19 +18,21 @@ ssh-keygen -t ed25519 -f ~/.ssh/id_ed25519 -q -N "" # we expect RAM shortage cat << EOF | sudo tee /etc/ksmtuned.conf > /dev/null +# /etc/ksmtuned.conf - Configuration file for ksmtuned # https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/7/html/virtualization_tuning_and_optimization_guide/chap-ksm KSM_MONITOR_INTERVAL=60 # Millisecond sleep between ksm scans for 16Gb server. # Smaller servers sleep more, bigger sleep less. -KSM_SLEEP_MSEC=10 -KSM_NPAGES_BOOST=300 -KSM_NPAGES_DECAY=-50 -KSM_NPAGES_MIN=64 -KSM_NPAGES_MAX=2048 - -KSM_THRES_COEF=25 -KSM_THRES_CONST=2048 +KSM_SLEEP_MSEC=30 + +KSM_NPAGES_BOOST=0 +KSM_NPAGES_DECAY=0 +KSM_NPAGES_MIN=1000 +KSM_NPAGES_MAX=25000 + +KSM_THRES_COEF=80 +KSM_THRES_CONST=8192 LOGFILE=/var/log/ksmtuned.log DEBUG=1 diff --git a/.github/workflows/scripts/qemu-5-setup.sh b/.github/workflows/scripts/qemu-5-setup.sh index 7acb67a27920..5034c1896304 100755 --- a/.github/workflows/scripts/qemu-5-setup.sh +++ b/.github/workflows/scripts/qemu-5-setup.sh @@ -14,17 +14,19 @@ PID=$(pidof /usr/bin/qemu-system-x86_64) tail --pid=$PID -f /dev/null sudo virsh undefine openzfs +# default values per test vm: +VMs=2 +CPU=2 + # definitions of per operating system case "$OS" in + # FreeBSD can't be optimized via ksmtuned freebsd*) - VMs=2 - CPU=3 RAM=6 ;; *) - VMs=2 - CPU=3 - RAM=7 + # Linux can be optimized via ksmtuned + RAM=8 ;; esac From b5a3825244583d10730b08dc29070ff81e84eb27 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Sat, 12 Oct 2024 09:54:50 +0200 Subject: [PATCH 08/47] ZTS: Make use of optimal CPU pinning With CPU pinning, we should get some speedup because of better cpu cache re-use. Reviewed-by: George Melikov Reviewed-by: Brian Behlendorf Signed-off-by: Tino Reichardt Closes #16641 --- .github/workflows/scripts/qemu-5-setup.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/scripts/qemu-5-setup.sh b/.github/workflows/scripts/qemu-5-setup.sh index 5034c1896304..bc40e8894b22 100755 --- a/.github/workflows/scripts/qemu-5-setup.sh +++ b/.github/workflows/scripts/qemu-5-setup.sh @@ -18,10 +18,12 @@ sudo virsh undefine openzfs VMs=2 CPU=2 -# definitions of per operating system +# cpu pinning +CPUSET=("0,1" "2,3") + case "$OS" in - # FreeBSD can't be optimized via ksmtuned freebsd*) + # FreeBSD can't be optimized via ksmtuned RAM=6 ;; *) @@ -75,6 +77,7 @@ EOF --cpu host-passthrough \ --virt-type=kvm --hvm \ --vcpus=$CPU,sockets=1 \ + --cpuset=${CPUSET[$((i-1))]} \ --memory $((1024*RAM)) \ --memballoon model=virtio \ --graphics none \ From 0409c47fe01f36c11bfde4aeeece12697b337e6d Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 9 Oct 2024 13:47:09 -0700 Subject: [PATCH 09/47] Tag 2.3.0-rc2 Signed-off-by: Brian Behlendorf --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index c2eff4ba0bbd..eca34a55d004 100644 --- a/META +++ b/META @@ -2,7 +2,7 @@ Meta: 1 Name: zfs Branch: 1.0 Version: 2.3.0 -Release: rc1 +Release: rc2 Release-Tags: relext License: CDDL Author: OpenZFS From 3d9129a7b6edf277ceb35acf2938285e3d410fdf Mon Sep 17 00:00:00 2001 From: Warner Losh Date: Wed, 16 Oct 2024 11:00:40 -0600 Subject: [PATCH 10/47] freebsd: Use compiler.h from FreeBSD's base's linuxkpi The FreeBSD linux/compiler.h in OpenZFS was copied from a very old version of FreeBSD's linuxkpi's linux/compiler.h. There's no need for this duplication. Use FreeBSD's linuxkpi version instead, and provide zfs_fallthrough to augment it (it's all that's needed). Use #pragma once to avoid naming issues for guard variables. Since this is a complete rewrite, use my copyright here (the original code in FreeBSD still credits everybody). This works back at least to FreeBSD 12.4, which is not out of support, and all newer releases. Remove extra copies of macros that were defined elsewhere, but are now properly defined in LinuxKPI so are redundant. Sponsored-by: Netflix Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Warner Losh Closes #16650 --- include/os/freebsd/linux/compiler.h | 83 +++------------------------- include/os/freebsd/spl/sys/ccompat.h | 9 --- include/os/freebsd/spl/sys/debug.h | 4 -- 3 files changed, 8 insertions(+), 88 deletions(-) diff --git a/include/os/freebsd/linux/compiler.h b/include/os/freebsd/linux/compiler.h index b408b77c746d..24f09c722158 100644 --- a/include/os/freebsd/linux/compiler.h +++ b/include/os/freebsd/linux/compiler.h @@ -1,10 +1,5 @@ /* - * Copyright (c) 2010 Isilon Systems, Inc. - * Copyright (c) 2010 iXsystems, Inc. - * Copyright (c) 2010 Panasas, Inc. - * Copyright (c) 2013-2016 Mellanox Technologies, Ltd. - * Copyright (c) 2015 François Tigeot - * All rights reserved. + * Copyright (c) 2024 Warner Losh. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -26,76 +21,14 @@ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * $FreeBSD$ */ -#ifndef _LINUX_COMPILER_H_ -#define _LINUX_COMPILER_H_ - -#include - -#define __user -#define __kernel -#define __safe -#define __force -#define __nocast -#define __iomem -#define __chk_user_ptr(x) ((void)0) -#define __chk_io_ptr(x) ((void)0) -#define __builtin_warning(x, y...) (1) -#define __acquires(x) -#define __releases(x) -#define __acquire(x) do { } while (0) -#define __release(x) do { } while (0) -#define __cond_lock(x, c) (c) -#define __bitwise -#define __devinitdata -#define __deprecated -#define __init -#define __initconst -#define __devinit -#define __devexit -#define __exit -#define __rcu -#define __percpu -#define __weak __weak_symbol -#define __malloc -#define ___stringify(...) #__VA_ARGS__ -#define __stringify(...) ___stringify(__VA_ARGS__) -#define __attribute_const__ __attribute__((__const__)) -#undef __always_inline -#define __always_inline inline -#define noinline __noinline -#define ____cacheline_aligned __aligned(CACHE_LINE_SIZE) -#define zfs_fallthrough __attribute__((__fallthrough__)) - -#if !defined(_KERNEL) && !defined(_STANDALONE) -#define likely(x) __builtin_expect(!!(x), 1) -#define unlikely(x) __builtin_expect(!!(x), 0) -#endif -#define typeof(x) __typeof(x) - -#define uninitialized_var(x) x = x -#define __maybe_unused __unused -#define __always_unused __unused -#define __must_check __result_use_check - -#define __printf(a, b) __printflike(a, b) -#define barrier() __asm__ __volatile__("": : :"memory") -#define ___PASTE(a, b) a##b -#define __PASTE(a, b) ___PASTE(a, b) - -#define ACCESS_ONCE(x) (*(volatile __typeof(x) *)&(x)) - -#define WRITE_ONCE(x, v) do { \ - barrier(); \ - ACCESS_ONCE(x) = (v); \ - barrier(); \ -} while (0) - -#define lockless_dereference(p) READ_ONCE(p) +/* + * FreeBSD's LinuxKPI compiler.h as far back as FreeBSD 12 has what we need, + * except zfs_fallthrough. + */ +#pragma once -#define _AT(T, X) ((T)(X)) +#include -#endif /* _LINUX_COMPILER_H_ */ +#define zfs_fallthrough __attribute__((__fallthrough__)) diff --git a/include/os/freebsd/spl/sys/ccompat.h b/include/os/freebsd/spl/sys/ccompat.h index 48749fb8eea2..07b3515ad964 100644 --- a/include/os/freebsd/spl/sys/ccompat.h +++ b/include/os/freebsd/spl/sys/ccompat.h @@ -70,15 +70,6 @@ hlist_del(struct hlist_node *n) n->next->pprev = n->pprev; } /* BEGIN CSTYLED */ -#define READ_ONCE(x) ({ \ - __typeof(x) __var = ({ \ - barrier(); \ - ACCESS_ONCE(x); \ - }); \ - barrier(); \ - __var; \ -}) - #define HLIST_HEAD_INIT { } #define HLIST_HEAD(name) struct hlist_head name = HLIST_HEAD_INIT #define INIT_HLIST_HEAD(head) (head)->first = NULL diff --git a/include/os/freebsd/spl/sys/debug.h b/include/os/freebsd/spl/sys/debug.h index f041dde34fc8..9eb424dd0373 100644 --- a/include/os/freebsd/spl/sys/debug.h +++ b/include/os/freebsd/spl/sys/debug.h @@ -95,10 +95,6 @@ spl_assert(const char *buf, const char *file, const char *func, int line) #ifndef expect #define expect(expr, value) (__builtin_expect((expr), (value))) #endif -#ifndef __linux__ -#define likely(expr) expect((expr) != 0, 1) -#define unlikely(expr) expect((expr) != 0, 0) -#endif #define PANIC(fmt, a...) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a) From 36a67b50a230aea6ddcef277ff3715fb9b877c07 Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Thu, 17 Oct 2024 18:09:39 +0500 Subject: [PATCH 11/47] Fix inconsistent mount options for ZFS root While mounting ZFS root during boot on Linux distributions from initrd, mount from busybox is effectively used which executes mount system call directly. This skips the ZFS helper mount.zfs, which checks and enables the mount options as specified in dataset properties. As a result, datasets mounted during boot from initrd do not have correct mount options as specified in ZFS dataset properties. There has been an attempt to use mount.zfs in zfs initrd script, responsible for mounting the ZFS root filesystem (PR#13305). This was later reverted (PR#14908) after discovering that using mount.zfs breaks mounting of snapshots on root (/) and other child datasets of root have the same issue (Issue#9461). This happens because switching from busybox mount to mount.zfs correctly parses the mount options but also adds 'mntpoint=/root' to the mount options, which is then prepended to the snapshot mountpoint in '.zfs/snapshot'. '/root' is the directory on Debian with initramfs-tools where root filesystem is mounted before pivot_root. When Linux runtime is reached, trying to access the snapshots on root results in automounting the snapshot on '/root/.zfs/*', which fails. This commit attempts to fix the automounting of snapshots on root, while using mount.zfs in initrd script. Since the mountpoint of dataset is stored in vfs_mntpoint field, we can check if current mountpoint of dataset and vfs_mntpoint are same or not. If they are not same, reset the vfs_mntpoint field with current mountpoint. This fixes the mountpoints of root dataset and children in respective vfs_mntpoint fields when we try to access the snapshots of root dataset or its children. With correct mountpoint for root dataset and children stored in vfs_mntpoint, all snapshots of root dataset are mounted correctly and become accessible. This fix will come into play only if current process, that is trying to access the snapshots is not in chroot context. The Linux kernel API that is used to convert struct path into char format (d_path), returns the complete path for given struct path. It works in chroot environment as well and returns the correct path from original filesystem root. However d_path fails to return the complete path if any directory from original root filesystem is mounted using --bind flag or --rbind flag in chroot environment. In this case, if we try to access the snapshot from outside the chroot environment, d_path returns the path correctly, i.e. it returns the correct path to the directory that is mounted with --bind flag. However inside the chroot environment, it only returns the path inside chroot. For now, there is not a better way in my understanding that gives the complete path in char format and handles the case where directories from root filesystem are mounted with --bind or --rbind on another path which user will later chroot into. So this fix gets enabled if current process trying to access the snapshot is not in chroot context. With the snapshots issue fixed for root filesystem, using mount.zfs in ZFS initrd script, mounts the datasets with correct mount options. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Ameer Hamza Signed-off-by: Umer Saleem Closes #16646 --- contrib/initramfs/scripts/zfs | 5 +- include/os/linux/zfs/sys/zfs_vfsops_os.h | 1 + module/os/linux/zfs/zfs_ctldir.c | 109 +++++++++++++++++++++-- module/os/linux/zfs/zfs_vfsops.c | 6 +- 4 files changed, 111 insertions(+), 10 deletions(-) diff --git a/contrib/initramfs/scripts/zfs b/contrib/initramfs/scripts/zfs index 0a2bd2efda7a..c569b2528368 100644 --- a/contrib/initramfs/scripts/zfs +++ b/contrib/initramfs/scripts/zfs @@ -344,7 +344,7 @@ mount_fs() # Need the _original_ datasets mountpoint! mountpoint=$(get_fs_value "$fs" mountpoint) - ZFS_CMD="mount -o zfsutil -t zfs" + ZFS_CMD="mount.zfs -o zfsutil" if [ "$mountpoint" = "legacy" ] || [ "$mountpoint" = "none" ]; then # Can't use the mountpoint property. Might be one of our # clones. Check the 'org.zol:mountpoint' property set in @@ -359,9 +359,8 @@ mount_fs() # isn't the root fs. return 0 fi - # Don't use mount.zfs -o zfsutils for legacy mountpoint if [ "$mountpoint" = "legacy" ]; then - ZFS_CMD="mount -t zfs" + ZFS_CMD="mount.zfs" fi # Last hail-mary: Hope 'rootmnt' is set! mountpoint="" diff --git a/include/os/linux/zfs/sys/zfs_vfsops_os.h b/include/os/linux/zfs/sys/zfs_vfsops_os.h index 7067eb17900d..30aa3a103d33 100644 --- a/include/os/linux/zfs/sys/zfs_vfsops_os.h +++ b/include/os/linux/zfs/sys/zfs_vfsops_os.h @@ -69,6 +69,7 @@ typedef struct vfs { boolean_t vfs_do_relatime; boolean_t vfs_nbmand; boolean_t vfs_do_nbmand; + kmutex_t vfs_mntpt_lock; } vfs_t; typedef struct zfs_mnt { diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index 8a42a075cd25..f60d6ae91e0b 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -767,9 +767,6 @@ zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid, uint64_t id, pos = 0; int error = 0; - if (zfsvfs->z_vfs->vfs_mntpoint == NULL) - return (SET_ERROR(ENOENT)); - cookie = spl_fstrans_mark(); snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); @@ -786,8 +783,14 @@ zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid, break; } - snprintf(full_path, path_len, "%s/.zfs/snapshot/%s", - zfsvfs->z_vfs->vfs_mntpoint, snapname); + mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock); + if (zfsvfs->z_vfs->vfs_mntpoint != NULL) { + snprintf(full_path, path_len, "%s/.zfs/snapshot/%s", + zfsvfs->z_vfs->vfs_mntpoint, snapname); + } else + error = SET_ERROR(ENOENT); + mutex_exit(&zfsvfs->z_vfs->vfs_mntpt_lock); + out: kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); spl_fstrans_unmark(cookie); @@ -1049,6 +1052,66 @@ exportfs_flush(void) (void) call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); } +/* + * Returns the path in char format for given struct path. Uses + * d_path exported by kernel to convert struct path to char + * format. Returns the correct path for mountpoints and chroot + * environments. + * + * If chroot environment has directories that are mounted with + * --bind or --rbind flag, d_path returns the complete path inside + * chroot environment but does not return the absolute path, i.e. + * the path to chroot environment is missing. + */ +static int +get_root_path(struct path *path, char *buff, int len) +{ + char *path_buffer, *path_ptr; + int error = 0; + + path_get(path); + path_buffer = kmem_zalloc(len, KM_SLEEP); + path_ptr = d_path(path, path_buffer, len); + if (IS_ERR(path_ptr)) + error = SET_ERROR(-PTR_ERR(path_ptr)); + else + strcpy(buff, path_ptr); + + kmem_free(path_buffer, len); + path_put(path); + return (error); +} + +/* + * Returns if the current process root is chrooted or not. Linux + * kernel exposes the task_struct for current process and init. + * Since init process root points to actual root filesystem when + * Linux runtime is reached, we can compare the current process + * root with init process root to determine if root of the current + * process is different from init, which can reliably determine if + * current process is in chroot context or not. + */ +static int +is_current_chrooted(void) +{ + struct task_struct *curr = current, *global = &init_task; + struct path cr_root, gl_root; + + task_lock(curr); + get_fs_root(curr->fs, &cr_root); + task_unlock(curr); + + task_lock(global); + get_fs_root(global->fs, &gl_root); + task_unlock(global); + + int chrooted = !path_equal(&cr_root, &gl_root); + path_put(&gl_root); + path_put(&cr_root); + + return (chrooted); +} + /* * Attempt to unmount a snapshot by making a call to user space. * There is no assurance that this can or will succeed, is just a @@ -1123,14 +1186,50 @@ zfsctl_snapshot_mount(struct path *path, int flags) if (error) goto error; + if (is_current_chrooted() == 0) { + /* + * Current process is not in chroot context + */ + + char *m = kmem_zalloc(MAXPATHLEN, KM_SLEEP); + struct path mnt_path; + mnt_path.mnt = path->mnt; + mnt_path.dentry = path->mnt->mnt_root; + + /* + * Get path to current mountpoint + */ + error = get_root_path(&mnt_path, m, MAXPATHLEN); + if (error != 0) { + kmem_free(m, MAXPATHLEN); + goto error; + } + mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock); + if (zfsvfs->z_vfs->vfs_mntpoint != NULL) { + /* + * If current mnountpoint and vfs_mntpoint are not same, + * store current mountpoint in vfs_mntpoint. + */ + if (strcmp(zfsvfs->z_vfs->vfs_mntpoint, m) != 0) { + kmem_strfree(zfsvfs->z_vfs->vfs_mntpoint); + zfsvfs->z_vfs->vfs_mntpoint = kmem_strdup(m); + } + } else + zfsvfs->z_vfs->vfs_mntpoint = kmem_strdup(m); + mutex_exit(&zfsvfs->z_vfs->vfs_mntpt_lock); + kmem_free(m, MAXPATHLEN); + } + /* * Construct a mount point path from sb of the ctldir inode and dirent * name, instead of from d_path(), so that chroot'd process doesn't fail * on mount.zfs(8). */ + mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock); snprintf(full_path, MAXPATHLEN, "%s/.zfs/snapshot/%s", zfsvfs->z_vfs->vfs_mntpoint ? zfsvfs->z_vfs->vfs_mntpoint : "", dname(dentry)); + mutex_exit(&zfsvfs->z_vfs->vfs_mntpt_lock); snprintf(options, 7, "%s", zfs_snapshot_no_setuid ? "nosuid" : "suid"); diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index de3e8c89cfdd..3c53a8a315c3 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -115,7 +115,7 @@ zfsvfs_vfs_free(vfs_t *vfsp) if (vfsp != NULL) { if (vfsp->vfs_mntpoint != NULL) kmem_strfree(vfsp->vfs_mntpoint); - + mutex_destroy(&vfsp->vfs_mntpt_lock); kmem_free(vfsp, sizeof (vfs_t)); } } @@ -197,10 +197,11 @@ zfsvfs_parse_option(char *option, int token, substring_t *args, vfs_t *vfsp) vfsp->vfs_do_nbmand = B_TRUE; break; case TOKEN_MNTPOINT: + if (vfsp->vfs_mntpoint != NULL) + kmem_strfree(vfsp->vfs_mntpoint); vfsp->vfs_mntpoint = match_strdup(&args[0]); if (vfsp->vfs_mntpoint == NULL) return (SET_ERROR(ENOMEM)); - break; default: break; @@ -219,6 +220,7 @@ zfsvfs_parse_options(char *mntopts, vfs_t **vfsp) int error; tmp_vfsp = kmem_zalloc(sizeof (vfs_t), KM_SLEEP); + mutex_init(&tmp_vfsp->vfs_mntpt_lock, NULL, MUTEX_DEFAULT, NULL); if (mntopts != NULL) { substring_t args[MAX_OPT_ARGS]; From bcd61d95790d3b005a7124775def9a2bc3ecb128 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 15 Oct 2024 22:18:19 +1100 Subject: [PATCH 12/47] libspl/backtrace: dump registers in libunwind backtraces More useful stuff, especially when trying to follow a disassembly. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Rob Norris Closes #16653 --- lib/libspl/backtrace.c | 44 +++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/lib/libspl/backtrace.c b/lib/libspl/backtrace.c index d26d742106e2..e969823c2e5d 100644 --- a/lib/libspl/backtrace.c +++ b/lib/libspl/backtrace.c @@ -65,32 +65,58 @@ libspl_backtrace(int fd) ssize_t ret __attribute__((unused)); unw_context_t uc; unw_cursor_t cp; - unw_word_t loc; + unw_word_t v; char buf[128]; - size_t n; + size_t n, c; - ret = write(fd, "Call trace:\n", 12); unw_getcontext(&uc); + unw_init_local(&cp, &uc); + ret = write(fd, "Registers:\n", 11); + c = 0; + for (uint_t regnum = 0; regnum <= UNW_TDEP_LAST_REG; regnum++) { + if (unw_get_reg(&cp, regnum, &v) < 0) + continue; + const char *name = unw_regname(regnum); + for (n = 0; name[n] != '\0' && name[n] != '?'; n++) {} + if (n == 0) { + buf[0] = '?'; + n = libspl_u64_to_hex_str(regnum, 2, + &buf[1], sizeof (buf)-1) + 1; + name = buf; + } + ret = write(fd, " ", 5-MIN(n, 3)); + ret = write(fd, name, n); + ret = write(fd, ": 0x", 4); + n = libspl_u64_to_hex_str(v, 18, buf, sizeof (buf)); + ret = write(fd, buf, n); + if (!(++c % 3)) + ret = write(fd, "\n", 1); + } + if (c % 3) + ret = write(fd, "\n", 1); + + unw_init_local(&cp, &uc); + ret = write(fd, "Call trace:\n", 12); while (unw_step(&cp) > 0) { - unw_get_reg(&cp, UNW_REG_IP, &loc); + unw_get_reg(&cp, UNW_REG_IP, &v); ret = write(fd, " [0x", 5); - n = libspl_u64_to_hex_str(loc, 10, buf, sizeof (buf)); + n = libspl_u64_to_hex_str(v, 18, buf, sizeof (buf)); ret = write(fd, buf, n); ret = write(fd, "] ", 2); - unw_get_proc_name(&cp, buf, sizeof (buf), &loc); + unw_get_proc_name(&cp, buf, sizeof (buf), &v); for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} ret = write(fd, buf, n); ret = write(fd, "+0x", 3); - n = libspl_u64_to_hex_str(loc, 2, buf, sizeof (buf)); + n = libspl_u64_to_hex_str(v, 2, buf, sizeof (buf)); ret = write(fd, buf, n); #ifdef HAVE_LIBUNWIND_ELF ret = write(fd, " (in ", 5); - unw_get_elf_filename(&cp, buf, sizeof (buf), &loc); + unw_get_elf_filename(&cp, buf, sizeof (buf), &v); for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} ret = write(fd, buf, n); ret = write(fd, " +0x", 4); - n = libspl_u64_to_hex_str(loc, 2, buf, sizeof (buf)); + n = libspl_u64_to_hex_str(v, 2, buf, sizeof (buf)); ret = write(fd, buf, n); ret = write(fd, ")", 1); #endif From d5db840260a1e39af11766227132e9b4d871b21b Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 18 Oct 2024 13:49:41 +1100 Subject: [PATCH 13/47] libspl/backtrace: helper macros for output My eyes are going blurry looking at all those write calls. This is much nicer. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Rob Norris Close #16653 --- lib/libspl/backtrace.c | 58 ++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/lib/libspl/backtrace.c b/lib/libspl/backtrace.c index e969823c2e5d..b4568d124dde 100644 --- a/lib/libspl/backtrace.c +++ b/lib/libspl/backtrace.c @@ -25,12 +25,20 @@ #include #include +#include #include /* - * libspl_backtrace() must be safe to call from inside a signal hander. This - * mostly means it must not allocate, and so we can't use things like printf. + * Output helpers. libspl_backtrace() must not block, must be thread-safe and + * must be safe to call from a signal handler. At least, that means not having + * printf, so we end up having to call write() directly on the fd. That's + * awkward, as we always have to pass through a length, and some systems will + * complain if we don't consume the return. So we have some macros to make + * things a little more palatable. */ +#define spl_bt_write_n(fd, s, n) \ + do { ssize_t r __maybe_unused = write(fd, s, n); } while (0) +#define spl_bt_write(fd, s) spl_bt_write_n(fd, s, sizeof (s)) #if defined(HAVE_LIBUNWIND) #define UNW_LOCAL_ONLY @@ -62,7 +70,6 @@ libspl_u64_to_hex_str(uint64_t v, size_t digits, char *buf, size_t buflen) void libspl_backtrace(int fd) { - ssize_t ret __attribute__((unused)); unw_context_t uc; unw_cursor_t cp; unw_word_t v; @@ -72,7 +79,7 @@ libspl_backtrace(int fd) unw_getcontext(&uc); unw_init_local(&cp, &uc); - ret = write(fd, "Registers:\n", 11); + spl_bt_write(fd, "Registers:\n"); c = 0; for (uint_t regnum = 0; regnum <= UNW_TDEP_LAST_REG; regnum++) { if (unw_get_reg(&cp, regnum, &v) < 0) @@ -85,42 +92,42 @@ libspl_backtrace(int fd) &buf[1], sizeof (buf)-1) + 1; name = buf; } - ret = write(fd, " ", 5-MIN(n, 3)); - ret = write(fd, name, n); - ret = write(fd, ": 0x", 4); + spl_bt_write_n(fd, " ", 5-MIN(n, 3)); + spl_bt_write_n(fd, name, n); + spl_bt_write(fd, ": 0x"); n = libspl_u64_to_hex_str(v, 18, buf, sizeof (buf)); - ret = write(fd, buf, n); + spl_bt_write_n(fd, buf, n); if (!(++c % 3)) - ret = write(fd, "\n", 1); + spl_bt_write(fd, "\n"); } if (c % 3) - ret = write(fd, "\n", 1); + spl_bt_write(fd, "\n"); unw_init_local(&cp, &uc); - ret = write(fd, "Call trace:\n", 12); + spl_bt_write(fd, "Call trace:\n"); while (unw_step(&cp) > 0) { unw_get_reg(&cp, UNW_REG_IP, &v); - ret = write(fd, " [0x", 5); + spl_bt_write(fd, " [0x"); n = libspl_u64_to_hex_str(v, 18, buf, sizeof (buf)); - ret = write(fd, buf, n); - ret = write(fd, "] ", 2); + spl_bt_write_n(fd, buf, n); + spl_bt_write(fd, "] "); unw_get_proc_name(&cp, buf, sizeof (buf), &v); for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} - ret = write(fd, buf, n); - ret = write(fd, "+0x", 3); + spl_bt_write_n(fd, buf, n); + spl_bt_write(fd, "+0x"); n = libspl_u64_to_hex_str(v, 2, buf, sizeof (buf)); - ret = write(fd, buf, n); + spl_bt_write_n(fd, buf, n); #ifdef HAVE_LIBUNWIND_ELF - ret = write(fd, " (in ", 5); + spl_bt_write(fd, " (in "); unw_get_elf_filename(&cp, buf, sizeof (buf), &v); for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} - ret = write(fd, buf, n); - ret = write(fd, " +0x", 4); + spl_bt_write_n(fd, buf, n); + spl_bt_write(fd, " +0x"); n = libspl_u64_to_hex_str(v, 2, buf, sizeof (buf)); - ret = write(fd, buf, n); - ret = write(fd, ")", 1); + spl_bt_write_n(fd, buf, n); + spl_bt_write(fd, ")"); #endif - ret = write(fd, "\n", 1); + spl_bt_write(fd, "\n"); } } #elif defined(HAVE_BACKTRACE) @@ -129,15 +136,12 @@ libspl_backtrace(int fd) void libspl_backtrace(int fd) { - ssize_t ret __attribute__((unused)); void *btptrs[64]; size_t nptrs = backtrace(btptrs, 64); - ret = write(fd, "Call trace:\n", 12); + spl_bt_write(fd, "Call trace:\n"); backtrace_symbols_fd(btptrs, nptrs, fd); } #else -#include - void libspl_backtrace(int fd __maybe_unused) { From f52d7aaaac2472a5d2c9675dec017910881d5ecb Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 18 Oct 2024 14:30:05 +1100 Subject: [PATCH 14/47] libspl/backtrace: rename and document hex conversion function Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Rob Norris Closes #16653 --- lib/libspl/backtrace.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/lib/libspl/backtrace.c b/lib/libspl/backtrace.c index b4568d124dde..b541858cbc56 100644 --- a/lib/libspl/backtrace.c +++ b/lib/libspl/backtrace.c @@ -44,8 +44,13 @@ #define UNW_LOCAL_ONLY #include +/* + * Convert `v` to ASCII hex characters. The bottom `n` nybbles (4-bits ie one + * hex digit) will be written, up to `buflen`. The buffer will not be + * null-terminated. Returns the number of digits written. + */ static size_t -libspl_u64_to_hex_str(uint64_t v, size_t digits, char *buf, size_t buflen) +spl_bt_u64_to_hex_str(uint64_t v, size_t n, char *buf, size_t buflen) { static const char hexdigits[] = { '0', '1', '2', '3', '4', '5', '6', '7', @@ -53,10 +58,10 @@ libspl_u64_to_hex_str(uint64_t v, size_t digits, char *buf, size_t buflen) }; size_t pos = 0; - boolean_t want = (digits == 0); + boolean_t want = (n == 0); for (int i = 15; i >= 0; i--) { const uint64_t d = v >> (i * 4) & 0xf; - if (!want && (d != 0 || digits > i)) + if (!want && (d != 0 || n > i)) want = B_TRUE; if (want) { buf[pos++] = hexdigits[d]; @@ -88,14 +93,14 @@ libspl_backtrace(int fd) for (n = 0; name[n] != '\0' && name[n] != '?'; n++) {} if (n == 0) { buf[0] = '?'; - n = libspl_u64_to_hex_str(regnum, 2, + n = spl_bt_u64_to_hex_str(regnum, 2, &buf[1], sizeof (buf)-1) + 1; name = buf; } spl_bt_write_n(fd, " ", 5-MIN(n, 3)); spl_bt_write_n(fd, name, n); spl_bt_write(fd, ": 0x"); - n = libspl_u64_to_hex_str(v, 18, buf, sizeof (buf)); + n = spl_bt_u64_to_hex_str(v, 18, buf, sizeof (buf)); spl_bt_write_n(fd, buf, n); if (!(++c % 3)) spl_bt_write(fd, "\n"); @@ -108,14 +113,14 @@ libspl_backtrace(int fd) while (unw_step(&cp) > 0) { unw_get_reg(&cp, UNW_REG_IP, &v); spl_bt_write(fd, " [0x"); - n = libspl_u64_to_hex_str(v, 18, buf, sizeof (buf)); + n = spl_bt_u64_to_hex_str(v, 18, buf, sizeof (buf)); spl_bt_write_n(fd, buf, n); spl_bt_write(fd, "] "); unw_get_proc_name(&cp, buf, sizeof (buf), &v); for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} spl_bt_write_n(fd, buf, n); spl_bt_write(fd, "+0x"); - n = libspl_u64_to_hex_str(v, 2, buf, sizeof (buf)); + n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf)); spl_bt_write_n(fd, buf, n); #ifdef HAVE_LIBUNWIND_ELF spl_bt_write(fd, " (in "); @@ -123,7 +128,7 @@ libspl_backtrace(int fd) for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} spl_bt_write_n(fd, buf, n); spl_bt_write(fd, " +0x"); - n = libspl_u64_to_hex_str(v, 2, buf, sizeof (buf)); + n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf)); spl_bt_write_n(fd, buf, n); spl_bt_write(fd, ")"); #endif From b4cd10ce5b581865546d6cbd5d35c44ba8d031f4 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 18 Oct 2024 15:10:33 +1100 Subject: [PATCH 15/47] libspl/backtrace: comment and harden libunwind backtracer This is the sort of code that we get right once and never look at again. Anyone reading this code is already likely in the middle of a debugging nightmare, and then they have a wall of manual string construction and an unfamiliar and idiosyncratic library to deal with. So, comment the whole thing to try to make it clear what's going on. In pursuit of the above, I've added return checks to some of the libunwind calls, fixed the frame loop to not skip the "top" frame (however unseful it may be), and fix a couple of calls to spl_bt_u64_to_hex_str() which requested 18 digits instead of 16. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Rob Norris Closes #16653 --- lib/libspl/backtrace.c | 166 ++++++++++++++++++++++++++++++++++------- 1 file changed, 141 insertions(+), 25 deletions(-) diff --git a/lib/libspl/backtrace.c b/lib/libspl/backtrace.c index b541858cbc56..6e8b3b12122d 100644 --- a/lib/libspl/backtrace.c +++ b/lib/libspl/backtrace.c @@ -79,61 +79,177 @@ libspl_backtrace(int fd) unw_cursor_t cp; unw_word_t v; char buf[128]; - size_t n, c; + size_t n; + int err; + /* Snapshot the current frame and state. */ unw_getcontext(&uc); - unw_init_local(&cp, &uc); + /* + * TODO: walk back to the frame that tripped the assertion / the place + * where the signal was recieved. + */ + + /* + * Register dump. We're going to loop over all the registers in the + * top frame, and show them, with names, in a nice three-column + * layout, which keeps us within 80 columns. + */ spl_bt_write(fd, "Registers:\n"); - c = 0; + + /* Initialise a frame cursor, starting at the current frame */ + unw_init_local(&cp, &uc); + + /* + * libunwind's list of possible registers for this architecture is an + * enum, unw_regnum_t. UNW_TDEP_LAST_REG is the highest-numbered + * register in that list, however, not all register numbers in this + * range are defined by the architecture, and not all defined registers + * will be present on every implementation of that architecture. + * Moreover, libunwind provides nice names for most, but not all + * registers, but these are hardcoded; a name being available does not + * mean that register is available. + * + * So, we have to pull this all together here. We try to get the value + * of every possible register. If we get a value for it, then the + * register must exist, and so we get its name. If libunwind has no + * name for it, we synthesize something. These cases should be rare, + * and they're usually for uninteresting or niche registers, so it + * shouldn't really matter. We can see the value, and that's the main + * thing. + */ + uint_t cols = 0; for (uint_t regnum = 0; regnum <= UNW_TDEP_LAST_REG; regnum++) { + /* + * Get the value. Any error probably means the register + * doesn't exist, and we skip it. + */ if (unw_get_reg(&cp, regnum, &v) < 0) continue; + + /* + * Register name. If libunwind doesn't have a name for it, + * it will return "???". As a shortcut, we just treat '?' + * is an alternate end-of-string character. + */ const char *name = unw_regname(regnum); for (n = 0; name[n] != '\0' && name[n] != '?'; n++) {} if (n == 0) { + /* + * No valid name, so make one of the form "?xx", where + * "xx" is the two-char hex of libunwind's register + * number. + */ buf[0] = '?'; n = spl_bt_u64_to_hex_str(regnum, 2, &buf[1], sizeof (buf)-1) + 1; name = buf; } + + /* + * Two spaces of padding before each column, plus extra + * spaces to align register names shorter than three chars. + */ spl_bt_write_n(fd, " ", 5-MIN(n, 3)); + + /* Register name and column punctuation */ spl_bt_write_n(fd, name, n); spl_bt_write(fd, ": 0x"); - n = spl_bt_u64_to_hex_str(v, 18, buf, sizeof (buf)); + + /* + * Convert register value (from unw_get_reg()) to hex. We're + * assuming that all registers are 64-bits wide, which is + * probably fine for any general-purpose registers on any + * machine currently in use. A more generic way would be to + * look at the width of unw_word_t, but that would also + * complicate the column code a bit. This is fine. + */ + n = spl_bt_u64_to_hex_str(v, 16, buf, sizeof (buf)); spl_bt_write_n(fd, buf, n); - if (!(++c % 3)) + + /* Every third column, emit a newline */ + if (!(++cols % 3)) spl_bt_write(fd, "\n"); } - if (c % 3) + + /* If we finished before the third column, emit a newline. */ + if (cols % 3) spl_bt_write(fd, "\n"); - unw_init_local(&cp, &uc); + /* Now the main event, the backtrace. */ spl_bt_write(fd, "Call trace:\n"); - while (unw_step(&cp) > 0) { - unw_get_reg(&cp, UNW_REG_IP, &v); + + /* Reset the cursor to the top again. */ + unw_init_local(&cp, &uc); + + do { + /* + * Getting the IP should never fail; libunwind handles it + * specially, because its used a lot internally. Still, no + * point being silly about it, as the last thing we want is + * our crash handler to crash. So if it ever does fail, we'll + * show an error line, but keep going to the next frame. + */ + if (unw_get_reg(&cp, UNW_REG_IP, &v) < 0) { + spl_bt_write(fd, " [couldn't get IP register; " + "corrupt frame?]"); + continue; + } + + /* IP & punctuation */ + n = spl_bt_u64_to_hex_str(v, 16, buf, sizeof (buf)); spl_bt_write(fd, " [0x"); - n = spl_bt_u64_to_hex_str(v, 18, buf, sizeof (buf)); spl_bt_write_n(fd, buf, n); spl_bt_write(fd, "] "); - unw_get_proc_name(&cp, buf, sizeof (buf), &v); - for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} - spl_bt_write_n(fd, buf, n); - spl_bt_write(fd, "+0x"); - n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf)); - spl_bt_write_n(fd, buf, n); + + /* + * Function ("procedure") name for the current frame. `v` + * receives the offset from the named function to the IP, which + * we show as a "+offset" suffix. + * + * If libunwind can't determine the name, we just show "???" + * instead. We've already displayed the IP above; that will + * have to do. + * + * unw_get_proc_name() will return ENOMEM if the buffer is too + * small, instead truncating the name. So we treat that as a + * success and use whatever is in the buffer. + */ + err = unw_get_proc_name(&cp, buf, sizeof (buf), &v); + if (err == 0 || err == -UNW_ENOMEM) { + for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} + spl_bt_write_n(fd, buf, n); + + /* Offset from proc name */ + spl_bt_write(fd, "+0x"); + n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf)); + spl_bt_write_n(fd, buf, n); + } else + spl_bt_write(fd, "???"); + #ifdef HAVE_LIBUNWIND_ELF - spl_bt_write(fd, " (in "); - unw_get_elf_filename(&cp, buf, sizeof (buf), &v); - for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} - spl_bt_write_n(fd, buf, n); - spl_bt_write(fd, " +0x"); - n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf)); - spl_bt_write_n(fd, buf, n); - spl_bt_write(fd, ")"); + /* + * Newer libunwind has unw_get_elf_filename(), which gets + * the name of the ELF object that the frame was executing in. + * Like `unw_get_proc_name()`, `v` recieves the offset within + * the file, and UNW_ENOMEM indicates that a truncate filename + * was left in the buffer. + */ + err = unw_get_elf_filename(&cp, buf, sizeof (buf), &v); + if (err == 0 || err == -UNW_ENOMEM) { + for (n = 0; n < sizeof (buf) && buf[n] != '\0'; n++) {} + spl_bt_write(fd, " (in "); + spl_bt_write_n(fd, buf, n); + + /* Offset within file */ + spl_bt_write(fd, " +0x"); + n = spl_bt_u64_to_hex_str(v, 2, buf, sizeof (buf)); + spl_bt_write_n(fd, buf, n); + spl_bt_write(fd, ")"); + } #endif spl_bt_write(fd, "\n"); - } + } while (unw_step(&cp) > 0); } #elif defined(HAVE_BACKTRACE) #include From ace2e17a9b6db4581ce70405a8a38e164f16fe30 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Sun, 20 Oct 2024 12:39:05 -0400 Subject: [PATCH 16/47] zfs_debug: Restore log size limit for userspace For some reason it was dropped when split from kernel, that makes raidz_test to accumulate in RAM up to 100GB of logs we don't need. Reviewed-by: Brian Behlendorf Reviewed-by: Igor Kozhukhov Reviewed-by: Rob Norris Reviewed-by: Tino Reichardt Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16492 Closes #16566 Closes #16664 --- lib/libzpool/zfs_debug.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/lib/libzpool/zfs_debug.c b/lib/libzpool/zfs_debug.c index df49a9a33fe8..82c7229932f0 100644 --- a/lib/libzpool/zfs_debug.c +++ b/lib/libzpool/zfs_debug.c @@ -35,9 +35,25 @@ typedef struct zfs_dbgmsg { static list_t zfs_dbgmsgs; static kmutex_t zfs_dbgmsgs_lock; +static uint_t zfs_dbgmsg_size = 0; +static uint_t zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ int zfs_dbgmsg_enable = B_TRUE; +static void +zfs_dbgmsg_purge(uint_t max_size) +{ + while (zfs_dbgmsg_size > max_size) { + zfs_dbgmsg_t *zdm = list_remove_head(&zfs_dbgmsgs); + if (zdm == NULL) + return; + + uint_t size = zdm->zdm_size; + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } +} + void zfs_dbgmsg_init(void) { @@ -74,6 +90,8 @@ __zfs_dbgmsg(char *buf) mutex_enter(&zfs_dbgmsgs_lock); list_insert_tail(&zfs_dbgmsgs, zdm); + zfs_dbgmsg_size += size; + zfs_dbgmsg_purge(zfs_dbgmsg_maxsize); mutex_exit(&zfs_dbgmsgs_lock); } From 2b359c7824ea538128a084836510c37ec8e99e72 Mon Sep 17 00:00:00 2001 From: Tomohiro Kusumi Date: Mon, 21 Oct 2024 01:43:16 +0900 Subject: [PATCH 17/47] Fix compile-time warnings caused by duplicate struct typedefs Some compiler/versions warn these typedefs according to #16660. The platform specific header sys/abd_os.h shouldn't define or use abd_t, as it's defined in its non-platform specific consumer sys/abd.h. Do the same as what FreeBSD header does. Reviewed-by: Brian Behlendorf Signed-off-by: Tomohiro Kusumi Closes #16660 Closes #16665 --- include/os/linux/spl/sys/taskq.h | 3 +-- include/os/linux/zfs/sys/abd_os.h | 12 ++++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h index f63a397f293d..4cb3a055c3c4 100644 --- a/include/os/linux/spl/sys/taskq.h +++ b/include/os/linux/spl/sys/taskq.h @@ -38,8 +38,7 @@ #include #include #include - -typedef struct kstat_s kstat_t; +#include #define TASKQ_NAMELEN 31 diff --git a/include/os/linux/zfs/sys/abd_os.h b/include/os/linux/zfs/sys/abd_os.h index 606e8bf682e8..3eed968e90c0 100644 --- a/include/os/linux/zfs/sys/abd_os.h +++ b/include/os/linux/zfs/sys/abd_os.h @@ -30,6 +30,8 @@ extern "C" { #endif +struct abd; + struct abd_scatter { uint_t abd_offset; uint_t abd_nents; @@ -41,10 +43,8 @@ struct abd_linear { struct scatterlist *abd_sgl; /* for LINEAR_PAGE */ }; -typedef struct abd abd_t; - typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *); -int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *, +int abd_iterate_page_func(struct abd *, size_t, size_t, abd_iter_page_func_t *, void *); /* @@ -52,11 +52,11 @@ int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *, * Note: these are only needed to support vdev_classic. See comment in * vdev_disk.c. */ -unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t); -unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); +unsigned int abd_bio_map_off(struct bio *, struct abd *, unsigned int, size_t); +unsigned long abd_nr_pages_off(struct abd *, unsigned int, size_t); __attribute__((malloc)) -abd_t *abd_alloc_from_pages(struct page **, unsigned long, uint64_t); +struct abd *abd_alloc_from_pages(struct page **, unsigned long, uint64_t); #ifdef __cplusplus } From 78d39d91fa7256dab53ca08ab8be6cd11412c31d Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 21 Oct 2024 04:01:49 +1100 Subject: [PATCH 18/47] zdb: show bp in uberblock dump Just another useful nugget of info in times of strife. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: George Melikov Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Rob Norris Closes #16667 --- cmd/zdb/zdb.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 16c7025802f3..42b82c08249f 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -4266,6 +4266,10 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer) (void) printf("\ttimestamp = %llu UTC = %s", (u_longlong_t)ub->ub_timestamp, ctime(×tamp)); + char blkbuf[BP_SPRINTF_LEN]; + snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); + (void) printf("\tbp = %s\n", blkbuf); + (void) printf("\tmmp_magic = %016llx\n", (u_longlong_t)ub->ub_mmp_magic); if (MMP_VALID(ub)) { From e30c69365d42c077a501e674b1784be702d9458e Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 21 Oct 2024 13:50:13 +1100 Subject: [PATCH 19/47] config: fix dequeue_signal check for kernels <4.20 Before 4.20, kernel_siginfo_t was just called siginfo_t. This was causing the kthread_dequeue_signal_3arg_task check, which uses kernel_siginfo_t, to fail on older kernels. In d6b8c17f1, we started checking for the "new" three-arg dequeue_signal() by testing for the "old" version. Because that test is explicitly using kernel_siginfo_t, it would fail, leading to the build trying to use the new three-arg version, which would then not compile. This commit fixes that by avoiding checking for the old 3-arg dequeue_signal entirely. Instead, we check for the new one, as well as the 4-arg form, and we use the old form as a fallback. This way, we never have to test for it explicitly, and once we're building HAVE_SIGINFO will make sure we get the right kernel_siginfo_t for it, so everything works out nice. Original-patch-by: Finix Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16666 --- config/kernel-kthread.m4 | 37 +++++++++++++++++++------------- module/os/linux/spl/spl-thread.c | 6 +++--- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/config/kernel-kthread.m4 b/config/kernel-kthread.m4 index 4d580efead6b..607953146323 100644 --- a/config/kernel-kthread.m4 +++ b/config/kernel-kthread.m4 @@ -17,14 +17,21 @@ AC_DEFUN([ZFS_AC_KERNEL_KTHREAD_COMPLETE_AND_EXIT], [ AC_DEFUN([ZFS_AC_KERNEL_KTHREAD_DEQUEUE_SIGNAL], [ dnl # - dnl # 5.17 API: enum pid_type * as new 4th dequeue_signal() argument, - dnl # 5768d8906bc23d512b1a736c1e198aa833a6daa4 ("signal: Requeue signals in the appropriate queue") + dnl # prehistory: + dnl # int dequeue_signal(struct task_struct *task, sigset_t *mask, + dnl # siginfo_t *info) dnl # - dnl # int dequeue_signal(struct task_struct *task, sigset_t *mask, kernel_siginfo_t *info); - dnl # int dequeue_signal(struct task_struct *task, sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type); + dnl # 4.20: kernel_siginfo_t introduced, replaces siginfo_t + dnl # int dequeue_signal(struct task_struct *task, sigset_t *mask, + dnl kernel_siginfo_t *info) dnl # - dnl # 6.12 API: first arg struct_task* removed - dnl # int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type); + dnl # 5.17: enum pid_type introduced as 4th arg + dnl # int dequeue_signal(struct task_struct *task, sigset_t *mask, + dnl # kernel_siginfo_t *info, enum pid_type *type) + dnl # + dnl # 6.12: first arg struct_task* removed + dnl # int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, + dnl # enum pid_type *type) dnl # AC_MSG_CHECKING([whether dequeue_signal() takes 4 arguments]) ZFS_LINUX_TEST_RESULT([kthread_dequeue_signal_4arg], [ @@ -33,11 +40,11 @@ AC_DEFUN([ZFS_AC_KERNEL_KTHREAD_DEQUEUE_SIGNAL], [ [dequeue_signal() takes 4 arguments]) ], [ AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether dequeue_signal() a task argument]) - ZFS_LINUX_TEST_RESULT([kthread_dequeue_signal_3arg_task], [ + AC_MSG_CHECKING([whether 3-arg dequeue_signal() takes a type argument]) + ZFS_LINUX_TEST_RESULT([kthread_dequeue_signal_3arg_type], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_DEQUEUE_SIGNAL_3ARG_TASK, 1, - [dequeue_signal() takes a task argument]) + AC_DEFINE(HAVE_DEQUEUE_SIGNAL_3ARG_TYPE, 1, + [3-arg dequeue_signal() takes a type argument]) ], [ AC_MSG_RESULT(no) ]) @@ -56,27 +63,27 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_KTHREAD_COMPLETE_AND_EXIT], [ ]) AC_DEFUN([ZFS_AC_KERNEL_SRC_KTHREAD_DEQUEUE_SIGNAL], [ - ZFS_LINUX_TEST_SRC([kthread_dequeue_signal_3arg_task], [ + ZFS_LINUX_TEST_SRC([kthread_dequeue_signal_4arg], [ #include ], [ struct task_struct *task = NULL; sigset_t *mask = NULL; kernel_siginfo_t *info = NULL; + enum pid_type *type = NULL; int error __attribute__ ((unused)); - error = dequeue_signal(task, mask, info); + error = dequeue_signal(task, mask, info, type); ]) - ZFS_LINUX_TEST_SRC([kthread_dequeue_signal_4arg], [ + ZFS_LINUX_TEST_SRC([kthread_dequeue_signal_3arg_type], [ #include ], [ - struct task_struct *task = NULL; sigset_t *mask = NULL; kernel_siginfo_t *info = NULL; enum pid_type *type = NULL; int error __attribute__ ((unused)); - error = dequeue_signal(task, mask, info, type); + error = dequeue_signal(mask, info, type); ]) ]) diff --git a/module/os/linux/spl/spl-thread.c b/module/os/linux/spl/spl-thread.c index 7f74d44f91ff..7b0ce30c7884 100644 --- a/module/os/linux/spl/spl-thread.c +++ b/module/os/linux/spl/spl-thread.c @@ -171,11 +171,11 @@ issig(void) #if defined(HAVE_DEQUEUE_SIGNAL_4ARG) enum pid_type __type; if (dequeue_signal(current, &set, &__info, &__type) != 0) { -#elif defined(HAVE_DEQUEUE_SIGNAL_3ARG_TASK) - if (dequeue_signal(current, &set, &__info) != 0) { -#else +#elif defined(HAVE_DEQUEUE_SIGNAL_3ARG_TYPE) enum pid_type __type; if (dequeue_signal(&set, &__info, &__type) != 0) { +#else + if (dequeue_signal(current, &set, &__info) != 0) { #endif spin_unlock_irq(¤t->sighand->siglock); kernel_signal_stop(); From ede715d1e426ea11b410bc64bc8ceca8f4b1e1f1 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 22 Oct 2024 05:38:30 +1100 Subject: [PATCH 20/47] spl/thread: explicitly define thread_func_t as noreturn All of our thread entry functions have this signature: void (*)(void*) __attribute__((noreturn)) The low-level `__thread_create()` function accepts a `thread_func_t` as the entry point, which is defined more simply as: void (*)(void *) And then the `thread_create()` and `thread_create_named()` macros cast the passed-in function point down to `thread_func_t`, that is, casting away the `noreturn` attribute. Clang considers casting between these two types to be invalid because both the caller and the callee may have elided parts of the stack frame save and restore, knowing that they won't be needed. Recent Linux appears to be setting `-Wcast-function-type-strict`, which causes this invalid cast to emit a warning, which with `-Werror` is converted to an error, breaking the build. This commit fixes this in the simplest possible way: adding `noreturn` to the `thread_func_t` attribute. Since all our thread entry functions already have this attribute, it's arguably a just a consistency fix anyway. I considered removing the casts in the macros, which silences the warnings, but it turns out that Clang has a bug that won't emit this error for implicit conversions, only explicit casts. So leaving them there seems like a reasonable belt-and-suspenders approach. Also, frankly, this whole mechanism seems a little undercooked inside LLVM, so I'm content go with my intuition about the smallest, least invaisve change. **NOTE**: `__thread_create` is exported by `spl.ko` and has a `thread_func_t` arg, so this is an ABI break. Whether that matters in practice, I have no idea. Further reading: - https://github.com/llvm/llvm-project/commit/1aad641c793090b4d036c03e737df2ebe2c32c57 - https://github.com/llvm/llvm-project/issues/7325 - https://github.com/llvm/llvm-project/issues/41465 Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #16672 Closes #16673 --- include/os/linux/spl/sys/thread.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/os/linux/spl/sys/thread.h b/include/os/linux/spl/sys/thread.h index 4f7f659e528d..c7ef7efa0a25 100644 --- a/include/os/linux/spl/sys/thread.h +++ b/include/os/linux/spl/sys/thread.h @@ -42,7 +42,7 @@ #define TS_ZOMB EXIT_ZOMBIE #define TS_STOPPED TASK_STOPPED -typedef void (*thread_func_t)(void *); +typedef void (*thread_func_t)(void *) __attribute__((noreturn)); #define thread_create_named(name, stk, stksize, func, arg, len, \ pp, state, pri) \ From 5237760b17cae246b8275a6ee388560dadf89f3b Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 21 Oct 2024 14:46:20 -0400 Subject: [PATCH 21/47] ZTS: Add additional exceptions The following tests have been observed to occasionally fail when running under the CI. Updated our exceptions list to track them. Signed-off-by: Brian Behlendorf Closes #16670 --- tests/test-runner/bin/zts-report.py.in | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 2562836213af..07ec2c4b601b 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -213,6 +213,7 @@ maybe = { 'cli_root/zfs_unshare/zfs_unshare_006_pos': ['SKIP', na_reason], 'cli_root/zpool_add/zpool_add_004_pos': ['FAIL', known_reason], 'cli_root/zpool_destroy/zpool_destroy_001_pos': ['SKIP', 6145], + 'cli_root/zpool_import/import_devices_missing': ['FAIL', 16669], 'cli_root/zpool_import/zpool_import_missing_003_pos': ['SKIP', 6839], 'cli_root/zpool_initialize/zpool_initialize_import_export': ['FAIL', 11948], @@ -275,7 +276,8 @@ if sys.platform.startswith('freebsd'): 'pool_checkpoint/checkpoint_big_rewind': ['FAIL', 12622], 'pool_checkpoint/checkpoint_indirect': ['FAIL', 12623], 'resilver/resilver_restart_001': ['FAIL', known_reason], - 'snapshot/snapshot_002_pos': ['FAIL', '14831'], + 'snapshot/snapshot_002_pos': ['FAIL', 14831], + 'zvol/zvol_misc/zvol_misc_volmode': ['FAIL', 16668], 'bclone/bclone_crossfs_corner_cases': ['SKIP', cfr_cross_reason], 'bclone/bclone_crossfs_corner_cases_limited': ['SKIP', cfr_cross_reason], From 77d81974b608b7fdcc5185f34dc74e9c7a3df24d Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Thu, 24 Oct 2024 23:05:45 +0200 Subject: [PATCH 22/47] Fix dependency install on Debian 11 (#16683) Adding cryptsetup breaks some dialog things on Debian 11. Apply some workaround for it. Signed-off-by: Tino Reichardt Reviewed-by: George Melikov Reviewed-by: Tony Hutter --- .github/workflows/scripts/qemu-3-deps.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/scripts/qemu-3-deps.sh b/.github/workflows/scripts/qemu-3-deps.sh index a2fb5e38249a..4c6227b88ed0 100755 --- a/.github/workflows/scripts/qemu-3-deps.sh +++ b/.github/workflows/scripts/qemu-3-deps.sh @@ -111,6 +111,7 @@ case "$1" in archlinux ;; debian*) + echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections debian echo "##[group]Install Debian specific" sudo apt-get install -yq linux-perf dh-sequence-dkms From 7e3ce4efaa15903d2aa6e6b014a935444dbb3f11 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 25 Oct 2024 12:03:37 -0400 Subject: [PATCH 23/47] Pack dmu_buf_impl_t by 16 bytes On 64bit FreeBSD this reduces one from 296 to 280 bytes. On small block workloads dbufs may consume gigabytes of ARC, and this saves 5% of it. Reviewed-by: Tino Reichardt Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16684 --- include/sys/dbuf.h | 53 +++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 56741cd2a58b..c9bfb9a8026c 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -264,6 +264,27 @@ typedef struct dmu_buf_impl { */ uint8_t db_level; + /* This block was freed while a read or write was active. */ + uint8_t db_freed_in_flight; + + /* + * Evict user data as soon as the dirty and reference counts are equal. + */ + uint8_t db_user_immediate_evict; + + /* + * dnode_evict_dbufs() or dnode_evict_bonus() tried to evict this dbuf, + * but couldn't due to outstanding references. Evict once the refcount + * drops to 0. + */ + uint8_t db_pending_evict; + + /* Number of TXGs in which this buffer is dirty. */ + uint8_t db_dirtycnt; + + /* The buffer was partially read. More reads may follow. */ + uint8_t db_partial_read; + /* * Protects db_buf's contents if they contain an indirect block or data * block of the meta-dnode. We use this lock to protect the structure of @@ -288,6 +309,9 @@ typedef struct dmu_buf_impl { */ dbuf_states_t db_state; + /* In which dbuf cache this dbuf is, if any. */ + dbuf_cached_state_t db_caching_status; + /* * Refcount accessed by dmu_buf_{hold,rele}. * If nonzero, the buffer can't be destroyed. @@ -304,39 +328,10 @@ typedef struct dmu_buf_impl { /* Link in dbuf_cache or dbuf_metadata_cache */ multilist_node_t db_cache_link; - /* Tells us which dbuf cache this dbuf is in, if any */ - dbuf_cached_state_t db_caching_status; - uint64_t db_hash; - /* Data which is unique to data (leaf) blocks: */ - /* User callback information. */ dmu_buf_user_t *db_user; - - /* - * Evict user data as soon as the dirty and reference - * counts are equal. - */ - uint8_t db_user_immediate_evict; - - /* - * This block was freed while a read or write was - * active. - */ - uint8_t db_freed_in_flight; - - /* - * dnode_evict_dbufs() or dnode_evict_bonus() tried to - * evict this dbuf, but couldn't due to outstanding - * references. Evict once the refcount drops to 0. - */ - uint8_t db_pending_evict; - - uint8_t db_dirtycnt; - - /* The buffer was partially read. More reads may follow. */ - uint8_t db_partial_read; } dmu_buf_impl_t; #define DBUF_HASH_MUTEX(h, idx) \ From f7b4bca66a1e8382bef314ec8ab14c99e3f54f84 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Fri, 25 Oct 2024 09:07:44 -0700 Subject: [PATCH 24/47] ZTS: Add LUKS sanity test Add a LUKS sanity test to trigger: #16631 Reviewed-by: Tino Reichardt Reviewed-by: Rob Norris Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Tony Hutter Closes #16681 --- .github/workflows/scripts/qemu-3-deps.sh | 35 ++++---- tests/runfiles/linux.run | 6 ++ tests/zfs-tests/include/commands.cfg | 1 + tests/zfs-tests/tests/Makefile.am | 3 +- .../tests/functional/luks/luks_sanity.ksh | 90 +++++++++++++++++++ 5 files changed, 117 insertions(+), 18 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/luks/luks_sanity.ksh diff --git a/.github/workflows/scripts/qemu-3-deps.sh b/.github/workflows/scripts/qemu-3-deps.sh index 4c6227b88ed0..bf06c1892ceb 100755 --- a/.github/workflows/scripts/qemu-3-deps.sh +++ b/.github/workflows/scripts/qemu-3-deps.sh @@ -13,10 +13,10 @@ function archlinux() { echo "##[endgroup]" echo "##[group]Install Development Tools" - sudo pacman -Sy --noconfirm base-devel bc cpio dhclient dkms fakeroot \ - fio gdb inetutils jq less linux linux-headers lsscsi nfs-utils parted \ - pax perf python-packaging python-setuptools qemu-guest-agent ksh samba \ - sysstat rng-tools rsync wget xxhash + sudo pacman -Sy --noconfirm base-devel bc cpio cryptsetup dhclient dkms \ + fakeroot fio gdb inetutils jq less linux linux-headers lsscsi nfs-utils \ + parted pax perf python-packaging python-setuptools qemu-guest-agent ksh \ + samba sysstat rng-tools rsync wget xxhash echo "##[endgroup]" } @@ -30,11 +30,11 @@ function debian() { echo "##[group]Install Development Tools" sudo apt-get install -y \ - acl alien attr autoconf bc cpio curl dbench dh-python dkms fakeroot \ - fio gdb gdebi git ksh lcov isc-dhcp-client jq libacl1-dev libaio-dev \ - libattr1-dev libblkid-dev libcurl4-openssl-dev libdevmapper-dev libelf-dev \ - libffi-dev libmount-dev libpam0g-dev libselinux-dev libssl-dev libtool \ - libtool-bin libudev-dev libunwind-dev linux-headers-$(uname -r) \ + acl alien attr autoconf bc cpio cryptsetup curl dbench dh-python dkms \ + fakeroot fio gdb gdebi git ksh lcov isc-dhcp-client jq libacl1-dev \ + libaio-dev libattr1-dev libblkid-dev libcurl4-openssl-dev libdevmapper-dev \ + libelf-dev libffi-dev libmount-dev libpam0g-dev libselinux-dev libssl-dev \ + libtool libtool-bin libudev-dev libunwind-dev linux-headers-$(uname -r) \ lsscsi nfs-kernel-server pamtester parted python3 python3-all-dev \ python3-cffi python3-dev python3-distlib python3-packaging \ python3-setuptools python3-sphinx qemu-guest-agent rng-tools rpm2cpio \ @@ -68,14 +68,15 @@ function rhel() { echo "##[group]Install Development Tools" sudo dnf group install -y "Development Tools" sudo dnf install -y \ - acl attr bc bzip2 curl dbench dkms elfutils-libelf-devel fio gdb git \ - jq kernel-rpm-macros ksh libacl-devel libaio-devel libargon2-devel \ - libattr-devel libblkid-devel libcurl-devel libffi-devel ncompress \ - libselinux-devel libtirpc-devel libtool libudev-devel libuuid-devel \ - lsscsi mdadm nfs-utils openssl-devel pam-devel pamtester parted perf \ - python3 python3-cffi python3-devel python3-packaging kernel-devel \ - python3-setuptools qemu-guest-agent rng-tools rpcgen rpm-build rsync \ - samba sysstat systemd watchdog wget xfsprogs-devel xxhash zlib-devel + acl attr bc bzip2 cryptsetup curl dbench dkms elfutils-libelf-devel fio \ + gdb git jq kernel-rpm-macros ksh libacl-devel libaio-devel \ + libargon2-devel libattr-devel libblkid-devel libcurl-devel libffi-devel \ + ncompress libselinux-devel libtirpc-devel libtool libudev-devel \ + libuuid-devel lsscsi mdadm nfs-utils openssl-devel pam-devel pamtester \ + parted perf python3 python3-cffi python3-devel python3-packaging \ + kernel-devel python3-setuptools qemu-guest-agent rng-tools rpcgen \ + rpm-build rsync samba sysstat systemd watchdog wget xfsprogs-devel xxhash \ + zlib-devel echo "##[endgroup]" } diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 5534cd27f637..76d07a6cc9c1 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -147,6 +147,12 @@ tags = ['functional', 'largest_pool'] tests = ['longname_001_pos', 'longname_002_pos', 'longname_003_pos'] tags = ['functional', 'longname'] +[tests/functional/luks:Linux] +pre = +post = +tests = ['luks_sanity'] +tags = ['functional', 'luks'] + [tests/functional/mmap:Linux] tests = ['mmap_libaio_001_pos', 'mmap_sync_001_pos'] tags = ['functional', 'mmap'] diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index be41ce5210e8..5985b5fe1526 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -129,6 +129,7 @@ export SYSTEM_FILES_LINUX='attr blkdiscard blockdev chattr + cryptsetup exportfs fallocate flock diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index bc767b9f624f..7d1551a63f0d 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -80,7 +80,8 @@ if BUILD_LINUX nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/simd/simd_supported.ksh \ functional/tmpfile/cleanup.ksh \ - functional/tmpfile/setup.ksh + functional/tmpfile/setup.ksh \ + functional/luks/luks_sanity.ksh endif nobase_dist_datadir_zfs_tests_tests_DATA += \ diff --git a/tests/zfs-tests/tests/functional/luks/luks_sanity.ksh b/tests/zfs-tests/tests/functional/luks/luks_sanity.ksh new file mode 100755 index 000000000000..9cee26503de7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/luks/luks_sanity.ksh @@ -0,0 +1,90 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 by Lawrence Livermore National Security, LLC. +# Use is subject to license terms. +# + +# DESCRIPTION: +# Verify ZFS works on a LUKS-backed pool +# +# STRATEGY: +# 1. Create a LUKS device +# 2. Make a pool with it +# 3. Write files to the pool +# 4. Verify no errors + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "both" + +VDEV=$(mktemp --suffix=luks_sanity) +TESTPOOL=testpool + +function cleanup +{ + log_must zpool destroy $TESTPOOL + + log_must cryptsetup luksClose /dev/mapper/luksdev + log_must rm -f $VDEV +} + +log_assert "Verify ZFS on LUKS works" +log_onexit cleanup + +PASS="fdsjfosdijfsdkjsldfjdlk" + +# Make a small LUKS device since LUKS formatting takes time and we want to +# make this test run as quickly as possible. +truncate -s 100M $VDEV + +log_must cryptsetup luksFormat --type luks2 $VDEV <<< $PASS +log_must cryptsetup luksOpen $VDEV luksdev <<< $PASS + +log_must zpool create $TESTPOOL /dev/mapper/luksdev + +CPUS=$(get_num_cpus) + +# Use these specific size and offset ranges as they often cause errors with +# https://github.com/openzfs/zfs/issues/16631 +# and we want to try to test for that. +for SIZE in {70..100} ; do + for OFF in {70..100} ; do + for i in {1..$CPUS} ; do + dd if=/dev/urandom of=/$TESTPOOL/file$i-bs$SIZE-off$OFF \ + seek=$OFF bs=$SIZE count=1 &>/dev/null & + done + wait + done + sync_pool $TESTPOOL + rm -f /$TESTPOOL/file* +done + +# Verify no read/write/checksum errors. Don't use JSON here so that we could +# could potentially backport this test case to the 2.2.x branch. +if zpool status -e | grep -q "luksdev" ; then + log_note "$(zpool status -v)" + log_fail "Saw errors writing to LUKS device" +fi + +log_pass "Verified ZFS on LUKS works" From fd2cae969ff83841ae35a5bf5e7b82b8a3a5f80b Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Tue, 29 Oct 2024 19:49:54 +0100 Subject: [PATCH 25/47] Fix gcc unused value warning in FreeBSD simd.h The macros `simd_stat_init()` and `simd_stat_fini()` in FreeBSD's `simd.h` are defined as zero, but they are actually only used as statements. Replace the definitions with `do {} while (0)` instead, to avoid gcc `-Wunused-value` warnings. Reviewed-by: Tino Reichardt Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Toomas Soome Signed-off-by: Dimitry Andric Closes #16693 --- include/os/freebsd/spl/sys/simd.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/os/freebsd/spl/sys/simd.h b/include/os/freebsd/spl/sys/simd.h index 6bc46755c4e3..d16e1db5e826 100644 --- a/include/os/freebsd/spl/sys/simd.h +++ b/include/os/freebsd/spl/sys/simd.h @@ -50,7 +50,7 @@ #define kfpu_fini() do {} while (0) #endif -#define simd_stat_init() 0 -#define simd_stat_fini() 0 +#define simd_stat_init() do {} while (0) +#define simd_stat_fini() do {} while (0) #endif From f3823a9ab2d1d8dfbf5f0b8df64a47317bbe7fda Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Tue, 29 Oct 2024 20:05:02 +0100 Subject: [PATCH 26/47] Fix gcc uninitialized warning in FreeBSD zio_crypt.c In FreeBSD's `zio_do_crypt_data()`, ensure that two `struct uio` variables are cleared before copying data out of them. This avoids accessing garbage data, and fixes gcc `-Wuninitialized` warnings. Reviewed-by: Brian Behlendorf Reviewed-by: Toomas Soome Reviewed-by: Alexander Motin Signed-off-by: Dimitry Andric Closes #16688 --- module/os/freebsd/zfs/zio_crypt.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/module/os/freebsd/zfs/zio_crypt.c b/module/os/freebsd/zfs/zio_crypt.c index 2b62abcccb78..feaca93fb933 100644 --- a/module/os/freebsd/zfs/zio_crypt.c +++ b/module/os/freebsd/zfs/zio_crypt.c @@ -1686,11 +1686,10 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, freebsd_crypt_session_t *tmpl = NULL; uint8_t *authbuf = NULL; - + memset(&puio_s, 0, sizeof (puio_s)); + memset(&cuio_s, 0, sizeof (cuio_s)); zfs_uio_init(&puio, &puio_s); zfs_uio_init(&cuio, &cuio_s); - memset(GET_UIO_STRUCT(&puio), 0, sizeof (struct uio)); - memset(GET_UIO_STRUCT(&cuio), 0, sizeof (struct uio)); #ifdef FCRYPTO_DEBUG printf("%s(%s, %p, %p, %d, %p, %p, %u, %s, %p, %p, %p)\n", From bbc0d34bfd2b6714d96c98d3fd252431e03c6148 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 29 Oct 2024 15:23:24 -0400 Subject: [PATCH 27/47] On the first vdev open ignore impossible ashift hints If on the first open device's logical ashift is bigger than set by pool's ashift property, ignore the last as unusable instead of creating vdev that will fail most of I/Os due to misalignment. Reviewed-by: Rob Norris Reviewed-by: Brian Behlendorf Reviewed-by: Ameer Hamza Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16690 --- module/zfs/vdev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index bcab46c63bfa..983f444d79b0 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -2205,10 +2205,11 @@ vdev_open(vdev_t *vd) vd->vdev_max_asize = max_asize; /* - * If the vdev_ashift was not overridden at creation time, + * If the vdev_ashift was not overridden at creation time + * (0) or the override value is impossible for the device, * then set it the logical ashift and optimize the ashift. */ - if (vd->vdev_ashift == 0) { + if (vd->vdev_ashift < vd->vdev_logical_ashift) { vd->vdev_ashift = vd->vdev_logical_ashift; if (vd->vdev_logical_ashift > ASHIFT_MAX) { From 8ac70aade7aff78b77d682d863b96376ffb4cc1e Mon Sep 17 00:00:00 2001 From: Serapheim Dimitropoulos Date: Wed, 30 Oct 2024 17:11:40 -0700 Subject: [PATCH 28/47] Add warning for external consumers of dmu_tx_callback_register While reading some code @grwilson came across the above function that seemingly had no consumers besides a ztest callback that ensures that the tx_callback infrastructure works correctly. It turns out that Lustre is the main (and potentially the only) consumer of this. Refer to `osd_trans_commit_cb` of `lustre/osd-zfs/osd_handler.c` in the Lustre repo for more info. Let's add a comment highlighting this before someone removes it by mistake. Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Reviewed-by: Alexander Motin Signed-off-by: Serapheim Dimitropoulos Closes #16698 --- module/zfs/dmu_tx.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 3fdcebdff918..6aee7afb6954 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -1377,6 +1377,13 @@ dmu_tx_pool(dmu_tx_t *tx) return (tx->tx_pool); } +/* + * Register a callback to be executed at the end of a TXG. + * + * Note: This currently exists for outside consumers, specifically the ZFS OSD + * for Lustre. Please do not remove before checking that project. For examples + * on how to use this see `ztest_commit_callback`. + */ void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) { From 19a8dd48e1ae9759f66e9204f9985d1abe2a68b0 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 25 Oct 2024 16:14:37 +1100 Subject: [PATCH 29/47] vdev_disk: try harder to ensure IO alignment rules It seems out our notion of "properly" aligned IO was incomplete. In particular, dm-crypt does its own splitting, and assumes that a logical block will never cross an order-0 page boundary (ie, the physical page size, not compound size). This effectively means that it needs to be possible to split a BIO at any page or block size boundary and have it work correctly. This updates the alignment check function to enforce these rules (to the extent possible). Our response to misaligned data is to make some new allocation that is properly aligned, and copy the data into it. It turns out that linearising (via abd_borrow_buf()) is not enough, because we allocate eg 4K blocks from a general purpose slab, and so may receive (or already have) a 4K block that crosses pages. So instead, we allocate a new ABD, which is guaranteed to be aligned properly to block sizes, and then copy everything into it, and back out on the way back. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter Signed-off-by: Rob Norris Closes #16687 #16631 #15646 #15533 #14533 --- module/os/linux/zfs/vdev_disk.c | 120 ++++---- .../functional/vdev_disk/page_alignment.c | 264 +++++++++++------- 2 files changed, 230 insertions(+), 154 deletions(-) diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index a6271d3a7df1..7a4944410906 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -805,14 +805,11 @@ vbio_completion(struct bio *bio) * to the ADB, with changes if appropriate. */ if (vbio->vbio_abd != NULL) { - void *buf = abd_to_buf(vbio->vbio_abd); + if (zio->io_type == ZIO_TYPE_READ) + abd_copy(zio->io_abd, vbio->vbio_abd, zio->io_size); + abd_free(vbio->vbio_abd); vbio->vbio_abd = NULL; - - if (zio->io_type == ZIO_TYPE_READ) - abd_return_buf_copy(zio->io_abd, buf, zio->io_size); - else - abd_return_buf(zio->io_abd, buf, zio->io_size); } /* Final cleanup */ @@ -834,38 +831,61 @@ vbio_completion(struct bio *bio) * NOTE: if you change this function, change the copy in * tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c, and add test * data there to validate the change you're making. - * */ typedef struct { - uint_t bmask; - uint_t npages; - uint_t end; -} vdev_disk_check_pages_t; + size_t blocksize; + int seen_first; + int seen_last; +} vdev_disk_check_alignment_t; static int -vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) +vdev_disk_check_alignment_cb(struct page *page, size_t off, size_t len, + void *priv) { (void) page; - vdev_disk_check_pages_t *s = priv; + vdev_disk_check_alignment_t *s = priv; /* - * If we didn't finish on a block size boundary last time, then there - * would be a gap if we tried to use this ABD as-is, so abort. + * The cardinal rule: a single on-disk block must never cross an + * physical (order-0) page boundary, as the kernel expects to be able + * to split at both LBS and page boundaries. + * + * This implies various alignment rules for the blocks in this + * (possibly compound) page, which we can check for. */ - if (s->end != 0) - return (1); /* - * Note if we're taking less than a full block, so we can check it - * above on the next call. + * If the previous page did not end on a page boundary, then we + * can't proceed without creating a hole. */ - s->end = (off+len) & s->bmask; + if (s->seen_last) + return (1); - /* All blocks after the first must start on a block size boundary. */ - if (s->npages != 0 && (off & s->bmask) != 0) + /* This page must contain only whole LBS-sized blocks. */ + if (!IS_P2ALIGNED(len, s->blocksize)) return (1); - s->npages++; + /* + * If this is not the first page in the ABD, then the data must start + * on a page-aligned boundary (so the kernel can split on page + * boundaries without having to deal with a hole). If it is, then + * it can start on LBS-alignment. + */ + if (s->seen_first) { + if (!IS_P2ALIGNED(off, PAGESIZE)) + return (1); + } else { + if (!IS_P2ALIGNED(off, s->blocksize)) + return (1); + s->seen_first = 1; + } + + /* + * If this data does not end on a page-aligned boundary, then this + * must be the last page in the ABD, for the same reason. + */ + s->seen_last = !IS_P2ALIGNED(off+len, PAGESIZE); + return (0); } @@ -874,15 +894,14 @@ vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) * the number of pages, or 0 if it can't be submitted like this. */ static boolean_t -vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev) +vdev_disk_check_alignment(abd_t *abd, uint64_t size, struct block_device *bdev) { - vdev_disk_check_pages_t s = { - .bmask = bdev_logical_block_size(bdev)-1, - .npages = 0, - .end = 0, + vdev_disk_check_alignment_t s = { + .blocksize = bdev_logical_block_size(bdev), }; - if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s)) + if (abd_iterate_page_func(abd, 0, size, + vdev_disk_check_alignment_cb, &s)) return (B_FALSE); return (B_TRUE); @@ -916,37 +935,32 @@ vdev_disk_io_rw(zio_t *zio) /* * Check alignment of the incoming ABD. If any part of it would require - * submitting a page that is not aligned to the logical block size, - * then we take a copy into a linear buffer and submit that instead. - * This should be impossible on a 512b LBS, and fairly rare on 4K, - * usually requiring abnormally-small data blocks (eg gang blocks) - * mixed into the same ABD as larger ones (eg aggregated). + * submitting a page that is not aligned to both the logical block size + * and the page size, then we take a copy into a new memory region with + * correct alignment. This should be impossible on a 512b LBS. On + * larger blocks, this can happen at least when a small number of + * blocks (usually 1) are allocated from a shared slab, or when + * abnormally-small data regions (eg gang headers) are mixed into the + * same ABD as larger allocations (eg aggregations). */ abd_t *abd = zio->io_abd; - if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) { - void *buf; - if (zio->io_type == ZIO_TYPE_READ) - buf = abd_borrow_buf(zio->io_abd, zio->io_size); - else - buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); + if (!vdev_disk_check_alignment(abd, zio->io_size, bdev)) { + /* Allocate a new memory region with guaranteed alignment */ + abd = abd_alloc_for_io(zio->io_size, + zio->io_abd->abd_flags & ABD_FLAG_META); - /* - * Wrap the copy in an abd_t, so we can use the same iterators - * to count and fill the vbio later. - */ - abd = abd_get_from_buf(buf, zio->io_size); + /* If we're writing copy our data into it */ + if (zio->io_type == ZIO_TYPE_WRITE) + abd_copy(abd, zio->io_abd, zio->io_size); /* - * False here would mean the borrowed copy has an invalid - * alignment too, which would mean we've somehow been passed a - * linear ABD with an interior page that has a non-zero offset - * or a size not a multiple of PAGE_SIZE. This is not possible. - * It would mean either zio_buf_alloc() or its underlying - * allocators have done something extremely strange, or our - * math in vdev_disk_check_pages() is wrong. In either case, + * False here would mean the new allocation has an invalid + * alignment too, which would mean that abd_alloc() is not + * guaranteeing this, or our logic in + * vdev_disk_check_alignment() is wrong. In either case, * something in seriously wrong and its not safe to continue. */ - VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev)); + VERIFY(vdev_disk_check_alignment(abd, zio->io_size, bdev)); } /* Allocate vbio, with a pointer to the borrowed ABD if necessary */ diff --git a/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c b/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c index 5c6d28eb2c44..7b926da6c01c 100644 --- a/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c +++ b/tests/zfs-tests/tests/functional/vdev_disk/page_alignment.c @@ -30,7 +30,7 @@ /* * This tests the vdev_disk page alignment check callback - * vdev_disk_check_pages_cb(). For now, this test includes a copy of that + * vdev_disk_check_alignment_cb(). For now, this test includes a copy of that * function from module/os/linux/zfs/vdev_disk.c. If you change it here, * remember to change it there too, and add tests data here to validate the * change you're making. @@ -38,36 +38,69 @@ struct page; +/* + * This is spl_pagesize() in userspace, which requires linking libspl, but + * would also then use the platform page size, which isn't what we want for + * a test. To keep the check callback the same as the real one, we just + * redefine it. + */ +#undef PAGESIZE +#define PAGESIZE (4096) + typedef struct { - uint32_t bmask; - uint32_t npages; - uint32_t end; -} vdev_disk_check_pages_t; + size_t blocksize; + int seen_first; + int seen_last; +} vdev_disk_check_alignment_t; static int -vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) +vdev_disk_check_alignment_cb(struct page *page, size_t off, size_t len, + void *priv) { (void) page; - vdev_disk_check_pages_t *s = priv; + vdev_disk_check_alignment_t *s = priv; /* - * If we didn't finish on a block size boundary last time, then there - * would be a gap if we tried to use this ABD as-is, so abort. + * The cardinal rule: a single on-disk block must never cross an + * physical (order-0) page boundary, as the kernel expects to be able + * to split at both LBS and page boundaries. + * + * This implies various alignment rules for the blocks in this + * (possibly compound) page, which we can check for. */ - if (s->end != 0) - return (1); /* - * Note if we're taking less than a full block, so we can check it - * above on the next call. + * If the previous page did not end on a page boundary, then we + * can't proceed without creating a hole. */ - s->end = (off+len) & s->bmask; + if (s->seen_last) + return (1); - /* All blocks after the first must start on a block size boundary. */ - if (s->npages != 0 && (off & s->bmask) != 0) + /* This page must contain only whole LBS-sized blocks. */ + if (!IS_P2ALIGNED(len, s->blocksize)) return (1); - s->npages++; + /* + * If this is not the first page in the ABD, then the data must start + * on a page-aligned boundary (so the kernel can split on page + * boundaries without having to deal with a hole). If it is, then + * it can start on LBS-alignment. + */ + if (s->seen_first) { + if (!IS_P2ALIGNED(off, PAGESIZE)) + return (1); + } else { + if (!IS_P2ALIGNED(off, s->blocksize)) + return (1); + s->seen_first = 1; + } + + /* + * If this data does not end on a page-aligned boundary, then this + * must be the last page in the ABD, for the same reason. + */ + s->seen_last = !IS_P2ALIGNED(off+len, PAGESIZE); + return (0); } @@ -75,8 +108,8 @@ typedef struct { /* test name */ const char *name; - /* blocks size mask */ - uint32_t mask; + /* stored block size */ + uint32_t blocksize; /* amount of data to take */ size_t size; @@ -89,39 +122,39 @@ static const page_test_t valid_tests[] = { /* 512B block tests */ { "512B blocks, 4K single page", - 0x1ff, 0x1000, { + 512, 0x1000, { { 0x0, 0x1000 }, }, }, { "512B blocks, 1K at start of page", - 0x1ff, 0x400, { + 512, 0x400, { { 0x0, 0x1000 }, }, }, { "512B blocks, 1K at end of page", - 0x1ff, 0x400, { + 512, 0x400, { { 0x0c00, 0x0400 }, }, }, { "512B blocks, 1K within page, 512B start offset", - 0x1ff, 0x400, { + 512, 0x400, { { 0x0200, 0x0e00 }, }, }, { "512B blocks, 8K across 2x4K pages", - 0x1ff, 0x2000, { + 512, 0x2000, { { 0x0, 0x1000 }, { 0x0, 0x1000 }, }, }, { "512B blocks, 4K across two pages, 2K start offset", - 0x1ff, 0x1000, { + 512, 0x1000, { { 0x0800, 0x0800 }, { 0x0, 0x0800 }, }, }, { "512B blocks, 16K across 5x4K pages, 512B start offset", - 0x1ff, 0x4000, { + 512, 0x4000, { { 0x0200, 0x0e00 }, { 0x0, 0x1000 }, { 0x0, 0x1000 }, @@ -130,7 +163,7 @@ static const page_test_t valid_tests[] = { }, }, { "512B blocks, 64K data, 8x8K compound pages", - 0x1ff, 0x10000, { + 512, 0x10000, { { 0x0, 0x2000 }, { 0x0, 0x2000 }, { 0x0, 0x2000 }, @@ -142,7 +175,7 @@ static const page_test_t valid_tests[] = { }, }, { "512B blocks, 64K data, 9x8K compound pages, 512B start offset", - 0x1ff, 0x10000, { + 512, 0x10000, { { 0x0200, 0x1e00 }, { 0x0, 0x2000 }, { 0x0, 0x2000 }, @@ -155,7 +188,7 @@ static const page_test_t valid_tests[] = { }, }, { "512B blocks, 64K data, 2x16K compound pages, 8x4K pages", - 0x1ff, 0x10000, { + 512, 0x10000, { { 0x0, 0x8000 }, { 0x0, 0x8000 }, { 0x0, 0x1000 }, @@ -169,7 +202,7 @@ static const page_test_t valid_tests[] = { }, }, { "512B blocks, 64K data, mixed 4K/8K/16K pages", - 0x1ff, 0x10000, { + 512, 0x10000, { { 0x0, 0x1000 }, { 0x0, 0x2000 }, { 0x0, 0x1000 }, @@ -183,7 +216,7 @@ static const page_test_t valid_tests[] = { }, }, { "512B blocks, 64K data, mixed 4K/8K/16K pages, 1K start offset", - 0x1ff, 0x10000, { + 512, 0x10000, { { 0x0400, 0x0c00 }, { 0x0, 0x1000 }, { 0x0, 0x1000 }, @@ -200,48 +233,18 @@ static const page_test_t valid_tests[] = { /* 4K block tests */ { "4K blocks, 4K single page", - 0xfff, 0x1000, { - { 0x0, 0x1000 }, - }, - }, { - "4K blocks, 1K at start of page", - 0xfff, 0x400, { + 4096, 0x1000, { { 0x0, 0x1000 }, }, - }, { - "4K blocks, 1K at end of page", - 0xfff, 0x400, { - { 0x0c00, 0x0400 }, - }, - }, { - "4K blocks, 1K within page, 512B start offset", - 0xfff, 0x400, { - { 0x0200, 0x0e00 }, - }, }, { "4K blocks, 8K across 2x4K pages", - 0xfff, 0x2000, { + 4096, 0x2000, { { 0x0, 0x1000 }, { 0x0, 0x1000 }, }, - }, { - "4K blocks, 4K across two pages, 2K start offset", - 0xfff, 0x1000, { - { 0x0800, 0x0800 }, - { 0x0, 0x0800 }, - }, - }, { - "4K blocks, 16K across 5x4K pages, 512B start offset", - 0xfff, 0x4000, { - { 0x0200, 0x0e00 }, - { 0x0, 0x1000 }, - { 0x0, 0x1000 }, - { 0x0, 0x1000 }, - { 0x0, 0x0200 }, - }, }, { "4K blocks, 64K data, 8x8K compound pages", - 0xfff, 0x10000, { + 4096, 0x10000, { { 0x0, 0x2000 }, { 0x0, 0x2000 }, { 0x0, 0x2000 }, @@ -251,22 +254,9 @@ static const page_test_t valid_tests[] = { { 0x0, 0x2000 }, { 0x0, 0x2000 }, }, - }, { - "4K blocks, 64K data, 9x8K compound pages, 512B start offset", - 0xfff, 0x10000, { - { 0x0200, 0x1e00 }, - { 0x0, 0x2000 }, - { 0x0, 0x2000 }, - { 0x0, 0x2000 }, - { 0x0, 0x2000 }, - { 0x0, 0x2000 }, - { 0x0, 0x2000 }, - { 0x0, 0x2000 }, - { 0x0, 0x0200 }, - }, }, { "4K blocks, 64K data, 2x16K compound pages, 8x4K pages", - 0xfff, 0x10000, { + 4096, 0x10000, { { 0x0, 0x8000 }, { 0x0, 0x8000 }, { 0x0, 0x1000 }, @@ -280,7 +270,7 @@ static const page_test_t valid_tests[] = { }, }, { "4K blocks, 64K data, mixed 4K/8K/16K pages", - 0xfff, 0x10000, { + 4096, 0x10000, { { 0x0, 0x1000 }, { 0x0, 0x2000 }, { 0x0, 0x1000 }, @@ -292,29 +282,19 @@ static const page_test_t valid_tests[] = { { 0x0, 0x1000 }, { 0x0, 0x2000 }, }, - }, { - "4K blocks, 64K data, mixed 4K/8K/16K pages, 1K start offset", - 0xfff, 0x10000, { - { 0x0400, 0x0c00 }, - { 0x0, 0x1000 }, - { 0x0, 0x1000 }, - { 0x0, 0x1000 }, - { 0x0, 0x2000 }, - { 0x0, 0x2000 }, - { 0x0, 0x1000 }, - { 0x0, 0x8000 }, - { 0x0, 0x1000 }, - { 0x0, 0x0400 }, - }, }, { 0 }, }; static const page_test_t invalid_tests[] = { + /* + * Gang tests. Composed of lots of smaller allocations, rarely properly + * aligned. + */ { "512B blocks, 16K data, 512 leader (gang block simulation)", - 0x1ff, 0x8000, { + 512, 0x8000, { { 0x0, 0x0200 }, { 0x0, 0x1000 }, { 0x0, 0x1000 }, @@ -324,7 +304,7 @@ static const page_test_t invalid_tests[] = { }, { "4K blocks, 32K data, 2 incompatible spans " "(gang abd simulation)", - 0xfff, 0x8000, { + 4096, 0x8000, { { 0x0800, 0x0800 }, { 0x0, 0x1000 }, { 0x0, 0x1000 }, @@ -337,6 +317,90 @@ static const page_test_t invalid_tests[] = { { 0x0, 0x0800 }, }, }, + + /* + * Blocks must not span multiple physical pages. These tests used to + * be considered valid, but were since found to be invalid and were + * moved here. + */ + { + "4K blocks, 4K across two pages, 2K start offset", + 4096, 0x1000, { + { 0x0800, 0x0800 }, + { 0x0, 0x0800 }, + }, + }, { + "4K blocks, 16K across 5x4K pages, 512B start offset", + 4096, 0x4000, { + { 0x0200, 0x0e00 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x0200 }, + }, + }, { + "4K blocks, 64K data, 9x8K compound pages, 512B start offset", + 4096, 0x10000, { + { 0x0200, 0x1e00 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x0200 }, + }, + }, { + "4K blocks, 64K data, mixed 4K/8K/16K pages, 1K start offset", + 4096, 0x10000, { + { 0x0400, 0x0c00 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x1000 }, + { 0x0, 0x2000 }, + { 0x0, 0x2000 }, + { 0x0, 0x1000 }, + { 0x0, 0x8000 }, + { 0x0, 0x1000 }, + { 0x0, 0x0400 }, + }, + }, + + /* + * This is the very typical case of a 4K block being allocated from + * the middle of a mixed-used slab backed by a higher-order compound + * page. + */ + { + "4K blocks, 4K data from compound slab, 2K-align offset", + 4096, 0x1000, { + { 0x1800, 0x6800 } + } + }, + + /* + * Blocks smaller than LBS should never be possible, but used to be by + * accident (see GH#16990). We test for and reject them just to be + * sure. + */ + { + "4K blocks, 1K at end of page", + 4096, 0x400, { + { 0x0c00, 0x0400 }, + }, + }, { + "4K blocks, 1K at start of page", + 4096, 0x400, { + { 0x0, 0x1000 }, + }, + }, { + "4K blocks, 1K within page, 512B start offset", + 4096, 0x400, { + { 0x0200, 0x0e00 }, + }, + }, + { 0 }, }; @@ -345,10 +409,8 @@ run_test(const page_test_t *test, bool verbose) { size_t rem = test->size; - vdev_disk_check_pages_t s = { - .bmask = 0xfff, - .npages = 0, - .end = 0, + vdev_disk_check_alignment_t s = { + .blocksize = test->blocksize, }; for (int i = 0; test->pages[i][1] > 0; i++) { @@ -362,7 +424,7 @@ run_test(const page_test_t *test, bool verbose) "rem %lx, take %lx\n", i, off, len, rem, take); - if (vdev_disk_check_pages_cb(NULL, off, take, &s)) { + if (vdev_disk_check_alignment_cb(NULL, off, take, &s)) { if (verbose) printf(" ABORT: misalignment detected, " "rem %lx\n", rem); @@ -389,7 +451,7 @@ run_test_set(const page_test_t *tests, bool want, int *ntests, int *npassed) for (const page_test_t *test = &tests[0]; test->name; test++) { bool pass = (run_test(test, false) == want); if (pass) { - printf("%s: PASS\n", test->name); + printf("%c %s: PASS\n", want ? '+' : '-', test->name); (*npassed)++; } else { printf("%s: FAIL [expected %s, got %s]\n", test->name, From 86b5853cfbf66af1afbef63a2dfbea488e8c9a0b Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 25 Oct 2024 17:28:20 +1100 Subject: [PATCH 30/47] vdev_disk: move abd return and free off the interrupt handler Freeing an ABD can take sleeping locks to update various stats. We aren't allowed to sleep on an interrupt handler. So, move the free off to the io_done callback. We should never have been freeing things in the interrupt handler, but we got away with it because we were usually freeing a linear ABD, which at most is returning two objects to a cache and never sleeping. Scatter ABDs can be used now, and those have more complex locking. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter Signed-off-by: Rob Norris Closes #16687 --- module/os/linux/zfs/vdev_disk.c | 40 ++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 7a4944410906..6a66a72b91a9 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -801,21 +801,13 @@ vbio_completion(struct bio *bio) bio_put(bio); /* - * If we copied the ABD before issuing it, clean up and return the copy - * to the ADB, with changes if appropriate. + * We're likely in an interrupt context so we can't do ABD/memory work + * here; instead we stash vbio on the zio and take care of it in the + * done callback. */ - if (vbio->vbio_abd != NULL) { - if (zio->io_type == ZIO_TYPE_READ) - abd_copy(zio->io_abd, vbio->vbio_abd, zio->io_size); + ASSERT3P(zio->io_bio, ==, NULL); + zio->io_bio = vbio; - abd_free(vbio->vbio_abd); - vbio->vbio_abd = NULL; - } - - /* Final cleanup */ - kmem_free(vbio, sizeof (vbio_t)); - - /* All done, submit for processing */ zio_delay_interrupt(zio); } @@ -1451,6 +1443,28 @@ vdev_disk_io_start(zio_t *zio) static void vdev_disk_io_done(zio_t *zio) { + /* If this was a read or write, we need to clean up the vbio */ + if (zio->io_bio != NULL) { + vbio_t *vbio = zio->io_bio; + zio->io_bio = NULL; + + /* + * If we copied the ABD before issuing it, clean up and return + * the copy to the ADB, with changes if appropriate. + */ + if (vbio->vbio_abd != NULL) { + if (zio->io_type == ZIO_TYPE_READ) + abd_copy(zio->io_abd, vbio->vbio_abd, + zio->io_size); + + abd_free(vbio->vbio_abd); + vbio->vbio_abd = NULL; + } + + /* Final cleanup */ + kmem_free(vbio, sizeof (vbio_t)); + } + /* * If the device returned EIO, we revalidate the media. If it is * determined the media has changed this triggers the asynchronous From 903d3f9187c2f6e3c0b3db620f475111a067267b Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Thu, 31 Oct 2024 22:32:31 -0400 Subject: [PATCH 31/47] Added output to `zpool online` and `offline` I was surprised to discover today that `zpool online` and `zpool offline` don't print any information about why they failed in many cases, they just return 1 with no information about why. Let's improve that where we can without changing the library function. Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Reviewed-by: Alexander Motin Reviewed-by: Allan Jude Signed-off-by: Rich Ercolani Closes #16244 --- cmd/zpool/zpool_main.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index ea180f6b705e..6a45a063d91a 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -7966,8 +7966,11 @@ zpool_do_online(int argc, char **argv) poolname = argv[0]; - if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { + (void) fprintf(stderr, gettext("failed to open pool " + "\"%s\""), poolname); return (1); + } for (i = 1; i < argc; i++) { vdev_state_t oldstate; @@ -7988,12 +7991,15 @@ zpool_do_online(int argc, char **argv) &l2cache, NULL); if (tgt == NULL) { ret = 1; + (void) fprintf(stderr, gettext("couldn't find device " + "\"%s\" in pool \"%s\"\n"), argv[i], poolname); continue; } uint_t vsc; oldstate = ((vdev_stat_t *)fnvlist_lookup_uint64_array(tgt, ZPOOL_CONFIG_VDEV_STATS, &vsc))->vs_state; - if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) { + if ((rc = zpool_vdev_online(zhp, argv[i], flags, + &newstate)) == 0) { if (newstate != VDEV_STATE_HEALTHY) { (void) printf(gettext("warning: device '%s' " "onlined, but remains in faulted state\n"), @@ -8019,6 +8025,9 @@ zpool_do_online(int argc, char **argv) } } } else { + (void) fprintf(stderr, gettext("Failed to online " + "\"%s\" in pool \"%s\": %d\n"), + argv[i], poolname, rc); ret = 1; } } @@ -8103,8 +8112,11 @@ zpool_do_offline(int argc, char **argv) poolname = argv[0]; - if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { + (void) fprintf(stderr, gettext("failed to open pool " + "\"%s\""), poolname); return (1); + } for (i = 1; i < argc; i++) { uint64_t guid = zpool_vdev_path_to_guid(zhp, argv[i]); From 7546fbd6e99144f33cb0adf68f2876f61651c8d0 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 2 Nov 2024 03:08:33 +1100 Subject: [PATCH 32/47] zdb: add extra -T flag to show histograms of BRT refcounts Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Closes #16692 --- cmd/zdb/zdb.c | 29 ++++++++++++++++++++++++----- man/man8/zdb.8 | 4 +++- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 42b82c08249f..46587671202a 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -2152,14 +2152,21 @@ dump_brt(spa_t *spa) if (dump_opt['T'] < 3) return; + /* -TTT shows a per-vdev histograms; -TTTT shows all entries */ + boolean_t do_histo = dump_opt['T'] == 3; + char dva[64]; - printf("\n%-16s %-10s\n", "DVA", "REFCNT"); + + if (!do_histo) + printf("\n%-16s %-10s\n", "DVA", "REFCNT"); for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid]; if (brtvd == NULL || !brtvd->bv_initiated) continue; + uint64_t counts[64] = {}; + zap_cursor_t zc; zap_attribute_t *za = zap_attribute_alloc(); for (zap_cursor_init(&zc, brt->brt_mos, brtvd->bv_mos_entries); @@ -2172,14 +2179,26 @@ dump_brt(spa_t *spa) za->za_integer_length, za->za_num_integers, &refcnt)); - uint64_t offset = *(const uint64_t *)za->za_name; + if (do_histo) + counts[highbit64(refcnt)]++; + else { + uint64_t offset = + *(const uint64_t *)za->za_name; - snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx", vdevid, - (u_longlong_t)offset); - printf("%-16s %-10llu\n", dva, (u_longlong_t)refcnt); + snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx", + vdevid, (u_longlong_t)offset); + printf("%-16s %-10llu\n", dva, + (u_longlong_t)refcnt); + } } zap_cursor_fini(&zc); zap_attribute_free(za); + + if (do_histo) { + printf("\nBRT: vdev %" PRIu64 + ": DVAs with 2^n refcnts:\n", vdevid); + dump_histogram(counts, 64, 0); + } } } diff --git a/man/man8/zdb.8 b/man/man8/zdb.8 index 08f5a3f70040..ae35454ad083 100644 --- a/man/man8/zdb.8 +++ b/man/man8/zdb.8 @@ -14,7 +14,7 @@ .\" Copyright (c) 2017 Lawrence Livermore National Security, LLC. .\" Copyright (c) 2017 Intel Corporation. .\" -.Dd November 18, 2023 +.Dd October 27, 2024 .Dt ZDB 8 .Os . @@ -408,6 +408,8 @@ blocks cloned, the space saving as a result of cloning, and the saving ratio. .It Fl TT Display the per-vdev BRT statistics, including total references. .It Fl TTT +Display histograms of per-vdev BRT refcounts. +.It Fl TTTT Dump the contents of the block reference tables. .It Fl u , -uberblock Display the current uberblock. From d367ef2995f3e04d71b8ae648867b323293c7cdf Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Fri, 1 Nov 2024 10:02:03 -0700 Subject: [PATCH 33/47] ZTS: Add Fedora 41, remove Fedora 39 Fedora 41 was released 10/29/24, and Fedora 39 will be EOL on 11/12/24. Update Fedora runners in the test suite. Some minor tweaks also needed to support ksh 1.0.10. Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Signed-off-by: Tony Hutter Closes #16700 --- .github/workflows/scripts/qemu-2-start.sh | 12 ++++++------ .github/workflows/scripts/qemu-3-deps.sh | 8 +++++++- .github/workflows/scripts/qemu-4-build.sh | 2 +- .github/workflows/zfs-qemu.yml | 6 +++--- .../cli_root/zpool_add/zpool_add_dryrun_output.ksh | 6 +++--- .../zpool_create/zpool_create_dryrun_output.ksh | 4 ++-- .../zpool_split/zpool_split_dryrun_output.ksh | 6 +++--- 7 files changed, 25 insertions(+), 19 deletions(-) diff --git a/.github/workflows/scripts/qemu-2-start.sh b/.github/workflows/scripts/qemu-2-start.sh index 84e13832d10f..39ac92107b71 100755 --- a/.github/workflows/scripts/qemu-2-start.sh +++ b/.github/workflows/scripts/qemu-2-start.sh @@ -52,16 +52,16 @@ case "$OS" in OSNAME="Debian 12" URL="https://cloud.debian.org/images/cloud/bookworm/latest/debian-12-generic-amd64.qcow2" ;; - fedora39) - OSNAME="Fedora 39" - OSv="fedora39" - URL="https://download.fedoraproject.org/pub/fedora/linux/releases/39/Cloud/x86_64/images/Fedora-Cloud-Base-39-1.5.x86_64.qcow2" - ;; fedora40) OSNAME="Fedora 40" - OSv="fedora39" + OSv="fedora-unknown" URL="https://download.fedoraproject.org/pub/fedora/linux/releases/40/Cloud/x86_64/images/Fedora-Cloud-Base-Generic.x86_64-40-1.14.qcow2" ;; + fedora41) + OSNAME="Fedora 41" + OSv="fedora-unknown" + URL="https://download.fedoraproject.org/pub/fedora/linux/releases/41/Cloud/x86_64/images/Fedora-Cloud-Base-Generic-41-1.4.x86_64.qcow2" + ;; freebsd13-3r) OSNAME="FreeBSD 13.3-RELEASE" OSv="freebsd13.0" diff --git a/.github/workflows/scripts/qemu-3-deps.sh b/.github/workflows/scripts/qemu-3-deps.sh index bf06c1892ceb..96979cd02e09 100755 --- a/.github/workflows/scripts/qemu-3-deps.sh +++ b/.github/workflows/scripts/qemu-3-deps.sh @@ -66,7 +66,13 @@ function rhel() { echo "##[endgroup]" echo "##[group]Install Development Tools" - sudo dnf group install -y "Development Tools" + + # Alma wants "Development Tools", Fedora 41 wants "development-tools" + if ! sudo dnf group install -y "Development Tools" ; then + echo "Trying 'development-tools' instead of 'Development Tools'" + sudo dnf group install -y development-tools + fi + sudo dnf install -y \ acl attr bc bzip2 cryptsetup curl dbench dkms elfutils-libelf-devel fio \ gdb git jq kernel-rpm-macros ksh libacl-devel libaio-devel \ diff --git a/.github/workflows/scripts/qemu-4-build.sh b/.github/workflows/scripts/qemu-4-build.sh index 1051ee1f4deb..955f605f5bce 100755 --- a/.github/workflows/scripts/qemu-4-build.sh +++ b/.github/workflows/scripts/qemu-4-build.sh @@ -83,7 +83,7 @@ function rpm_build_and_install() { echo "##[endgroup]" echo "##[group]Install" - run sudo dnf -y --skip-broken localinstall $(ls *.rpm | grep -v src.rpm) + run sudo dnf -y --nobest install $(ls *.rpm | grep -v src.rpm) echo "##[endgroup]" } diff --git a/.github/workflows/zfs-qemu.yml b/.github/workflows/zfs-qemu.yml index f819e9938e31..e90030f4c02e 100644 --- a/.github/workflows/zfs-qemu.yml +++ b/.github/workflows/zfs-qemu.yml @@ -22,8 +22,8 @@ jobs: - name: Generate OS config and CI type id: os run: | - FULL_OS='["almalinux8", "almalinux9", "centos-stream9", "debian11", "debian12", "fedora39", "fedora40", "freebsd13-4r", "freebsd14-0r", "freebsd14-1s", "ubuntu20", "ubuntu22", "ubuntu24"]' - QUICK_OS='["almalinux8", "almalinux9", "debian12", "fedora40", "freebsd13-3r", "freebsd14-1r", "ubuntu24"]' + FULL_OS='["almalinux8", "almalinux9", "centos-stream9", "debian11", "debian12", "fedora40", "fedora41", "freebsd13-4r", "freebsd14-0r", "freebsd14-1s", "ubuntu20", "ubuntu22", "ubuntu24"]' + QUICK_OS='["almalinux8", "almalinux9", "debian12", "fedora41", "freebsd13-3r", "freebsd14-1r", "ubuntu24"]' # determine CI type when running on PR ci_type="full" if ${{ github.event_name == 'pull_request' }}; then @@ -46,7 +46,7 @@ jobs: strategy: fail-fast: false matrix: - # rhl: almalinux8, almalinux9, centos-stream9, fedora39, fedora40 + # rhl: almalinux8, almalinux9, centos-stream9, fedora40, fedora41 # debian: debian11, debian12, ubuntu20, ubuntu22, ubuntu24 # misc: archlinux, tumbleweed # FreeBSD Release: freebsd13-3r, freebsd13-4r, freebsd14-0r, freebsd14-1r diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh index fa1ce9c64d33..3b06e8c35c77 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh @@ -148,9 +148,9 @@ done # Foreach test create pool, add -n devices and check output. for (( i=0; i < ${#tests[@]}; i+=1 )); do - typeset tree="${tests[$i].tree}" - typeset add="${tests[$i].add}" - typeset want="${tests[$i].want}" + tree="${tests[$i].tree}" + add="${tests[$i].add}" + want="${tests[$i].want}" log_must eval zpool create "$TESTPOOL" $tree log_must poolexists "$TESTPOOL" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh index 485891c945a3..0671ea618e05 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh @@ -124,8 +124,8 @@ done # Foreach test create pool, add -n devices and check output. for (( i=0; i < ${#tests[@]}; i+=1 )); do - typeset tree="${tests[$i].tree}" - typeset want="${tests[$i].want}" + tree="${tests[$i].tree}" + want="${tests[$i].want}" typeset out="$(log_must eval "zpool create -n '$TESTPOOL' $tree" | \ sed /^SUCCESS/d)" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh index 410b1fe7a03e..def1d154387d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh @@ -133,9 +133,9 @@ done # Foreach test create pool, add -n devices and check output. for (( i=0; i < ${#tests[@]}; i+=1 )); do - typeset tree="${tests[$i].tree}" - typeset devs="${tests[$i].devs}" - typeset want="${tests[$i].want}" + tree="${tests[$i].tree}" + devs="${tests[$i].devs}" + want="${tests[$i].want}" log_must eval zpool create "$TESTPOOL" $tree log_must poolexists "$TESTPOOL" From 880b73956be6b88a0d8bcd0bb75c066921501f3c Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 2 Nov 2024 08:43:25 +1100 Subject: [PATCH 34/47] zfs(4): remove "experimental" from zfs_bclone_enabled I think we've done enough experiments. Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: George Melikov Signed-off-by: Rob Norris Closes #16189 Closes #16712 --- man/man4/zfs.4 | 7 ++++--- module/zfs/zfs_vnops.c | 6 +++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index c9f6ed0dece3..da027798f962 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -18,7 +18,7 @@ .\" .\" Copyright (c) 2024, Klara, Inc. .\" -.Dd October 2, 2024 +.Dd November 1, 2024 .Dt ZFS 4 .Os . @@ -1333,9 +1333,10 @@ results in vector instructions from the respective CPU instruction set being used. . .It Sy zfs_bclone_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int -Enable the experimental block cloning feature. +Enables access to the block cloning feature. If this setting is 0, then even if feature@block_cloning is enabled, -attempts to clone blocks will act as though the feature is disabled. +using functions and system calls that attempt to clone blocks will act as +though the feature is disabled. . .It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index d5e0d2a2b35a..6c15a5c472ea 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -58,9 +58,9 @@ #include /* - * Enable the experimental block cloning feature. If this setting is 0, then - * even if feature@block_cloning is enabled, attempts to clone blocks will act - * as though the feature is disabled. + * Enables access to the block cloning feature. If this setting is 0, then even + * if feature@block_cloning is enabled, using functions and system calls that + * attempt to clone blocks will act as though the feature is disabled. */ int zfs_bclone_enabled = 1; From 55cbd1f9bd22a67d25aab5c7ee2af4c9eee6e697 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Mon, 4 Nov 2024 19:42:06 -0500 Subject: [PATCH 35/47] Reduce dirty records memory usage Small block workloads may use a very large number of dirty records. During simple block cloning test due to BRT still using 4KB blocks I can easily see up to 2.5M of those used. Before this change dbuf_dirty_record_t structures representing them were allocated via kmem_zalloc(), that rounded their size up to 512 bytes. Introduction of specialized kmem cache allows to reduce the size from 512 to 408 bytes. Additionally, since override and raw params in dirty records are mutually exclusive, puting them into a union allows to reduce structure size down to 368 bytes, increasing the saving to 28%, that can be a 0.5GB or more of RAM. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16694 --- include/sys/dbuf.h | 26 +++++++++++++++++--------- module/zfs/dbuf.c | 21 ++++++++++++++++----- module/zfs/dmu.c | 9 ++++----- module/zfs/dmu_direct.c | 1 + module/zfs/dnode_sync.c | 2 +- 5 files changed, 39 insertions(+), 20 deletions(-) diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index c9bfb9a8026c..e69464809a42 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -171,7 +171,6 @@ typedef struct dbuf_dirty_record { * gets COW'd in a subsequent transaction group. */ arc_buf_t *dr_data; - blkptr_t dr_overridden_by; override_states_t dr_override_state; uint8_t dr_copies; boolean_t dr_nopwrite; @@ -179,14 +178,21 @@ typedef struct dbuf_dirty_record { boolean_t dr_diowrite; boolean_t dr_has_raw_params; - /* - * If dr_has_raw_params is set, the following crypt - * params will be set on the BP that's written. - */ - boolean_t dr_byteorder; - uint8_t dr_salt[ZIO_DATA_SALT_LEN]; - uint8_t dr_iv[ZIO_DATA_IV_LEN]; - uint8_t dr_mac[ZIO_DATA_MAC_LEN]; + /* Override and raw params are mutually exclusive. */ + union { + blkptr_t dr_overridden_by; + struct { + /* + * If dr_has_raw_params is set, the + * following crypt params will be set + * on the BP that's written. + */ + boolean_t dr_byteorder; + uint8_t dr_salt[ZIO_DATA_SALT_LEN]; + uint8_t dr_iv[ZIO_DATA_IV_LEN]; + uint8_t dr_mac[ZIO_DATA_MAC_LEN]; + }; + }; } dl; struct dirty_lightweight_leaf { /* @@ -346,6 +352,8 @@ typedef struct dbuf_hash_table { typedef void (*dbuf_prefetch_fn)(void *, uint64_t, uint64_t, boolean_t); +extern kmem_cache_t *dbuf_dirty_kmem_cache; + uint64_t dbuf_whichblock(const struct dnode *di, const int64_t level, const uint64_t offset); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index df9368fc8bdb..b1419d96f4ef 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -182,6 +182,7 @@ static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr); * Global data structures and functions for the dbuf cache. */ static kmem_cache_t *dbuf_kmem_cache; +kmem_cache_t *dbuf_dirty_kmem_cache; static taskq_t *dbu_evict_taskq; static kthread_t *dbuf_cache_evict_thread; @@ -966,6 +967,8 @@ dbuf_init(void) dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); + dbuf_dirty_kmem_cache = kmem_cache_create("dbuf_dirty_record_t", + sizeof (dbuf_dirty_record_t), 0, NULL, NULL, NULL, NULL, NULL, 0); for (int i = 0; i < hmsize; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_NOLOCKDEP, NULL); @@ -1041,6 +1044,7 @@ dbuf_fini(void) sizeof (kmutex_t)); kmem_cache_destroy(dbuf_kmem_cache); + kmem_cache_destroy(dbuf_dirty_kmem_cache); taskq_destroy(dbu_evict_taskq); mutex_enter(&dbuf_evict_lock); @@ -2343,7 +2347,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * to make a copy of it so that the changes we make in this * transaction group won't leak out when we sync the older txg. */ - dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); + dr = kmem_cache_alloc(dbuf_dirty_kmem_cache, KM_SLEEP); + memset(dr, 0, sizeof (*dr)); list_link_init(&dr->dr_dirty_node); list_link_init(&dr->dr_dbuf_node); dr->dr_dnode = dn; @@ -2526,7 +2531,7 @@ dbuf_undirty_bonus(dbuf_dirty_record_t *dr) mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); + kmem_cache_free(dbuf_dirty_kmem_cache, dr); ASSERT3U(db->db_dirtycnt, >, 0); db->db_dirtycnt -= 1; } @@ -2616,7 +2621,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) } } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); + kmem_cache_free(dbuf_dirty_kmem_cache, dr); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; @@ -2941,7 +2946,7 @@ dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, * (see dbuf_sync_dnode_leaf_crypt()). */ ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT); - ASSERT3U(db->db_level, ==, 0); + ASSERT0(db->db_level); ASSERT(db->db_objset->os_raw_receive); dmu_buf_will_dirty_impl(db_fake, @@ -2950,6 +2955,7 @@ dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, dr = dbuf_find_dirty_eq(db, tx->tx_txg); ASSERT3P(dr, !=, NULL); + ASSERT3U(dr->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN); dr->dt.dl.dr_has_raw_params = B_TRUE; dr->dt.dl.dr_byteorder = byteorder; @@ -2964,10 +2970,14 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx) struct dirty_leaf *dl; dbuf_dirty_record_t *dr; + ASSERT3U(db->db.db_object, !=, DMU_META_DNODE_OBJECT); + ASSERT0(db->db_level); + dr = list_head(&db->db_dirty_records); ASSERT3P(dr, !=, NULL); ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; + ASSERT0(dl->dr_has_raw_params); dl->dr_overridden_by = *bp; dl->dr_override_state = DR_OVERRIDDEN; BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg); @@ -3040,6 +3050,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, ASSERT3P(dr, !=, NULL); ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; + ASSERT0(dl->dr_has_raw_params); encode_embedded_bp_compressed(&dl->dr_overridden_by, data, comp, uncompressed_size, compressed_size); BPE_SET_ETYPE(&dl->dr_overridden_by, etype); @@ -5083,7 +5094,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted, zio->io_txg); - kmem_free(dr, sizeof (dbuf_dirty_record_t)); + kmem_cache_free(dbuf_dirty_kmem_cache, dr); } static void diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 3f87cfe6bee9..362415a25895 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1895,6 +1895,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); if (zio->io_error == 0) { + ASSERT0(dr->dt.dl.dr_has_raw_params); dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); if (dr->dt.dl.dr_nopwrite) { blkptr_t *bp = zio->io_bp; @@ -2190,6 +2191,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) return (SET_ERROR(EALREADY)); } + ASSERT0(dr->dt.dl.dr_has_raw_params); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; mutex_exit(&db->db_mtx); @@ -2657,6 +2659,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, db = (dmu_buf_impl_t *)dbuf; bp = &bps[i]; + ASSERT3U(db->db.db_object, !=, DMU_META_DNODE_OBJECT); ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_blkid != DMU_SPILL_BLKID); @@ -2672,11 +2675,6 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, db = (dmu_buf_impl_t *)dbuf; bp = &bps[i]; - ASSERT0(db->db_level); - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT(db->db_blkid != DMU_SPILL_BLKID); - ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp)); - dmu_buf_will_clone_or_dio(dbuf, tx); mutex_enter(&db->db_mtx); @@ -2685,6 +2683,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, VERIFY(dr != NULL); ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; + ASSERT0(dl->dr_has_raw_params); dl->dr_overridden_by = *bp; if (!BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) != 0) { if (!BP_IS_EMBEDDED(bp)) { diff --git a/module/zfs/dmu_direct.c b/module/zfs/dmu_direct.c index ed96e7515bc7..40b78b519f49 100644 --- a/module/zfs/dmu_direct.c +++ b/module/zfs/dmu_direct.c @@ -180,6 +180,7 @@ dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx) if (list_next(&db->db_dirty_records, dr_head) != NULL) zp.zp_nopwrite = B_FALSE; + ASSERT0(dr_head->dt.dl.dr_has_raw_params); ASSERT3S(dr_head->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN); dr_head->dt.dl.dr_override_state = DR_IN_DMU_SYNC; diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index f67dad002319..122d7d0d17d8 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -566,7 +566,7 @@ dnode_undirty_dbufs(list_t *list) mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); + kmem_cache_free(dbuf_dirty_kmem_cache, dr); dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); } } From b96845b63204336a0e05a1b1c27cb02d626d4f5d Mon Sep 17 00:00:00 2001 From: Uglymotha Date: Tue, 5 Nov 2024 01:44:38 +0100 Subject: [PATCH 36/47] Verify parent_dev before calling udev_device_get_sysattr_value Not all udev devices have parent devices. Calling udev_device_get_ functions yield an assertion error if called with a NULL pointer. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Sietse Co-authored-by: Sietse Closes #16705 Closes #16717 --- cmd/zed/zed_disk_event.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c index 32a8789d3001..4a2b7398e922 100644 --- a/cmd/zed/zed_disk_event.c +++ b/cmd/zed/zed_disk_event.c @@ -139,7 +139,8 @@ dev_event_nvlist(struct udev_device *dev) * is /dev/sda. */ struct udev_device *parent_dev = udev_device_get_parent(dev); - if ((value = udev_device_get_sysattr_value(parent_dev, "size")) + if (parent_dev != NULL && + (value = udev_device_get_sysattr_value(parent_dev, "size")) != NULL) { uint64_t numval = DEV_BSIZE; From ae48c2f6a97bd95b84099439bbb1e668fd7c2548 Mon Sep 17 00:00:00 2001 From: tstabrawa <59430211+tstabrawa@users.noreply.github.com> Date: Sun, 3 Nov 2024 15:30:23 -0600 Subject: [PATCH 37/47] Revert "Avoid BUG in migrate_folio_extra" This reverts commit b052035990594408899fa32fd4ad6603b75b6c6d. Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Reviewed-by: Brian Atkinson Signed-off-by: tstabrawa <59430211+tstabrawa@users.noreply.github.com> Closes #16568 Closes #16723 --- config/kernel-vfs-invalidate_folio.m4 | 33 ------------------- config/kernel-vfs-release_folio.m4 | 32 ------------------- config/kernel.m4 | 4 --- module/os/linux/zfs/zfs_vnops_os.c | 17 ---------- module/os/linux/zfs/zfs_znode_os.c | 8 ----- module/os/linux/zfs/zpl_file.c | 46 --------------------------- 6 files changed, 140 deletions(-) delete mode 100644 config/kernel-vfs-invalidate_folio.m4 delete mode 100644 config/kernel-vfs-release_folio.m4 diff --git a/config/kernel-vfs-invalidate_folio.m4 b/config/kernel-vfs-invalidate_folio.m4 deleted file mode 100644 index 61a5c8478af1..000000000000 --- a/config/kernel-vfs-invalidate_folio.m4 +++ /dev/null @@ -1,33 +0,0 @@ -dnl # -dnl # Linux 5.18 uses invalidate_folio in lieu of invalidate_page -dnl # -AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_INVALIDATE_FOLIO], [ - ZFS_LINUX_TEST_SRC([vfs_has_invalidate_folio], [ - #include - - static void - test_invalidate_folio(struct folio *folio, size_t offset, - size_t len) { - (void) folio; (void) offset; (void) len; - return; - } - - static const struct address_space_operations - aops __attribute__ ((unused)) = { - .invalidate_folio = test_invalidate_folio, - }; - ],[]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_VFS_INVALIDATE_FOLIO], [ - dnl # - dnl # Linux 5.18 uses invalidate_folio in lieu of invalidate_page - dnl # - AC_MSG_CHECKING([whether invalidate_folio exists]) - ZFS_LINUX_TEST_RESULT([vfs_has_invalidate_folio], [ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_VFS_INVALIDATE_FOLIO, 1, [invalidate_folio exists]) - ],[ - AC_MSG_RESULT([no]) - ]) -]) diff --git a/config/kernel-vfs-release_folio.m4 b/config/kernel-vfs-release_folio.m4 deleted file mode 100644 index f31db5677fd3..000000000000 --- a/config/kernel-vfs-release_folio.m4 +++ /dev/null @@ -1,32 +0,0 @@ -dnl # -dnl # Linux 5.19 uses release_folio in lieu of releasepage -dnl # -AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_RELEASE_FOLIO], [ - ZFS_LINUX_TEST_SRC([vfs_has_release_folio], [ - #include - - static bool - test_release_folio(struct folio *folio, gfp_t gfp) { - (void) folio; (void) gfp; - return (0); - } - - static const struct address_space_operations - aops __attribute__ ((unused)) = { - .release_folio = test_release_folio, - }; - ],[]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_VFS_RELEASE_FOLIO], [ - dnl # - dnl # Linux 5.19 uses release_folio in lieu of releasepage - dnl # - AC_MSG_CHECKING([whether release_folio exists]) - ZFS_LINUX_TEST_RESULT([vfs_has_release_folio], [ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_VFS_RELEASE_FOLIO, 1, [release_folio exists]) - ],[ - AC_MSG_RESULT([no]) - ]) -]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 556df58082f9..761f9310753a 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -77,8 +77,6 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_SGET ZFS_AC_KERNEL_SRC_VFS_FILEMAP_DIRTY_FOLIO ZFS_AC_KERNEL_SRC_VFS_READ_FOLIO - ZFS_AC_KERNEL_SRC_VFS_RELEASE_FOLIO - ZFS_AC_KERNEL_SRC_VFS_INVALIDATE_FOLIO ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO ZFS_AC_KERNEL_SRC_VFS_READPAGES @@ -189,8 +187,6 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_SGET ZFS_AC_KERNEL_VFS_FILEMAP_DIRTY_FOLIO ZFS_AC_KERNEL_VFS_READ_FOLIO - ZFS_AC_KERNEL_VFS_RELEASE_FOLIO - ZFS_AC_KERNEL_VFS_INVALIDATE_FOLIO ZFS_AC_KERNEL_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_VFS_DIRECT_IO ZFS_AC_KERNEL_VFS_READPAGES diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 469197220859..dd9fd760b9c2 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -260,15 +260,6 @@ update_pages(znode_t *zp, int64_t start, int len, objset_t *os) } else { ClearPageError(pp); SetPageUptodate(pp); - if (!PagePrivate(pp)) { - /* - * Set private bit so page migration - * will wait for us to finish writeback - * before calling migrate_folio(). - */ - SetPagePrivate(pp); - get_page(pp); - } if (mapping_writably_mapped(mp)) flush_dcache_page(pp); @@ -4090,14 +4081,6 @@ zfs_fillpage(struct inode *ip, struct page *pp) } else { ClearPageError(pp); SetPageUptodate(pp); - if (!PagePrivate(pp)) { - /* - * Set private bit so page migration will wait for us to - * finish writeback before calling migrate_folio(). - */ - SetPagePrivate(pp); - get_page(pp); - } } return (error); diff --git a/module/os/linux/zfs/zfs_znode_os.c b/module/os/linux/zfs/zfs_znode_os.c index bc1e17f086d9..bbaca2f58394 100644 --- a/module/os/linux/zfs/zfs_znode_os.c +++ b/module/os/linux/zfs/zfs_znode_os.c @@ -1577,14 +1577,6 @@ zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len) mark_page_accessed(pp); SetPageUptodate(pp); ClearPageError(pp); - if (!PagePrivate(pp)) { - /* - * Set private bit so page migration will wait for us to - * finish writeback before calling migrate_folio(). - */ - SetPagePrivate(pp); - get_page(pp); - } unlock_page(pp); put_page(pp); } diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 50c63695dcc8..4d1bf1d5477f 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -607,42 +607,6 @@ zpl_writepage(struct page *pp, struct writeback_control *wbc) return (zpl_putpage(pp, wbc, &for_sync)); } -static int -zpl_releasepage(struct page *pp, gfp_t gfp) -{ - if (PagePrivate(pp)) { - ClearPagePrivate(pp); - put_page(pp); - } - return (1); -} - -#ifdef HAVE_VFS_RELEASE_FOLIO -static bool -zpl_release_folio(struct folio *folio, gfp_t gfp) -{ - return (zpl_releasepage(&folio->page, gfp)); -} -#endif - -#ifdef HAVE_VFS_INVALIDATE_FOLIO -static void -zpl_invalidate_folio(struct folio *folio, size_t offset, size_t len) -{ - if ((offset == 0) && (len == PAGE_SIZE)) { - zpl_releasepage(&folio->page, 0); - } -} -#else -static void -zpl_invalidatepage(struct page *pp, unsigned int offset, unsigned int len) -{ - if ((offset == 0) && (len == PAGE_SIZE)) { - zpl_releasepage(pp, 0); - } -} -#endif - /* * The flag combination which matches the behavior of zfs_space() is * FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE @@ -1126,16 +1090,6 @@ const struct address_space_operations zpl_address_space_operations = { #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO .dirty_folio = filemap_dirty_folio, #endif -#ifdef HAVE_VFS_RELEASE_FOLIO - .release_folio = zpl_release_folio, -#else - .releasepage = zpl_releasepage, -#endif -#ifdef HAVE_VFS_INVALIDATE_FOLIO - .invalidate_folio = zpl_invalidate_folio, -#else - .invalidatepage = zpl_invalidatepage, -#endif }; const struct file_operations zpl_file_operations = { From 661bb434e6a9beb47aac2fbc5ce94ba490e683dc Mon Sep 17 00:00:00 2001 From: tstabrawa <59430211+tstabrawa@users.noreply.github.com> Date: Sun, 3 Nov 2024 16:00:20 -0600 Subject: [PATCH 38/47] Use simple folio migration function Avoids using fallback_migrate_folio, which starts unnecessary writeback (leading to BUG in migrate_folio_extra). Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Reviewed-by: Brian Atkinson Signed-off-by: tstabrawa <59430211+tstabrawa@users.noreply.github.com> Closes #16568 Closes #16723 --- config/kernel-vfs-migrate_folio.m4 | 27 +++++++++++++++++++++++++++ config/kernel.m4 | 2 ++ module/os/linux/zfs/zpl_file.c | 6 ++++++ 3 files changed, 35 insertions(+) create mode 100644 config/kernel-vfs-migrate_folio.m4 diff --git a/config/kernel-vfs-migrate_folio.m4 b/config/kernel-vfs-migrate_folio.m4 new file mode 100644 index 000000000000..186cd0581a17 --- /dev/null +++ b/config/kernel-vfs-migrate_folio.m4 @@ -0,0 +1,27 @@ +dnl # +dnl # Linux 6.0 uses migrate_folio in lieu of migrate_page +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_MIGRATE_FOLIO], [ + ZFS_LINUX_TEST_SRC([vfs_has_migrate_folio], [ + #include + #include + + static const struct address_space_operations + aops __attribute__ ((unused)) = { + .migrate_folio = migrate_folio, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_MIGRATE_FOLIO], [ + dnl # + dnl # Linux 6.0 uses migrate_folio in lieu of migrate_page + dnl # + AC_MSG_CHECKING([whether migrate_folio exists]) + ZFS_LINUX_TEST_RESULT([vfs_has_migrate_folio], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_MIGRATE_FOLIO, 1, [migrate_folio exists]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 761f9310753a..78f178ff27ac 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -77,6 +77,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_SGET ZFS_AC_KERNEL_SRC_VFS_FILEMAP_DIRTY_FOLIO ZFS_AC_KERNEL_SRC_VFS_READ_FOLIO + ZFS_AC_KERNEL_SRC_VFS_MIGRATE_FOLIO ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO ZFS_AC_KERNEL_SRC_VFS_READPAGES @@ -187,6 +188,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_SGET ZFS_AC_KERNEL_VFS_FILEMAP_DIRTY_FOLIO ZFS_AC_KERNEL_VFS_READ_FOLIO + ZFS_AC_KERNEL_VFS_MIGRATE_FOLIO ZFS_AC_KERNEL_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_VFS_DIRECT_IO ZFS_AC_KERNEL_VFS_READPAGES diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 4d1bf1d5477f..f6e014327717 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -28,6 +28,7 @@ #include #endif #include +#include #include #include #include @@ -1090,6 +1091,11 @@ const struct address_space_operations zpl_address_space_operations = { #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO .dirty_folio = filemap_dirty_folio, #endif +#ifdef HAVE_VFS_MIGRATE_FOLIO + .migrate_folio = migrate_folio, +#else + .migratepage = migrate_page, +#endif }; const struct file_operations zpl_file_operations = { From c82eb27b22c63f197aacb77528b4efa179f476a4 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 6 Nov 2024 11:52:01 -0800 Subject: [PATCH 39/47] ZFS send should use spill block prefetched from send_reader_thread Currently, even though send_reader_thread prefetches spill block, do_dump() will not use it and issues its own blocking arc_read. This causes significant performance degradation when sending datasets with lots of spill blocks. For unmodified spill blocks, we also create send_range struct for them in send_reader_thread and issue prefetches for them. We piggyback them on the dnode send_range instead of enqueueing them so we don't break send_range_after check. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Chunwei Chen Co-authored-by: david.chen Closes #16701 --- module/zfs/dmu_send.c | 126 +++++++++++++++++++++--------------------- 1 file changed, 64 insertions(+), 62 deletions(-) diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index a174972e9b57..aeb8ff3b6688 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -180,6 +180,8 @@ struct send_range { */ dnode_phys_t *dnp; blkptr_t bp; + /* Piggyback unmodified spill block */ + struct send_range *spill_range; } object; struct srr { uint32_t datablksz; @@ -231,6 +233,8 @@ range_free(struct send_range *range) size_t size = sizeof (dnode_phys_t) * (range->sru.object.dnp->dn_extra_slots + 1); kmem_free(range->sru.object.dnp, size); + if (range->sru.object.spill_range) + range_free(range->sru.object.spill_range); } else if (range->type == DATA) { mutex_enter(&range->sru.data.lock); while (range->sru.data.io_outstanding) @@ -617,7 +621,7 @@ dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, drrs->drr_length = blksz; drrs->drr_toguid = dscp->dsc_toguid; - /* See comment in dump_dnode() for full details */ + /* See comment in piggyback_unmodified_spill() for full details */ if (zfs_send_unmodified_spill_blocks && (BP_GET_LOGICAL_BIRTH(bp) <= dscp->dsc_fromtxg)) { drrs->drr_flags |= DRR_SPILL_UNMODIFIED; @@ -793,35 +797,6 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0) return (SET_ERROR(EINTR)); - /* - * Send DRR_SPILL records for unmodified spill blocks. This is useful - * because changing certain attributes of the object (e.g. blocksize) - * can cause old versions of ZFS to incorrectly remove a spill block. - * Including these records in the stream forces an up to date version - * to always be written ensuring they're never lost. Current versions - * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can - * ignore these unmodified spill blocks. - */ - if (zfs_send_unmodified_spill_blocks && - (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && - (BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= dscp->dsc_fromtxg)) { - struct send_range record; - blkptr_t *bp = DN_SPILL_BLKPTR(dnp); - - memset(&record, 0, sizeof (struct send_range)); - record.type = DATA; - record.object = object; - record.eos_marker = B_FALSE; - record.start_blkid = DMU_SPILL_BLKID; - record.end_blkid = record.start_blkid + 1; - record.sru.data.bp = *bp; - record.sru.data.obj_type = dnp->dn_type; - record.sru.data.datablksz = BP_GET_LSIZE(bp); - - if (do_dump(dscp, &record) != 0) - return (SET_ERROR(EINTR)); - } - if (dscp->dsc_err != 0) return (SET_ERROR(EINTR)); @@ -911,6 +886,9 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) case OBJECT: err = dump_dnode(dscp, &range->sru.object.bp, range->object, range->sru.object.dnp); + /* Dump piggybacked unmodified spill block */ + if (!err && range->sru.object.spill_range) + err = do_dump(dscp, range->sru.object.spill_range); return (err); case OBJECT_RANGE: { ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); @@ -939,34 +917,7 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) ASSERT3U(srdp->datablksz, ==, BP_GET_LSIZE(bp)); ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); - if (BP_GET_TYPE(bp) == DMU_OT_SA) { - arc_flags_t aflags = ARC_FLAG_WAIT; - zio_flag_t zioflags = ZIO_FLAG_CANFAIL; - if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) { - ASSERT(BP_IS_PROTECTED(bp)); - zioflags |= ZIO_FLAG_RAW; - } - - zbookmark_phys_t zb; - ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID); - zb.zb_objset = dmu_objset_id(dscp->dsc_os); - zb.zb_object = range->object; - zb.zb_level = 0; - zb.zb_blkid = range->start_blkid; - - arc_buf_t *abuf = NULL; - if (!dscp->dsc_dso->dso_dryrun && arc_read(NULL, spa, - bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, - zioflags, &aflags, &zb) != 0) - return (SET_ERROR(EIO)); - - err = dump_spill(dscp, bp, zb.zb_object, - (abuf == NULL ? NULL : abuf->b_data)); - if (abuf != NULL) - arc_buf_destroy(abuf, &abuf); - return (err); - } if (send_do_embed(bp, dscp->dsc_featureflags)) { err = dump_write_embedded(dscp, range->object, range->start_blkid * srdp->datablksz, @@ -975,8 +926,9 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) } ASSERT(range->object > dscp->dsc_resume_object || (range->object == dscp->dsc_resume_object && + (range->start_blkid == DMU_SPILL_BLKID || range->start_blkid * srdp->datablksz >= - dscp->dsc_resume_offset)); + dscp->dsc_resume_offset))); /* it's a level-0 block of a regular object */ mutex_enter(&srdp->lock); @@ -1006,8 +958,6 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) ASSERT(dscp->dsc_dso->dso_dryrun || srdp->abuf != NULL || srdp->abd != NULL); - uint64_t offset = range->start_blkid * srdp->datablksz; - char *data = NULL; if (srdp->abd != NULL) { data = abd_to_buf(srdp->abd); @@ -1016,6 +966,14 @@ do_dump(dmu_send_cookie_t *dscp, struct send_range *range) data = srdp->abuf->b_data; } + if (BP_GET_TYPE(bp) == DMU_OT_SA) { + ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID); + err = dump_spill(dscp, bp, range->object, data); + return (err); + } + + uint64_t offset = range->start_blkid * srdp->datablksz; + /* * If we have large blocks stored on disk but the send flags * don't allow us to send large blocks, we split the data from @@ -1098,6 +1056,8 @@ range_alloc(enum type type, uint64_t object, uint64_t start_blkid, range->sru.data.io_outstanding = 0; range->sru.data.io_err = 0; range->sru.data.io_compressed = B_FALSE; + } else if (type == OBJECT) { + range->sru.object.spill_range = NULL; } return (range); } @@ -1742,6 +1702,45 @@ enqueue_range(struct send_reader_thread_arg *srta, bqueue_t *q, dnode_t *dn, bqueue_enqueue(q, range, datablksz); } +/* + * Send DRR_SPILL records for unmodified spill blocks. This is useful + * because changing certain attributes of the object (e.g. blocksize) + * can cause old versions of ZFS to incorrectly remove a spill block. + * Including these records in the stream forces an up to date version + * to always be written ensuring they're never lost. Current versions + * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can + * ignore these unmodified spill blocks. + * + * We piggyback the spill_range to dnode range instead of enqueueing it + * so send_range_after won't complain. + */ +static uint64_t +piggyback_unmodified_spill(struct send_reader_thread_arg *srta, + struct send_range *range) +{ + ASSERT3U(range->type, ==, OBJECT); + + dnode_phys_t *dnp = range->sru.object.dnp; + uint64_t fromtxg = srta->smta->to_arg->fromtxg; + + if (!zfs_send_unmodified_spill_blocks || + !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) || + !(BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= fromtxg)) + return (0); + + blkptr_t *bp = DN_SPILL_BLKPTR(dnp); + struct send_range *spill_range = range_alloc(DATA, range->object, + DMU_SPILL_BLKID, DMU_SPILL_BLKID+1, B_FALSE); + spill_range->sru.data.bp = *bp; + spill_range->sru.data.obj_type = dnp->dn_type; + spill_range->sru.data.datablksz = BP_GET_LSIZE(bp); + + issue_data_read(srta, spill_range); + range->sru.object.spill_range = spill_range; + + return (BP_GET_LSIZE(bp)); +} + /* * This thread is responsible for two things: First, it retrieves the correct * blkptr in the to ds if we need to send the data because of something from @@ -1773,17 +1772,20 @@ send_reader_thread(void *arg) uint64_t last_obj_exists = B_TRUE; while (!range->eos_marker && !srta->cancel && smta->error == 0 && err == 0) { + uint64_t spill = 0; switch (range->type) { case DATA: issue_data_read(srta, range); bqueue_enqueue(outq, range, range->sru.data.datablksz); range = get_next_range_nofree(inq, range); break; - case HOLE: case OBJECT: + spill = piggyback_unmodified_spill(srta, range); + zfs_fallthrough; + case HOLE: case OBJECT_RANGE: case REDACT: // Redacted blocks must exist - bqueue_enqueue(outq, range, sizeof (*range)); + bqueue_enqueue(outq, range, sizeof (*range) + spill); range = get_next_range_nofree(inq, range); break; case PREVIOUSLY_REDACTED: { From 8131793d6f08e359238a3d56428a96b81480bd0a Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Thu, 7 Nov 2024 14:11:05 -0500 Subject: [PATCH 40/47] Update ABD stats for linear page Linux a10e552 updated abd_free_linear_page() to no longer call abd_update_scatter_stat(). This meant that linear pages that were not attached to Direct I/O requests were not doing waste accounting for the ARC. This led to performance issues due to incorrect ARC accounting that resulted in 100% of CPU time being spent in arc_evict() during prolonged I/O workloads with the ARC. The call to abd_update_scatter_stats() is now conditionally called in abd_free_linear_page() when the ABD is not from a Direct I/O request. Reviewed-by: Mark Maybee Reviewed-by: Tony Nguyen Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Brian Atkinson Closes #16729 --- module/os/linux/zfs/abd_os.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index 303af48cf3af..04ab8bbca352 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -701,6 +701,8 @@ abd_free_linear_page(abd_t *abd) /* When backed by user page unmap it */ if (abd_is_from_pages(abd)) zfs_kunmap(sg_page(sg)); + else + abd_update_scatter_stats(abd, ABDSTAT_DECR); abd->abd_flags &= ~ABD_FLAG_LINEAR; abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; From e12d76176d4e5454db62eb48b58ecd4970838a76 Mon Sep 17 00:00:00 2001 From: Sam James Date: Thu, 7 Nov 2024 19:20:37 +0000 Subject: [PATCH 41/47] Use instead of When building on musl, we get: ``` In file included from tests/zfs-tests/cmd/getversion.c:22: /usr/include/sys/fcntl.h:1:2: error: #warning redirecting incorrect #include to [-Werror=cpp] 1 | #warning redirecting incorrect #include to In file included from module/os/linux/zfs/vdev_file.c:36: /usr/include/sys/fcntl.h:1:2: error: #warning redirecting incorrect #include to [-Werror=cpp] 1 | #warning redirecting incorrect #include to ``` Bug: https://bugs.gentoo.org/925235 Reviewed-by: Brian Behlendorf Signed-off-by: Sam James Closes #15925 --- module/os/linux/zfs/vdev_file.c | 4 +++- tests/zfs-tests/cmd/getversion.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/module/os/linux/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c index 4bffb6412ffd..2cab6532487a 100644 --- a/module/os/linux/zfs/vdev_file.c +++ b/module/os/linux/zfs/vdev_file.c @@ -33,11 +33,13 @@ #include #include #include -#include #include #include #ifdef _KERNEL #include +#include +#else +#include #endif /* * Virtual device vector for files. diff --git a/tests/zfs-tests/cmd/getversion.c b/tests/zfs-tests/cmd/getversion.c index 62c1c5b6abc0..1e026b92d17d 100644 --- a/tests/zfs-tests/cmd/getversion.c +++ b/tests/zfs-tests/cmd/getversion.c @@ -19,9 +19,9 @@ */ #include -#include #include #include +#include #include #include #include From 9061a4da0b2083bf5f7d88d1b19b91b3da62350c Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Fri, 8 Nov 2024 00:31:14 +0500 Subject: [PATCH 42/47] JSON: fix user properties output for zfs list This commit fixes JSON output for zfs list when user properties are requested with -o flag. This case needed to be handled specifically since zfs_prop_to_name does not return property name for user properties, instead it is stored in pl->pl_user_prop. Reviewed-by: Ameer Hamza Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Umer Saleem Closes #16732 --- cmd/zfs/zfs_main.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 97397726c709..7836f5909f4a 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -3761,8 +3761,13 @@ collect_dataset(zfs_handle_t *zhp, list_cbdata_t *cb) if (cb->cb_json) { if (pl->pl_prop == ZFS_PROP_NAME) continue; + const char *prop_name; + if (pl->pl_prop != ZPROP_USERPROP) + prop_name = zfs_prop_to_name(pl->pl_prop); + else + prop_name = pl->pl_user_prop; if (zprop_nvlist_one_property( - zfs_prop_to_name(pl->pl_prop), propstr, + prop_name, propstr, sourcetype, source, NULL, props, cb->cb_json_as_int) != 0) nomem(); From 1a54b13aafd275d0f217ab771c21bf54ff13bcf6 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 21 Oct 2024 12:00:47 -0700 Subject: [PATCH 43/47] Tag 2.3.0-rc3 Signed-off-by: Brian Behlendorf --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index eca34a55d004..0fe0dedae79e 100644 --- a/META +++ b/META @@ -2,7 +2,7 @@ Meta: 1 Name: zfs Branch: 1.0 Version: 2.3.0 -Release: rc2 +Release: rc3 Release-Tags: relext License: CDDL Author: OpenZFS From 41f89d0ce778b625a23cad99fdaf92d5dd0fc12c Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Fri, 8 Nov 2024 15:09:47 +0500 Subject: [PATCH 44/47] JSON: fix user properties output for zpool list This commit fixes JSON output for zpool list when user properties are requested with -o flag. This case needed to be handled specifically since zpool_prop_to_name does not return property name for user properties, instead it is stored in pl->pl_user_prop. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Umer Saleem Closes #16734 --- cmd/zpool/zpool_main.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 6a45a063d91a..4458b902de31 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -6882,8 +6882,13 @@ collect_pool(zpool_handle_t *zhp, list_cbdata_t *cb) if (cb->cb_json) { if (pl->pl_prop == ZPOOL_PROP_NAME) continue; + const char *prop_name; + if (pl->pl_prop != ZPROP_USERPROP) + prop_name = zpool_prop_to_name(pl->pl_prop); + else + prop_name = pl->pl_user_prop; (void) zprop_nvlist_one_property( - zpool_prop_to_name(pl->pl_prop), propstr, + prop_name, propstr, sourcetype, NULL, NULL, props, cb->cb_json_as_int); } else { /* From 092cb94ef68074c09fc3249595f2fd4c5ed20bff Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Fri, 8 Nov 2024 15:13:05 +0500 Subject: [PATCH 45/47] Fix user properties output for zpool list In zpool_get_user_prop, when called from zpool_expand_proplist and collect_pool, we often have zpool_props present in zpool_handle_t equal to NULL. This mostly happens when only one user property is requested using zpool list -o . Checking for this case and correctly initializing the zpool_props field in zpool_handle_t fixes this issue. Interestingly, this issue does not occur if we query any other property like name or guid along with a user property with -o flag because while accessing properties like guid, zpool_prop_get_int is called which checks for this case specifically and calls zpool_get_all_props. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Umer Saleem Closes #16734 --- lib/libzfs/libzfs_pool.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index ef0346a96fd0..15388ccbbe2b 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -471,13 +471,15 @@ int zpool_get_userprop(zpool_handle_t *zhp, const char *propname, char *buf, size_t len, zprop_source_t *srctype) { - nvlist_t *nv, *nvl; + nvlist_t *nv; uint64_t ival; const char *value; zprop_source_t source = ZPROP_SRC_LOCAL; - nvl = zhp->zpool_props; - if (nvlist_lookup_nvlist(nvl, propname, &nv) == 0) { + if (zhp->zpool_props == NULL) + zpool_get_all_props(zhp); + + if (nvlist_lookup_nvlist(zhp->zpool_props, propname, &nv) == 0) { if (nvlist_lookup_uint64(nv, ZPROP_SOURCE, &ival) == 0) source = ival; verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0); From 5a0ec4a496aefa968b53b67e41179870786a7b10 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 13 Nov 2024 10:31:50 -0500 Subject: [PATCH 46/47] L2ARC: Move different stats updates earlier ..., before we make the header or the log block visible to others. It should fix assertion on allocated space going negative if the header is freed once the lock is dropped, while the write is still going. Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #16040 Closes #16743 --- module/zfs/arc.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 76dc0b19139d..2ece7d67704f 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -9287,6 +9287,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) hdr->b_l2hdr.b_hits = 0; hdr->b_l2hdr.b_arcs_state = hdr->b_l1hdr.b_state->arcs_state; + arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR | + ARC_FLAG_L2_WRITING); + + (void) zfs_refcount_add_many(&dev->l2ad_alloc, + arc_hdr_size(hdr), hdr); + l2arc_hdr_arcstats_increment(hdr); + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + mutex_enter(&dev->l2ad_mtx); if (pio == NULL) { /* @@ -9298,12 +9306,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) } list_insert_head(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); - arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR | - ARC_FLAG_L2_WRITING); - - (void) zfs_refcount_add_many(&dev->l2ad_alloc, - arc_hdr_size(hdr), hdr); - l2arc_hdr_arcstats_increment(hdr); boolean_t commit = l2arc_log_blk_insert(dev, hdr); mutex_exit(hash_lock); @@ -9333,7 +9335,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) write_psize += psize; write_asize += asize; dev->l2ad_hand += asize; - vdev_space_update(dev->l2ad_vdev, asize, 0, 0); if (commit) { /* l2ad_hand will be adjusted inside. */ @@ -10585,6 +10586,8 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) (void) zio_nowait(wzio); dev->l2ad_hand += asize; + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + /* * Include the committed log block's pointer in the list of pointers * to log blocks present in the L2ARC device. @@ -10598,7 +10601,6 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); - vdev_space_update(dev->l2ad_vdev, asize, 0, 0); /* bump the kstats */ ARCSTAT_INCR(arcstat_l2_write_bytes, asize); From b79bddb53ed6652409c69b91c5297fe03443f0cc Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Tue, 12 Nov 2024 19:46:13 +0500 Subject: [PATCH 47/47] zvol_os.c: Increase optimal IO size Since zvol read and write can process up to (DMU_MAX_ACCESS / 2) bytes in a single operation, the current optimal I/O size is too low. SCST directly reports this value as the optimal transfer length for the target SCSI device. Increasing it from the previous volblocksize results in performance improvement for large block parallel I/O workloads. Signed-off-by: Ameer Hamza --- module/os/linux/zfs/zvol_os.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 2396690b40fd..47aa6417068d 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -1176,7 +1176,7 @@ zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv, limits->zql_max_segment_size = UINT_MAX; } - limits->zql_io_opt = zv->zv_volblocksize; + limits->zql_io_opt = DMU_MAX_ACCESS / 2; limits->zql_physical_block_size = zv->zv_volblocksize; limits->zql_max_discard_sectors =