From 89fcb8c6f969cbfa237849d9b33610124d361186 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 27 Nov 2023 13:49:20 -0800 Subject: [PATCH 01/16] Revert "Tune zio buffer caches and their alignments" This reverts commit bd7a02c251d8c119937e847d5161b512913667e6 which can trigger an unlikely existing bio alignment issue on Linux. This change is good, but the underlying issue it exposes needs to be resolved before this can be re-applied. Signed-off-by: Brian Behlendorf Issue #15533 --- module/zfs/zio.c | 89 +++++++++++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 39 deletions(-) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index a719e5492323..3b3b40fa73d8 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -158,22 +158,23 @@ zio_init(void) zio_link_cache = kmem_cache_create("zio_link_cache", sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + /* + * For small buffers, we want a cache for each multiple of + * SPA_MINBLOCKSIZE. For larger buffers, we want a cache + * for each quarter-power of 2. + */ for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { size_t size = (c + 1) << SPA_MINBLOCKSHIFT; - size_t align, cflags, data_cflags; - char name[32]; - - /* - * Create cache for each half-power of 2 size, starting from - * SPA_MINBLOCKSIZE. It should give us memory space efficiency - * of ~7/8, sufficient for transient allocations mostly using - * these caches. - */ size_t p2 = size; + size_t align = 0; + size_t data_cflags, cflags; + + data_cflags = KMC_NODEBUG; + cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? + KMC_NODEBUG : 0; + while (!ISP2(p2)) p2 &= p2 - 1; - if (!IS_P2ALIGNED(size, p2 / 2)) - continue; #ifndef _KERNEL /* @@ -184,37 +185,47 @@ zio_init(void) */ if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) continue; -#endif - - if (IS_P2ALIGNED(size, PAGESIZE)) + /* + * Here's the problem - on 4K native devices in userland on + * Linux using O_DIRECT, buffers must be 4K aligned or I/O + * will fail with EINVAL, causing zdb (and others) to coredump. + * Since userland probably doesn't need optimized buffer caches, + * we just force 4K alignment on everything. + */ + align = 8 * SPA_MINBLOCKSIZE; +#else + if (size < PAGESIZE) { + align = SPA_MINBLOCKSIZE; + } else if (IS_P2ALIGNED(size, p2 >> 2)) { align = PAGESIZE; - else - align = 1 << (highbit64(size ^ (size - 1)) - 1); + } +#endif - cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? - KMC_NODEBUG : 0; - data_cflags = KMC_NODEBUG; - if (cflags == data_cflags) { - /* - * Resulting kmem caches would be identical. - * Save memory by creating only one. - */ - (void) snprintf(name, sizeof (name), - "zio_buf_comb_%lu", (ulong_t)size); - zio_buf_cache[c] = kmem_cache_create(name, size, align, - NULL, NULL, NULL, NULL, NULL, cflags); - zio_data_buf_cache[c] = zio_buf_cache[c]; - continue; + if (align != 0) { + char name[36]; + if (cflags == data_cflags) { + /* + * Resulting kmem caches would be identical. + * Save memory by creating only one. + */ + (void) snprintf(name, sizeof (name), + "zio_buf_comb_%lu", (ulong_t)size); + zio_buf_cache[c] = kmem_cache_create(name, + size, align, NULL, NULL, NULL, NULL, NULL, + cflags); + zio_data_buf_cache[c] = zio_buf_cache[c]; + continue; + } + (void) snprintf(name, sizeof (name), "zio_buf_%lu", + (ulong_t)size); + zio_buf_cache[c] = kmem_cache_create(name, size, + align, NULL, NULL, NULL, NULL, NULL, cflags); + + (void) snprintf(name, sizeof (name), "zio_data_buf_%lu", + (ulong_t)size); + zio_data_buf_cache[c] = kmem_cache_create(name, size, + align, NULL, NULL, NULL, NULL, NULL, data_cflags); } - (void) snprintf(name, sizeof (name), "zio_buf_%lu", - (ulong_t)size); - zio_buf_cache[c] = kmem_cache_create(name, size, align, - NULL, NULL, NULL, NULL, NULL, cflags); - - (void) snprintf(name, sizeof (name), "zio_data_buf_%lu", - (ulong_t)size); - zio_data_buf_cache[c] = kmem_cache_create(name, size, align, - NULL, NULL, NULL, NULL, NULL, data_cflags); } while (--c != 0) { From 9b9b09f452a469458451c221debfbab944e7f081 Mon Sep 17 00:00:00 2001 From: Rob N Date: Wed, 29 Nov 2023 04:15:48 +1100 Subject: [PATCH 02/16] dnode_is_dirty: check dnode and its data for dirtiness Over its history this the dirty dnode test has been changed between checking for a dnodes being on `os_dirty_dnodes` (`dn_dirty_link`) and `dn_dirty_record`. de198f2d9 Fix lseek(SEEK_DATA/SEEK_HOLE) mmap consistency 2531ce372 Revert "Report holes when there are only metadata changes" ec4f9b8f3 Report holes when there are only metadata changes 454365bba Fix dirty check in dmu_offset_next() 66aca2473 SEEK_HOLE should not block on txg_wait_synced() Also illumos/illumos-gate@c543ec060d illumos/illumos-gate@2bcf0248e9 It turns out both are actually required. In the case of appending data to a newly created file, the dnode proper is dirtied (at least to change the blocksize) and dirty records are added. Thus, a single logical operation is represented by separate dirty indicators, and must not be separated. The incorrect dirty check becomes a problem when the first block of a file is being appended to while another process is calling lseek to skip holes. There is a small window where the dnode part is undirtied while there are still dirty records. In this case, `lseek(fd, 0, SEEK_DATA)` would not know that the file is dirty, and would go to `dnode_next_offset()`. Since the object has no data blocks yet, it returns `ESRCH`, indicating no data found, which results in `ENXIO` being returned to `lseek()`'s caller. Since coreutils 9.2, `cp` performs sparse copies by default, that is, it uses `SEEK_DATA` and `SEEK_HOLE` against the source file and attempts to replicate the holes in the target. When it hits the bug, its initial search for data fails, and it goes on to call `fallocate()` to create a hole over the entire destination file. This has come up more recently as users upgrade their systems, getting OpenZFS 2.2 as well as a newer coreutils. However, this problem has been reproduced against 2.1, as well as on FreeBSD 13 and 14. This change simply updates the dirty check to check both types of dirty. If there's anything dirty at all, we immediately go to the "wait for sync" stage, It doesn't really matter after that; both changes are on disk, so the dirty fields should be correct. Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Rich Ercolani Signed-off-by: Rob Norris Closes #15571 Closes #15526 --- module/zfs/dnode.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 7cf03264dce2..ad9988366ada 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1764,7 +1764,14 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots) } /* - * Checks if the dnode contains any uncommitted dirty records. + * Checks if the dnode itself is dirty, or is carrying any uncommitted records. + * It is important to check both conditions, as some operations (eg appending + * to a file) can dirty both as a single logical unit, but they are not synced + * out atomically, so checking one and not the other can result in an object + * appearing to be clean mid-way through a commit. + * + * Do not change this lightly! If you get it wrong, dmu_offset_next() can + * detect a hole where there is really data, leading to silent corruption. */ boolean_t dnode_is_dirty(dnode_t *dn) @@ -1772,7 +1779,8 @@ dnode_is_dirty(dnode_t *dn) mutex_enter(&dn->dn_mtx); for (int i = 0; i < TXG_SIZE; i++) { - if (multilist_link_active(&dn->dn_dirty_link[i])) { + if (multilist_link_active(&dn->dn_dirty_link[i]) || + !list_is_empty(&dn->dn_dirty_records[i])) { mutex_exit(&dn->dn_mtx); return (B_TRUE); } From 56a2a0981ed770ed316769d078c892b1e8fa67a9 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Sat, 18 Nov 2023 20:01:03 -0500 Subject: [PATCH 03/16] ZIL: Do not encrypt block pointers in lr_clone_range_t In case of crash cloned blocks need to be claimed on pool import. It is only possible if they (lr_bps) and their count (lr_nbps) are not encrypted but only authenticated, similar to block pointer in lr_write_t. Few other fields can be and are still encrypted. This should fix panic on ZIL claim after crash when block cloning is actively used. Reviewed-by: Richard Yao Reviewed-by: Tom Caputi Reviewed-by: Sean Eric Fagan Reviewed-by: Brian Behlendorf Reviewed-by: Edmund Nadolski Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15543 Closes #15513 --- module/os/freebsd/zfs/zio_crypt.c | 13 +++++++++++++ module/os/linux/zfs/zio_crypt.c | 15 +++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/module/os/freebsd/zfs/zio_crypt.c b/module/os/freebsd/zfs/zio_crypt.c index fdbe13dbb5e9..024a931d7816 100644 --- a/module/os/freebsd/zfs/zio_crypt.c +++ b/module/os/freebsd/zfs/zio_crypt.c @@ -1364,6 +1364,19 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, vec++; total_len += crypt_len; } + } else if (txtype == TX_CLONE_RANGE) { + const size_t o = offsetof(lr_clone_range_t, lr_nbps); + crypt_len = o - sizeof (lr_t); + dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t); + dst_iovecs[vec].iov_len = crypt_len; + + /* copy the bps now since they will not be encrypted */ + memcpy(dlrp + o, slrp + o, lr_len - o); + memcpy(aadp, slrp + o, lr_len - o); + aadp += lr_len - o; + aad_len += lr_len - o; + vec++; + total_len += crypt_len; } else { crypt_len = lr_len - sizeof (lr_t); dst_iovecs[vec].iov_base = (char *)dlrp + diff --git a/module/os/linux/zfs/zio_crypt.c b/module/os/linux/zfs/zio_crypt.c index 55554d09ee43..775ab8efbcdf 100644 --- a/module/os/linux/zfs/zio_crypt.c +++ b/module/os/linux/zfs/zio_crypt.c @@ -1543,6 +1543,21 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, nr_iovecs++; total_len += crypt_len; } + } else if (txtype == TX_CLONE_RANGE) { + const size_t o = offsetof(lr_clone_range_t, lr_nbps); + crypt_len = o - sizeof (lr_t); + src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); + src_iovecs[nr_iovecs].iov_len = crypt_len; + dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + + /* copy the bps now since they will not be encrypted */ + memcpy(dlrp + o, slrp + o, lr_len - o); + memcpy(aadp, slrp + o, lr_len - o); + aadp += lr_len - o; + aad_len += lr_len - o; + nr_iovecs++; + total_len += crypt_len; } else { crypt_len = lr_len - sizeof (lr_t); src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); From 41c4599cba75c5bb18018fdec022a907e5f14ffa Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Mon, 27 Nov 2023 13:24:37 -0800 Subject: [PATCH 04/16] ZTS: Fix zfs_load-key failures on F39 The zfs_load-key tests were failing on F39 due to their use of the deprecated ssl.wrap_socket function. This commit updates the test to instead use ssl.SSLContext() as described in https://stackoverflow.com/a/65194957. Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #15534 Closes #15550 --- .../cli_root/zfs_load-key/zfs_load-key_common.kshlib | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib index 4a85999b4ab8..f174eeeeaae9 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib @@ -123,7 +123,10 @@ if not httpd: with open('$HTTPS_PORT_FILE', 'w') as portf: print(port, file=portf) -httpd.socket = ssl.wrap_socket(httpd.socket, server_side=True, keyfile='/$TESTPOOL/snakeoil.key', certfile='$SSL_CA_CERT_FILE', ssl_version=ssl.PROTOCOL_TLS) +sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) +sslctx.check_hostname = False +sslctx.load_cert_chain(certfile='$SSL_CA_CERT_FILE', keyfile='/$TESTPOOL/snakeoil.key') +httpd.socket = httpd.socket = sslctx.wrap_socket(httpd.socket, server_side=True) os.chdir('$STF_SUITE/tests/functional/cli_root/zfs_load-key') From d702f86eaf5aba13762f436152026ba0befa1a23 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 18 Nov 2023 21:32:16 +1100 Subject: [PATCH 05/16] brt: lift internal definitions into _impl header So that zdb (and others!) can get at the BRT on-disk structures. Reviewed-by: Alexander Motin Reviewed-by: Kay Pedersen Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #15541 --- include/Makefile.am | 1 + include/sys/brt_impl.h | 199 +++++++++++++++++++++++++++++++++++++++++ module/zfs/brt.c | 164 +-------------------------------- 3 files changed, 201 insertions(+), 163 deletions(-) create mode 100644 include/sys/brt_impl.h diff --git a/include/Makefile.am b/include/Makefile.am index 569de6dfa781..5f38f6ac6ddb 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -33,6 +33,7 @@ COMMON_H = \ sys/bqueue.h \ sys/btree.h \ sys/brt.h \ + sys/brt_impl.h \ sys/dataset_kstats.h \ sys/dbuf.h \ sys/ddt.h \ diff --git a/include/sys/brt_impl.h b/include/sys/brt_impl.h new file mode 100644 index 000000000000..9cc06fbb2c3a --- /dev/null +++ b/include/sys/brt_impl.h @@ -0,0 +1,199 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek + */ + +#ifndef _SYS_BRT_IMPL_H +#define _SYS_BRT_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * BRT - Block Reference Table. + */ +#define BRT_OBJECT_VDEV_PREFIX "com.fudosecurity:brt:vdev:" + +/* + * We divide each VDEV into 16MB chunks. Each chunk is represented in memory + * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B + * Each element in this array represents how many BRT entries do we have in this + * chunk of storage. We always load this entire array into memory and update as + * needed. By having it in memory we can quickly tell (during zio_free()) if + * there are any BRT entries that we might need to update. + * + * This value cannot be larger than 16MB, at least as long as we support + * 512 byte block sizes. With 512 byte block size we can have exactly + * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too + * many for a 16bit counter. + */ +#define BRT_RANGESIZE (16 * 1024 * 1024) +_Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX, + "BRT_RANGESIZE is too large."); +/* + * We don't want to update the whole structure every time. Maintain bitmap + * of dirty blocks within the regions, so that a single bit represents a + * block size of entcounts. For example if we have a 1PB vdev then all + * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this + * 128MB array of entcounts into 32kB disk blocks, as we don't want to update + * the whole 128MB on disk when we have updated only a single entcount. + * We maintain a bitmap where each 32kB disk block within 128MB entcounts array + * is represented by a single bit. This gives us 4096 bits. A set bit in the + * bitmap means that we had a change in at least one of the 16384 entcounts + * that reside on a 32kB disk block (32kB / sizeof (uint16_t)). + */ +#define BRT_BLOCKSIZE (32 * 1024) +#define BRT_RANGESIZE_TO_NBLOCKS(size) \ + (((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1) + +#define BRT_LITTLE_ENDIAN 0 +#define BRT_BIG_ENDIAN 1 +#ifdef _ZFS_LITTLE_ENDIAN +#define BRT_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN +#define BRT_NON_NATIVE_BYTEORDER BRT_BIG_ENDIAN +#else +#define BRT_NATIVE_BYTEORDER BRT_BIG_ENDIAN +#define BRT_NON_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN +#endif + +typedef struct brt_vdev_phys { + uint64_t bvp_mos_entries; + uint64_t bvp_size; + uint64_t bvp_byteorder; + uint64_t bvp_totalcount; + uint64_t bvp_rangesize; + uint64_t bvp_usedspace; + uint64_t bvp_savedspace; +} brt_vdev_phys_t; + +typedef struct brt_vdev { + /* + * VDEV id. + */ + uint64_t bv_vdevid; + /* + * Is the structure initiated? + * (bv_entcount and bv_bitmap are allocated?) + */ + boolean_t bv_initiated; + /* + * Object number in the MOS for the entcount array and brt_vdev_phys. + */ + uint64_t bv_mos_brtvdev; + /* + * Object number in the MOS for the entries table. + */ + uint64_t bv_mos_entries; + /* + * Entries to sync. + */ + avl_tree_t bv_tree; + /* + * Does the bv_entcount[] array needs byte swapping? + */ + boolean_t bv_need_byteswap; + /* + * Number of entries in the bv_entcount[] array. + */ + uint64_t bv_size; + /* + * This is the array with BRT entry count per BRT_RANGESIZE. + */ + uint16_t *bv_entcount; + /* + * Sum of all bv_entcount[]s. + */ + uint64_t bv_totalcount; + /* + * Space on disk occupied by cloned blocks (without compression). + */ + uint64_t bv_usedspace; + /* + * How much additional space would be occupied without block cloning. + */ + uint64_t bv_savedspace; + /* + * brt_vdev_phys needs updating on disk. + */ + boolean_t bv_meta_dirty; + /* + * bv_entcount[] needs updating on disk. + */ + boolean_t bv_entcount_dirty; + /* + * bv_entcount[] potentially can be a bit too big to sychronize it all + * when we just changed few entcounts. The fields below allow us to + * track updates to bv_entcount[] array since the last sync. + * A single bit in the bv_bitmap represents as many entcounts as can + * fit into a single BRT_BLOCKSIZE. + * For example we have 65536 entcounts in the bv_entcount array + * (so the whole array is 128kB). We updated bv_entcount[2] and + * bv_entcount[5]. In that case only first bit in the bv_bitmap will + * be set and we will write only first BRT_BLOCKSIZE out of 128kB. + */ + ulong_t *bv_bitmap; + uint64_t bv_nblocks; +} brt_vdev_t; + +/* + * In-core brt + */ +typedef struct brt { + krwlock_t brt_lock; + spa_t *brt_spa; +#define brt_mos brt_spa->spa_meta_objset + uint64_t brt_rangesize; + uint64_t brt_usedspace; + uint64_t brt_savedspace; + avl_tree_t brt_pending_tree[TXG_SIZE]; + kmutex_t brt_pending_lock[TXG_SIZE]; + /* Sum of all entries across all bv_trees. */ + uint64_t brt_nentries; + brt_vdev_t *brt_vdevs; + uint64_t brt_nvdevs; +} brt_t; + +/* Size of bre_offset / sizeof (uint64_t). */ +#define BRT_KEY_WORDS (1) + +/* + * In-core brt entry. + * On-disk we use bre_offset as the key and bre_refcount as the value. + */ +typedef struct brt_entry { + uint64_t bre_offset; + uint64_t bre_refcount; + avl_node_t bre_node; +} brt_entry_t; + +typedef struct brt_pending_entry { + blkptr_t bpe_bp; + int bpe_count; + avl_node_t bpe_node; +} brt_pending_entry_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BRT_IMPL_H */ diff --git a/module/zfs/brt.c b/module/zfs/brt.c index ddd8eefe600b..b0529521ec76 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -243,169 +244,6 @@ * a chance to clean this up on dataset destroy (see zil_free_clone_range()). */ -/* - * BRT - Block Reference Table. - */ -#define BRT_OBJECT_VDEV_PREFIX "com.fudosecurity:brt:vdev:" - -/* - * We divide each VDEV into 16MB chunks. Each chunk is represented in memory - * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B - * Each element in this array represents how many BRT entries do we have in this - * chunk of storage. We always load this entire array into memory and update as - * needed. By having it in memory we can quickly tell (during zio_free()) if - * there are any BRT entries that we might need to update. - * - * This value cannot be larger than 16MB, at least as long as we support - * 512 byte block sizes. With 512 byte block size we can have exactly - * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too - * many for a 16bit counter. - */ -#define BRT_RANGESIZE (16 * 1024 * 1024) -_Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX, - "BRT_RANGESIZE is too large."); -/* - * We don't want to update the whole structure every time. Maintain bitmap - * of dirty blocks within the regions, so that a single bit represents a - * block size of entcounts. For example if we have a 1PB vdev then all - * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this - * 128MB array of entcounts into 32kB disk blocks, as we don't want to update - * the whole 128MB on disk when we have updated only a single entcount. - * We maintain a bitmap where each 32kB disk block within 128MB entcounts array - * is represented by a single bit. This gives us 4096 bits. A set bit in the - * bitmap means that we had a change in at least one of the 16384 entcounts - * that reside on a 32kB disk block (32kB / sizeof (uint16_t)). - */ -#define BRT_BLOCKSIZE (32 * 1024) -#define BRT_RANGESIZE_TO_NBLOCKS(size) \ - (((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1) - -#define BRT_LITTLE_ENDIAN 0 -#define BRT_BIG_ENDIAN 1 -#ifdef _ZFS_LITTLE_ENDIAN -#define BRT_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN -#define BRT_NON_NATIVE_BYTEORDER BRT_BIG_ENDIAN -#else -#define BRT_NATIVE_BYTEORDER BRT_BIG_ENDIAN -#define BRT_NON_NATIVE_BYTEORDER BRT_LITTLE_ENDIAN -#endif - -typedef struct brt_vdev_phys { - uint64_t bvp_mos_entries; - uint64_t bvp_size; - uint64_t bvp_byteorder; - uint64_t bvp_totalcount; - uint64_t bvp_rangesize; - uint64_t bvp_usedspace; - uint64_t bvp_savedspace; -} brt_vdev_phys_t; - -typedef struct brt_vdev { - /* - * VDEV id. - */ - uint64_t bv_vdevid; - /* - * Is the structure initiated? - * (bv_entcount and bv_bitmap are allocated?) - */ - boolean_t bv_initiated; - /* - * Object number in the MOS for the entcount array and brt_vdev_phys. - */ - uint64_t bv_mos_brtvdev; - /* - * Object number in the MOS for the entries table. - */ - uint64_t bv_mos_entries; - /* - * Entries to sync. - */ - avl_tree_t bv_tree; - /* - * Does the bv_entcount[] array needs byte swapping? - */ - boolean_t bv_need_byteswap; - /* - * Number of entries in the bv_entcount[] array. - */ - uint64_t bv_size; - /* - * This is the array with BRT entry count per BRT_RANGESIZE. - */ - uint16_t *bv_entcount; - /* - * Sum of all bv_entcount[]s. - */ - uint64_t bv_totalcount; - /* - * Space on disk occupied by cloned blocks (without compression). - */ - uint64_t bv_usedspace; - /* - * How much additional space would be occupied without block cloning. - */ - uint64_t bv_savedspace; - /* - * brt_vdev_phys needs updating on disk. - */ - boolean_t bv_meta_dirty; - /* - * bv_entcount[] needs updating on disk. - */ - boolean_t bv_entcount_dirty; - /* - * bv_entcount[] potentially can be a bit too big to sychronize it all - * when we just changed few entcounts. The fields below allow us to - * track updates to bv_entcount[] array since the last sync. - * A single bit in the bv_bitmap represents as many entcounts as can - * fit into a single BRT_BLOCKSIZE. - * For example we have 65536 entcounts in the bv_entcount array - * (so the whole array is 128kB). We updated bv_entcount[2] and - * bv_entcount[5]. In that case only first bit in the bv_bitmap will - * be set and we will write only first BRT_BLOCKSIZE out of 128kB. - */ - ulong_t *bv_bitmap; - uint64_t bv_nblocks; -} brt_vdev_t; - -/* - * In-core brt - */ -typedef struct brt { - krwlock_t brt_lock; - spa_t *brt_spa; -#define brt_mos brt_spa->spa_meta_objset - uint64_t brt_rangesize; - uint64_t brt_usedspace; - uint64_t brt_savedspace; - avl_tree_t brt_pending_tree[TXG_SIZE]; - kmutex_t brt_pending_lock[TXG_SIZE]; - /* Sum of all entries across all bv_trees. */ - uint64_t brt_nentries; - brt_vdev_t *brt_vdevs; - uint64_t brt_nvdevs; -} brt_t; - -/* Size of bre_offset / sizeof (uint64_t). */ -#define BRT_KEY_WORDS (1) - -/* - * In-core brt entry. - * On-disk we use bre_offset as the key and bre_refcount as the value. - */ -typedef struct brt_entry { - uint64_t bre_offset; - uint64_t bre_refcount; - avl_node_t bre_node; -} brt_entry_t; - -typedef struct brt_pending_entry { - blkptr_t bpe_bp; - int bpe_count; - avl_node_t bpe_node; -} brt_pending_entry_t; - static kmem_cache_t *brt_entry_cache; static kmem_cache_t *brt_pending_entry_cache; From e96675a7b131c0049c7dd2b269ab0344f530999e Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 18 Nov 2023 21:33:45 +1100 Subject: [PATCH 06/16] zdb: show BRT statistics and dump its contents Same idea as the dedup stats, but for block cloning. Reviewed-by: Alexander Motin Reviewed-by: Kay Pedersen Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #15541 --- cmd/zdb/zdb.c | 90 ++++++++++++++++++- man/man8/zdb.8 | 11 ++- .../functional/cli_root/zdb/zdb_args_neg.ksh | 2 +- 3 files changed, 99 insertions(+), 4 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 005bf3f16590..201a443431cc 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -34,6 +34,7 @@ * Copyright (c) 2021 Allan Jude * Copyright (c) 2021 Toomas Soome * Copyright (c) 2023, Klara Inc. + * Copyright (c) 2023, Rob Norris */ #include @@ -80,6 +81,7 @@ #include #include #include +#include #include #include @@ -899,6 +901,8 @@ usage(void) "don't print label contents\n"); (void) fprintf(stderr, " -t --txg=INTEGER " "highest txg to use when searching for uberblocks\n"); + (void) fprintf(stderr, " -T --brt-stats " + "BRT statistics\n"); (void) fprintf(stderr, " -u --uberblock " "uberblock\n"); (void) fprintf(stderr, " -U --cachefile=PATH " @@ -999,6 +1003,15 @@ zdb_nicenum(uint64_t num, char *buf, size_t buflen) nicenum(num, buf, buflen); } +static void +zdb_nicebytes(uint64_t bytes, char *buf, size_t buflen) +{ + if (dump_opt['P']) + (void) snprintf(buf, buflen, "%llu", (longlong_t)bytes); + else + zfs_nicebytes(bytes, buf, buflen); +} + static const char histo_stars[] = "****************************************"; static const uint64_t histo_width = sizeof (histo_stars) - 1; @@ -2081,6 +2094,76 @@ dump_all_ddts(spa_t *spa) dump_dedup_ratio(&dds_total); } +static void +dump_brt(spa_t *spa) +{ + if (!spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING)) { + printf("BRT: unsupported on this pool\n"); + return; + } + + if (!spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) { + printf("BRT: empty\n"); + return; + } + + brt_t *brt = spa->spa_brt; + VERIFY(brt); + + char count[32], used[32], saved[32]; + zdb_nicebytes(brt_get_used(spa), used, sizeof (used)); + zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved)); + uint64_t ratio = brt_get_ratio(spa); + printf("BRT: used %s; saved %s; ratio %llu.%02llux\n", used, saved, + (u_longlong_t)(ratio / 100), (u_longlong_t)(ratio % 100)); + + if (dump_opt['T'] < 2) + return; + + for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid]; + if (brtvd == NULL) + continue; + + if (!brtvd->bv_initiated) { + printf("BRT: vdev %lu: empty\n", vdevid); + continue; + } + + zdb_nicenum(brtvd->bv_totalcount, count, sizeof (count)); + zdb_nicebytes(brtvd->bv_usedspace, used, sizeof (used)); + zdb_nicebytes(brtvd->bv_savedspace, saved, sizeof (saved)); + printf("BRT: vdev %lu: refcnt %s; used %s; saved %s\n", + vdevid, count, used, saved); + } + + if (dump_opt['T'] < 3) + return; + + char dva[64]; + printf("\n%-16s %-10s\n", "DVA", "REFCNT"); + + for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid]; + if (brtvd == NULL || !brtvd->bv_initiated) + continue; + + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, brt->brt_mos, brtvd->bv_mos_entries); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t offset = *(uint64_t *)za.za_name; + uint64_t refcnt = za.za_first_integer; + + snprintf(dva, sizeof (dva), "%lu:%llx", vdevid, + (u_longlong_t)offset); + printf("%-16s %-10llu\n", dva, (u_longlong_t)refcnt); + } + zap_cursor_fini(&zc); + } +} + static void dump_dtl_seg(void *arg, uint64_t start, uint64_t size) { @@ -8093,6 +8176,9 @@ dump_zpool(spa_t *spa) if (dump_opt['D']) dump_all_ddts(spa); + if (dump_opt['T']) + dump_brt(spa); + if (dump_opt['d'] > 2 || dump_opt['m']) dump_metaslabs(spa); if (dump_opt['M']) @@ -8879,6 +8965,7 @@ main(int argc, char **argv) {"io-stats", no_argument, NULL, 's'}, {"simulate-dedup", no_argument, NULL, 'S'}, {"txg", required_argument, NULL, 't'}, + {"brt-stats", no_argument, NULL, 'T'}, {"uberblock", no_argument, NULL, 'u'}, {"cachefile", required_argument, NULL, 'U'}, {"verbose", no_argument, NULL, 'v'}, @@ -8892,7 +8979,7 @@ main(int argc, char **argv) }; while ((c = getopt_long(argc, argv, - "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:uU:vVx:XYyZ", + "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:TuU:vVx:XYyZ", long_options, NULL)) != -1) { switch (c) { case 'b': @@ -8914,6 +9001,7 @@ main(int argc, char **argv) case 'R': case 's': case 'S': + case 'T': case 'u': case 'y': case 'Z': diff --git a/man/man8/zdb.8 b/man/man8/zdb.8 index 52c8e452fa7c..d7f66d917ac7 100644 --- a/man/man8/zdb.8 +++ b/man/man8/zdb.8 @@ -14,7 +14,7 @@ .\" Copyright (c) 2017 Lawrence Livermore National Security, LLC. .\" Copyright (c) 2017 Intel Corporation. .\" -.Dd June 27, 2023 +.Dd November 18, 2023 .Dt ZDB 8 .Os . @@ -23,7 +23,7 @@ .Nd display ZFS storage pool debugging and consistency information .Sh SYNOPSIS .Nm -.Op Fl AbcdDFGhikLMNPsvXYy +.Op Fl AbcdDFGhikLMNPsTvXYy .Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns … .Op Fl I Ar inflight-I/O-ops .Oo Fl o Ar var Ns = Ns Ar value Oc Ns … @@ -403,6 +403,13 @@ Display operation counts, bandwidth, and error counts of I/O to the pool from Simulate the effects of deduplication, constructing a DDT and then display that DDT as with .Fl DD . +.It Fl T , -brt-stats +Display block reference table (BRT) statistics, including the size of uniques +blocks cloned, the space saving as a result of cloning, and the saving ratio. +.It Fl TT +Display the per-vdev BRT statistics, including total references. +.It Fl TTT +Dump the contents of the block reference tables. .It Fl u , -uberblock Display the current uberblock. .El diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh index 168e7c18c3a3..688d488ceb62 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh @@ -58,7 +58,7 @@ set -A args "create" "add" "destroy" "import fakepool" \ "setvprop" "blah blah" "-%" "--?" "-*" "-=" \ "-a" "-f" "-g" "-j" "-n" "-o" "-p" "-p /tmp" \ "-t" "-w" "-z" "-E" "-H" "-I" "-J" \ - "-Q" "-R" "-T" "-W" + "-Q" "-R" "-W" log_assert "Execute zdb using invalid parameters." From e4985bf5a1581e77d1e9e07df98ce6468345ae51 Mon Sep 17 00:00:00 2001 From: Akash B Date: Tue, 28 Nov 2023 03:11:58 +0530 Subject: [PATCH 07/16] zdb: Fix zdb '-O|-r' options with -e/exported zpool zdb with '-e' or exported zpool doesn't work along with '-O' and '-r' options as we process them before '-e' has been processed. Below errors are seen: ~> zdb -e pool-mds65/mdt65 -O oi.9/0x200000009:0x0:0x0 failed to hold dataset 'pool-mds65/mdt65': No such file or directory ~> zdb -e pool-oss0/ost0 -r file1 /tmp/filecopy1 -p. failed to hold dataset 'pool-oss0/ost0': No such file or directory zdb: internal error: No such file or directory We need to make sure to process '-O|-r' options after the '-e' option has been processed, which imports the pool to the namespace if it's not in the cachefile. Reviewed-by: Brian Behlendorf Signed-off-by: Akash B Closes #15532 --- cmd/zdb/zdb.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 201a443431cc..e773b15091de 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -9164,22 +9164,6 @@ main(int argc, char **argv) if (dump_opt['l']) return (dump_label(argv[0])); - if (dump_opt['O']) { - if (argc != 2) - usage(); - dump_opt['v'] = verbose + 3; - return (dump_path(argv[0], argv[1], NULL)); - } - if (dump_opt['r']) { - target_is_spa = B_FALSE; - if (argc != 3) - usage(); - dump_opt['v'] = verbose; - error = dump_path(argv[0], argv[1], &object); - if (error != 0) - fatal("internal error: %s", strerror(error)); - } - if (dump_opt['X'] || dump_opt['F']) rewind = ZPOOL_DO_REWIND | (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0); @@ -9280,6 +9264,29 @@ main(int argc, char **argv) searchdirs = NULL; } + /* + * We need to make sure to process -O option or call + * dump_path after the -e option has been processed, + * which imports the pool to the namespace if it's + * not in the cachefile. + */ + if (dump_opt['O']) { + if (argc != 2) + usage(); + dump_opt['v'] = verbose + 3; + return (dump_path(argv[0], argv[1], NULL)); + } + + if (dump_opt['r']) { + target_is_spa = B_FALSE; + if (argc != 3) + usage(); + dump_opt['v'] = verbose; + error = dump_path(argv[0], argv[1], &object); + if (error != 0) + fatal("internal error: %s", strerror(error)); + } + /* * import_checkpointed_state makes the assumption that the * target pool that we pass it is already part of the spa From 2a953e0ac928563166ad7caa89e48b8d257724d4 Mon Sep 17 00:00:00 2001 From: Rob N Date: Wed, 29 Nov 2023 04:53:04 +1100 Subject: [PATCH 08/16] dmu_buf_will_clone: fix race in transition back to NOFILL Previously, dmu_buf_will_clone() would roll back any dirty record, but would not clean out the modified data nor reset the state before releasing the lock. That leaves the last-written data in db_data, but the dbuf in the wrong state. This is eventually corrected when the dbuf state is made NOFILL, and dbuf_noread() called (which clears out the old data), but at this point its too late, because the lock was already dropped with that invalid state. Any caller acquiring the lock before the call into dmu_buf_will_not_fill() can find what appears to be a clean, readable buffer, and would take the wrong state from it: it should be getting the data from the cloned block, not from earlier (unwritten) dirty data. Even after the state was switched to NOFILL, the old data was still not cleaned out until dbuf_noread(), which is another gap for a caller to take the lock and read the wrong data. This commit fixes all this by properly cleaning up the previous state and then setting the new state before dropping the lock. The DBUF_VERIFY() calls confirm that the dbuf is in a valid state when the lock is down. Sponsored-by: Klara, Inc. Sponsored-By: OpenDrives Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Pawel Jakub Dawidek Signed-off-by: Rob Norris Closes #15566 Closes #15526 --- module/zfs/dbuf.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index f2831a0e8abf..5a7fe42b602a 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2700,15 +2700,23 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) * writes and clones into this block. */ mutex_enter(&db->db_mtx); + DBUF_VERIFY(db); VERIFY(!dbuf_undirty(db, tx)); ASSERT3P(dbuf_find_dirty_eq(db, tx->tx_txg), ==, NULL); if (db->db_buf != NULL) { arc_buf_destroy(db->db_buf, db); db->db_buf = NULL; + dbuf_clear_data(db); } + + db->db_state = DB_NOFILL; + DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone"); + + DBUF_VERIFY(db); mutex_exit(&db->db_mtx); - dmu_buf_will_not_fill(db_fake, tx); + dbuf_noread(db); + (void) dbuf_dirty(db, tx); } void From 349fb77f1167c287e9d2bf8bab5f1934d8b2672c Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Mon, 27 Nov 2023 13:58:03 -0700 Subject: [PATCH 09/16] FreeBSD: Fix the build on FreeBSD 12 It was broken for several reasons: * VOP_UNLOCK lost an argument in 13.0. So OpenZFS should be using VOP_UNLOCK1, but a few direct calls to VOP_UNLOCK snuck in. * The location of the zlib header moved in 13.0 and 12.1. We can drop support for building on 12.0, which is EoL. * knlist_init lost an argument in 13.0. OpenZFS change 9d0887402ba assumed 13.0 or later. * FreeBSD 13.0 added copy_file_range, and OpenZFS change 67a1b037915 assumed 13.0 or later. Sponsored-by: Axcient Reviewed-by: Alexander Motin Signed-off-by: Alan Somers Closes #15551 --- README.md | 2 +- include/os/freebsd/spl/sys/vnode.h | 3 ++- module/os/freebsd/spl/spl_zlib.c | 8 -------- module/os/freebsd/zfs/event_os.c | 18 ++++++++++++++++++ module/os/freebsd/zfs/zfs_vnops_os.c | 4 ++++ 5 files changed, 25 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 331889560950..af244c1fff14 100644 --- a/README.md +++ b/README.md @@ -32,4 +32,4 @@ For more details see the NOTICE, LICENSE and COPYRIGHT files; `UCRL-CODE-235197` # Supported Kernels * The `META` file contains the officially recognized supported Linux kernel versions. - * Supported FreeBSD versions are any supported branches and releases starting from 12.2-RELEASE. + * Supported FreeBSD versions are any supported branches and releases starting from 12.4-RELEASE. diff --git a/include/os/freebsd/spl/sys/vnode.h b/include/os/freebsd/spl/sys/vnode.h index 0779e58e4953..75c32f221ffd 100644 --- a/include/os/freebsd/spl/sys/vnode.h +++ b/include/os/freebsd/spl/sys/vnode.h @@ -56,6 +56,7 @@ enum symfollow { NO_FOLLOW = NOFOLLOW }; #ifndef IN_BASE #include_next #endif +#include #include #include #include @@ -104,7 +105,7 @@ vn_flush_cached_data(vnode_t *vp, boolean_t sync) zfs_vmobject_wlock(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, flags); zfs_vmobject_wunlock(vp->v_object); - VOP_UNLOCK(vp); + VOP_UNLOCK1(vp); } } #endif diff --git a/module/os/freebsd/spl/spl_zlib.c b/module/os/freebsd/spl/spl_zlib.c index 8bd3bdedf268..42c6d7f9df6c 100644 --- a/module/os/freebsd/spl/spl_zlib.c +++ b/module/os/freebsd/spl/spl_zlib.c @@ -32,11 +32,7 @@ __FBSDID("$FreeBSD$"); #include #include #include -#if __FreeBSD_version >= 1300041 #include -#else -#include -#endif #include @@ -90,11 +86,7 @@ zlib_inflateInit(z_stream *stream) static int zlib_inflate(z_stream *stream, int finish) { -#if __FreeBSD_version >= 1300024 return (inflate(stream, finish)); -#else - return (_zlib104_inflate(stream, finish)); -#endif } diff --git a/module/os/freebsd/zfs/event_os.c b/module/os/freebsd/zfs/event_os.c index 239d44d0cfe7..e774fbaaf867 100644 --- a/module/os/freebsd/zfs/event_os.c +++ b/module/os/freebsd/zfs/event_os.c @@ -46,6 +46,7 @@ knlist_sx_xunlock(void *arg) sx_xunlock((struct sx *)arg); } +#if __FreeBSD_version >= 1300128 static void knlist_sx_assert_lock(void *arg, int what) { @@ -55,11 +56,28 @@ knlist_sx_assert_lock(void *arg, int what) else sx_assert((struct sx *)arg, SX_UNLOCKED); } +#else +static void +knlist_sx_assert_locked(void *arg) +{ + sx_assert((struct sx *)arg, SX_LOCKED); +} +static void +knlist_sx_assert_unlocked(void *arg) +{ + sx_assert((struct sx *)arg, SX_UNLOCKED); +} +#endif void knlist_init_sx(struct knlist *knl, struct sx *lock) { +#if __FreeBSD_version >= 1300128 knlist_init(knl, lock, knlist_sx_xlock, knlist_sx_xunlock, knlist_sx_assert_lock); +#else + knlist_init(knl, lock, knlist_sx_xlock, knlist_sx_xunlock, + knlist_sx_assert_locked, knlist_sx_assert_unlocked); +#endif } diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index f672deed34dd..05f28033be6a 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -6213,6 +6213,7 @@ zfs_deallocate(struct vop_deallocate_args *ap) } #endif +#if __FreeBSD_version >= 1300039 #ifndef _SYS_SYSPROTO_H_ struct vop_copy_file_range_args { struct vnode *a_invp; @@ -6319,6 +6320,7 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap) ap->a_incred, ap->a_outcred, ap->a_fsizetd); return (error); } +#endif struct vop_vector zfs_vnodeops; struct vop_vector zfs_fifoops; @@ -6383,7 +6385,9 @@ struct vop_vector zfs_vnodeops = { #if __FreeBSD_version >= 1400043 .vop_add_writecount = vop_stdadd_writecount_nomsync, #endif +#if __FreeBSD_version >= 1300039 .vop_copy_file_range = zfs_freebsd_copy_file_range, +#endif }; VFS_VOP_VECTOR_REGISTER(zfs_vnodeops); From 3b267e72de6f98fb911af9477d2c48de3c36c6dd Mon Sep 17 00:00:00 2001 From: AllKind Date: Mon, 27 Nov 2023 22:17:48 +0100 Subject: [PATCH 10/16] zfs-dkms: fix shell-init error message If all zfs dkms modules have been removed, a shell-init error message may appear, because /var/lib/dkms/zfs does no longer exist. Resolve this by leaving the directory earlier on. Reviewed-by: Brian Behlendorf Signed-off-by: Mart Frauenlob Closes #15576 --- rpm/generic/zfs-dkms.spec.in | 1 + 1 file changed, 1 insertion(+) diff --git a/rpm/generic/zfs-dkms.spec.in b/rpm/generic/zfs-dkms.spec.in index d56967d7a8b1..cd85dd28cf56 100644 --- a/rpm/generic/zfs-dkms.spec.in +++ b/rpm/generic/zfs-dkms.spec.in @@ -103,6 +103,7 @@ if [ -d ${dkms_root}/%{module} ]; then fi fi done + cd ${dkms_root} fi # Uninstall this version of zfs dkms modules before installation of the package. From d813aa85309a06310e7dda3be87fd139f8b6035c Mon Sep 17 00:00:00 2001 From: Jaron Kent-Dobias Date: Tue, 28 Nov 2023 20:34:40 +0100 Subject: [PATCH 11/16] Linux 6.6 compat: fix configure error with clang (#15558) With Linux v6.6.x and clang 16, a configure step fails on a warning that later results in an error while building, due to 'ts' being uninitialized. Add a trivial initialization to silence the warning. Signed-off-by: Jaron Kent-Dobias Reviewed-by: Tony Hutter --- config/kernel-inode-times.m4 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/kernel-inode-times.m4 b/config/kernel-inode-times.m4 index 412e13b47df5..aae95abf1720 100644 --- a/config/kernel-inode-times.m4 +++ b/config/kernel-inode-times.m4 @@ -47,7 +47,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_TIMES], [ #include ],[ struct inode ip; - struct timespec64 ts; + struct timespec64 ts = {0}; memset(&ip, 0, sizeof(ip)); inode_set_ctime_to_ts(&ip, ts); From eb34de04d78bcc9caf77198dd4491d8e6d52e27d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Matu=C5=A1ka?= Date: Wed, 29 Nov 2023 18:18:30 +0100 Subject: [PATCH 12/16] zdb: fix printf() length for uint64_t devid Bug introduced in 213d6829673. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Warner Losh Signed-off-by: Martin Matuska Closes #15606 --- cmd/zdb/zdb.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index e773b15091de..3fc9fd2a9d81 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -2126,14 +2126,14 @@ dump_brt(spa_t *spa) continue; if (!brtvd->bv_initiated) { - printf("BRT: vdev %lu: empty\n", vdevid); + printf("BRT: vdev %" PRIu64 ": empty\n", vdevid); continue; } zdb_nicenum(brtvd->bv_totalcount, count, sizeof (count)); zdb_nicebytes(brtvd->bv_usedspace, used, sizeof (used)); zdb_nicebytes(brtvd->bv_savedspace, saved, sizeof (saved)); - printf("BRT: vdev %lu: refcnt %s; used %s; saved %s\n", + printf("BRT: vdev %" PRIu64 ": refcnt %s; used %s; saved %s\n", vdevid, count, used, saved); } @@ -2156,7 +2156,7 @@ dump_brt(spa_t *spa) uint64_t offset = *(uint64_t *)za.za_name; uint64_t refcnt = za.za_first_integer; - snprintf(dva, sizeof (dva), "%lu:%llx", vdevid, + snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx", vdevid, (u_longlong_t)offset); printf("%-16s %-10llu\n", dva, (u_longlong_t)refcnt); } From a8c256046b0433861d6c5dafce54d750059b7c1b Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 29 Nov 2023 13:51:34 -0500 Subject: [PATCH 13/16] ZIL: Call brt_pending_add() replaying TX_CLONE_RANGE zil_claim_clone_range() takes references on cloned blocks before ZIL replay. Later zil_free_clone_range() drops them after replay or on dataset destroy. The total balance is neutral. It means on actual replay we must take additional references, which would stay in BRT. Without this blocks could be freed prematurely when either original file or its clone are destroyed. I've observed BRT being emptied and the feature being deactivated after ZIL replay completion, which should not have happened. With the patch I see expected stats. Reviewed-by: Kay Pedersen Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15603 --- include/sys/dmu.h | 3 +-- module/zfs/brt.c | 14 +++++++------- module/zfs/dmu.c | 6 ++---- module/zfs/zfs_vnops.c | 4 ++-- 4 files changed, 12 insertions(+), 15 deletions(-) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 615ba8fe7496..06b4dc27dfea 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -1072,8 +1072,7 @@ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, struct blkptr *bps, size_t *nbpsp); int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, - uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps, - boolean_t replay); + uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps); /* * Initial setup and final teardown. diff --git a/module/zfs/brt.c b/module/zfs/brt.c index b0529521ec76..759bc8d2e2b8 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -235,13 +235,13 @@ * destination dataset is mounted and its ZIL replayed. * To address this situation we leverage zil_claim() mechanism where ZFS will * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE - * entries, we will bump reference counters for their BPs in the BRT and then - * on mount and ZIL replay we will just attach BPs to the file without - * bumping reference counters. - * Note it is still possible that after zil_claim() we never mount the - * destination, so we never replay its ZIL and we destroy it. This way we would - * end up with leaked references in BRT. We address that too as ZFS gives us - * a chance to clean this up on dataset destroy (see zil_free_clone_range()). + * entries, we will bump reference counters for their BPs in the BRT. Then + * on mount and ZIL replay we bump the reference counters once more, while the + * first references are dropped during ZIL destroy by zil_free_clone_range(). + * It is possible that after zil_claim() we never mount the destination, so + * we never replay its ZIL and just destroy it. In this case the only taken + * references will be dropped by zil_free_clone_range(), since the cloning is + * not going to ever take place. */ static kmem_cache_t *brt_entry_cache; diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index ddb29020b09b..3f626031de52 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2267,7 +2267,7 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, - dmu_tx_t *tx, const blkptr_t *bps, size_t nbps, boolean_t replay) + dmu_tx_t *tx, const blkptr_t *bps, size_t nbps) { spa_t *spa; dmu_buf_t **dbp, *dbuf; @@ -2341,10 +2341,8 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, * When data in embedded into BP there is no need to create * BRT entry as there is no data block. Just copy the BP as * it contains the data. - * Also, when replaying ZIL we don't want to bump references - * in the BRT as it was already done during ZIL claim. */ - if (!replay && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { + if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { brt_pending_add(spa, bp, tx); } } diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 84e6b10ef37c..3a5fa75df2ea 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1333,7 +1333,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, } error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx, - bps, nbps, B_FALSE); + bps, nbps); if (error != 0) { dmu_tx_commit(tx); break; @@ -1467,7 +1467,7 @@ zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz, if (zp->z_blksz < blksz) zfs_grow_blocksize(zp, blksz, tx); - dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps, B_TRUE); + dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); From 522414da3b283dada175161e49ea7f3fe46436ff Mon Sep 17 00:00:00 2001 From: rmacklem <64620010+rmacklem@users.noreply.github.com> Date: Mon, 27 Nov 2023 16:31:03 -0800 Subject: [PATCH 14/16] FreeBSD: Fix ZFS so that snapshots under .zfs/snapshot are NFS visible Call vfs_exjail_clone() for mounts created under .zfs/snapshot to fill in the mnt_exjail field for the mount. If this is not done, the snapshots under .zfs/snapshot with not be accessible over NFS. This version has the argument name in vfs.h fixed to match that of the name in spl_vfs.c, although it really does not matter. External-issue: https://reviews.freebsd.org/D42672 Reviewed-by: Alexander Motin Signed-off-by: Rick Macklem Closes #15563 --- include/os/freebsd/spl/sys/vfs.h | 2 +- module/os/freebsd/spl/spl_vfs.c | 9 ++++++++- module/os/freebsd/zfs/zfs_ctldir.c | 3 ++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/include/os/freebsd/spl/sys/vfs.h b/include/os/freebsd/spl/sys/vfs.h index 7f163fcfdb1e..f2196da56bc8 100644 --- a/include/os/freebsd/spl/sys/vfs.h +++ b/include/os/freebsd/spl/sys/vfs.h @@ -101,7 +101,7 @@ void vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg, void vfs_clearmntopt(vfs_t *vfsp, const char *name); int vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp); int mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, - char *fspath, char *fspec, int fsflags); + char *fspath, char *fspec, int fsflags, vfs_t *parent_vfsp); typedef uint64_t vfs_feature_t; diff --git a/module/os/freebsd/spl/spl_vfs.c b/module/os/freebsd/spl/spl_vfs.c index a07098afc5b4..3f33547216eb 100644 --- a/module/os/freebsd/spl/spl_vfs.c +++ b/module/os/freebsd/spl/spl_vfs.c @@ -120,7 +120,7 @@ vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp) int mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath, - char *fspec, int fsflags) + char *fspec, int fsflags, vfs_t *parent_vfsp) { struct vfsconf *vfsp; struct mount *mp; @@ -220,6 +220,13 @@ mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath, mp->mnt_opt = mp->mnt_optnew; (void) VFS_STATFS(mp, &mp->mnt_stat); +#ifdef VFS_SUPPORTS_EXJAIL_CLONE + /* + * Clone the mnt_exjail credentials of the parent, as required. + */ + vfs_exjail_clone(parent_vfsp, mp); +#endif + /* * Prevent external consumers of mount options from reading * mnt_optnew. diff --git a/module/os/freebsd/zfs/zfs_ctldir.c b/module/os/freebsd/zfs/zfs_ctldir.c index 28445a18b809..a753e91da4fe 100644 --- a/module/os/freebsd/zfs/zfs_ctldir.c +++ b/module/os/freebsd/zfs/zfs_ctldir.c @@ -1026,7 +1026,8 @@ zfsctl_snapdir_lookup(struct vop_lookup_args *ap) "%s/" ZFS_CTLDIR_NAME "/snapshot/%s", dvp->v_vfsp->mnt_stat.f_mntonname, name); - err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0); + err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0, + dvp->v_vfsp); kmem_free(mountpoint, mountpoint_len); if (err == 0) { /* From 494aaaed89cb9fe9f2da3b6c6f465a4bc9f6a7e1 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Tue, 28 Nov 2023 14:53:42 -0800 Subject: [PATCH 15/16] Tag zfs-2.2.2 META file and changelog updated. Signed-off-by: Tony Hutter --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index 5868838a26df..93045ec3abe8 100644 --- a/META +++ b/META @@ -1,7 +1,7 @@ Meta: 1 Name: zfs Branch: 1.0 -Version: 2.2.1 +Version: 2.2.2 Release: 1 Release-Tags: relext License: CDDL From 443a2f2fed05f2291bf5d41bc48347072db38e7d Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 30 Nov 2023 17:23:27 -0500 Subject: [PATCH 16/16] ZIL: Do not clone blocks from the future ZIL claim can not handle block pointers cloned from the future, since they are not yet allocated at that point. It may happen either if the block was just written when it was cloned, or if the pool was frozen or somehow else rewound on import. Handle it from two sides: prevent cloning of blocks with physical birth time from not yet synced or frozen TXG, and abort ZIL claim if we still detect such blocks due to rewind or something else. While there, assert that any cloned blocks we claim are really allocted by calling metaslab_check_free(). Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. --- module/zfs/dmu.c | 15 +++++++++++++++ module/zfs/zil.c | 34 ++++++++++++++++++++++++++-------- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 3f626031de52..63464d747428 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2255,6 +2255,21 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, goto out; } + /* + * If the block was allocated in transaction group that is not + * yet synced, we could clone it, but we couldn't write this + * operation into ZIL, or it may be impossible to replay, since + * the block may appear not yet allocated at that point. + */ + if (BP_PHYSICAL_BIRTH(bp) > spa_freeze_txg(os->os_spa)) { + error = SET_ERROR(EINVAL); + goto out; + } + if (BP_PHYSICAL_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) { + error = SET_ERROR(EAGAIN); + goto out; + } + bps[i] = *bp; } diff --git a/module/zfs/zil.c b/module/zfs/zil.c index a11886136994..1ba3c01111c4 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -624,11 +624,12 @@ zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) } static int -zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) +zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx, + uint64_t first_txg) { const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc; const blkptr_t *bp; - spa_t *spa; + spa_t *spa = zilog->zl_spa; uint_t ii; ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE); @@ -641,19 +642,36 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) * XXX: Do we need to byteswap lr? */ - spa = zilog->zl_spa; - for (ii = 0; ii < lr->lr_nbps; ii++) { bp = &lr->lr_bps[ii]; /* - * When data in embedded into BP there is no need to create + * When data are embedded into BP there is no need to create * BRT entry as there is no data block. Just copy the BP as * it contains the data. */ - if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + continue; + + /* + * We can not handle block pointers from the future, since they + * are not yet allocated. It should not normally happen, but + * just in case lets be safe and just stop here now instead of + * corrupting the pool. + */ + if (BP_PHYSICAL_BIRTH(bp) >= first_txg) + return (SET_ERROR(ENOENT)); + + /* + * Assert the block is really allocated before we reference it. + */ + metaslab_check_free(spa, bp); + } + + for (ii = 0; ii < lr->lr_nbps; ii++) { + bp = &lr->lr_bps[ii]; + if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) brt_pending_add(spa, bp, tx); - } } return (0); @@ -668,7 +686,7 @@ zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, case TX_WRITE: return (zil_claim_write(zilog, lrc, tx, first_txg)); case TX_CLONE_RANGE: - return (zil_claim_clone_range(zilog, lrc, tx)); + return (zil_claim_clone_range(zilog, lrc, tx, first_txg)); default: return (0); }