From 89fcb8c6f969cbfa237849d9b33610124d361186 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Mon, 27 Nov 2023 13:49:20 -0800
Subject: [PATCH 01/16] Revert "Tune zio buffer caches and their alignments"

This reverts commit bd7a02c251d8c119937e847d5161b512913667e6 which
can trigger an unlikely existing bio alignment issue on Linux.
This change is good, but the underlying issue it exposes needs to
be resolved before this can be re-applied.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #15533
---
 module/zfs/zio.c | 89 +++++++++++++++++++++++++++---------------------
 1 file changed, 50 insertions(+), 39 deletions(-)

diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index a719e5492323..3b3b40fa73d8 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -158,22 +158,23 @@ zio_init(void)
 	zio_link_cache = kmem_cache_create("zio_link_cache",
 	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
+	/*
+	 * For small buffers, we want a cache for each multiple of
+	 * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
+	 * for each quarter-power of 2.
+	 */
 	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
-		size_t align, cflags, data_cflags;
-		char name[32];
-
-		/*
-		 * Create cache for each half-power of 2 size, starting from
-		 * SPA_MINBLOCKSIZE.  It should give us memory space efficiency
-		 * of ~7/8, sufficient for transient allocations mostly using
-		 * these caches.
-		 */
 		size_t p2 = size;
+		size_t align = 0;
+		size_t data_cflags, cflags;
+
+		data_cflags = KMC_NODEBUG;
+		cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
+		    KMC_NODEBUG : 0;
+
 		while (!ISP2(p2))
 			p2 &= p2 - 1;
-		if (!IS_P2ALIGNED(size, p2 / 2))
-			continue;
 
 #ifndef _KERNEL
 		/*
@@ -184,37 +185,47 @@ zio_init(void)
 		 */
 		if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 			continue;
-#endif
-
-		if (IS_P2ALIGNED(size, PAGESIZE))
+		/*
+		 * Here's the problem - on 4K native devices in userland on
+		 * Linux using O_DIRECT, buffers must be 4K aligned or I/O
+		 * will fail with EINVAL, causing zdb (and others) to coredump.
+		 * Since userland probably doesn't need optimized buffer caches,
+		 * we just force 4K alignment on everything.
+		 */
+		align = 8 * SPA_MINBLOCKSIZE;
+#else
+		if (size < PAGESIZE) {
+			align = SPA_MINBLOCKSIZE;
+		} else if (IS_P2ALIGNED(size, p2 >> 2)) {
 			align = PAGESIZE;
-		else
-			align = 1 << (highbit64(size ^ (size - 1)) - 1);
+		}
+#endif
 
-		cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
-		    KMC_NODEBUG : 0;
-		data_cflags = KMC_NODEBUG;
-		if (cflags == data_cflags) {
-			/*
-			 * Resulting kmem caches would be identical.
-			 * Save memory by creating only one.
-			 */
-			(void) snprintf(name, sizeof (name),
-			    "zio_buf_comb_%lu", (ulong_t)size);
-			zio_buf_cache[c] = kmem_cache_create(name, size, align,
-			    NULL, NULL, NULL, NULL, NULL, cflags);
-			zio_data_buf_cache[c] = zio_buf_cache[c];
-			continue;
+		if (align != 0) {
+			char name[36];
+			if (cflags == data_cflags) {
+				/*
+				 * Resulting kmem caches would be identical.
+				 * Save memory by creating only one.
+				 */
+				(void) snprintf(name, sizeof (name),
+				    "zio_buf_comb_%lu", (ulong_t)size);
+				zio_buf_cache[c] = kmem_cache_create(name,
+				    size, align, NULL, NULL, NULL, NULL, NULL,
+				    cflags);
+				zio_data_buf_cache[c] = zio_buf_cache[c];
+				continue;
+			}
+			(void) snprintf(name, sizeof (name), "zio_buf_%lu",
+			    (ulong_t)size);
+			zio_buf_cache[c] = kmem_cache_create(name, size,
+			    align, NULL, NULL, NULL, NULL, NULL, cflags);
+
+			(void) snprintf(name, sizeof (name), "zio_data_buf_%lu",
+			    (ulong_t)size);
+			zio_data_buf_cache[c] = kmem_cache_create(name, size,
+			    align, NULL, NULL, NULL, NULL, NULL, data_cflags);
 		}
-		(void) snprintf(name, sizeof (name), "zio_buf_%lu",
-		    (ulong_t)size);
-		zio_buf_cache[c] = kmem_cache_create(name, size, align,
-		    NULL, NULL, NULL, NULL, NULL, cflags);
-
-		(void) snprintf(name, sizeof (name), "zio_data_buf_%lu",
-		    (ulong_t)size);
-		zio_data_buf_cache[c] = kmem_cache_create(name, size, align,
-		    NULL, NULL, NULL, NULL, NULL, data_cflags);
 	}
 
 	while (--c != 0) {

From 9b9b09f452a469458451c221debfbab944e7f081 Mon Sep 17 00:00:00 2001
From: Rob N <robn@despairlabs.com>
Date: Wed, 29 Nov 2023 04:15:48 +1100
Subject: [PATCH 02/16] dnode_is_dirty: check dnode and its data for dirtiness

Over its history this the dirty dnode test has been changed between
checking for a dnodes being on `os_dirty_dnodes` (`dn_dirty_link`) and
`dn_dirty_record`.

  de198f2d9 Fix lseek(SEEK_DATA/SEEK_HOLE) mmap consistency
  2531ce372 Revert "Report holes when there are only metadata changes"
  ec4f9b8f3 Report holes when there are only metadata changes
  454365bba Fix dirty check in dmu_offset_next()
  66aca2473 SEEK_HOLE should not block on txg_wait_synced()

Also illumos/illumos-gate@c543ec060d illumos/illumos-gate@2bcf0248e9

It turns out both are actually required.

In the case of appending data to a newly created file, the dnode proper
is dirtied (at least to change the blocksize) and dirty records are
added.  Thus, a single logical operation is represented by separate
dirty indicators, and must not be separated.

The incorrect dirty check becomes a problem when the first block of a
file is being appended to while another process is calling lseek to skip
holes. There is a small window where the dnode part is undirtied while
there are still dirty records. In this case, `lseek(fd, 0, SEEK_DATA)`
would not know that the file is dirty, and would go to
`dnode_next_offset()`. Since the object has no data blocks yet, it
returns `ESRCH`, indicating no data found, which results in `ENXIO`
being returned to `lseek()`'s caller.

Since coreutils 9.2, `cp` performs sparse copies by default, that is, it
uses `SEEK_DATA` and `SEEK_HOLE` against the source file and attempts to
replicate the holes in the target. When it hits the bug, its initial
search for data fails, and it goes on to call `fallocate()` to create a
hole over the entire destination file.

This has come up more recently as users upgrade their systems, getting
OpenZFS 2.2 as well as a newer coreutils. However, this problem has been
reproduced against 2.1, as well as on FreeBSD 13 and 14.

This change simply updates the dirty check to check both types of dirty.
If there's anything dirty at all, we immediately go to the "wait for
sync" stage, It doesn't really matter after that; both changes are on
disk, so the dirty fields should be correct.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #15571
Closes #15526
---
 module/zfs/dnode.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index 7cf03264dce2..ad9988366ada 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -1764,7 +1764,14 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots)
 }
 
 /*
- * Checks if the dnode contains any uncommitted dirty records.
+ * Checks if the dnode itself is dirty, or is carrying any uncommitted records.
+ * It is important to check both conditions, as some operations (eg appending
+ * to a file) can dirty both as a single logical unit, but they are not synced
+ * out atomically, so checking one and not the other can result in an object
+ * appearing to be clean mid-way through a commit.
+ *
+ * Do not change this lightly! If you get it wrong, dmu_offset_next() can
+ * detect a hole where there is really data, leading to silent corruption.
  */
 boolean_t
 dnode_is_dirty(dnode_t *dn)
@@ -1772,7 +1779,8 @@ dnode_is_dirty(dnode_t *dn)
 	mutex_enter(&dn->dn_mtx);
 
 	for (int i = 0; i < TXG_SIZE; i++) {
-		if (multilist_link_active(&dn->dn_dirty_link[i])) {
+		if (multilist_link_active(&dn->dn_dirty_link[i]) ||
+		    !list_is_empty(&dn->dn_dirty_records[i])) {
 			mutex_exit(&dn->dn_mtx);
 			return (B_TRUE);
 		}

From 56a2a0981ed770ed316769d078c892b1e8fa67a9 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Sat, 18 Nov 2023 20:01:03 -0500
Subject: [PATCH 03/16] ZIL: Do not encrypt block pointers in lr_clone_range_t

In case of crash cloned blocks need to be claimed on pool import.
It is only possible if they (lr_bps) and their count (lr_nbps) are
not encrypted but only authenticated, similar to block pointer in
lr_write_t.  Few other fields can be and are still encrypted.

This should fix panic on ZIL claim after crash when block cloning
is actively used.

Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Reviewed-by: Tom Caputi <caputit1@tcnj.edu>
Reviewed-by: Sean Eric Fagan <sef@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Edmund Nadolski <edmund.nadolski@ixsystems.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #15543
Closes #15513
---
 module/os/freebsd/zfs/zio_crypt.c | 13 +++++++++++++
 module/os/linux/zfs/zio_crypt.c   | 15 +++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/module/os/freebsd/zfs/zio_crypt.c b/module/os/freebsd/zfs/zio_crypt.c
index fdbe13dbb5e9..024a931d7816 100644
--- a/module/os/freebsd/zfs/zio_crypt.c
+++ b/module/os/freebsd/zfs/zio_crypt.c
@@ -1364,6 +1364,19 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
 				vec++;
 				total_len += crypt_len;
 			}
+		} else if (txtype == TX_CLONE_RANGE) {
+			const size_t o = offsetof(lr_clone_range_t, lr_nbps);
+			crypt_len = o - sizeof (lr_t);
+			dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t);
+			dst_iovecs[vec].iov_len = crypt_len;
+
+			/* copy the bps now since they will not be encrypted */
+			memcpy(dlrp + o, slrp + o, lr_len - o);
+			memcpy(aadp, slrp + o, lr_len - o);
+			aadp += lr_len - o;
+			aad_len += lr_len - o;
+			vec++;
+			total_len += crypt_len;
 		} else {
 			crypt_len = lr_len - sizeof (lr_t);
 			dst_iovecs[vec].iov_base = (char *)dlrp +
diff --git a/module/os/linux/zfs/zio_crypt.c b/module/os/linux/zfs/zio_crypt.c
index 55554d09ee43..775ab8efbcdf 100644
--- a/module/os/linux/zfs/zio_crypt.c
+++ b/module/os/linux/zfs/zio_crypt.c
@@ -1543,6 +1543,21 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
 				nr_iovecs++;
 				total_len += crypt_len;
 			}
+		} else if (txtype == TX_CLONE_RANGE) {
+			const size_t o = offsetof(lr_clone_range_t, lr_nbps);
+			crypt_len = o - sizeof (lr_t);
+			src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
+			src_iovecs[nr_iovecs].iov_len = crypt_len;
+			dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
+			dst_iovecs[nr_iovecs].iov_len = crypt_len;
+
+			/* copy the bps now since they will not be encrypted */
+			memcpy(dlrp + o, slrp + o, lr_len - o);
+			memcpy(aadp, slrp + o, lr_len - o);
+			aadp += lr_len - o;
+			aad_len += lr_len - o;
+			nr_iovecs++;
+			total_len += crypt_len;
 		} else {
 			crypt_len = lr_len - sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);

From 41c4599cba75c5bb18018fdec022a907e5f14ffa Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Mon, 27 Nov 2023 13:24:37 -0800
Subject: [PATCH 04/16] ZTS: Fix zfs_load-key failures on F39

The zfs_load-key tests were failing on F39 due to their use of the
deprecated ssl.wrap_socket function.  This commit updates the test to
instead use ssl.SSLContext() as described in
https://stackoverflow.com/a/65194957.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #15534
Closes #15550
---
 .../cli_root/zfs_load-key/zfs_load-key_common.kshlib         | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib
index 4a85999b4ab8..f174eeeeaae9 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib
@@ -123,7 +123,10 @@ if not httpd:
 with open('$HTTPS_PORT_FILE', 'w') as portf:
 	print(port, file=portf)
 
-httpd.socket = ssl.wrap_socket(httpd.socket, server_side=True, keyfile='/$TESTPOOL/snakeoil.key', certfile='$SSL_CA_CERT_FILE', ssl_version=ssl.PROTOCOL_TLS)
+sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
+sslctx.check_hostname = False
+sslctx.load_cert_chain(certfile='$SSL_CA_CERT_FILE', keyfile='/$TESTPOOL/snakeoil.key')
+httpd.socket = httpd.socket = sslctx.wrap_socket(httpd.socket, server_side=True)
 
 os.chdir('$STF_SUITE/tests/functional/cli_root/zfs_load-key')
 

From d702f86eaf5aba13762f436152026ba0befa1a23 Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Sat, 18 Nov 2023 21:32:16 +1100
Subject: [PATCH 05/16] brt: lift internal definitions into _impl header

So that zdb (and others!) can get at the BRT on-disk structures.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Kay Pedersen <mail@mkwg.de>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #15541
---
 include/Makefile.am    |   1 +
 include/sys/brt_impl.h | 199 +++++++++++++++++++++++++++++++++++++++++
 module/zfs/brt.c       | 164 +--------------------------------
 3 files changed, 201 insertions(+), 163 deletions(-)
 create mode 100644 include/sys/brt_impl.h

diff --git a/include/Makefile.am b/include/Makefile.am
index 569de6dfa781..5f38f6ac6ddb 100644
--- a/include/Makefile.am
+++ b/include/Makefile.am
@@ -33,6 +33,7 @@ COMMON_H = \
 	sys/bqueue.h \
 	sys/btree.h \
 	sys/brt.h \
+	sys/brt_impl.h \
 	sys/dataset_kstats.h \
 	sys/dbuf.h \
 	sys/ddt.h \
diff --git a/include/sys/brt_impl.h b/include/sys/brt_impl.h
new file mode 100644
index 000000000000..9cc06fbb2c3a
--- /dev/null
+++ b/include/sys/brt_impl.h
@@ -0,0 +1,199 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
+ */
+
+#ifndef _SYS_BRT_IMPL_H
+#define	_SYS_BRT_IMPL_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * BRT - Block Reference Table.
+ */
+#define	BRT_OBJECT_VDEV_PREFIX	"com.fudosecurity:brt:vdev:"
+
+/*
+ * We divide each VDEV into 16MB chunks. Each chunk is represented in memory
+ * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B
+ * Each element in this array represents how many BRT entries do we have in this
+ * chunk of storage. We always load this entire array into memory and update as
+ * needed. By having it in memory we can quickly tell (during zio_free()) if
+ * there are any BRT entries that we might need to update.
+ *
+ * This value cannot be larger than 16MB, at least as long as we support
+ * 512 byte block sizes. With 512 byte block size we can have exactly
+ * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too
+ * many for a 16bit counter.
+ */
+#define	BRT_RANGESIZE	(16 * 1024 * 1024)
+_Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX,
+	"BRT_RANGESIZE is too large.");
+/*
+ * We don't want to update the whole structure every time. Maintain bitmap
+ * of dirty blocks within the regions, so that a single bit represents a
+ * block size of entcounts. For example if we have a 1PB vdev then all
+ * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this
+ * 128MB array of entcounts into 32kB disk blocks, as we don't want to update
+ * the whole 128MB on disk when we have updated only a single entcount.
+ * We maintain a bitmap where each 32kB disk block within 128MB entcounts array
+ * is represented by a single bit. This gives us 4096 bits. A set bit in the
+ * bitmap means that we had a change in at least one of the 16384 entcounts
+ * that reside on a 32kB disk block (32kB / sizeof (uint16_t)).
+ */
+#define	BRT_BLOCKSIZE	(32 * 1024)
+#define	BRT_RANGESIZE_TO_NBLOCKS(size)					\
+	(((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
+
+#define	BRT_LITTLE_ENDIAN	0
+#define	BRT_BIG_ENDIAN		1
+#ifdef _ZFS_LITTLE_ENDIAN
+#define	BRT_NATIVE_BYTEORDER		BRT_LITTLE_ENDIAN
+#define	BRT_NON_NATIVE_BYTEORDER	BRT_BIG_ENDIAN
+#else
+#define	BRT_NATIVE_BYTEORDER		BRT_BIG_ENDIAN
+#define	BRT_NON_NATIVE_BYTEORDER	BRT_LITTLE_ENDIAN
+#endif
+
+typedef struct brt_vdev_phys {
+	uint64_t	bvp_mos_entries;
+	uint64_t	bvp_size;
+	uint64_t	bvp_byteorder;
+	uint64_t	bvp_totalcount;
+	uint64_t	bvp_rangesize;
+	uint64_t	bvp_usedspace;
+	uint64_t	bvp_savedspace;
+} brt_vdev_phys_t;
+
+typedef struct brt_vdev {
+	/*
+	 * VDEV id.
+	 */
+	uint64_t	bv_vdevid;
+	/*
+	 * Is the structure initiated?
+	 * (bv_entcount and bv_bitmap are allocated?)
+	 */
+	boolean_t	bv_initiated;
+	/*
+	 * Object number in the MOS for the entcount array and brt_vdev_phys.
+	 */
+	uint64_t	bv_mos_brtvdev;
+	/*
+	 * Object number in the MOS for the entries table.
+	 */
+	uint64_t	bv_mos_entries;
+	/*
+	 * Entries to sync.
+	 */
+	avl_tree_t	bv_tree;
+	/*
+	 * Does the bv_entcount[] array needs byte swapping?
+	 */
+	boolean_t	bv_need_byteswap;
+	/*
+	 * Number of entries in the bv_entcount[] array.
+	 */
+	uint64_t	bv_size;
+	/*
+	 * This is the array with BRT entry count per BRT_RANGESIZE.
+	 */
+	uint16_t	*bv_entcount;
+	/*
+	 * Sum of all bv_entcount[]s.
+	 */
+	uint64_t	bv_totalcount;
+	/*
+	 * Space on disk occupied by cloned blocks (without compression).
+	 */
+	uint64_t	bv_usedspace;
+	/*
+	 * How much additional space would be occupied without block cloning.
+	 */
+	uint64_t	bv_savedspace;
+	/*
+	 * brt_vdev_phys needs updating on disk.
+	 */
+	boolean_t	bv_meta_dirty;
+	/*
+	 * bv_entcount[] needs updating on disk.
+	 */
+	boolean_t	bv_entcount_dirty;
+	/*
+	 * bv_entcount[] potentially can be a bit too big to sychronize it all
+	 * when we just changed few entcounts. The fields below allow us to
+	 * track updates to bv_entcount[] array since the last sync.
+	 * A single bit in the bv_bitmap represents as many entcounts as can
+	 * fit into a single BRT_BLOCKSIZE.
+	 * For example we have 65536 entcounts in the bv_entcount array
+	 * (so the whole array is 128kB). We updated bv_entcount[2] and
+	 * bv_entcount[5]. In that case only first bit in the bv_bitmap will
+	 * be set and we will write only first BRT_BLOCKSIZE out of 128kB.
+	 */
+	ulong_t		*bv_bitmap;
+	uint64_t	bv_nblocks;
+} brt_vdev_t;
+
+/*
+ * In-core brt
+ */
+typedef struct brt {
+	krwlock_t	brt_lock;
+	spa_t		*brt_spa;
+#define	brt_mos		brt_spa->spa_meta_objset
+	uint64_t	brt_rangesize;
+	uint64_t	brt_usedspace;
+	uint64_t	brt_savedspace;
+	avl_tree_t	brt_pending_tree[TXG_SIZE];
+	kmutex_t	brt_pending_lock[TXG_SIZE];
+	/* Sum of all entries across all bv_trees. */
+	uint64_t	brt_nentries;
+	brt_vdev_t	*brt_vdevs;
+	uint64_t	brt_nvdevs;
+} brt_t;
+
+/* Size of bre_offset / sizeof (uint64_t). */
+#define	BRT_KEY_WORDS	(1)
+
+/*
+ * In-core brt entry.
+ * On-disk we use bre_offset as the key and bre_refcount as the value.
+ */
+typedef struct brt_entry {
+	uint64_t	bre_offset;
+	uint64_t	bre_refcount;
+	avl_node_t	bre_node;
+} brt_entry_t;
+
+typedef struct brt_pending_entry {
+	blkptr_t	bpe_bp;
+	int		bpe_count;
+	avl_node_t	bpe_node;
+} brt_pending_entry_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_BRT_IMPL_H */
diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index ddd8eefe600b..b0529521ec76 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -28,6 +28,7 @@
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/brt.h>
+#include <sys/brt_impl.h>
 #include <sys/ddt.h>
 #include <sys/bitmap.h>
 #include <sys/zap.h>
@@ -243,169 +244,6 @@
  * a chance to clean this up on dataset destroy (see zil_free_clone_range()).
  */
 
-/*
- * BRT - Block Reference Table.
- */
-#define	BRT_OBJECT_VDEV_PREFIX	"com.fudosecurity:brt:vdev:"
-
-/*
- * We divide each VDEV into 16MB chunks. Each chunk is represented in memory
- * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B
- * Each element in this array represents how many BRT entries do we have in this
- * chunk of storage. We always load this entire array into memory and update as
- * needed. By having it in memory we can quickly tell (during zio_free()) if
- * there are any BRT entries that we might need to update.
- *
- * This value cannot be larger than 16MB, at least as long as we support
- * 512 byte block sizes. With 512 byte block size we can have exactly
- * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too
- * many for a 16bit counter.
- */
-#define	BRT_RANGESIZE	(16 * 1024 * 1024)
-_Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX,
-	"BRT_RANGESIZE is too large.");
-/*
- * We don't want to update the whole structure every time. Maintain bitmap
- * of dirty blocks within the regions, so that a single bit represents a
- * block size of entcounts. For example if we have a 1PB vdev then all
- * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this
- * 128MB array of entcounts into 32kB disk blocks, as we don't want to update
- * the whole 128MB on disk when we have updated only a single entcount.
- * We maintain a bitmap where each 32kB disk block within 128MB entcounts array
- * is represented by a single bit. This gives us 4096 bits. A set bit in the
- * bitmap means that we had a change in at least one of the 16384 entcounts
- * that reside on a 32kB disk block (32kB / sizeof (uint16_t)).
- */
-#define	BRT_BLOCKSIZE	(32 * 1024)
-#define	BRT_RANGESIZE_TO_NBLOCKS(size)					\
-	(((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
-
-#define	BRT_LITTLE_ENDIAN	0
-#define	BRT_BIG_ENDIAN		1
-#ifdef _ZFS_LITTLE_ENDIAN
-#define	BRT_NATIVE_BYTEORDER		BRT_LITTLE_ENDIAN
-#define	BRT_NON_NATIVE_BYTEORDER	BRT_BIG_ENDIAN
-#else
-#define	BRT_NATIVE_BYTEORDER		BRT_BIG_ENDIAN
-#define	BRT_NON_NATIVE_BYTEORDER	BRT_LITTLE_ENDIAN
-#endif
-
-typedef struct brt_vdev_phys {
-	uint64_t	bvp_mos_entries;
-	uint64_t	bvp_size;
-	uint64_t	bvp_byteorder;
-	uint64_t	bvp_totalcount;
-	uint64_t	bvp_rangesize;
-	uint64_t	bvp_usedspace;
-	uint64_t	bvp_savedspace;
-} brt_vdev_phys_t;
-
-typedef struct brt_vdev {
-	/*
-	 * VDEV id.
-	 */
-	uint64_t	bv_vdevid;
-	/*
-	 * Is the structure initiated?
-	 * (bv_entcount and bv_bitmap are allocated?)
-	 */
-	boolean_t	bv_initiated;
-	/*
-	 * Object number in the MOS for the entcount array and brt_vdev_phys.
-	 */
-	uint64_t	bv_mos_brtvdev;
-	/*
-	 * Object number in the MOS for the entries table.
-	 */
-	uint64_t	bv_mos_entries;
-	/*
-	 * Entries to sync.
-	 */
-	avl_tree_t	bv_tree;
-	/*
-	 * Does the bv_entcount[] array needs byte swapping?
-	 */
-	boolean_t	bv_need_byteswap;
-	/*
-	 * Number of entries in the bv_entcount[] array.
-	 */
-	uint64_t	bv_size;
-	/*
-	 * This is the array with BRT entry count per BRT_RANGESIZE.
-	 */
-	uint16_t	*bv_entcount;
-	/*
-	 * Sum of all bv_entcount[]s.
-	 */
-	uint64_t	bv_totalcount;
-	/*
-	 * Space on disk occupied by cloned blocks (without compression).
-	 */
-	uint64_t	bv_usedspace;
-	/*
-	 * How much additional space would be occupied without block cloning.
-	 */
-	uint64_t	bv_savedspace;
-	/*
-	 * brt_vdev_phys needs updating on disk.
-	 */
-	boolean_t	bv_meta_dirty;
-	/*
-	 * bv_entcount[] needs updating on disk.
-	 */
-	boolean_t	bv_entcount_dirty;
-	/*
-	 * bv_entcount[] potentially can be a bit too big to sychronize it all
-	 * when we just changed few entcounts. The fields below allow us to
-	 * track updates to bv_entcount[] array since the last sync.
-	 * A single bit in the bv_bitmap represents as many entcounts as can
-	 * fit into a single BRT_BLOCKSIZE.
-	 * For example we have 65536 entcounts in the bv_entcount array
-	 * (so the whole array is 128kB). We updated bv_entcount[2] and
-	 * bv_entcount[5]. In that case only first bit in the bv_bitmap will
-	 * be set and we will write only first BRT_BLOCKSIZE out of 128kB.
-	 */
-	ulong_t		*bv_bitmap;
-	uint64_t	bv_nblocks;
-} brt_vdev_t;
-
-/*
- * In-core brt
- */
-typedef struct brt {
-	krwlock_t	brt_lock;
-	spa_t		*brt_spa;
-#define	brt_mos		brt_spa->spa_meta_objset
-	uint64_t	brt_rangesize;
-	uint64_t	brt_usedspace;
-	uint64_t	brt_savedspace;
-	avl_tree_t	brt_pending_tree[TXG_SIZE];
-	kmutex_t	brt_pending_lock[TXG_SIZE];
-	/* Sum of all entries across all bv_trees. */
-	uint64_t	brt_nentries;
-	brt_vdev_t	*brt_vdevs;
-	uint64_t	brt_nvdevs;
-} brt_t;
-
-/* Size of bre_offset / sizeof (uint64_t). */
-#define	BRT_KEY_WORDS	(1)
-
-/*
- * In-core brt entry.
- * On-disk we use bre_offset as the key and bre_refcount as the value.
- */
-typedef struct brt_entry {
-	uint64_t	bre_offset;
-	uint64_t	bre_refcount;
-	avl_node_t	bre_node;
-} brt_entry_t;
-
-typedef struct brt_pending_entry {
-	blkptr_t	bpe_bp;
-	int		bpe_count;
-	avl_node_t	bpe_node;
-} brt_pending_entry_t;
-
 static kmem_cache_t *brt_entry_cache;
 static kmem_cache_t *brt_pending_entry_cache;
 

From e96675a7b131c0049c7dd2b269ab0344f530999e Mon Sep 17 00:00:00 2001
From: Rob Norris <robn@despairlabs.com>
Date: Sat, 18 Nov 2023 21:33:45 +1100
Subject: [PATCH 06/16] zdb: show BRT statistics and dump its contents

Same idea as the dedup stats, but for block cloning.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Kay Pedersen <mail@mkwg.de>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #15541
---
 cmd/zdb/zdb.c                                 | 90 ++++++++++++++++++-
 man/man8/zdb.8                                | 11 ++-
 .../functional/cli_root/zdb/zdb_args_neg.ksh  |  2 +-
 3 files changed, 99 insertions(+), 4 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 005bf3f16590..201a443431cc 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -34,6 +34,7 @@
  * Copyright (c) 2021 Allan Jude
  * Copyright (c) 2021 Toomas Soome <tsoome@me.com>
  * Copyright (c) 2023, Klara Inc.
+ * Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
  */
 
 #include <stdio.h>
@@ -80,6 +81,7 @@
 #include <sys/dsl_scan.h>
 #include <sys/btree.h>
 #include <sys/brt.h>
+#include <sys/brt_impl.h>
 #include <zfs_comutil.h>
 #include <sys/zstd/zstd.h>
 
@@ -899,6 +901,8 @@ usage(void)
 	    "don't print label contents\n");
 	(void) fprintf(stderr, "        -t --txg=INTEGER             "
 	    "highest txg to use when searching for uberblocks\n");
+	(void) fprintf(stderr, "        -T --brt-stats               "
+	    "BRT statistics\n");
 	(void) fprintf(stderr, "        -u --uberblock               "
 	    "uberblock\n");
 	(void) fprintf(stderr, "        -U --cachefile=PATH          "
@@ -999,6 +1003,15 @@ zdb_nicenum(uint64_t num, char *buf, size_t buflen)
 		nicenum(num, buf, buflen);
 }
 
+static void
+zdb_nicebytes(uint64_t bytes, char *buf, size_t buflen)
+{
+	if (dump_opt['P'])
+		(void) snprintf(buf, buflen, "%llu", (longlong_t)bytes);
+	else
+		zfs_nicebytes(bytes, buf, buflen);
+}
+
 static const char histo_stars[] = "****************************************";
 static const uint64_t histo_width = sizeof (histo_stars) - 1;
 
@@ -2081,6 +2094,76 @@ dump_all_ddts(spa_t *spa)
 	dump_dedup_ratio(&dds_total);
 }
 
+static void
+dump_brt(spa_t *spa)
+{
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING)) {
+		printf("BRT: unsupported on this pool\n");
+		return;
+	}
+
+	if (!spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
+		printf("BRT: empty\n");
+		return;
+	}
+
+	brt_t *brt = spa->spa_brt;
+	VERIFY(brt);
+
+	char count[32], used[32], saved[32];
+	zdb_nicebytes(brt_get_used(spa), used, sizeof (used));
+	zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved));
+	uint64_t ratio = brt_get_ratio(spa);
+	printf("BRT: used %s; saved %s; ratio %llu.%02llux\n", used, saved,
+	    (u_longlong_t)(ratio / 100), (u_longlong_t)(ratio % 100));
+
+	if (dump_opt['T'] < 2)
+		return;
+
+	for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+		brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
+		if (brtvd == NULL)
+			continue;
+
+		if (!brtvd->bv_initiated) {
+			printf("BRT: vdev %lu: empty\n", vdevid);
+			continue;
+		}
+
+		zdb_nicenum(brtvd->bv_totalcount, count, sizeof (count));
+		zdb_nicebytes(brtvd->bv_usedspace, used, sizeof (used));
+		zdb_nicebytes(brtvd->bv_savedspace, saved, sizeof (saved));
+		printf("BRT: vdev %lu: refcnt %s; used %s; saved %s\n",
+		    vdevid, count, used, saved);
+	}
+
+	if (dump_opt['T'] < 3)
+		return;
+
+	char dva[64];
+	printf("\n%-16s %-10s\n", "DVA", "REFCNT");
+
+	for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+		brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
+		if (brtvd == NULL || !brtvd->bv_initiated)
+			continue;
+
+		zap_cursor_t zc;
+		zap_attribute_t za;
+		for (zap_cursor_init(&zc, brt->brt_mos, brtvd->bv_mos_entries);
+		    zap_cursor_retrieve(&zc, &za) == 0;
+		    zap_cursor_advance(&zc)) {
+			uint64_t offset = *(uint64_t *)za.za_name;
+			uint64_t refcnt = za.za_first_integer;
+
+			snprintf(dva, sizeof (dva), "%lu:%llx", vdevid,
+			    (u_longlong_t)offset);
+			printf("%-16s %-10llu\n", dva, (u_longlong_t)refcnt);
+		}
+		zap_cursor_fini(&zc);
+	}
+}
+
 static void
 dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
 {
@@ -8093,6 +8176,9 @@ dump_zpool(spa_t *spa)
 	if (dump_opt['D'])
 		dump_all_ddts(spa);
 
+	if (dump_opt['T'])
+		dump_brt(spa);
+
 	if (dump_opt['d'] > 2 || dump_opt['m'])
 		dump_metaslabs(spa);
 	if (dump_opt['M'])
@@ -8879,6 +8965,7 @@ main(int argc, char **argv)
 		{"io-stats",		no_argument,		NULL, 's'},
 		{"simulate-dedup",	no_argument,		NULL, 'S'},
 		{"txg",			required_argument,	NULL, 't'},
+		{"brt-stats",		no_argument,		NULL, 'T'},
 		{"uberblock",		no_argument,		NULL, 'u'},
 		{"cachefile",		required_argument,	NULL, 'U'},
 		{"verbose",		no_argument,		NULL, 'v'},
@@ -8892,7 +8979,7 @@ main(int argc, char **argv)
 	};
 
 	while ((c = getopt_long(argc, argv,
-	    "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:uU:vVx:XYyZ",
+	    "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:TuU:vVx:XYyZ",
 	    long_options, NULL)) != -1) {
 		switch (c) {
 		case 'b':
@@ -8914,6 +9001,7 @@ main(int argc, char **argv)
 		case 'R':
 		case 's':
 		case 'S':
+		case 'T':
 		case 'u':
 		case 'y':
 		case 'Z':
diff --git a/man/man8/zdb.8 b/man/man8/zdb.8
index 52c8e452fa7c..d7f66d917ac7 100644
--- a/man/man8/zdb.8
+++ b/man/man8/zdb.8
@@ -14,7 +14,7 @@
 .\" Copyright (c) 2017 Lawrence Livermore National Security, LLC.
 .\" Copyright (c) 2017 Intel Corporation.
 .\"
-.Dd June 27, 2023
+.Dd November 18, 2023
 .Dt ZDB 8
 .Os
 .
@@ -23,7 +23,7 @@
 .Nd display ZFS storage pool debugging and consistency information
 .Sh SYNOPSIS
 .Nm
-.Op Fl AbcdDFGhikLMNPsvXYy
+.Op Fl AbcdDFGhikLMNPsTvXYy
 .Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns …
 .Op Fl I Ar inflight-I/O-ops
 .Oo Fl o Ar var Ns = Ns Ar value Oc Ns …
@@ -403,6 +403,13 @@ Display operation counts, bandwidth, and error counts of I/O to the pool from
 Simulate the effects of deduplication, constructing a DDT and then display
 that DDT as with
 .Fl DD .
+.It Fl T , -brt-stats
+Display block reference table (BRT) statistics, including the size of uniques
+blocks cloned, the space saving as a result of cloning, and the saving ratio.
+.It Fl TT
+Display the per-vdev BRT statistics, including total references.
+.It Fl TTT
+Dump the contents of the block reference tables.
 .It Fl u , -uberblock
 Display the current uberblock.
 .El
diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh
index 168e7c18c3a3..688d488ceb62 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh
@@ -58,7 +58,7 @@ set -A args "create" "add" "destroy" "import fakepool" \
     "setvprop" "blah blah" "-%" "--?" "-*" "-=" \
     "-a" "-f" "-g" "-j" "-n" "-o" "-p" "-p /tmp" \
     "-t" "-w" "-z" "-E" "-H" "-I" "-J" \
-    "-Q" "-R" "-T" "-W"
+    "-Q" "-R" "-W"
 
 log_assert "Execute zdb using invalid parameters."
 

From e4985bf5a1581e77d1e9e07df98ce6468345ae51 Mon Sep 17 00:00:00 2001
From: Akash B <akash-b@hpe.com>
Date: Tue, 28 Nov 2023 03:11:58 +0530
Subject: [PATCH 07/16] zdb: Fix zdb '-O|-r' options with -e/exported zpool

zdb with '-e' or exported zpool doesn't work along with
'-O' and '-r' options as we process them before '-e' has
been processed.

Below errors are seen:

~> zdb -e pool-mds65/mdt65 -O oi.9/0x200000009:0x0:0x0
failed to hold dataset 'pool-mds65/mdt65': No such file or directory

~> zdb -e pool-oss0/ost0 -r file1 /tmp/filecopy1 -p.
failed to hold dataset 'pool-oss0/ost0': No such file or directory
zdb: internal error: No such file or directory

We need to make sure to process '-O|-r' options after the
'-e' option has been processed, which imports the pool to
the namespace if it's not in the cachefile.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Akash B <akash-b@hpe.com>
Closes #15532
---
 cmd/zdb/zdb.c | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 201a443431cc..e773b15091de 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -9164,22 +9164,6 @@ main(int argc, char **argv)
 	if (dump_opt['l'])
 		return (dump_label(argv[0]));
 
-	if (dump_opt['O']) {
-		if (argc != 2)
-			usage();
-		dump_opt['v'] = verbose + 3;
-		return (dump_path(argv[0], argv[1], NULL));
-	}
-	if (dump_opt['r']) {
-		target_is_spa = B_FALSE;
-		if (argc != 3)
-			usage();
-		dump_opt['v'] = verbose;
-		error = dump_path(argv[0], argv[1], &object);
-		if (error != 0)
-			fatal("internal error: %s", strerror(error));
-	}
-
 	if (dump_opt['X'] || dump_opt['F'])
 		rewind = ZPOOL_DO_REWIND |
 		    (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
@@ -9280,6 +9264,29 @@ main(int argc, char **argv)
 		searchdirs = NULL;
 	}
 
+	/*
+	 * We need to make sure to process -O option or call
+	 * dump_path after the -e option has been processed,
+	 * which imports the pool to the namespace if it's
+	 * not in the cachefile.
+	 */
+	if (dump_opt['O']) {
+		if (argc != 2)
+			usage();
+		dump_opt['v'] = verbose + 3;
+		return (dump_path(argv[0], argv[1], NULL));
+	}
+
+	if (dump_opt['r']) {
+		target_is_spa = B_FALSE;
+		if (argc != 3)
+			usage();
+		dump_opt['v'] = verbose;
+		error = dump_path(argv[0], argv[1], &object);
+		if (error != 0)
+			fatal("internal error: %s", strerror(error));
+	}
+
 	/*
 	 * import_checkpointed_state makes the assumption that the
 	 * target pool that we pass it is already part of the spa

From 2a953e0ac928563166ad7caa89e48b8d257724d4 Mon Sep 17 00:00:00 2001
From: Rob N <robn@despairlabs.com>
Date: Wed, 29 Nov 2023 04:53:04 +1100
Subject: [PATCH 08/16] dmu_buf_will_clone: fix race in transition back to
 NOFILL

Previously, dmu_buf_will_clone() would roll back any dirty record, but
would not clean out the modified data nor reset the state before
releasing the lock. That leaves the last-written data in db_data, but
the dbuf in the wrong state.

This is eventually corrected when the dbuf state is made NOFILL, and
dbuf_noread() called (which clears out the old data), but at this point
its too late, because the lock was already dropped with that invalid
state.

Any caller acquiring the lock before the call into
dmu_buf_will_not_fill() can find what appears to be a clean, readable
buffer, and would take the wrong state from it: it should be getting the
data from the cloned block, not from earlier (unwritten) dirty data.

Even after the state was switched to NOFILL, the old data was still not
cleaned out until dbuf_noread(), which is another gap for a caller to
take the lock and read the wrong data.

This commit fixes all this by properly cleaning up the previous state
and then setting the new state before dropping the lock. The
DBUF_VERIFY() calls confirm that the dbuf is in a valid state when the
lock is down.

Sponsored-by: Klara, Inc.
Sponsored-By: OpenDrives Inc.
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Pawel Jakub Dawidek <pawel@dawidek.net>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #15566
Closes #15526
---
 module/zfs/dbuf.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index f2831a0e8abf..5a7fe42b602a 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -2700,15 +2700,23 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
 	 * writes and clones into this block.
 	 */
 	mutex_enter(&db->db_mtx);
+	DBUF_VERIFY(db);
 	VERIFY(!dbuf_undirty(db, tx));
 	ASSERT3P(dbuf_find_dirty_eq(db, tx->tx_txg), ==, NULL);
 	if (db->db_buf != NULL) {
 		arc_buf_destroy(db->db_buf, db);
 		db->db_buf = NULL;
+		dbuf_clear_data(db);
 	}
+
+	db->db_state = DB_NOFILL;
+	DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone");
+
+	DBUF_VERIFY(db);
 	mutex_exit(&db->db_mtx);
 
-	dmu_buf_will_not_fill(db_fake, tx);
+	dbuf_noread(db);
+	(void) dbuf_dirty(db, tx);
 }
 
 void

From 349fb77f1167c287e9d2bf8bab5f1934d8b2672c Mon Sep 17 00:00:00 2001
From: Alan Somers <asomers@gmail.com>
Date: Mon, 27 Nov 2023 13:58:03 -0700
Subject: [PATCH 09/16] FreeBSD: Fix the build on FreeBSD 12

It was broken for several reasons:
* VOP_UNLOCK lost an argument in 13.0.  So OpenZFS should be using
  VOP_UNLOCK1, but a few direct calls to VOP_UNLOCK snuck in.
* The location of the zlib header moved in 13.0 and 12.1.  We can drop
  support for building on 12.0, which is EoL.
* knlist_init lost an argument in 13.0.  OpenZFS change 9d0887402ba
  assumed 13.0 or later.
* FreeBSD 13.0 added copy_file_range, and OpenZFS change 67a1b037915
  assumed 13.0 or later.

Sponsored-by: Axcient
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Alan Somers <asomers@gmail.com>
Closes #15551
---
 README.md                            |  2 +-
 include/os/freebsd/spl/sys/vnode.h   |  3 ++-
 module/os/freebsd/spl/spl_zlib.c     |  8 --------
 module/os/freebsd/zfs/event_os.c     | 18 ++++++++++++++++++
 module/os/freebsd/zfs/zfs_vnops_os.c |  4 ++++
 5 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 331889560950..af244c1fff14 100644
--- a/README.md
+++ b/README.md
@@ -32,4 +32,4 @@ For more details see the NOTICE, LICENSE and COPYRIGHT files; `UCRL-CODE-235197`
 
 # Supported Kernels
   * The `META` file contains the officially recognized supported Linux kernel versions.
-  * Supported FreeBSD versions are any supported branches and releases starting from 12.2-RELEASE.
+  * Supported FreeBSD versions are any supported branches and releases starting from 12.4-RELEASE.
diff --git a/include/os/freebsd/spl/sys/vnode.h b/include/os/freebsd/spl/sys/vnode.h
index 0779e58e4953..75c32f221ffd 100644
--- a/include/os/freebsd/spl/sys/vnode.h
+++ b/include/os/freebsd/spl/sys/vnode.h
@@ -56,6 +56,7 @@ enum symfollow { NO_FOLLOW = NOFOLLOW };
 #ifndef IN_BASE
 #include_next <sys/vnode.h>
 #endif
+#include <sys/ccompat.h>
 #include <sys/mount.h>
 #include <sys/cred.h>
 #include <sys/fcntl.h>
@@ -104,7 +105,7 @@ vn_flush_cached_data(vnode_t *vp, boolean_t sync)
 		zfs_vmobject_wlock(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, flags);
 		zfs_vmobject_wunlock(vp->v_object);
-		VOP_UNLOCK(vp);
+		VOP_UNLOCK1(vp);
 	}
 }
 #endif
diff --git a/module/os/freebsd/spl/spl_zlib.c b/module/os/freebsd/spl/spl_zlib.c
index 8bd3bdedf268..42c6d7f9df6c 100644
--- a/module/os/freebsd/spl/spl_zlib.c
+++ b/module/os/freebsd/spl/spl_zlib.c
@@ -32,11 +32,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/kmem.h>
 #include <sys/kmem_cache.h>
 #include <sys/zmod.h>
-#if __FreeBSD_version >= 1300041
 #include <contrib/zlib/zlib.h>
-#else
-#include <sys/zlib.h>
-#endif
 #include <sys/kobj.h>
 
 
@@ -90,11 +86,7 @@ zlib_inflateInit(z_stream *stream)
 static int
 zlib_inflate(z_stream *stream, int finish)
 {
-#if __FreeBSD_version >= 1300024
 	return (inflate(stream, finish));
-#else
-	return (_zlib104_inflate(stream, finish));
-#endif
 }
 
 
diff --git a/module/os/freebsd/zfs/event_os.c b/module/os/freebsd/zfs/event_os.c
index 239d44d0cfe7..e774fbaaf867 100644
--- a/module/os/freebsd/zfs/event_os.c
+++ b/module/os/freebsd/zfs/event_os.c
@@ -46,6 +46,7 @@ knlist_sx_xunlock(void *arg)
 	sx_xunlock((struct sx *)arg);
 }
 
+#if __FreeBSD_version >= 1300128
 static void
 knlist_sx_assert_lock(void *arg, int what)
 {
@@ -55,11 +56,28 @@ knlist_sx_assert_lock(void *arg, int what)
 	else
 		sx_assert((struct sx *)arg, SX_UNLOCKED);
 }
+#else
+static void
+knlist_sx_assert_locked(void *arg)
+{
+	sx_assert((struct sx *)arg, SX_LOCKED);
+}
+static void
+knlist_sx_assert_unlocked(void *arg)
+{
+	sx_assert((struct sx *)arg, SX_UNLOCKED);
+}
+#endif
 
 void
 knlist_init_sx(struct knlist *knl, struct sx *lock)
 {
 
+#if __FreeBSD_version >= 1300128
 	knlist_init(knl, lock, knlist_sx_xlock, knlist_sx_xunlock,
 	    knlist_sx_assert_lock);
+#else
+	knlist_init(knl, lock, knlist_sx_xlock, knlist_sx_xunlock,
+	    knlist_sx_assert_locked, knlist_sx_assert_unlocked);
+#endif
 }
diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c
index f672deed34dd..05f28033be6a 100644
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -6213,6 +6213,7 @@ zfs_deallocate(struct vop_deallocate_args *ap)
 }
 #endif
 
+#if __FreeBSD_version >= 1300039
 #ifndef _SYS_SYSPROTO_H_
 struct vop_copy_file_range_args {
 	struct vnode *a_invp;
@@ -6319,6 +6320,7 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
 	    ap->a_incred, ap->a_outcred, ap->a_fsizetd);
 	return (error);
 }
+#endif
 
 struct vop_vector zfs_vnodeops;
 struct vop_vector zfs_fifoops;
@@ -6383,7 +6385,9 @@ struct vop_vector zfs_vnodeops = {
 #if __FreeBSD_version >= 1400043
 	.vop_add_writecount =	vop_stdadd_writecount_nomsync,
 #endif
+#if __FreeBSD_version >= 1300039
 	.vop_copy_file_range =	zfs_freebsd_copy_file_range,
+#endif
 };
 VFS_VOP_VECTOR_REGISTER(zfs_vnodeops);
 

From 3b267e72de6f98fb911af9477d2c48de3c36c6dd Mon Sep 17 00:00:00 2001
From: AllKind <allkind@fastest.cc>
Date: Mon, 27 Nov 2023 22:17:48 +0100
Subject: [PATCH 10/16] zfs-dkms: fix shell-init error message

If all zfs dkms modules have been removed, a shell-init error message
may appear, because /var/lib/dkms/zfs does no longer exist.
Resolve this by leaving the directory earlier on.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Mart Frauenlob <AllKind@fastest.cc>
Closes #15576
---
 rpm/generic/zfs-dkms.spec.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rpm/generic/zfs-dkms.spec.in b/rpm/generic/zfs-dkms.spec.in
index d56967d7a8b1..cd85dd28cf56 100644
--- a/rpm/generic/zfs-dkms.spec.in
+++ b/rpm/generic/zfs-dkms.spec.in
@@ -103,6 +103,7 @@ if [ -d ${dkms_root}/%{module} ]; then
             fi
         fi
     done
+    cd ${dkms_root}
 fi
 
 # Uninstall this version of zfs dkms modules before installation of the package.

From d813aa85309a06310e7dda3be87fd139f8b6035c Mon Sep 17 00:00:00 2001
From: Jaron Kent-Dobias <kentdobias@users.noreply.github.com>
Date: Tue, 28 Nov 2023 20:34:40 +0100
Subject: [PATCH 11/16] Linux 6.6 compat: fix configure error with clang
 (#15558)

With Linux v6.6.x and clang 16, a configure step fails on a warning that
later results in an error while building, due to 'ts' being
uninitialized. Add a trivial initialization to silence the warning.

Signed-off-by: Jaron Kent-Dobias <jaron@kent-dobias.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
---
 config/kernel-inode-times.m4 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/kernel-inode-times.m4 b/config/kernel-inode-times.m4
index 412e13b47df5..aae95abf1720 100644
--- a/config/kernel-inode-times.m4
+++ b/config/kernel-inode-times.m4
@@ -47,7 +47,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_TIMES], [
 		#include <linux/fs.h>
 	],[
 		struct inode ip;
-		struct timespec64 ts;
+		struct timespec64 ts = {0};
 
 		memset(&ip, 0, sizeof(ip));
 		inode_set_ctime_to_ts(&ip, ts);

From eb34de04d78bcc9caf77198dd4491d8e6d52e27d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Matu=C5=A1ka?= <mm@FreeBSD.org>
Date: Wed, 29 Nov 2023 18:18:30 +0100
Subject: [PATCH 12/16] zdb: fix printf() length for uint64_t devid

Bug introduced in 213d6829673.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Warner Losh <imp@FreeBSD.org>
Signed-off-by: Martin Matuska <mm@FreeBSD.org>
Closes #15606
---
 cmd/zdb/zdb.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index e773b15091de..3fc9fd2a9d81 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -2126,14 +2126,14 @@ dump_brt(spa_t *spa)
 			continue;
 
 		if (!brtvd->bv_initiated) {
-			printf("BRT: vdev %lu: empty\n", vdevid);
+			printf("BRT: vdev %" PRIu64 ": empty\n", vdevid);
 			continue;
 		}
 
 		zdb_nicenum(brtvd->bv_totalcount, count, sizeof (count));
 		zdb_nicebytes(brtvd->bv_usedspace, used, sizeof (used));
 		zdb_nicebytes(brtvd->bv_savedspace, saved, sizeof (saved));
-		printf("BRT: vdev %lu: refcnt %s; used %s; saved %s\n",
+		printf("BRT: vdev %" PRIu64 ": refcnt %s; used %s; saved %s\n",
 		    vdevid, count, used, saved);
 	}
 
@@ -2156,7 +2156,7 @@ dump_brt(spa_t *spa)
 			uint64_t offset = *(uint64_t *)za.za_name;
 			uint64_t refcnt = za.za_first_integer;
 
-			snprintf(dva, sizeof (dva), "%lu:%llx", vdevid,
+			snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx", vdevid,
 			    (u_longlong_t)offset);
 			printf("%-16s %-10llu\n", dva, (u_longlong_t)refcnt);
 		}

From a8c256046b0433861d6c5dafce54d750059b7c1b Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 29 Nov 2023 13:51:34 -0500
Subject: [PATCH 13/16] ZIL: Call brt_pending_add() replaying TX_CLONE_RANGE

zil_claim_clone_range() takes references on cloned blocks before ZIL
replay.  Later zil_free_clone_range() drops them after replay or on
dataset destroy.  The total balance is neutral.  It means on actual
replay we must take additional references, which would stay in BRT.

Without this blocks could be freed prematurely when either original
file or its clone are destroyed.  I've observed BRT being emptied
and the feature being deactivated after ZIL replay completion, which
should not have happened.  With the patch I see expected stats.

Reviewed-by: Kay Pedersen <mail@mkwg.de>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15603
---
 include/sys/dmu.h      |  3 +--
 module/zfs/brt.c       | 14 +++++++-------
 module/zfs/dmu.c       |  6 ++----
 module/zfs/zfs_vnops.c |  4 ++--
 4 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 615ba8fe7496..06b4dc27dfea 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -1072,8 +1072,7 @@ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
 int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t length, struct blkptr *bps, size_t *nbpsp);
 int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset,
-    uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps,
-    boolean_t replay);
+    uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps);
 
 /*
  * Initial setup and final teardown.
diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index b0529521ec76..759bc8d2e2b8 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -235,13 +235,13 @@
  * destination dataset is mounted and its ZIL replayed.
  * To address this situation we leverage zil_claim() mechanism where ZFS will
  * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
- * entries, we will bump reference counters for their BPs in the BRT and then
- * on mount and ZIL replay we will just attach BPs to the file without
- * bumping reference counters.
- * Note it is still possible that after zil_claim() we never mount the
- * destination, so we never replay its ZIL and we destroy it. This way we would
- * end up with leaked references in BRT. We address that too as ZFS gives us
- * a chance to clean this up on dataset destroy (see zil_free_clone_range()).
+ * entries, we will bump reference counters for their BPs in the BRT.  Then
+ * on mount and ZIL replay we bump the reference counters once more, while the
+ * first references are dropped during ZIL destroy by zil_free_clone_range().
+ * It is possible that after zil_claim() we never mount the destination, so
+ * we never replay its ZIL and just destroy it.  In this case the only taken
+ * references will be dropped by zil_free_clone_range(), since the cloning is
+ * not going to ever take place.
  */
 
 static kmem_cache_t *brt_entry_cache;
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index ddb29020b09b..3f626031de52 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -2267,7 +2267,7 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 
 int
 dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
-    dmu_tx_t *tx, const blkptr_t *bps, size_t nbps, boolean_t replay)
+    dmu_tx_t *tx, const blkptr_t *bps, size_t nbps)
 {
 	spa_t *spa;
 	dmu_buf_t **dbp, *dbuf;
@@ -2341,10 +2341,8 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 		 * When data in embedded into BP there is no need to create
 		 * BRT entry as there is no data block. Just copy the BP as
 		 * it contains the data.
-		 * Also, when replaying ZIL we don't want to bump references
-		 * in the BRT as it was already done during ZIL claim.
 		 */
-		if (!replay && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
+		if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
 			brt_pending_add(spa, bp, tx);
 		}
 	}
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 84e6b10ef37c..3a5fa75df2ea 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -1333,7 +1333,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 		}
 
 		error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx,
-		    bps, nbps, B_FALSE);
+		    bps, nbps);
 		if (error != 0) {
 			dmu_tx_commit(tx);
 			break;
@@ -1467,7 +1467,7 @@ zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz,
 	if (zp->z_blksz < blksz)
 		zfs_grow_blocksize(zp, blksz, tx);
 
-	dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps, B_TRUE);
+	dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps);
 
 	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
 

From 522414da3b283dada175161e49ea7f3fe46436ff Mon Sep 17 00:00:00 2001
From: rmacklem <64620010+rmacklem@users.noreply.github.com>
Date: Mon, 27 Nov 2023 16:31:03 -0800
Subject: [PATCH 14/16] FreeBSD: Fix ZFS so that snapshots under .zfs/snapshot
 are NFS visible

Call vfs_exjail_clone() for mounts created under .zfs/snapshot
to fill in the mnt_exjail field for the mount.  If this is not
done, the snapshots under .zfs/snapshot with not be accessible
over NFS.

This version has the argument name in vfs.h fixed to match that
of the name in spl_vfs.c, although it really does not matter.

External-issue: https://reviews.freebsd.org/D42672
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Rick Macklem <rmacklem@uoguelph.ca>
Closes #15563
---
 include/os/freebsd/spl/sys/vfs.h   | 2 +-
 module/os/freebsd/spl/spl_vfs.c    | 9 ++++++++-
 module/os/freebsd/zfs/zfs_ctldir.c | 3 ++-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/include/os/freebsd/spl/sys/vfs.h b/include/os/freebsd/spl/sys/vfs.h
index 7f163fcfdb1e..f2196da56bc8 100644
--- a/include/os/freebsd/spl/sys/vfs.h
+++ b/include/os/freebsd/spl/sys/vfs.h
@@ -101,7 +101,7 @@ void vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg,
 void vfs_clearmntopt(vfs_t *vfsp, const char *name);
 int vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp);
 int mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype,
-    char *fspath, char *fspec, int fsflags);
+    char *fspath, char *fspec, int fsflags, vfs_t *parent_vfsp);
 
 typedef	uint64_t	vfs_feature_t;
 
diff --git a/module/os/freebsd/spl/spl_vfs.c b/module/os/freebsd/spl/spl_vfs.c
index a07098afc5b4..3f33547216eb 100644
--- a/module/os/freebsd/spl/spl_vfs.c
+++ b/module/os/freebsd/spl/spl_vfs.c
@@ -120,7 +120,7 @@ vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp)
 
 int
 mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath,
-    char *fspec, int fsflags)
+    char *fspec, int fsflags, vfs_t *parent_vfsp)
 {
 	struct vfsconf *vfsp;
 	struct mount *mp;
@@ -220,6 +220,13 @@ mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath,
 	mp->mnt_opt = mp->mnt_optnew;
 	(void) VFS_STATFS(mp, &mp->mnt_stat);
 
+#ifdef VFS_SUPPORTS_EXJAIL_CLONE
+	/*
+	 * Clone the mnt_exjail credentials of the parent, as required.
+	 */
+	vfs_exjail_clone(parent_vfsp, mp);
+#endif
+
 	/*
 	 * Prevent external consumers of mount options from reading
 	 * mnt_optnew.
diff --git a/module/os/freebsd/zfs/zfs_ctldir.c b/module/os/freebsd/zfs/zfs_ctldir.c
index 28445a18b809..a753e91da4fe 100644
--- a/module/os/freebsd/zfs/zfs_ctldir.c
+++ b/module/os/freebsd/zfs/zfs_ctldir.c
@@ -1026,7 +1026,8 @@ zfsctl_snapdir_lookup(struct vop_lookup_args *ap)
 	    "%s/" ZFS_CTLDIR_NAME "/snapshot/%s",
 	    dvp->v_vfsp->mnt_stat.f_mntonname, name);
 
-	err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0);
+	err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0,
+	    dvp->v_vfsp);
 	kmem_free(mountpoint, mountpoint_len);
 	if (err == 0) {
 		/*

From 494aaaed89cb9fe9f2da3b6c6f465a4bc9f6a7e1 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Tue, 28 Nov 2023 14:53:42 -0800
Subject: [PATCH 15/16] Tag zfs-2.2.2

META file and changelog updated.

Signed-off-by: Tony Hutter <hutter2@llnl.gov>
---
 META | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/META b/META
index 5868838a26df..93045ec3abe8 100644
--- a/META
+++ b/META
@@ -1,7 +1,7 @@
 Meta:          1
 Name:          zfs
 Branch:        1.0
-Version:       2.2.1
+Version:       2.2.2
 Release:       1
 Release-Tags:  relext
 License:       CDDL

From 443a2f2fed05f2291bf5d41bc48347072db38e7d Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 30 Nov 2023 17:23:27 -0500
Subject: [PATCH 16/16] ZIL: Do not clone blocks from the future

ZIL claim can not handle block pointers cloned from the future,
since they are not yet allocated at that point.  It may happen
either if the block was just written when it was cloned, or if
the pool was frozen or somehow else rewound on import.

Handle it from two sides: prevent cloning of blocks with physical
birth time from not yet synced or frozen TXG, and abort ZIL claim
if we still detect such blocks due to rewind or something else.

While there, assert that any cloned blocks we claim are really
allocted by calling metaslab_check_free().

Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
---
 module/zfs/dmu.c | 15 +++++++++++++++
 module/zfs/zil.c | 34 ++++++++++++++++++++++++++--------
 2 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 3f626031de52..63464d747428 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -2255,6 +2255,21 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 			goto out;
 		}
 
+		/*
+		 * If the block was allocated in transaction group that is not
+		 * yet synced, we could clone it, but we couldn't write this
+		 * operation into ZIL, or it may be impossible to replay, since
+		 * the block may appear not yet allocated at that point.
+		 */
+		if (BP_PHYSICAL_BIRTH(bp) > spa_freeze_txg(os->os_spa)) {
+			error = SET_ERROR(EINVAL);
+			goto out;
+		}
+		if (BP_PHYSICAL_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) {
+			error = SET_ERROR(EAGAIN);
+			goto out;
+		}
+
 		bps[i] = *bp;
 	}
 
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index a11886136994..1ba3c01111c4 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -624,11 +624,12 @@ zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
 }
 
 static int
-zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
+zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx,
+    uint64_t first_txg)
 {
 	const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
 	const blkptr_t *bp;
-	spa_t *spa;
+	spa_t *spa = zilog->zl_spa;
 	uint_t ii;
 
 	ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE);
@@ -641,19 +642,36 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
 	 * XXX: Do we need to byteswap lr?
 	 */
 
-	spa = zilog->zl_spa;
-
 	for (ii = 0; ii < lr->lr_nbps; ii++) {
 		bp = &lr->lr_bps[ii];
 
 		/*
-		 * When data in embedded into BP there is no need to create
+		 * When data are embedded into BP there is no need to create
 		 * BRT entry as there is no data block. Just copy the BP as
 		 * it contains the data.
 		 */
-		if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
+		if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+			continue;
+
+		/*
+		 * We can not handle block pointers from the future, since they
+		 * are not yet allocated.  It should not normally happen, but
+		 * just in case lets be safe and just stop here now instead of
+		 * corrupting the pool.
+		 */
+		if (BP_PHYSICAL_BIRTH(bp) >= first_txg)
+			return (SET_ERROR(ENOENT));
+
+		/*
+		 * Assert the block is really allocated before we reference it.
+		 */
+		metaslab_check_free(spa, bp);
+	}
+
+	for (ii = 0; ii < lr->lr_nbps; ii++) {
+		bp = &lr->lr_bps[ii];
+		if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp))
 			brt_pending_add(spa, bp, tx);
-		}
 	}
 
 	return (0);
@@ -668,7 +686,7 @@ zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
 	case TX_WRITE:
 		return (zil_claim_write(zilog, lrc, tx, first_txg));
 	case TX_CLONE_RANGE:
-		return (zil_claim_clone_range(zilog, lrc, tx));
+		return (zil_claim_clone_range(zilog, lrc, tx, first_txg));
 	default:
 		return (0);
 	}