Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sync with upstream zfs-2.2-release branch #158

Merged
merged 21 commits into from
Aug 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
12f2b1f
zdb: include cloned blocks in block statistics
robn Aug 1, 2023
c47f0f4
linux/copy_file_range: properly request a fallback copy on Linux <5.3
robn Aug 1, 2023
b5e2456
Check the return value in clonefile test
oromenahar Aug 1, 2023
b3c1807
readmmap.c: fix building with MUSL libc
zdykstra Aug 1, 2023
bd1eab1
linux: zfs: ctldir: set [amc]time to snapshot's creation property
nabijaczleweli Aug 1, 2023
0ae7bfc
zpool_vdev_remove() should handle EALREADY error return
sdimitro Aug 1, 2023
5fe9a4e
Fix some typos
0mp Aug 7, 2023
39bf443
linux/spl/kmem_cache: undefine `kmem_cache_alloc` before defining it
RaitoBezarius Aug 7, 2023
c2995a8
libzfs: sendrecv: send_progress_thread: handle SIGINFO/SIGUSR1
nabijaczleweli Aug 8, 2023
dd577a7
zfs_clone_range should return a descriptive error codes
oromenahar Aug 8, 2023
72305b2
dracut: support mountpoint=legacy for root dataset
rkitover Aug 8, 2023
b8a85c8
Move zinject from openzfs-zfs-test to openzfs-zfsutils
usaleem-ix Aug 8, 2023
46e77ff
copy_file_range: fix fallback when source create on same txg
robn Aug 15, 2023
06924e6
zed: Add zedlet to power off slot when drive is faulted
tonyhutter Aug 24, 2023
02ce903
Avoid waiting in dmu_sync_late_arrival().
amotin Jul 27, 2023
ffaedf0
Remove fastwrite mechanism.
amotin Jul 28, 2023
c1801cb
ZIL: Avoid dbuf_read() before dmu_sync().
amotin Aug 11, 2023
bb31ded
ZIL: Replay blocks without next block pointer.
amotin Aug 11, 2023
df8c9f3
ZIL: Second attempt to reduce scope of zl_issuer_lock.
amotin Aug 25, 2023
eb35eed
Merge branch 'ozfs/zfs-2.2-release'
ixhamza Aug 25, 2023
4b09b19
Merge branch 'behelndorf/zfs-2.2-backports'
ixhamza Aug 25, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 76 additions & 1 deletion cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
#include <sys/dsl_crypt.h>
#include <sys/dsl_scan.h>
#include <sys/btree.h>
#include <sys/brt.h>
#include <zfs_comutil.h>
#include <sys/zstd/zstd.h>

Expand Down Expand Up @@ -5342,12 +5343,20 @@ static const char *zdb_ot_extname[] = {
#define ZB_TOTAL DN_MAX_LEVELS
#define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1)

typedef struct zdb_brt_entry {
dva_t zbre_dva;
uint64_t zbre_refcount;
avl_node_t zbre_node;
} zdb_brt_entry_t;

typedef struct zdb_cb {
zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
uint64_t zcb_removing_size;
uint64_t zcb_checkpoint_size;
uint64_t zcb_dedup_asize;
uint64_t zcb_dedup_blocks;
uint64_t zcb_clone_asize;
uint64_t zcb_clone_blocks;
uint64_t zcb_psize_count[SPA_MAX_FOR_16M];
uint64_t zcb_lsize_count[SPA_MAX_FOR_16M];
uint64_t zcb_asize_count[SPA_MAX_FOR_16M];
Expand All @@ -5368,6 +5377,8 @@ typedef struct zdb_cb {
int zcb_haderrors;
spa_t *zcb_spa;
uint32_t **zcb_vd_obsolete_counts;
avl_tree_t zcb_brt;
boolean_t zcb_brt_is_active;
} zdb_cb_t;

/* test if two DVA offsets from same vdev are within the same metaslab */
Expand Down Expand Up @@ -5662,6 +5673,45 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
zcb->zcb_asize_total += BP_GET_ASIZE(bp);

if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) {
/*
* Cloned blocks are special. We need to count them, so we can
* later uncount them when reporting leaked space, and we must
* only claim them them once.
*
* To do this, we keep our own in-memory BRT. For each block
* we haven't seen before, we look it up in the real BRT and
* if its there, we note it and its refcount then proceed as
* normal. If we see the block again, we count it as a clone
* and then give it no further consideration.
*/
zdb_brt_entry_t zbre_search, *zbre;
avl_index_t where;

zbre_search.zbre_dva = bp->blk_dva[0];
zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
if (zbre != NULL) {
zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
zcb->zcb_clone_blocks++;

zbre->zbre_refcount--;
if (zbre->zbre_refcount == 0) {
avl_remove(&zcb->zcb_brt, zbre);
umem_free(zbre, sizeof (zdb_brt_entry_t));
}
return;
}

uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp);
if (crefcnt > 0) {
zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
UMEM_NOFAIL);
zbre->zbre_dva = bp->blk_dva[0];
zbre->zbre_refcount = crefcnt;
avl_insert(&zcb->zcb_brt, zbre, where);
}
}

if (dump_opt['L'])
return;

Expand Down Expand Up @@ -6664,6 +6714,20 @@ deleted_livelists_dump_mos(spa_t *spa)
iterate_deleted_livelists(spa, dump_livelist_cb, NULL);
}

static int
zdb_brt_entry_compare(const void *zcn1, const void *zcn2)
{
const dva_t *dva1 = &((const zdb_brt_entry_t *)zcn1)->zbre_dva;
const dva_t *dva2 = &((const zdb_brt_entry_t *)zcn2)->zbre_dva;
int cmp;

cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
if (cmp == 0)
cmp = TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2));

return (cmp);
}

static int
dump_block_stats(spa_t *spa)
{
Expand All @@ -6678,6 +6742,13 @@ dump_block_stats(spa_t *spa)

zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL);

if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
avl_create(&zcb->zcb_brt, zdb_brt_entry_compare,
sizeof (zdb_brt_entry_t),
offsetof(zdb_brt_entry_t, zbre_node));
zcb->zcb_brt_is_active = B_TRUE;
}

(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
(dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
(dump_opt['c'] == 1) ? "metadata " : "",
Expand Down Expand Up @@ -6779,7 +6850,8 @@ dump_block_stats(spa_t *spa)
metaslab_class_get_alloc(spa_special_class(spa)) +
metaslab_class_get_alloc(spa_dedup_class(spa)) +
get_unflushed_alloc_space(spa);
total_found = tzb->zb_asize - zcb->zcb_dedup_asize +
total_found =
tzb->zb_asize - zcb->zcb_dedup_asize - zcb->zcb_clone_asize +
zcb->zcb_removing_size + zcb->zcb_checkpoint_size;

if (total_found == total_alloc && !dump_opt['L']) {
Expand Down Expand Up @@ -6820,6 +6892,9 @@ dump_block_stats(spa_t *spa)
"bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize,
(u_longlong_t)zcb->zcb_dedup_blocks,
(double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0);
(void) printf("\t%-16s %14llu count: %6llu\n",
"bp cloned:", (u_longlong_t)zcb->zcb_clone_asize,
(u_longlong_t)zcb->zcb_clone_blocks);
(void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:",
(u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);

Expand Down
61 changes: 61 additions & 0 deletions cmd/zed/zed.d/statechange-slot_off.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/bin/sh
#
# Turn off disk's enclosure slot if it becomes FAULTED.
#
# Bad SCSI disks can often "disappear and reappear" causing all sorts of chaos
# as they flip between FAULTED and ONLINE. If
# ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT is set in zed.rc, and the disk gets
# FAULTED, then power down the slot via sysfs:
#
# /sys/class/enclosure/<enclosure>/<slot>/power_status
#
# We assume the user will be responsible for turning the slot back on again.
#
# Note that this script requires that your enclosure be supported by the
# Linux SCSI Enclosure services (SES) driver. The script will do nothing
# if you have no enclosure, or if your enclosure isn't supported.
#
# Exit codes:
# 0: slot successfully powered off
# 1: enclosure not available
# 2: ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT disabled
# 3: vdev was not FAULTED
# 4: The enclosure sysfs path passed from ZFS does not exist
# 5: Enclosure slot didn't actually turn off after we told it to

[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
. "${ZED_ZEDLET_DIR}/zed-functions.sh"

if [ ! -d /sys/class/enclosure ] ; then
# No JBOD enclosure or NVMe slots
exit 1
fi

if [ "${ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT}" != "1" ] ; then
exit 2
fi

if [ "$ZEVENT_VDEV_STATE_STR" != "FAULTED" ] ; then
exit 3
fi

if [ ! -f "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" ] ; then
exit 4
fi

echo "off" | tee "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status"

# Wait for sysfs for report that the slot is off. It can take ~400ms on some
# enclosures.
for i in $(seq 1 20) ; do
if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" == "off" ] ; then
break
fi
sleep 0.1
done

if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" != "off" ] ; then
exit 5
fi

zed_log_msg "powered down slot $ZEVENT_VDEV_ENC_SYSFS_PATH for $ZEVENT_VDEV_PATH"
5 changes: 5 additions & 0 deletions cmd/zed/zed.d/zed.rc
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,8 @@ ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event"
# Disabled by default, 1 to enable and 0 to disable.
#ZED_SYSLOG_DISPLAY_GUIDS=1

##
# Power off the drive's slot in the enclosure if it becomes FAULTED. This can
# help silence misbehaving drives. This assumes your drive enclosure fully
# supports slot power control via sysfs.
#ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT=1
2 changes: 1 addition & 1 deletion cmd/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -2412,7 +2412,6 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
int error;

ASSERT3P(lwb, !=, NULL);
ASSERT3P(zio, !=, NULL);
ASSERT3U(size, !=, 0);

ztest_object_lock(zd, object, RL_READER);
Expand Down Expand Up @@ -2446,6 +2445,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
DMU_READ_NO_PREFETCH);
ASSERT0(error);
} else {
ASSERT3P(zio, !=, NULL);
size = doi.doi_data_block_size;
if (ISP2(size)) {
offset = P2ALIGN(offset, size);
Expand Down
11 changes: 6 additions & 5 deletions contrib/dracut/90zfs/zfs-env-bootfs.service.in
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@ ExecStart=/bin/sh -c '
decode_root_args || exit 0; \
[ "$root" = "zfs:AUTO" ] && root="$(@sbindir@/zpool list -H -o bootfs | grep -m1 -vFx -)"; \
rootflags="$(getarg rootflags=)"; \
case ",$rootflags," in \
*,zfsutil,*) ;; \
,,) rootflags=zfsutil ;; \
*) rootflags="zfsutil,$rootflags" ;; \
esac; \
[ "$(@sbindir@/zfs get -H -o value mountpoint "$root")" = legacy ] || \
case ",$rootflags," in \
*,zfsutil,*) ;; \
,,) rootflags=zfsutil ;; \
*) rootflags="zfsutil,$rootflags" ;; \
esac; \
exec systemctl set-environment BOOTFS="$root" BOOTFSFLAGS="$rootflags"'

[Install]
Expand Down
8 changes: 8 additions & 0 deletions include/os/linux/spl/sys/kmem_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,14 @@ extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache);
spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl)
#define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move)
#define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc)
/*
* This is necessary to be compatible with other kernel modules
* or in-tree filesystem that may define kmem_cache_alloc,
* like bcachefs does it now.
*/
#ifdef kmem_cache_alloc
#undef kmem_cache_alloc
#endif
#define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags)
#define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj)
#define kmem_cache_reap_now(skc) spl_kmem_cache_reap_now(skc)
Expand Down
1 change: 1 addition & 0 deletions include/sys/brt.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ extern "C" {
#endif

extern boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp);
extern uint64_t brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp);

extern uint64_t brt_get_dspace(spa_t *spa);
extern uint64_t brt_get_used(spa_t *spa);
Expand Down
4 changes: 4 additions & 0 deletions include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -572,11 +572,15 @@ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
uint64_t length, int read, const void *tag, int *numbufsp,
dmu_buf_t ***dbpp);
int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
const void *tag, dmu_buf_t **dbp);
int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
const void *tag, dmu_buf_t **dbp, int flags);
int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
uint64_t length, boolean_t read, const void *tag, int *numbufsp,
dmu_buf_t ***dbpp, uint32_t flags);
int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag,
dmu_buf_t **dbp);
/*
* Add a reference to a dmu buffer that has already been held via
* dmu_buf_hold() in the current context.
Expand Down
2 changes: 0 additions & 2 deletions include/sys/dmu_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -247,8 +247,6 @@ typedef struct dmu_sendstatus {

void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *);
int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t,
const void *, dmu_buf_t **);

#ifdef __cplusplus
}
Expand Down
3 changes: 0 additions & 3 deletions include/sys/metaslab.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ uint64_t metaslab_largest_allocatable(metaslab_t *);
#define METASLAB_ASYNC_ALLOC 0x8
#define METASLAB_DONT_THROTTLE 0x10
#define METASLAB_MUST_RESERVE 0x20
#define METASLAB_FASTWRITE 0x40
#define METASLAB_ZIL 0x80

int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
Expand All @@ -96,8 +95,6 @@ void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t);
int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
void metaslab_check_free(spa_t *, const blkptr_t *);
void metaslab_fastwrite_mark(spa_t *, const blkptr_t *);
void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *);

void metaslab_stat_init(void);
void metaslab_stat_fini(void);
Expand Down
6 changes: 3 additions & 3 deletions include/sys/metaslab_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ struct metaslab_group {
* Each metaslab maintains a set of in-core trees to track metaslab
* operations. The in-core free tree (ms_allocatable) contains the list of
* free segments which are eligible for allocation. As blocks are
* allocated, the allocated segment are removed from the ms_allocatable and
* allocated, the allocated segments are removed from the ms_allocatable and
* added to a per txg allocation tree (ms_allocating). As blocks are
* freed, they are added to the free tree (ms_freeing). These trees
* allow us to process all allocations and frees in syncing context
Expand Down Expand Up @@ -366,9 +366,9 @@ struct metaslab_group {
struct metaslab {
/*
* This is the main lock of the metaslab and its purpose is to
* coordinate our allocations and frees [e.g metaslab_block_alloc(),
* coordinate our allocations and frees [e.g., metaslab_block_alloc(),
* metaslab_free_concrete(), ..etc] with our various syncing
* procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc].
* procedures [e.g., metaslab_sync(), metaslab_sync_done(), ..etc].
*
* The lock is also used during some miscellaneous operations like
* using the metaslab's histogram for the metaslab group's histogram
Expand Down
1 change: 0 additions & 1 deletion include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,6 @@ struct vdev {
metaslab_group_t *vdev_mg; /* metaslab group */
metaslab_group_t *vdev_log_mg; /* embedded slog metaslab group */
metaslab_t **vdev_ms; /* metaslab array */
uint64_t vdev_pending_fastwrite; /* allocated fastwrites */
txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */
txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
Expand Down
Loading
Loading