diff --git a/cmd/mount_zfs.c b/cmd/mount_zfs.c index fc9220950647..283074daf717 100644 --- a/cmd/mount_zfs.c +++ b/cmd/mount_zfs.c @@ -269,8 +269,7 @@ main(int argc, char **argv) return (MOUNT_USAGE); } - if (!zfsutil || sloppy || - libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { + if (sloppy || libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { zfs_adjust_mount_options(zhp, mntpoint, mntopts, mtabopt); } @@ -337,7 +336,7 @@ main(int argc, char **argv) dataset, mntpoint, mntflags, zfsflags, mntopts, mtabopt); if (!fake) { - if (zfsutil && !sloppy && + if (!remount && !sloppy && !libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { error = zfs_mount_at(zhp, mntopts, mntflags, mntpoint); if (error) { diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 41c2b6765585..8e3b6972ae04 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -2045,7 +2045,7 @@ dump_all_ddts(spa_t *spa) for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; - if (!ddt) + if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED) continue; for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; @@ -2072,6 +2072,32 @@ dump_all_ddts(spa_t *spa) } dump_dedup_ratio(&dds_total); + + /* + * Dump a histogram of unique class entry age + */ + if (dump_opt['D'] == 3 && getenv("ZDB_DDT_UNIQUE_AGE_HIST") != NULL) { + ddt_age_histo_t histogram; + + (void) printf("DDT walk unique, building age histogram...\n"); + ddt_prune_walk(spa, 0, &histogram); + + /* + * print out histogram for unique entry class birth + */ + if (histogram.dah_entries > 0) { + (void) printf("%5s %9s %4s\n", + "age", "blocks", "amnt"); + (void) printf("%5s %9s %4s\n", + "-----", "---------", "----"); + for (int i = 0; i < HIST_BINS; i++) { + (void) printf("%5d %9d %4d%%\n", 1 << i, + (int)histogram.dah_age_histo[i], + (int)((histogram.dah_age_histo[i] * 100) / + histogram.dah_entries)); + } + } + } } static void @@ -5749,12 +5775,17 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, ddt_entry_t *dde = ddt_lookup(ddt, bp); /* - * ddt_lookup() can only return NULL if this block didn't exist + * ddt_lookup() can return NULL if this block didn't exist * in the DDT and creating it would take the DDT over its * quota. Since we got the block from disk, it must exist in - * the DDT, so this can't happen. + * the DDT, so this can't happen. However, when unique entries + * are pruned, the dedup bit can be set with no corresponding + * entry in the DDT. */ - VERIFY3P(dde, !=, NULL); + if (dde == NULL) { + ddt_exit(ddt); + goto skipped; + } /* Get the phys for this variant */ ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); @@ -5774,8 +5805,8 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, (void *)(((uintptr_t)dde->dde_io) | (1 << v)); /* Consume a reference for this block. */ - VERIFY3U(ddt_phys_total_refcnt(ddt, dde->dde_phys), >, 0); - ddt_phys_decref(dde->dde_phys, v); + if (ddt_phys_total_refcnt(ddt, dde->dde_phys) > 0) + ddt_phys_decref(dde->dde_phys, v); /* * If this entry has a single flat phys, it may have been @@ -5864,6 +5895,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, } } +skipped: for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; int t = (i & 1) ? type : ZDB_OT_TOTAL; @@ -8138,7 +8170,7 @@ dump_mos_leaks(spa_t *spa) for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; - if (!ddt) + if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED) continue; /* DDT store objects */ @@ -8150,11 +8182,14 @@ dump_mos_leaks(spa_t *spa) } /* FDT container */ - mos_obj_refd(ddt->ddt_dir_object); + if (ddt->ddt_version == DDT_VERSION_FDT) + mos_obj_refd(ddt->ddt_dir_object); /* FDT log objects */ - mos_obj_refd(ddt->ddt_log[0].ddl_object); - mos_obj_refd(ddt->ddt_log[1].ddl_object); + if (ddt->ddt_flags & DDT_FLAG_LOG) { + mos_obj_refd(ddt->ddt_log[0].ddl_object); + mos_obj_refd(ddt->ddt_log[1].ddl_object); + } } if (spa->spa_brt != NULL) { diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 9cd26a8650ad..349c208c521b 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -75,6 +75,7 @@ #include "zpool_util.h" #include "zfs_comutil.h" #include "zfeature_common.h" +#include "zfs_valstr.h" #include "statcommon.h" @@ -130,6 +131,8 @@ static int zpool_do_version(int, char **); static int zpool_do_wait(int, char **); +static int zpool_do_ddt_prune(int, char **); + static int zpool_do_help(int argc, char **argv); static zpool_compat_status_t zpool_do_load_compat( @@ -170,6 +173,7 @@ typedef enum { HELP_CLEAR, HELP_CREATE, HELP_CHECKPOINT, + HELP_DDT_PRUNE, HELP_DESTROY, HELP_DETACH, HELP_EXPORT, @@ -426,6 +430,8 @@ static zpool_command_t command_table[] = { { "sync", zpool_do_sync, HELP_SYNC }, { NULL }, { "wait", zpool_do_wait, HELP_WAIT }, + { NULL }, + { "ddtprune", zpool_do_ddt_prune, HELP_DDT_PRUNE }, }; #define NCOMMAND (ARRAY_SIZE(command_table)) @@ -545,6 +551,8 @@ get_usage(zpool_help_t idx) case HELP_WAIT: return (gettext("\twait [-Hp] [-T d|u] [-t [,...]] " " [interval]\n")); + case HELP_DDT_PRUNE: + return (gettext("\tddtprune -d|-p \n")); default: __builtin_unreachable(); } @@ -11929,6 +11937,7 @@ static void zpool_do_events_nvprint(nvlist_t *nvl, int depth) { nvpair_t *nvp; + static char flagstr[256]; for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { @@ -11988,7 +11997,21 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth) case DATA_TYPE_UINT32: (void) nvpair_value_uint32(nvp, &i32); - printf(gettext("0x%x"), i32); + if (strcmp(name, + FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE) == 0 || + strcmp(name, + FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE) == 0) { + zfs_valstr_zio_stage(i32, flagstr, + sizeof (flagstr)); + printf(gettext("0x%x [%s]"), i32, flagstr); + } else if (strcmp(name, + FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY) == 0) { + zfs_valstr_zio_priority(i32, flagstr, + sizeof (flagstr)); + printf(gettext("0x%x [%s]"), i32, flagstr); + } else { + printf(gettext("0x%x"), i32); + } break; case DATA_TYPE_INT64: @@ -12009,6 +12032,12 @@ zpool_do_events_nvprint(nvlist_t *nvl, int depth) printf(gettext("\"%s\" (0x%llx)"), zpool_state_to_name(i64, VDEV_AUX_NONE), (u_longlong_t)i64); + } else if (strcmp(name, + FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS) == 0) { + zfs_valstr_zio_flag(i64, flagstr, + sizeof (flagstr)); + printf(gettext("0x%llx [%s]"), + (u_longlong_t)i64, flagstr); } else { printf(gettext("0x%llx"), (u_longlong_t)i64); } @@ -13342,6 +13371,88 @@ found:; return (error); } +/* + * zpool ddtprune -d|-p + * + * -d Prune entries old and older + * -p Prune amount of entries + * + * Prune single reference entries from DDT to satisfy the amount specified. + */ +int +zpool_do_ddt_prune(int argc, char **argv) +{ + zpool_ddt_prune_unit_t unit = ZPOOL_DDT_PRUNE_NONE; + uint64_t amount = 0; + zpool_handle_t *zhp; + char *endptr; + int c; + + while ((c = getopt(argc, argv, "d:p:")) != -1) { + switch (c) { + case 'd': + if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) { + (void) fprintf(stderr, gettext("-d cannot be " + "combined with -p option\n")); + usage(B_FALSE); + } + errno = 0; + amount = strtoull(optarg, &endptr, 0); + if (errno != 0 || *endptr != '\0' || amount == 0) { + (void) fprintf(stderr, + gettext("invalid days value\n")); + usage(B_FALSE); + } + amount *= 86400; /* convert days to seconds */ + unit = ZPOOL_DDT_PRUNE_AGE; + break; + case 'p': + if (unit == ZPOOL_DDT_PRUNE_AGE) { + (void) fprintf(stderr, gettext("-p cannot be " + "combined with -d option\n")); + usage(B_FALSE); + } + errno = 0; + amount = strtoull(optarg, &endptr, 0); + if (errno != 0 || *endptr != '\0' || + amount == 0 || amount > 100) { + (void) fprintf(stderr, + gettext("invalid percentage value\n")); + usage(B_FALSE); + } + unit = ZPOOL_DDT_PRUNE_PERCENTAGE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + argc -= optind; + argv += optind; + + if (unit == ZPOOL_DDT_PRUNE_NONE) { + (void) fprintf(stderr, + gettext("missing amount option (-d|-p )\n")); + usage(B_FALSE); + } else if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool argument\n")); + usage(B_FALSE); + } else if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + zhp = zpool_open(g_zfs, argv[0]); + if (zhp == NULL) + return (-1); + + int error = zpool_ddt_prune(zhp, unit, amount); + + zpool_close(zhp); + + return (error); +} + static int find_command_idx(const char *command, int *idx) { diff --git a/cmd/ztest.c b/cmd/ztest.c index 7c9db84d4ea4..ce031632e758 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -276,6 +276,8 @@ extern unsigned long zio_decompress_fail_fraction; extern unsigned long zfs_reconstruct_indirect_damage_fraction; extern uint64_t raidz_expand_max_reflow_bytes; extern uint_t raidz_expand_pause_point; +extern boolean_t ddt_prune_artificial_age; +extern boolean_t ddt_dump_prune_histogram; static ztest_shared_opts_t *ztest_shared_opts; @@ -446,6 +448,7 @@ ztest_func_t ztest_fletcher; ztest_func_t ztest_fletcher_incr; ztest_func_t ztest_verify_dnode_bt; ztest_func_t ztest_pool_prefetch_ddt; +ztest_func_t ztest_ddt_prune; static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ @@ -502,6 +505,7 @@ static ztest_info_t ztest_info[] = { ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely), + ZTI_INIT(ztest_ddt_prune, 1, &zopt_rarely), }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) @@ -6211,13 +6215,14 @@ void ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; - nvlist_t *props = NULL; (void) pthread_rwlock_rdlock(&ztest_name_lock); (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); - VERIFY0(spa_prop_get(ztest_spa, &props)); + nvlist_t *props = fnvlist_alloc(); + + VERIFY0(spa_prop_get(ztest_spa, props)); if (ztest_opts.zo_verbose >= 6) dump_nvlist(props, 4); @@ -7288,6 +7293,17 @@ ztest_trim(ztest_ds_t *zd, uint64_t id) mutex_exit(&ztest_vdev_lock); } +void +ztest_ddt_prune(ztest_ds_t *zd, uint64_t id) +{ + (void) zd, (void) id; + + spa_t *spa = ztest_spa; + uint64_t pct = ztest_random(15) + 1; + + (void) ddt_prune_unique_entries(spa, ZPOOL_DDT_PRUNE_PERCENTAGE, pct); +} + /* * Verify pool integrity by running zdb. */ @@ -7469,6 +7485,13 @@ ztest_resume_thread(void *arg) { spa_t *spa = arg; + /* + * Synthesize aged DDT entries for ddt prune testing + */ + ddt_prune_artificial_age = B_TRUE; + if (ztest_opts.zo_verbose >= 3) + ddt_dump_prune_histogram = B_TRUE; + while (!ztest_exiting) { if (spa_suspended(spa)) ztest_resume(spa); @@ -8587,6 +8610,12 @@ ztest_init(ztest_shared_t *zs) if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) continue; + /* + * split 50/50 between legacy and fast dedup + */ + if (i == SPA_FEATURE_FAST_DEDUP && ztest_random(2) != 0) + continue; + VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", spa_feature_table[i].fi_uname)); fnvlist_add_uint64(props, buf, 0); diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install index 10083351abb5..d51e4ef003e6 100644 --- a/contrib/debian/openzfs-zfsutils.install +++ b/contrib/debian/openzfs-zfsutils.install @@ -100,6 +100,7 @@ usr/share/man/man8/zpool-clear.8 usr/share/man/man8/zpool-create.8 usr/share/man/man8/zpool-destroy.8 usr/share/man/man8/zpool-detach.8 +usr/share/man/man8/zpool-ddtprune.8 usr/share/man/man8/zpool-events.8 usr/share/man/man8/zpool-export.8 usr/share/man/man8/zpool-get.8 diff --git a/include/Makefile.am b/include/Makefile.am index fa725c2e7a5f..f173064efc99 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -14,6 +14,7 @@ COMMON_H = \ zfs_fletcher.h \ zfs_namecheck.h \ zfs_prop.h \ + zfs_valstr.h \ \ sys/abd.h \ sys/abd_impl.h \ diff --git a/include/libzfs.h b/include/libzfs.h index 2412797541de..01d51999f4eb 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -305,6 +305,9 @@ _LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *); _LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *); +_LIBZFS_H int zpool_ddt_prune(zpool_handle_t *, zpool_ddt_prune_unit_t, + uint64_t); + _LIBZFS_H int zpool_vdev_online(zpool_handle_t *, const char *, int, vdev_state_t *); _LIBZFS_H int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t); diff --git a/include/libzfs_core.h b/include/libzfs_core.h index 206e5e5c2bf6..b1d74fbbc8f5 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -161,6 +161,9 @@ _LIBZFS_CORE_H int lzc_set_vdev_prop(const char *, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_scrub(zfs_ioc_t, const char *, nvlist_t *, nvlist_t **); +_LIBZFS_CORE_H int lzc_ddt_prune(const char *, zpool_ddt_prune_unit_t, + uint64_t); + #ifdef __cplusplus } #endif diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 93abad85af44..4e5ccd46318e 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -405,6 +405,9 @@ extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp); +extern int ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit, + uint64_t amount); + #ifdef __cplusplus } #endif diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h index 6f11cd90c1d8..4d3c0cae072e 100644 --- a/include/sys/ddt_impl.h +++ b/include/sys/ddt_impl.h @@ -35,8 +35,11 @@ extern "C" { #endif /* DDT version numbers */ -#define DDT_VERSION_LEGACY (0) -#define DDT_VERSION_FDT (1) +#define DDT_VERSION_LEGACY (0) +#define DDT_VERSION_FDT (1) + +/* Dummy version to signal that configure is still necessary */ +#define DDT_VERSION_UNCONFIGURED (UINT64_MAX) /* Names of interesting objects in the DDT root dir */ #define DDT_DIR_VERSION "version" @@ -187,8 +190,11 @@ extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu); extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe); -extern boolean_t ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, - const ddt_key_t *ddk, ddt_lightweight_entry_t *ddlwe); + +extern boolean_t ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk, + ddt_lightweight_entry_t *ddlwe); +extern boolean_t ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, + const ddt_key_t *ddk); extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx); @@ -211,6 +217,44 @@ extern void ddt_log_fini(void); * them up. */ +/* + * We use a histogram to convert a percentage request into a + * cutoff value where entries older than the cutoff get pruned. + * + * The histogram bins represent hours in power-of-two increments. + * 16 bins covers up to four years. + */ +#define HIST_BINS 16 + +typedef struct ddt_age_histo { + uint64_t dah_entries; + uint64_t dah_age_histo[HIST_BINS]; +} ddt_age_histo_t; + +void ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram); + +#if defined(_KERNEL) || !defined(ZFS_DEBUG) +#define ddt_dump_age_histogram(histo, cutoff) ((void)0) +#else +static inline void +ddt_dump_age_histogram(ddt_age_histo_t *histogram, uint64_t cutoff) +{ + if (histogram->dah_entries == 0) + return; + + (void) printf("DDT prune unique class age, %llu hour cutoff\n", + (u_longlong_t)(gethrestime_sec() - cutoff)/3600); + (void) printf("%5s %9s %4s\n", "age", "blocks", "amnt"); + (void) printf("%5s %9s %4s\n", "-----", "---------", "----"); + for (int i = 0; i < HIST_BINS; i++) { + (void) printf("%5d %9llu %4d%%\n", 1<dah_age_histo[i], + (int)((histogram->dah_age_histo[i] * 100) / + histogram->dah_entries)); + } +} +#endif + /* * Enough room to expand DMU_POOL_DDT format for all possible DDT * checksum/class/type combinations. diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 73d686a002ee..fc4f22cd5304 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1422,7 +1422,7 @@ typedef enum { */ typedef enum zfs_ioc { /* - * Core features - 88/128 numbers reserved. + * Core features - 89/128 numbers reserved. */ #ifdef __FreeBSD__ ZFS_IOC_FIRST = 0, @@ -1519,6 +1519,7 @@ typedef enum zfs_ioc { ZFS_IOC_VDEV_SET_PROPS, /* 0x5a56 */ ZFS_IOC_POOL_SCRUB, /* 0x5a57 */ ZFS_IOC_POOL_PREFETCH, /* 0x5a58 */ + ZFS_IOC_DDT_PRUNE, /* 0x5a59 */ /* * Per-platform (Optional) - 8/128 numbers reserved. @@ -1655,6 +1656,12 @@ typedef enum { ZPOOL_PREFETCH_DDT } zpool_prefetch_type_t; +typedef enum { + ZPOOL_DDT_PRUNE_NONE, + ZPOOL_DDT_PRUNE_AGE, /* in seconds */ + ZPOOL_DDT_PRUNE_PERCENTAGE, /* 1 - 100 */ +} zpool_ddt_prune_unit_t; + /* * Bookmark name values. */ @@ -1753,6 +1760,12 @@ typedef enum { */ #define ZPOOL_PREFETCH_TYPE "prefetch_type" +/* + * The following are names used when invoking ZFS_IOC_DDT_PRUNE. + */ +#define DDT_PRUNE_UNIT "ddt_prune_unit" +#define DDT_PRUNE_AMOUNT "ddt_prune_amount" + /* * Flags for ZFS_IOC_VDEV_SET_STATE */ diff --git a/include/sys/spa.h b/include/sys/spa.h index 93f381affd95..aa66d489ef1a 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1201,9 +1201,9 @@ extern void spa_boot_init(void); /* properties */ extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); -extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); +extern int spa_prop_get(spa_t *spa, nvlist_t *nvp); extern int spa_prop_get_nvlist(spa_t *spa, char **props, - unsigned int n_props, nvlist_t **outnvl); + unsigned int n_props, nvlist_t *outnvl); extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 4fc6f22fcb50..7811abbb9ce3 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -412,6 +412,7 @@ struct spa { uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */ uint64_t spa_dedup_checksum; /* default dedup checksum */ uint64_t spa_dspace; /* dspace in normal class */ + boolean_t spa_active_ddt_prune; /* ddt prune process active */ struct brt *spa_brt; /* in-core BRT */ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ kmutex_t spa_proc_lock; /* protects spa_proc* */ diff --git a/include/sys/zio.h b/include/sys/zio.h index 446b64ccd8ab..3a756949a422 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -167,6 +167,9 @@ typedef enum zio_suspend_reason { * This was originally an enum type. However, those are 32-bit and there is no * way to make a 64-bit enum type. Since we ran out of bits for flags, we were * forced to upgrade it to a uint64_t. + * + * NOTE: PLEASE UPDATE THE BITFIELD STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER + * FLAG. */ typedef uint64_t zio_flag_t; /* diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h index 2b026d48675a..2c846a5d41f6 100644 --- a/include/sys/zio_impl.h +++ b/include/sys/zio_impl.h @@ -120,6 +120,9 @@ extern "C" { /* * zio pipeline stage definitions + * + * NOTE: PLEASE UPDATE THE BITFIELD STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER + * FLAG. */ enum zio_stage { ZIO_STAGE_OPEN = 1 << 0, /* RWFCXT */ diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h index 2d8e7fc36bae..bdf5f9b8ff35 100644 --- a/include/sys/zio_priority.h +++ b/include/sys/zio_priority.h @@ -22,6 +22,10 @@ extern "C" { #endif +/* + * NOTE: PLEASE UPDATE THE ENUM STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER + * VALUE. + */ typedef enum zio_priority { ZIO_PRIORITY_SYNC_READ, ZIO_PRIORITY_SYNC_WRITE, /* ZIL */ diff --git a/include/zfs_valstr.h b/include/zfs_valstr.h new file mode 100644 index 000000000000..77c26ce1ae7d --- /dev/null +++ b/include/zfs_valstr.h @@ -0,0 +1,84 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2024, Klara Inc. + */ + +#ifndef _ZFS_VALSTR_H +#define _ZFS_VALSTR_H extern __attribute__((visibility("default"))) + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * These macros create function prototypes for pretty-printing or stringifying + * certain kinds of numeric types. + * + * _ZFS_VALSTR_DECLARE_BITFIELD(name) creates: + * + * size_t zfs_valstr__bits(uint64_t bits, char *out, size_t outlen); + * expands single char for each set bit, and space for each clear bit + * + * size_t zfs_valstr__pairs(uint64_t bits, char *out, size_t outlen); + * expands two-char mnemonic for each bit set in `bits`, separated by `|` + * + * size_t zfs_valstr_(uint64_t bits, char *out, size_t outlen); + * expands full name of each bit set in `bits`, separated by spaces + * + * _ZFS_VALSTR_DECLARE_ENUM(name) creates: + * + * size_t zfs_valstr_(int v, char *out, size_t outlen); + * expands full name of enum value + * + * Each _ZFS_VALSTR_DECLARE_xxx needs a corresponding _VALSTR_xxx_IMPL string + * table in vfs_valstr.c. + */ + +#define _ZFS_VALSTR_DECLARE_BITFIELD(name) \ + _ZFS_VALSTR_H size_t zfs_valstr_ ## name ## _bits( \ + uint64_t bits, char *out, size_t outlen); \ + _ZFS_VALSTR_H size_t zfs_valstr_ ## name ## _pairs( \ + uint64_t bits, char *out, size_t outlen); \ + _ZFS_VALSTR_H size_t zfs_valstr_ ## name( \ + uint64_t bits, char *out, size_t outlen); \ + +#define _ZFS_VALSTR_DECLARE_ENUM(name) \ + _ZFS_VALSTR_H size_t zfs_valstr_ ## name( \ + int v, char *out, size_t outlen); \ + +_ZFS_VALSTR_DECLARE_BITFIELD(zio_flag) +_ZFS_VALSTR_DECLARE_BITFIELD(zio_stage) + +_ZFS_VALSTR_DECLARE_ENUM(zio_priority) + +#undef _ZFS_VALSTR_DECLARE_BITFIELD +#undef _ZFS_VALSTR_DECLARE_ENUM + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_VALSTR_H */ diff --git a/lib/libspl/include/sys/types32.h b/lib/libspl/include/sys/types32.h index eadc67c7122a..d065ebed03b7 100644 --- a/lib/libspl/include/sys/types32.h +++ b/lib/libspl/include/sys/types32.h @@ -65,11 +65,6 @@ typedef int32_t ssize32_t; typedef int32_t time32_t; typedef int32_t clock32_t; -struct timeval32 { - time32_t tv_sec; /* seconds */ - int32_t tv_usec; /* and microseconds */ -}; - typedef struct timespec32 { time32_t tv_sec; /* seconds */ int32_t tv_nsec; /* and nanoseconds */ diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am index 5e74d908de3d..a976faaf9913 100644 --- a/lib/libzfs/Makefile.am +++ b/lib/libzfs/Makefile.am @@ -47,6 +47,7 @@ nodist_libzfs_la_SOURCES = \ module/zcommon/zfs_fletcher_superscalar4.c \ module/zcommon/zfs_namecheck.c \ module/zcommon/zfs_prop.c \ + module/zcommon/zfs_valstr.c \ module/zcommon/zpool_prop.c \ module/zcommon/zprop_common.c diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 87c5c4380be3..51b29643ee0c 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -183,8 +183,8 @@ - + @@ -454,6 +454,13 @@ + + + + + + + @@ -466,7 +473,9 @@ + + @@ -485,8 +494,8 @@ - + @@ -529,7 +538,6 @@ - @@ -5929,6 +5937,7 @@ + @@ -5963,6 +5972,13 @@ + + + + + + + @@ -6139,6 +6155,12 @@ + + + + + + @@ -6798,6 +6820,12 @@ + + + + + + @@ -7837,7 +7865,7 @@ - + @@ -7856,6 +7884,9 @@ + + + @@ -7865,6 +7896,15 @@ + + + + + + + + + @@ -7968,6 +8008,11 @@ + + + + + @@ -8075,6 +8120,11 @@ + + + + + @@ -8093,6 +8143,11 @@ + + + + + @@ -8292,12 +8347,12 @@ - - - + + + @@ -8802,11 +8857,6 @@ - - - - - @@ -9788,6 +9838,50 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index d27c7954e63c..0379d1f52cb7 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -5652,3 +5652,31 @@ zpool_set_vdev_prop(zpool_handle_t *zhp, const char *vdevname, return (ret); } + +/* + * Prune older entries from the DDT to reclaim space under the quota + */ +int +zpool_ddt_prune(zpool_handle_t *zhp, zpool_ddt_prune_unit_t unit, + uint64_t amount) +{ + int error = lzc_ddt_prune(zhp->zpool_name, unit, amount); + if (error != 0) { + libzfs_handle_t *hdl = zhp->zpool_hdl; + char errbuf[ERRBUFLEN]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot prune dedup table on '%s'"), zhp->zpool_name); + + if (error == EALREADY) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "a prune operation is already in progress")); + (void) zfs_error(hdl, EZFS_BUSY, errbuf); + } else { + (void) zpool_standard_error(hdl, errno, errbuf); + } + return (-1); + } + + return (0); +} diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index 1062a6b52dff..5ee6b8e09d6d 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -162,6 +162,7 @@ + @@ -1444,6 +1445,7 @@ + @@ -1484,6 +1486,13 @@ + + + + + + + @@ -3015,6 +3024,12 @@ + + + + + + diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index ec8b0ff4f61c..d07fca6cebad 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -1927,3 +1927,25 @@ lzc_get_bootenv(const char *pool, nvlist_t **outnvl) { return (lzc_ioctl(ZFS_IOC_GET_BOOTENV, pool, NULL, outnvl)); } + +/* + * Prune the specified amount from the pool's dedup table. + */ +int +lzc_ddt_prune(const char *pool, zpool_ddt_prune_unit_t unit, uint64_t amount) +{ + int error; + + nvlist_t *result = NULL; + nvlist_t *args = fnvlist_alloc(); + + fnvlist_add_int32(args, DDT_PRUNE_UNIT, unit); + fnvlist_add_uint64(args, DDT_PRUNE_AMOUNT, amount); + + error = lzc_ioctl(ZFS_IOC_DDT_PRUNE, pool, args, &result); + + fnvlist_free(args); + fnvlist_free(result); + + return (error); +} diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 81949bf9e5b8..ff30af7d2b9f 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -64,6 +64,7 @@ nodist_libzpool_la_SOURCES = \ module/zcommon/zfs_fletcher_superscalar4.c \ module/zcommon/zfs_namecheck.c \ module/zcommon/zfs_prop.c \ + module/zcommon/zfs_valstr.c \ module/zcommon/zpool_prop.c \ module/zcommon/zprop_common.c \ \ diff --git a/lib/libzstd/Makefile.am b/lib/libzstd/Makefile.am index 49bfb328a6f7..856175137906 100644 --- a/lib/libzstd/Makefile.am +++ b/lib/libzstd/Makefile.am @@ -1,4 +1,6 @@ libzstd_la_CFLAGS = $(AM_CFLAGS) $(LIBRARY_CFLAGS) +libzstd_la_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS) + # -fno-tree-vectorize is set for gcc in zstd/common/compiler.h # Set it for other compilers, too. libzstd_la_CFLAGS += -fno-tree-vectorize diff --git a/man/Makefile.am b/man/Makefile.am index 194bb4721619..fde704933764 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -72,6 +72,7 @@ dist_man_MANS = \ %D%/man8/zpool-create.8 \ %D%/man8/zpool-destroy.8 \ %D%/man8/zpool-detach.8 \ + %D%/man8/zpool-ddtprune.8 \ %D%/man8/zpool-events.8 \ %D%/man8/zpool-export.8 \ %D%/man8/zpool-get.8 \ diff --git a/man/man8/zpool-ddtprune.8 b/man/man8/zpool-ddtprune.8 new file mode 100644 index 000000000000..1ab7d3982c3e --- /dev/null +++ b/man/man8/zpool-ddtprune.8 @@ -0,0 +1,48 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2024, Klara Inc. +.\" +.Dd June 17, 2024 +.Dt ZPOOL-DDTPRUNE 8 +.Os +. +.Sh NAME +.Nm zpool-ddtprune +.Nd Prunes the oldest entries from the single reference dedup table(s) +.Sh SYNOPSIS +.Nm zpool +.Cm ddtprune +.Fl d Ar days | Fl p Ar percentage +.Ar pool +.Sh DESCRIPTION +This command prunes older unique entries from the dedup table. +As a complement to the dedup quota feature, +.Sy ddtprune +allows removal of older non-duplicate entries to make room for +newer duplicate entries. +.Pp +The amount to prune can be based on a target percentage of the unique entries +or based on the age (i.e., every unique entry older than N days). +. +.Sh SEE ALSO +.Xr zdb 8 , +.Xr zpool-status 8 diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index c55644d9ecea..02a258f66708 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -592,6 +592,7 @@ don't wait. .Xr zpool-checkpoint 8 , .Xr zpool-clear 8 , .Xr zpool-create 8 , +.Xr zpool-ddtprune 8 , .Xr zpool-destroy 8 , .Xr zpool-detach 8 , .Xr zpool-events 8 , diff --git a/module/Kbuild.in b/module/Kbuild.in index 901905eb2826..c14fa4ec1bce 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -241,6 +241,7 @@ ZCOMMON_OBJS := \ zfs_fletcher_superscalar4.o \ zfs_namecheck.o \ zfs_prop.o \ + zfs_valstr.o \ zpool_prop.o \ zprop_common.o diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 534f3257132a..9161204c99d3 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -233,6 +233,7 @@ SRCS+= cityhash.c \ zfs_fletcher_superscalar.c \ zfs_namecheck.c \ zfs_prop.c \ + zfs_valstr.c \ zpool_prop.c \ zprop_common.c diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index 30983b13f7d1..c84cb7407a9c 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -124,7 +124,6 @@ SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS livelist condense"); -SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache"); SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, file, CTLFLAG_RW, 0, "ZFS VDEV file"); SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0, "ZFS VDEV mirror"); diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index 54ed70d0394f..e042116333fb 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -1101,8 +1101,8 @@ zfsctl_snapshot_mount(struct path *path, int flags) zfsvfs_t *snap_zfsvfs; zfs_snapentry_t *se; char *full_name, *full_path; - char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL, - NULL }; + char *argv[] = { "/usr/bin/env", "mount", "-i", "-t", "zfs", "-n", + NULL, NULL, NULL }; char *envp[] = { NULL }; int error; struct path spath; @@ -1153,8 +1153,8 @@ zfsctl_snapshot_mount(struct path *path, int flags) * value from call_usermodehelper() will be (exitcode << 8 + signal). */ dprintf("mount; name=%s path=%s\n", full_name, full_path); - argv[5] = full_name; - argv[6] = full_path; + argv[6] = full_name; + argv[7] = full_path; error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); if (error) { if (!(error & MOUNT_BUSY << 8)) { diff --git a/module/zcommon/zfs_valstr.c b/module/zcommon/zfs_valstr.c new file mode 100644 index 000000000000..e2d4d1aefefb --- /dev/null +++ b/module/zcommon/zfs_valstr.c @@ -0,0 +1,277 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2024, Klara Inc. + */ + +#include +#include +#include +#include +#include +#include "zfs_valstr.h" + +/* + * Each bit in a bitfield has three possible string representations: + * - single char + * - two-char pair + * - full name + */ +typedef struct { + const char vb_bit; + const char vb_pair[2]; + const char *vb_name; +} valstr_bit_t; + +/* + * Emits a character for each bit in `bits`, up to the number of elements + * in the table. Set bits get the character in vb_bit, clear bits get a + * space. This results in all strings having the same width, for easier + * visual comparison. + */ +static size_t +valstr_bitfield_bits(const valstr_bit_t *table, const size_t nelems, + uint64_t bits, char *out, size_t outlen) +{ + ASSERT(out); + size_t n = 0; + for (int b = 0; b < nelems; b++) { + if (n == outlen) + break; + uint64_t mask = (1ULL << b); + out[n++] = (bits & mask) ? table[b].vb_bit : ' '; + } + if (n < outlen) + out[n++] = '\0'; + return (n); +} + +/* + * Emits a two-char pair for each bit set in `bits`, taken from vb_pair, and + * separated by a `|` character. This gives a concise representation of the + * whole value. + */ +static size_t +valstr_bitfield_pairs(const valstr_bit_t *table, const size_t nelems, + uint64_t bits, char *out, size_t outlen) +{ + ASSERT(out); + size_t n = 0; + for (int b = 0; b < nelems; b++) { + ASSERT3U(n, <=, outlen); + if (n == outlen) + break; + uint64_t mask = (1ULL << b); + if (bits & mask) { + size_t len = (n > 0) ? 3 : 2; + if (n > outlen-len) + break; + if (n > 0) + out[n++] = '|'; + out[n++] = table[b].vb_pair[0]; + out[n++] = table[b].vb_pair[1]; + } + } + if (n < outlen) + out[n++] = '\0'; + return (n); +} + +/* + * Emits the full name for each bit set in `bits`, taken from vb_name, and + * separated by a space. This unambiguously shows the entire set of bits, but + * can get very long. + */ +static size_t +valstr_bitfield_str(const valstr_bit_t *table, const size_t nelems, + uint64_t bits, char *out, size_t outlen) +{ + ASSERT(out); + size_t n = 0; + for (int b = 0; b < nelems; b++) { + ASSERT3U(n, <=, outlen); + if (n == outlen) + break; + uint64_t mask = (1ULL << b); + if (bits & mask) { + size_t len = strlen(table[b].vb_name); + if (n > 0) + len++; + if (n > outlen-len) + break; + if (n > 0) { + out[n++] = ' '; + len--; + } + memcpy(&out[n], table[b].vb_name, len); + n += len; + } + } + if (n < outlen) + out[n++] = '\0'; + return (n); +} + +/* + * Emits the name of the given enum value in the table. + */ +static size_t +valstr_enum_str(const char **table, const size_t nelems, + int v, char *out, size_t outlen) +{ + ASSERT(out); + ASSERT3U(v, <, nelems); + if (v >= nelems) + return (0); + return (MIN(strlcpy(out, table[v], outlen), outlen)); +} + +/* + * These macros create the string tables for the given name, and implement + * the public functions described in zfs_valstr.h. + */ +#define _VALSTR_BITFIELD_IMPL(name, ...) \ +static const valstr_bit_t valstr_ ## name ## _table[] = { __VA_ARGS__ };\ +size_t \ +zfs_valstr_ ## name ## _bits(uint64_t bits, char *out, size_t outlen) \ +{ \ + return (valstr_bitfield_bits(valstr_ ## name ## _table, \ + ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen)); \ +} \ + \ +size_t \ +zfs_valstr_ ## name ## _pairs(uint64_t bits, char *out, size_t outlen) \ +{ \ + return (valstr_bitfield_pairs(valstr_ ## name ## _table, \ + ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen)); \ +} \ + \ +size_t \ +zfs_valstr_ ## name(uint64_t bits, char *out, size_t outlen) \ +{ \ + return (valstr_bitfield_str(valstr_ ## name ## _table, \ + ARRAY_SIZE(valstr_ ## name ## _table), bits, out, outlen)); \ +} \ + +#define _VALSTR_ENUM_IMPL(name, ...) \ +static const char *valstr_ ## name ## _table[] = { __VA_ARGS__ }; \ +size_t \ +zfs_valstr_ ## name(int v, char *out, size_t outlen) \ +{ \ + return (valstr_enum_str(valstr_ ## name ## _table, \ + ARRAY_SIZE(valstr_ ## name ## _table), v, out, outlen)); \ +} \ + + +/* String tables */ + +/* ZIO flags: zio_flag_t, typically zio->io_flags */ +/* BEGIN CSTYLED */ +_VALSTR_BITFIELD_IMPL(zio_flag, + { '.', "DA", "DONT_AGGREGATE" }, + { '.', "RP", "IO_REPAIR" }, + { '.', "SH", "SELF_HEAL" }, + { '.', "RS", "RESILVER" }, + { '.', "SC", "SCRUB" }, + { '.', "ST", "SCAN_THREAD" }, + { '.', "PH", "PHYSICAL" }, + { '.', "CF", "CANFAIL" }, + { '.', "SP", "SPECULATIVE" }, + { '.', "CW", "CONFIG_WRITER" }, + { '.', "DR", "DONT_RETRY" }, + { '?', "??", "[UNUSED 11]" }, + { '.', "ND", "NODATA" }, + { '.', "ID", "INDUCE_DAMAGE" }, + { '.', "AL", "IO_ALLOCATING" }, + { '.', "RE", "IO_RETRY" }, + { '.', "PR", "PROBE" }, + { '.', "TH", "TRYHARD" }, + { '.', "OP", "OPTIONAL" }, + { '.', "DQ", "DONT_QUEUE" }, + { '.', "DP", "DONT_PROPAGATE" }, + { '.', "BY", "IO_BYPASS" }, + { '.', "RW", "IO_REWRITE" }, + { '.', "CM", "RAW_COMPRESS" }, + { '.', "EN", "RAW_ENCRYPT" }, + { '.', "GG", "GANG_CHILD" }, + { '.', "DD", "DDT_CHILD" }, + { '.', "GF", "GODFATHER" }, + { '.', "NP", "NOPWRITE" }, + { '.', "EX", "REEXECUTED" }, + { '.', "DG", "DELEGATED" }, +) +/* END CSTYLED */ + +/* + * ZIO pipeline stage(s): enum zio_stage, typically zio->io_stage or + * zio->io_pipeline. + */ +/* BEGIN CSTYLED */ +_VALSTR_BITFIELD_IMPL(zio_stage, + { 'O', "O ", "OPEN" }, + { 'I', "RI", "READ_BP_INIT" }, + { 'I', "WI", "WRITE_BP_INIT" }, + { 'I', "FI", "FREE_BP_INIT" }, + { 'A', "IA", "ISSUE_ASYNC" }, + { 'W', "WC", "WRITE_COMPRESS" }, + { 'E', "EN", "ENCRYPT" }, + { 'C', "CG", "CHECKSUM_GENERATE" }, + { 'N', "NW", "NOP_WRITE" }, + { 'B', "BF", "BRT_FREE" }, + { 'd', "dS", "DDT_READ_START" }, + { 'd', "dD", "DDT_READ_DONE" }, + { 'd', "dW", "DDT_WRITE" }, + { 'd', "dF", "DDT_FREE" }, + { 'G', "GA", "GANG_ASSEMBLE" }, + { 'G', "GI", "GANG_ISSUE" }, + { 'D', "DT", "DVA_THROTTLE" }, + { 'D', "DA", "DVA_ALLOCATE" }, + { 'D', "DF", "DVA_FREE" }, + { 'D', "DC", "DVA_CLAIM" }, + { 'R', "R ", "READY" }, + { 'V', "VS", "VDEV_IO_START" }, + { 'V', "VD", "VDEV_IO_DONE" }, + { 'V', "VA", "VDEV_IO_ASSESS" }, + { 'C', "CV", "CHECKSUM_VERIFY" }, + { 'X', "X ", "DONE" }, +) +/* END CSTYLED */ + +/* ZIO priority: zio_priority_t, typically zio->io_priority */ +/* BEGIN CSTYLED */ +_VALSTR_ENUM_IMPL(zio_priority, + "SYNC_READ", + "SYNC_WRITE", + "ASYNC_READ", + "ASYNC_WRITE", + "SCRUB", + "REMOVAL", + "INITIALIZING", + "TRIM", + "REBUILD", + "[NUM_QUEUEABLE]", + "NOW", +) +/* END CSTYLED */ + +#undef _VALSTR_BITFIELD_IMPL +#undef _VALSTR_ENUM_IMPL diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index bd1941f43adf..0e12e7e49828 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -125,6 +125,13 @@ * without which, no space would be recovered and the DDT would continue to be * considered "over quota". See zap_shrink_enabled. * + * ## Dedup table pruning + * + * As a complement to the dedup quota feature, ddtprune allows removal of older + * non-duplicate entries to make room for newer duplicate entries. The amount + * to prune can be based on a target percentage of the unique entries or based + * on the age (i.e., prune unique entry older than N days). + * * ## Dedup log * * Historically, all entries modified on a txg were written back to dedup @@ -228,6 +235,19 @@ int zfs_dedup_prefetch = 0; */ uint_t dedup_class_wait_txgs = 5; +/* + * How many DDT prune entries to add to the DDT sync AVL tree. + * Note these addtional entries have a memory footprint of a + * ddt_entry_t (216 bytes). + */ +static uint32_t zfs_ddt_prunes_per_txg = 50000; + +/* + * For testing, synthesize aged DDT entries + * (in global scope for ztest) + */ +boolean_t ddt_prune_artificial_age = B_FALSE; +boolean_t ddt_dump_prune_histogram = B_FALSE; /* * Don't do more than this many incremental flush passes per txg. @@ -268,10 +288,6 @@ static const uint64_t ddt_version_flags[] = { [DDT_VERSION_FDT] = DDT_FLAG_FLAT | DDT_FLAG_LOG, }; -/* Dummy version to signal that configure is still necessary */ -#define DDT_VERSION_UNCONFIGURED (UINT64_MAX) - -#ifdef _KERNEL /* per-DDT kstats */ typedef struct { /* total lookups and whether they returned new or existing entries */ @@ -324,6 +340,7 @@ static const ddt_kstats_t ddt_kstats_template = { { "log_flush_time_rate", KSTAT_DATA_UINT32 }, }; +#ifdef _KERNEL #define _DDT_KSTAT_STAT(ddt, stat) \ &((ddt_kstats_t *)(ddt)->ddt_ksp->ks_data)->stat.value.ui64 #define DDT_KSTAT_BUMP(ddt, stat) \ @@ -343,6 +360,7 @@ static const ddt_kstats_t ddt_kstats_template = { #define DDT_KSTAT_ZERO(ddt, stat) do {} while (0) #endif /* _KERNEL */ + static void ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class, dmu_tx_t *tx) @@ -715,6 +733,30 @@ ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) memset(&ddp->ddp_trad[v], 0, DDT_TRAD_PHYS_SIZE / DDT_PHYS_MAX); } +static uint64_t +ddt_class_start(void) +{ + uint64_t start = gethrestime_sec(); + + if (ddt_prune_artificial_age) { + /* + * debug aide -- simulate a wider distribution + * so we don't have to wait for an aged DDT + * to test prune. + */ + int range = 1 << 21; + int percent = random_in_range(100); + if (percent < 50) { + range = range >> 4; + } else if (percent > 75) { + range /= 2; + } + start -= random_in_range(range); + } + + return (start); +} + void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) { @@ -789,6 +831,9 @@ ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp) { + if (dde == NULL) + return (DDT_PHYS_NONE); + const ddt_univ_phys_t *ddp = dde->dde_phys; if (ddt->ddt_flags & DDT_FLAG_FLAT) { @@ -1019,6 +1064,47 @@ ddt_prefetch_all(spa_t *spa) static int ddt_configure(ddt_t *ddt, boolean_t new); +/* + * If the BP passed to ddt_lookup has valid DVAs, then we need to compare them + * to the ones in the entry. If they're different, then the passed-in BP is + * from a previous generation of this entry (ie was previously pruned) and we + * have to act like the entry doesn't exist at all. + * + * This should only happen during a lookup to free the block (zio_ddt_free()). + * + * XXX this is similar in spirit to ddt_phys_select(), maybe can combine + * -- robn, 2024-02-09 + */ +static boolean_t +ddt_entry_lookup_is_valid(ddt_t *ddt, const blkptr_t *bp, ddt_entry_t *dde) +{ + /* If the BP has no DVAs, then this entry is good */ + uint_t ndvas = BP_GET_NDVAS(bp); + if (ndvas == 0) + return (B_TRUE); + + /* + * Only checking the phys for the copies. For flat, there's only one; + * for trad it'll be the one that has the matching set of DVAs. + */ + const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ? + dde->dde_phys->ddp_flat.ddp_dva : + dde->dde_phys->ddp_trad[ndvas].ddp_dva; + + /* + * Compare entry DVAs with the BP. They should all be there, but + * there's not really anything we can do if its only partial anyway, + * that's an error somewhere else, maybe long ago. + */ + uint_t d; + for (d = 0; d < ndvas; d++) + if (!DVA_EQUAL(&dvas[d], &bp->blk_dva[d])) + return (B_FALSE); + ASSERT3U(d, ==, ndvas); + + return (B_TRUE); +} + ddt_entry_t * ddt_lookup(ddt_t *ddt, const blkptr_t *bp) { @@ -1054,8 +1140,11 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp) /* If it's already loaded, we can just return it. */ DDT_KSTAT_BUMP(ddt, dds_lookup_live_hit); - if (dde->dde_flags & DDE_FLAG_LOADED) - return (dde); + if (dde->dde_flags & DDE_FLAG_LOADED) { + if (ddt_entry_lookup_is_valid(ddt, bp, dde)) + return (dde); + return (NULL); + } /* Someone else is loading it, wait for it. */ dde->dde_waiters++; @@ -1074,7 +1163,11 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp) } DDT_KSTAT_BUMP(ddt, dds_lookup_existing); - return (dde); + + /* Make sure the loaded entry matches the BP */ + if (ddt_entry_lookup_is_valid(ddt, bp, dde)) + return (dde); + return (NULL); } else DDT_KSTAT_BUMP(ddt, dds_lookup_live_miss); @@ -1083,32 +1176,42 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp) /* Record the time this class was created (used by ddt prune) */ if (ddt->ddt_flags & DDT_FLAG_FLAT) - dde->dde_phys->ddp_flat.ddp_class_start = gethrestime_sec(); + dde->dde_phys->ddp_flat.ddp_class_start = ddt_class_start(); avl_insert(&ddt->ddt_tree, dde, where); /* If its in the log tree, we can "load" it from there */ if (ddt->ddt_flags & DDT_FLAG_LOG) { ddt_lightweight_entry_t ddlwe; - boolean_t found = B_FALSE; - - if (ddt_log_take_key(ddt, ddt->ddt_log_active, - &search, &ddlwe)) { - DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit); - found = B_TRUE; - } else if (ddt_log_take_key(ddt, ddt->ddt_log_flushing, - &search, &ddlwe)) { - DDT_KSTAT_BUMP(ddt, dds_lookup_log_flushing_hit); - found = B_TRUE; - } - - if (found) { - dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED; + if (ddt_log_find_key(ddt, &search, &ddlwe)) { + /* + * See if we have the key first, and if so, set up + * the entry. + */ dde->dde_type = ddlwe.ddlwe_type; dde->dde_class = ddlwe.ddlwe_class; memcpy(dde->dde_phys, &ddlwe.ddlwe_phys, DDT_PHYS_SIZE(ddt)); + /* Whatever we found isn't valid for this BP, eject */ + if (!ddt_entry_lookup_is_valid(ddt, bp, dde)) { + avl_remove(&ddt->ddt_tree, dde); + ddt_free(ddt, dde); + return (NULL); + } + + /* Remove it and count it */ + if (ddt_log_remove_key(ddt, + ddt->ddt_log_active, &search)) { + DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit); + } else { + VERIFY(ddt_log_remove_key(ddt, + ddt->ddt_log_flushing, &search)); + DDT_KSTAT_BUMP(ddt, + dds_lookup_log_flushing_hit); + } + + dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED; DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit); DDT_KSTAT_BUMP(ddt, dds_lookup_existing); @@ -1147,6 +1250,8 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp) dde->dde_type = type; /* will be DDT_TYPES if no entry found */ dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ + boolean_t valid = B_TRUE; + if (dde->dde_type == DDT_TYPES && dde->dde_class == DDT_CLASSES && ddt_over_quota(spa)) { @@ -1160,6 +1265,24 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp) /* Flag cleanup required */ dde->dde_flags |= DDE_FLAG_OVERQUOTA; } else if (error == 0) { + /* + * If what we loaded is no good for this BP and there's no one + * waiting for it, we can just remove it and get out. If its no + * good but there are waiters, we have to leave it, because we + * don't know what they want. If its not needed we'll end up + * taking an entry log/sync, but it can only happen if more + * than one previous version of this block is being deleted at + * the same time. This is extremely unlikely to happen and not + * worth the effort to deal with without taking an entry + * update. + */ + valid = ddt_entry_lookup_is_valid(ddt, bp, dde); + if (!valid && dde->dde_waiters == 0) { + avl_remove(&ddt->ddt_tree, dde); + ddt_free(ddt, dde); + return (NULL); + } + DDT_KSTAT_BUMP(ddt, dds_lookup_stored_hit); DDT_KSTAT_BUMP(ddt, dds_lookup_existing); @@ -1188,7 +1311,10 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp) dde->dde_flags |= DDE_FLAG_LOADED; cv_broadcast(&dde->dde_cv); - return (dde->dde_flags & DDE_FLAG_OVERQUOTA ? NULL : dde); + if ((dde->dde_flags & DDE_FLAG_OVERQUOTA) || !valid) + return (NULL); + + return (dde); } void @@ -1417,7 +1543,6 @@ ddt_configure(ddt_t *ddt, boolean_t new) static void ddt_table_alloc_kstats(ddt_t *ddt) { -#ifdef _KERNEL char *mod = kmem_asprintf("zfs/%s", spa_name(ddt->ddt_spa)); char *name = kmem_asprintf("ddt_stats_%s", zio_checksum_table[ddt->ddt_checksum].ci_name); @@ -1433,9 +1558,6 @@ ddt_table_alloc_kstats(ddt_t *ddt) kmem_strfree(name); kmem_strfree(mod); -#else - (void) ddt; -#endif /* _KERNEL */ } static ddt_t * @@ -1465,13 +1587,11 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c) static void ddt_table_free(ddt_t *ddt) { -#ifdef _KERNEL if (ddt->ddt_ksp != NULL) { kmem_free(ddt->ddt_ksp->ks_data, sizeof (ddt_kstats_t)); ddt->ddt_ksp->ks_data = NULL; kstat_delete(ddt->ddt_ksp); } -#endif /* _KERNEL */ ddt_log_free(ddt); ASSERT0(avl_numnodes(&ddt->ddt_tree)); @@ -1811,7 +1931,7 @@ ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v); if (ddt_phys_birth(ddp, v) == 0) { - ASSERT3U(phys_refcnt, ==, 0); + ASSERT0(phys_refcnt); continue; } if (DDT_PHYS_IS_DITTO(ddt, p)) { @@ -2285,8 +2405,9 @@ ddt_walk_ready(spa_t *spa) return (B_TRUE); } -int -ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe) +static int +ddt_walk_impl(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe, + uint64_t flags, boolean_t wait) { do { do { @@ -2295,7 +2416,11 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe) if (ddt == NULL) continue; - if (ddt->ddt_flush_force_txg > 0) + if (flags != 0 && + (ddt->ddt_flags & flags) != flags) + continue; + + if (wait && ddt->ddt_flush_force_txg > 0) return (EAGAIN); int error = ENOENT; @@ -2319,13 +2444,19 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe) return (SET_ERROR(ENOENT)); } +int +ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe) +{ + return (ddt_walk_impl(spa, ddb, ddlwe, 0, B_TRUE)); +} + /* * This function is used by Block Cloning (brt.c) to increase reference * counter for the DDT entry if the block is already in DDT. * * Return false if the block, despite having the D bit set, is not present - * in the DDT. Currently this is not possible but might be in the future. - * See the comment below. + * in the DDT. This is possible when the DDT has been pruned by an admin + * or by the DDT quota mechanism. */ boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp) @@ -2356,28 +2487,13 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp)); ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); - /* - * This entry already existed (dde_type is real), so it must - * have refcnt >0 at the start of this txg. We are called from - * brt_pending_apply(), before frees are issued, so the refcnt - * can't be lowered yet. Therefore, it must be >0. We assert - * this because if the order of BRT and DDT interactions were - * ever to change and the refcnt was ever zero here, then - * likely further action is required to fill out the DDT entry, - * and this is a place that is likely to be missed in testing. - */ - ASSERT3U(ddt_phys_refcnt(dde->dde_phys, v), >, 0); - ddt_phys_addref(dde->dde_phys, v); result = B_TRUE; } else { /* - * At the time of implementating this if the block has the - * DEDUP flag set it must exist in the DEDUP table, but - * there are many advocates that want ability to remove - * entries from DDT with refcnt=1. If this will happen, - * we may have a block with the DEDUP set, but which doesn't - * have a corresponding entry in the DDT. Be ready. + * If the block has the DEDUP flag set it still might not + * exist in the DEDUP table due to DDT pruning of entries + * where refcnt=1. */ ddt_remove(ddt, dde); result = B_FALSE; @@ -2389,6 +2505,261 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) return (result); } +typedef struct ddt_prune_entry { + ddt_t *dpe_ddt; + ddt_key_t dpe_key; + list_node_t dpe_node; + ddt_univ_phys_t dpe_phys[]; +} ddt_prune_entry_t; + +typedef struct ddt_prune_info { + spa_t *dpi_spa; + uint64_t dpi_txg_syncs; + uint64_t dpi_pruned; + list_t dpi_candidates; +} ddt_prune_info_t; + +/* + * Add prune candidates for ddt_sync during spa_sync + */ +static void +prune_candidates_sync(void *arg, dmu_tx_t *tx) +{ + (void) tx; + ddt_prune_info_t *dpi = arg; + ddt_prune_entry_t *dpe; + + spa_config_enter(dpi->dpi_spa, SCL_ZIO, FTAG, RW_READER); + + /* Process the prune candidates collected so far */ + while ((dpe = list_remove_head(&dpi->dpi_candidates)) != NULL) { + blkptr_t blk; + ddt_t *ddt = dpe->dpe_ddt; + + ddt_enter(ddt); + + /* + * If it's on the live list, then it was loaded for update + * this txg and is no longer stale; skip it. + */ + if (avl_find(&ddt->ddt_tree, &dpe->dpe_key, NULL)) { + ddt_exit(ddt); + kmem_free(dpe, sizeof (*dpe)); + continue; + } + + ddt_bp_create(ddt->ddt_checksum, &dpe->dpe_key, + dpe->dpe_phys, DDT_PHYS_FLAT, &blk); + + ddt_entry_t *dde = ddt_lookup(ddt, &blk); + if (dde != NULL && !(dde->dde_flags & DDE_FLAG_LOGGED)) { + ASSERT(dde->dde_flags & DDE_FLAG_LOADED); + /* + * Zero the physical, so we don't try to free DVAs + * at flush nor try to reuse this entry. + */ + ddt_phys_clear(dde->dde_phys, DDT_PHYS_FLAT); + + dpi->dpi_pruned++; + } + + ddt_exit(ddt); + kmem_free(dpe, sizeof (*dpe)); + } + + spa_config_exit(dpi->dpi_spa, SCL_ZIO, FTAG); + dpi->dpi_txg_syncs++; +} + +/* + * Prune candidates are collected in open context and processed + * in sync context as part of ddt_sync_table(). + */ +static void +ddt_prune_entry(list_t *list, ddt_t *ddt, const ddt_key_t *ddk, + const ddt_univ_phys_t *ddp) +{ + ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT); + + size_t dpe_size = sizeof (ddt_prune_entry_t) + DDT_FLAT_PHYS_SIZE; + ddt_prune_entry_t *dpe = kmem_alloc(dpe_size, KM_SLEEP); + + dpe->dpe_ddt = ddt; + dpe->dpe_key = *ddk; + memcpy(dpe->dpe_phys, ddp, DDT_FLAT_PHYS_SIZE); + list_insert_head(list, dpe); +} + +/* + * Interate over all the entries in the DDT unique class. + * The walk will perform one of the following operations: + * (a) build a histogram than can be used when pruning + * (b) prune entries older than the cutoff + * + * Also called by zdb(8) to dump the age histogram + */ +void +ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram) +{ + ddt_bookmark_t ddb = { + .ddb_class = DDT_CLASS_UNIQUE, + .ddb_type = 0, + .ddb_checksum = 0, + .ddb_cursor = 0 + }; + ddt_lightweight_entry_t ddlwe = {0}; + int error; + int total = 0, valid = 0; + int candidates = 0; + uint64_t now = gethrestime_sec(); + ddt_prune_info_t dpi; + boolean_t pruning = (cutoff != 0); + + if (pruning) { + dpi.dpi_txg_syncs = 0; + dpi.dpi_pruned = 0; + dpi.dpi_spa = spa; + list_create(&dpi.dpi_candidates, sizeof (ddt_prune_entry_t), + offsetof(ddt_prune_entry_t, dpe_node)); + } + + if (histogram != NULL) + memset(histogram, 0, sizeof (ddt_age_histo_t)); + + while ((error = + ddt_walk_impl(spa, &ddb, &ddlwe, DDT_FLAG_FLAT, B_FALSE)) == 0) { + ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; + VERIFY(ddt); + + if (spa_shutting_down(spa) || issig()) + break; + total++; + + ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT); + ASSERT3U(ddlwe.ddlwe_phys.ddp_flat.ddp_refcnt, <=, 1); + + uint64_t class_start = + ddlwe.ddlwe_phys.ddp_flat.ddp_class_start; + + /* + * If this entry is on the log, then the stored entry is stale + * and we should skip it. + */ + if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL)) + continue; + + /* prune older entries */ + if (pruning && class_start < cutoff) { + if (candidates++ >= zfs_ddt_prunes_per_txg) { + /* sync prune candidates in batches */ + VERIFY0(dsl_sync_task(spa_name(spa), + NULL, prune_candidates_sync, + &dpi, 0, ZFS_SPACE_CHECK_NONE)); + candidates = 1; + } + ddt_prune_entry(&dpi.dpi_candidates, ddt, + &ddlwe.ddlwe_key, &ddlwe.ddlwe_phys); + } + + /* build a histogram */ + if (histogram != NULL) { + uint64_t age = MAX(1, (now - class_start) / 3600); + int bin = MIN(highbit64(age) - 1, HIST_BINS - 1); + histogram->dah_entries++; + histogram->dah_age_histo[bin]++; + } + + valid++; + } + + if (pruning && valid > 0) { + if (!list_is_empty(&dpi.dpi_candidates)) { + /* sync out final batch of prune candidates */ + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + prune_candidates_sync, &dpi, 0, + ZFS_SPACE_CHECK_NONE)); + } + list_destroy(&dpi.dpi_candidates); + + zfs_dbgmsg("pruned %llu entries (%d%%) across %llu txg syncs", + (u_longlong_t)dpi.dpi_pruned, + (int)((dpi.dpi_pruned * 100) / valid), + (u_longlong_t)dpi.dpi_txg_syncs); + } +} + +static uint64_t +ddt_total_entries(spa_t *spa) +{ + ddt_object_t ddo; + ddt_get_dedup_object_stats(spa, &ddo); + + return (ddo.ddo_count); +} + +int +ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit, + uint64_t amount) +{ + uint64_t cutoff; + uint64_t start_time = gethrtime(); + + if (spa->spa_active_ddt_prune) + return (SET_ERROR(EALREADY)); + if (ddt_total_entries(spa) == 0) + return (0); + + spa->spa_active_ddt_prune = B_TRUE; + + zfs_dbgmsg("prune %llu %s", (u_longlong_t)amount, + unit == ZPOOL_DDT_PRUNE_PERCENTAGE ? "%" : "seconds old or older"); + + if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) { + ddt_age_histo_t histogram; + uint64_t oldest = 0; + + /* Make a pass over DDT to build a histogram */ + ddt_prune_walk(spa, 0, &histogram); + + int target = (histogram.dah_entries * amount) / 100; + + /* + * Figure out our cutoff date + * (i.e., which bins to prune from) + */ + for (int i = HIST_BINS - 1; i >= 0 && target > 0; i--) { + if (histogram.dah_age_histo[i] != 0) { + /* less than this bucket remaining */ + if (target < histogram.dah_age_histo[i]) { + oldest = MAX(1, (1< 0 && !spa_shutting_down(spa) && !issig()) { + /* Traverse DDT to prune entries older that our cuttoff */ + ddt_prune_walk(spa, cutoff, NULL); + } + + zfs_dbgmsg("%s: prune completed in %llu ms", + spa_name(spa), (u_longlong_t)NSEC2MSEC(gethrtime() - start_time)); + + spa->spa_active_ddt_prune = B_FALSE; + return (0); +} + ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW, "Enable prefetching dedup-ed blks"); diff --git a/module/zfs/ddt_log.c b/module/zfs/ddt_log.c index a367d0cd02f8..3aa07dc25b91 100644 --- a/module/zfs/ddt_log.c +++ b/module/zfs/ddt_log.c @@ -353,16 +353,15 @@ ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe) } boolean_t -ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk, - ddt_lightweight_entry_t *ddlwe) +ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk) { ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL); if (ddle == NULL) return (B_FALSE); - DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe); - - ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe); + ddt_lightweight_entry_t ddlwe; + DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe); + ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe); avl_remove(&ddl->ddl_tree, ddle); kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? @@ -371,6 +370,21 @@ ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk, return (B_TRUE); } +boolean_t +ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk, + ddt_lightweight_entry_t *ddlwe) +{ + ddt_log_entry_t *ddle = + avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL); + if (!ddle) + ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL); + if (!ddle) + return (B_FALSE); + if (ddlwe) + DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe); + return (B_TRUE); +} + void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) { diff --git a/module/zfs/spa.c b/module/zfs/spa.c index d51cc4fcd09a..1a68a0953565 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -366,21 +366,15 @@ spa_prop_add(spa_t *spa, const char *propname, nvlist_t *outnvl) int spa_prop_get_nvlist(spa_t *spa, char **props, unsigned int n_props, - nvlist_t **outnvl) + nvlist_t *outnvl) { int err = 0; if (props == NULL) return (0); - if (*outnvl == NULL) { - err = nvlist_alloc(outnvl, NV_UNIQUE_NAME, KM_SLEEP); - if (err) - return (err); - } - for (unsigned int i = 0; i < n_props && err == 0; i++) { - err = spa_prop_add(spa, props[i], *outnvl); + err = spa_prop_add(spa, props[i], outnvl); } return (err); @@ -406,7 +400,7 @@ spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval, * Get property values from the spa configuration. */ static void -spa_prop_get_config(spa_t *spa, nvlist_t **nvp) +spa_prop_get_config(spa_t *spa, nvlist_t *nv) { vdev_t *rvd = spa->spa_root_vdev; dsl_pool_t *pool = spa->spa_dsl_pool; @@ -428,48 +422,48 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) size += metaslab_class_get_space(spa_dedup_class(spa)); size += metaslab_class_get_space(spa_embedded_log_class(spa)); - spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_NAME, spa_name(spa), 0, src); + spa_prop_add_list(nv, ZPOOL_PROP_SIZE, NULL, size, src); + spa_prop_add_list(nv, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); + spa_prop_add_list(nv, ZPOOL_PROP_FREE, NULL, size - alloc, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_CHECKPOINT, NULL, spa->spa_checkpoint_info.sci_dspace, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_FRAGMENTATION, NULL, metaslab_class_fragmentation(mc), src); - spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_EXPANDSZ, NULL, metaslab_class_expandable_space(mc), src); - spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_READONLY, NULL, (spa_mode(spa) == SPA_MODE_READ), src); cap = (size == 0) ? 0 : (alloc * 100 / size); - spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); + spa_prop_add_list(nv, ZPOOL_PROP_CAPACITY, NULL, cap, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_DEDUPRATIO, NULL, ddt_get_pool_dedup_ratio(spa), src); - spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONEUSED, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_BCLONEUSED, NULL, brt_get_used(spa), src); - spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONESAVED, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_BCLONESAVED, NULL, brt_get_saved(spa), src); - spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_BCLONERATIO, NULL, brt_get_ratio(spa), src); - spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUP_TABLE_SIZE, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_DEDUP_TABLE_SIZE, NULL, ddt_get_ddt_dsize(spa), src); - spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_HEALTH, NULL, rvd->vdev_state, src); version = spa_version(spa); if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { - spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL, version, ZPROP_SRC_DEFAULT); } else { - spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL, version, ZPROP_SRC_LOCAL); } - spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID, + spa_prop_add_list(nv, ZPOOL_PROP_LOAD_GUID, NULL, spa_load_guid(spa), src); } @@ -479,62 +473,62 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) * when opening pools before this version freedir will be NULL. */ if (pool->dp_free_dir != NULL) { - spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_FREEING, NULL, dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, src); } else { - spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, + spa_prop_add_list(nv, ZPOOL_PROP_FREEING, NULL, 0, src); } if (pool->dp_leak_dir != NULL) { - spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_LEAKED, NULL, dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, src); } else { - spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, + spa_prop_add_list(nv, ZPOOL_PROP_LEAKED, NULL, 0, src); } } - spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); + spa_prop_add_list(nv, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); if (spa->spa_comment != NULL) { - spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, + spa_prop_add_list(nv, ZPOOL_PROP_COMMENT, spa->spa_comment, 0, ZPROP_SRC_LOCAL); } if (spa->spa_compatibility != NULL) { - spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY, + spa_prop_add_list(nv, ZPOOL_PROP_COMPATIBILITY, spa->spa_compatibility, 0, ZPROP_SRC_LOCAL); } if (spa->spa_root != NULL) - spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, + spa_prop_add_list(nv, ZPOOL_PROP_ALTROOT, spa->spa_root, 0, ZPROP_SRC_LOCAL); if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { - spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL, MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); } else { - spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL, SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); } if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { - spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL, DNODE_MAX_SIZE, ZPROP_SRC_NONE); } else { - spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, + spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL, DNODE_MIN_SIZE, ZPROP_SRC_NONE); } if ((dp = list_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path == NULL) { - spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, + spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE, "none", 0, ZPROP_SRC_LOCAL); } else if (strcmp(dp->scd_path, spa_config_path) != 0) { - spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, + spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE, dp->scd_path, 0, ZPROP_SRC_LOCAL); } } @@ -544,19 +538,13 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) * Get zpool property values. */ int -spa_prop_get(spa_t *spa, nvlist_t **nvp) +spa_prop_get(spa_t *spa, nvlist_t *nv) { objset_t *mos = spa->spa_meta_objset; zap_cursor_t zc; zap_attribute_t za; dsl_pool_t *dp; - int err; - - if (*nvp == NULL) { - err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); - if (err) - return (err); - } + int err = 0; dp = spa_get_dsl(spa); dsl_pool_config_enter(dp, FTAG); @@ -565,7 +553,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) /* * Get properties from the spa config. */ - spa_prop_get_config(spa, nvp); + spa_prop_get_config(spa, nv); /* If no pool property object, no more prop to get. */ if (mos == NULL || spa->spa_pool_props_object == 0) @@ -610,7 +598,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) intval = za.za_first_integer; } - spa_prop_add_list(*nvp, prop, strval, intval, src); + spa_prop_add_list(nv, prop, strval, intval, src); if (strval != NULL) kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); @@ -627,10 +615,10 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) break; } if (prop != ZPOOL_PROP_INVAL) { - spa_prop_add_list(*nvp, prop, strval, 0, src); + spa_prop_add_list(nv, prop, strval, 0, src); } else { src = ZPROP_SRC_LOCAL; - spa_prop_add_user(*nvp, za.za_name, strval, + spa_prop_add_user(nv, za.za_name, strval, src); } kmem_free(strval, za.za_num_integers); @@ -644,11 +632,9 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) out: mutex_exit(&spa->spa_props_lock); dsl_pool_config_exit(dp, FTAG); - if (err && err != ENOENT) { - nvlist_free(*nvp); - *nvp = NULL; + + if (err && err != ENOENT) return (err); - } return (0); } diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 7ce2d919610f..53366ad49781 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -3050,7 +3050,6 @@ static const zfs_ioc_key_t zfs_keys_get_props[] = { static int zfs_ioc_pool_get_props(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) { - nvlist_t *nvp = outnvl; spa_t *spa; char **props = NULL; unsigned int n_props = 0; @@ -3069,16 +3068,17 @@ zfs_ioc_pool_get_props(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pool)) != NULL) { - error = spa_prop_get(spa, &nvp); + error = spa_prop_get(spa, outnvl); if (error == 0 && props != NULL) error = spa_prop_get_nvlist(spa, props, n_props, - &nvp); + outnvl); } mutex_exit(&spa_namespace_lock); } else { - error = spa_prop_get(spa, &nvp); + error = spa_prop_get(spa, outnvl); if (error == 0 && props != NULL) - error = spa_prop_get_nvlist(spa, props, n_props, &nvp); + error = spa_prop_get_nvlist(spa, props, n_props, + outnvl); spa_close(spa, FTAG); } @@ -4342,6 +4342,51 @@ zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) return (total_errors > 0 ? SET_ERROR(EINVAL) : 0); } +#define DDT_PRUNE_UNIT "ddt_prune_unit" +#define DDT_PRUNE_AMOUNT "ddt_prune_amount" + +/* + * innvl: { + * "ddt_prune_unit" -> uint32_t + * "ddt_prune_amount" -> uint64_t + * } + * + * outnvl: "waited" -> boolean_t + */ +static const zfs_ioc_key_t zfs_keys_ddt_prune[] = { + {DDT_PRUNE_UNIT, DATA_TYPE_INT32, 0}, + {DDT_PRUNE_AMOUNT, DATA_TYPE_UINT64, 0}, +}; + +static int +zfs_ioc_ddt_prune(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + int32_t unit; + uint64_t amount; + + if (nvlist_lookup_int32(innvl, DDT_PRUNE_UNIT, &unit) != 0 || + nvlist_lookup_uint64(innvl, DDT_PRUNE_AMOUNT, &amount) != 0) { + return (EINVAL); + } + + spa_t *spa; + int error = spa_open(poolname, &spa, FTAG); + if (error != 0) + return (error); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + error = ddt_prune_unique_entries(spa, (zpool_ddt_prune_unit_t)unit, + amount); + + spa_close(spa, FTAG); + + return (error); +} + /* * This ioctl waits for activity of a particular type to complete. If there is * no activity of that type in progress, it returns immediately, and the @@ -7430,6 +7475,11 @@ zfs_ioctl_init(void) POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_get_props, ARRAY_SIZE(zfs_keys_get_props)); + zfs_ioctl_register("zpool_ddt_prune", ZFS_IOC_DDT_PRUNE, + zfs_ioc_ddt_prune, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_ddt_prune, ARRAY_SIZE(zfs_keys_ddt_prune)); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, diff --git a/module/zfs/zio.c b/module/zfs/zio.c index a841e0a79107..53992931e049 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2553,7 +2553,7 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) if (reason != ZIO_SUSPEND_MMP) { cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable " - "I/O failure and has been suspended.\n", spa_name(spa)); + "I/O failure and has been suspended.", spa_name(spa)); } (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, @@ -2589,6 +2589,10 @@ zio_resume(spa_t *spa) * Reexecute all previously suspended i/o. */ mutex_enter(&spa->spa_suspend_lock); + if (spa->spa_suspended != ZIO_SUSPEND_NONE) + cmn_err(CE_WARN, "Pool '%s' was suspended and is being " + "resumed. Failed I/O will be retried.", + spa_name(spa)); spa->spa_suspended = ZIO_SUSPEND_NONE; cv_broadcast(&spa->spa_suspend_cv); pio = spa->spa_suspend_zio_root; @@ -3859,6 +3863,16 @@ zio_ddt_free(zio_t *zio) } ddt_exit(ddt); + /* + * When no entry was found, it must have been pruned, + * so we can free it now instead of decrementing the + * refcount in the DDT. + */ + if (!dde) { + BP_SET_DEDUP(bp, 0); + zio->io_pipeline |= ZIO_STAGE_DVA_FREE; + } + return (zio); } diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 1177e80e1a75..6db10b91de05 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -389,7 +389,7 @@ if os.environ.get('CI') == 'true': def process_results(pathname): try: - f = open(pathname) + f = open(pathname, errors='replace') except IOError as e: print('Error opening file:', e) sys.exit(1)