From 617bac1d86f5c3d6351746d7000582a2ce5f9efb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Wed, 29 May 2024 14:04:00 +0300 Subject: [PATCH] MDEV-34062: Implement innodb_log_file_mmap on 64-bit systems When using the default innodb_log_buffer_size=2m, mariadb-backup --backup would spend a lot of time re-reading and re-parsing the log. For reads, it would be beneficial to memory-map the entire ib_logfile0 to the address space (typically 48 bits or 256 TiB) and read it from there, both during --backup and --prepare. That is what we will be doing by default. We can also enable memory-mapped log writes in case the new parameter innodb_log_file_mmap is set to ON. This could speed up I/O and allow the log data to be shared between mariadbd and mariadb-backup --backup in the RAM buffer. Memory-mapped regular files differ from log_sys.is_pmem() in the way that an open file handle to ib_logfile0 will be retained. That allows log_t::set_mmap() to enable or disable the interface with fewer operations. On Linux, on log checkpoint we will invoke fallocate() with FALLOC_FL_ZERO_RANGE and madvise() with MADV_DONTNEED in order to reduce the memory pressure. Without the fallocate() call, using MADV_DONTNEED could lead to reads of old garbage contents of the circular log file when a page fault occurs while writing a record. Most references to HAVE_PMEM or log_sys.is_pmem() are replaced with HAVE_INNODB_MMAP or log_sys.is_mmap(). The main difference is that PMEM skips the use of write_lock and flush_lock and uses pmem_persist(), while the memory-mapped interface will use a combination of msync() and fdatasync(). --- extra/mariabackup/xtrabackup.cc | 142 +++++- .../innodb/r/log_file_size_online.result | 8 + .../suite/innodb/t/log_file_size_online.test | 17 + .../suite/sys_vars/r/sysvars_innodb.result | 1 + .../suite/sys_vars/t/sysvars_innodb.test | 1 + storage/innobase/buf/buf0flu.cc | 95 +++- storage/innobase/handler/ha_innodb.cc | 20 + storage/innobase/include/log0log.h | 66 +-- storage/innobase/include/log0recv.h | 9 +- storage/innobase/include/univ.i | 5 +- storage/innobase/log/log0log.cc | 474 +++++++++++++----- storage/innobase/log/log0recv.cc | 51 +- storage/innobase/mtr/mtr0mtr.cc | 56 +-- storage/innobase/srv/srv0start.cc | 14 +- 14 files changed, 684 insertions(+), 275 deletions(-) diff --git a/extra/mariabackup/xtrabackup.cc b/extra/mariabackup/xtrabackup.cc index c50267eba1ab0..37258366408d2 100644 --- a/extra/mariabackup/xtrabackup.cc +++ b/extra/mariabackup/xtrabackup.cc @@ -205,6 +205,8 @@ lsn_t checkpoint_lsn_start; lsn_t checkpoint_no_start; /** whether log_copying_thread() is active; protected by recv_sys.mutex */ static bool log_copying_running; +/** for --backup, target LSN to copy the log to; protected by recv_sys.mutex */ +lsn_t metadata_to_lsn; uint xtrabackup_parallel; @@ -236,7 +238,6 @@ my_bool opt_encrypted_backup; #define XTRABACKUP_METADATA_FILENAME "xtrabackup_checkpoints" char metadata_type[30] = ""; /*[full-backuped|log-applied|incremental]*/ static lsn_t metadata_from_lsn; -lsn_t metadata_to_lsn; static lsn_t metadata_last_lsn; static ds_file_t* dst_log_file; @@ -282,9 +283,6 @@ my_bool xtrabackup_incremental_force_scan = FALSE; */ ulong xtrabackup_innodb_force_recovery = 0; -/* The flushed lsn which is read from data files */ -lsn_t flushed_lsn= 0; - ulong xb_open_files_limit= 0; char *xb_plugin_dir; char *xb_plugin_load; @@ -1329,6 +1327,9 @@ enum options_xtrabackup OPT_INNODB_BUFFER_POOL_FILENAME, OPT_INNODB_LOCK_WAIT_TIMEOUT, OPT_INNODB_LOG_BUFFER_SIZE, +#ifdef HAVE_INNODB_MMAP + OPT_INNODB_LOG_FILE_MMAP, +#endif #if defined __linux__ || defined _WIN32 OPT_INNODB_LOG_FILE_BUFFERING, #endif @@ -1890,6 +1891,13 @@ struct my_option xb_server_options[] = (G_PTR*) &log_sys.buf_size, (G_PTR*) &log_sys.buf_size, 0, GET_UINT, REQUIRED_ARG, 2U << 20, 2U << 20, log_sys.buf_size_max, 0, 4096, 0}, +#ifdef HAVE_INNODB_MMAP + {"innodb_log_file_mmap", OPT_INNODB_LOG_FILE_SIZE, + "Whether ib_logfile0 should be memory-mapped", + (G_PTR*) &log_sys.log_mmap, + (G_PTR*) &log_sys.log_mmap, 0, GET_BOOL, NO_ARG, + TRUE, 0, 0, 0, 0, 0}, +#endif #if defined __linux__ || defined _WIN32 {"innodb_log_file_buffering", OPT_INNODB_LOG_FILE_BUFFERING, "Whether the file system cache for ib_logfile0 is enabled during --backup", @@ -3360,8 +3368,108 @@ static my_bool xtrabackup_copy_datafile(ds_ctxt *ds_data, return(FALSE); } +#ifdef HAVE_INNODB_MMAP +static int +xtrabackup_copy_mmap_snippet(ds_file_t *ds, const byte *start, const byte *end) +{ + if (UNIV_UNLIKELY(start > end)) + { + if (int r= ds_write(ds, start, log_sys.buf + log_sys.file_size - start)) + return r; + start= log_sys.buf + log_sys.START_OFFSET; + } + return ds_write(ds, start, end - start); +} + +/** Copy memory-mapped log until the end of the log is reached +or the log_copying_stop signal is received +@return whether the operation failed */ +static bool xtrabackup_copy_mmap_logfile() +{ + mysql_mutex_assert_owner(&recv_sys.mutex); + recv_sys.offset= size_t(log_sys.calc_lsn_offset(recv_sys.lsn)); + recv_sys.len= size_t(log_sys.file_size); + const size_t seq_offset{log_sys.is_encrypted() ? 8U + 5U : 5U}; + const char one{'\1'}; + + for (unsigned retry_count{0};;) + { + recv_sys_t::parse_mtr_result r; + const byte *start= &log_sys.buf[recv_sys.offset]; + + if (recv_sys.parse_mmap(false) == recv_sys_t::OK) + { + const byte *end; + + do + { + /* Set the sequence bit (the backed-up log will not wrap around) */ + size_t seqo= recv_sys.offset - seq_offset; + if (seqo < log_sys.START_OFFSET) + seqo+= log_sys.file_size - log_sys.START_OFFSET; + const byte *seq= &log_sys.buf[seqo]; + ut_ad(*seq == log_sys.get_sequence_bit(recv_sys.lsn - seq_offset)); + if (!*seq) + { + if (xtrabackup_copy_mmap_snippet(dst_log_file, start, seq) || + ds_write(dst_log_file, &one, 1)) + goto write_error; + start = seq + 1; + } + } + while ((r= recv_sys.parse_mmap(false)) == recv_sys_t::OK); + + end= &log_sys.buf[recv_sys.offset]; + + if (xtrabackup_copy_mmap_snippet(dst_log_file, start, end)) + { + write_error: + msg("Error: write to ib_logfile0 failed"); + return true; + } + + start= end; + + pthread_cond_broadcast(&scanned_lsn_cond); + + if (r == recv_sys_t::GOT_EOF) + break; + + retry_count= 0; + } + else + { + if (metadata_to_lsn) + { + if (metadata_to_lsn <= recv_sys.lsn) + return false; + } + else if (xtrabackup_throttle && io_ticket-- < 0) + mysql_cond_wait(&wait_throttle, &recv_sys.mutex); + + if (!retry_count++) + msg("Retrying read of log at LSN=" LSN_PF, recv_sys.lsn); + else if (retry_count == 100) + break; + else + { + timespec abstime; + set_timespec_nsec(abstime, 1000000ULL /* 1 ms */); + if (!mysql_cond_timedwait(&log_copying_stop, &recv_sys.mutex, + &abstime)) + return true; + } + } + } + + if (verbose) + msg(">> log scanned up to (" LSN_PF ")", recv_sys.lsn); + return false; +} +#endif + /** Copy redo log until the current end of the log is reached -@return whether the operation failed */ +@return whether the operation failed */ static bool xtrabackup_copy_logfile() { mysql_mutex_assert_owner(&recv_sys.mutex); @@ -3369,16 +3477,17 @@ static bool xtrabackup_copy_logfile() ut_a(dst_log_file); ut_ad(recv_sys.is_initialised()); + +#ifdef HAVE_INNODB_MMAP + if (log_sys.is_mmap()) + return xtrabackup_copy_mmap_logfile(); +#endif const size_t sequence_offset{log_sys.is_encrypted() ? 8U + 5U : 5U}; const size_t block_size_1{log_sys.get_block_size() - 1}; - ut_ad(!log_sys.is_pmem()); - - { - recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) & - block_size_1; - recv_sys.len= 0; - } + recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) & + block_size_1; + recv_sys.len= 0; for (unsigned retry_count{0};;) { @@ -5299,9 +5408,8 @@ static bool xtrabackup_backup_func() goto fail; } - if (!log_sys.create()) { - goto fail; - } + log_sys.create(); + /* get current checkpoint_lsn */ { log_sys.latch.wr_lock(SRW_LOCK_CALL); @@ -6658,9 +6766,7 @@ static bool xtrabackup_prepare_func(char** argv) } recv_sys.create(); - if (!log_sys.create()) { - goto error; - } + log_sys.create(); recv_sys.recovery_on = true; xb_fil_io_init(); diff --git a/mysql-test/suite/innodb/r/log_file_size_online.result b/mysql-test/suite/innodb/r/log_file_size_online.result index 1db2fdde5762c..cc1d2b740f9ee 100644 --- a/mysql-test/suite/innodb/r/log_file_size_online.result +++ b/mysql-test/suite/innodb/r/log_file_size_online.result @@ -20,6 +20,14 @@ Variable_name Value innodb_log_file_size 4194304 FOUND 1 /InnoDB: Resized log to 4\.000MiB/ in mysqld.1.err UPDATE t SET b='' WHERE a<10; +SET @save=@@GLOBAL.innodb_log_file_buffering; +SET GLOBAL innodb_log_file_buffering=OFF; +SET GLOBAL innodb_log_file_buffering=ON; +SET GLOBAL innodb_log_file_buffering=@save; +SET @save=@@GLOBAL.innodb_log_file_mmap; +SET GLOBAL innodb_log_file_mmap=OFF; +SET GLOBAL innodb_log_file_mmap=ON; +SET GLOBAL innodb_log_file_mmap=@save; SET GLOBAL innodb_log_file_size=5242880; SHOW VARIABLES LIKE 'innodb_log_file_size'; Variable_name Value diff --git a/mysql-test/suite/innodb/t/log_file_size_online.test b/mysql-test/suite/innodb/t/log_file_size_online.test index 65551f13dbccf..f4628a4279dd5 100644 --- a/mysql-test/suite/innodb/t/log_file_size_online.test +++ b/mysql-test/suite/innodb/t/log_file_size_online.test @@ -27,6 +27,23 @@ let SEARCH_PATTERN = InnoDB: Resized log to 4\\.000MiB; UPDATE t SET b='' WHERE a<10; +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE +SET @save=@@GLOBAL.innodb_log_file_buffering; +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE +SET GLOBAL innodb_log_file_buffering=OFF; +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE +SET GLOBAL innodb_log_file_buffering=ON; +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE +SET GLOBAL innodb_log_file_buffering=@save; +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE +SET @save=@@GLOBAL.innodb_log_file_mmap; +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE +SET GLOBAL innodb_log_file_mmap=OFF; +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE +SET GLOBAL innodb_log_file_mmap=ON; +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE +SET GLOBAL innodb_log_file_mmap=@save; + SET GLOBAL innodb_log_file_size=5242880; SHOW VARIABLES LIKE 'innodb_log_file_size'; SELECT global_value FROM information_schema.system_variables diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb.result b/mysql-test/suite/sys_vars/r/sysvars_innodb.result index 6a7e184f68ece..d0547be0f9b4e 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_innodb.result +++ b/mysql-test/suite/sys_vars/r/sysvars_innodb.result @@ -4,6 +4,7 @@ variable_name not in ( 'innodb_numa_interleave', # only available WITH_NUMA 'innodb_evict_tables_on_commit_debug', # one may want to override this 'innodb_use_native_aio', # default value depends on OS +'innodb_log_file_mmap', # only available on 64-bit 'innodb_log_file_buffering', # only available on Linux and Windows 'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing order by variable_name; diff --git a/mysql-test/suite/sys_vars/t/sysvars_innodb.test b/mysql-test/suite/sys_vars/t/sysvars_innodb.test index 2680e442da4a7..86f5ffddf1c5a 100644 --- a/mysql-test/suite/sys_vars/t/sysvars_innodb.test +++ b/mysql-test/suite/sys_vars/t/sysvars_innodb.test @@ -11,6 +11,7 @@ select VARIABLE_NAME, SESSION_VALUE, DEFAULT_VALUE, VARIABLE_SCOPE, VARIABLE_TYP 'innodb_numa_interleave', # only available WITH_NUMA 'innodb_evict_tables_on_commit_debug', # one may want to override this 'innodb_use_native_aio', # default value depends on OS + 'innodb_log_file_mmap', # only available on 64-bit 'innodb_log_file_buffering', # only available on Linux and Windows 'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing order by variable_name; diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index d364be3124f07..d8fd979369696 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -1754,7 +1754,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept static_assert(CPU_LEVEL1_DCACHE_LINESIZE >= 64, "efficiency"); static_assert(CPU_LEVEL1_DCACHE_LINESIZE <= 4096, "compatibility"); byte* c= my_assume_aligned - (is_pmem() ? buf + offset : checkpoint_buf); + (is_mmap() ? buf + offset : checkpoint_buf); memset_aligned(c, 0, CPU_LEVEL1_DCACHE_LINESIZE); mach_write_to_8(my_assume_aligned<8>(c), next_checkpoint_lsn); mach_write_to_8(my_assume_aligned<8>(c + 8), end_lsn); @@ -1773,6 +1773,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept header_write(resize_buf, resizing, is_encrypted()); pmem_persist(resize_buf, resize_target); } + pmem_persist(c, 64); } else @@ -1783,16 +1784,33 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept latch.wr_unlock(); log_write_and_flush_prepare(); resizing= resize_lsn.load(std::memory_order_relaxed); - /* FIXME: issue an asynchronous write */ - log.write(offset, {c, get_block_size()}); - if (resizing > 1 && resizing <= next_checkpoint_lsn) +#ifdef HAVE_INNODB_MMAP + if (is_mmap()) { - byte *buf= static_cast(aligned_malloc(4096, 4096)); - memset_aligned<4096>(buf, 0, 4096); - header_write(buf, resizing, is_encrypted()); - resize_log.write(0, {buf, 4096}); - aligned_free(buf); - resize_log.write(CHECKPOINT_1, {c, get_block_size()}); + IF_WIN(FlushViewOfFile(c, 64), msync(c, my_system_page_size, MS_ASYNC)); + + if (resizing > 1 && resizing <= next_checkpoint_lsn) + { + memcpy_aligned<64>(resize_buf + CHECKPOINT_1, c, 64); + header_write(resize_buf, resizing, is_encrypted()); + IF_WIN(FlushViewOfFile(resize_buf, resize_target), + msync(resize_buf, resize_target, MS_ASYNC)); + } + } + else +#endif + { + log.write(offset, {c, get_block_size()}); + + if (resizing > 1 && resizing <= next_checkpoint_lsn) + { + byte *buf= static_cast(aligned_malloc(4096, 4096)); + memset_aligned<4096>(buf, 0, 4096); + header_write(buf, resizing, is_encrypted()); + resize_log.write(0, {buf, 4096}); + aligned_free(buf); + resize_log.write(CHECKPOINT_1, {c, get_block_size()}); + } } if (srv_file_flush_method != SRV_O_DSYNC) @@ -1823,7 +1841,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept if (resizing > 1 && resizing <= checkpoint_lsn) { - ut_ad(is_pmem() == !resize_flush_buf); + ut_ad(is_mmap() == !resize_flush_buf); if (!is_pmem()) { @@ -1834,9 +1852,9 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept if (resize_rename()) { - /* Resizing failed. Discard the log_sys.resize_log. */ -#ifdef HAVE_PMEM - if (is_pmem()) + /* Resizing failed. Discard the ib_logfile101. */ +#ifdef HAVE_INNODB_MMAP + if (is_mmap()) my_munmap(resize_buf, resize_target); else #endif @@ -1857,23 +1875,35 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept else { /* Adopt the resized log. */ -#ifdef HAVE_PMEM - if (is_pmem()) +#ifdef HAVE_INNODB_MMAP + if (is_mmap()) { my_munmap(buf, file_size); buf= resize_buf; set_buf_free(START_OFFSET + (get_lsn() - resizing)); + +# ifdef HAVE_PMEM + if (!log.is_opened()) + { + resize_log.close(); + goto swap_done; + } +# endif } else #endif { - IF_WIN(,log.close()); - std::swap(log, resize_log); ut_free_dodump(buf, buf_size); ut_free_dodump(flush_buf, buf_size); buf= resize_buf; flush_buf= resize_flush_buf; } + + IF_WIN(,log.close()); + std::swap(log, resize_log); +# ifdef HAVE_PMEM + swap_done: +# endif srv_log_file_size= resizing_completed= file_size= resize_target; first_lsn= resizing; set_capacity(); @@ -1885,6 +1915,35 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept resize_lsn.store(0, std::memory_order_relaxed); } +#if defined HAVE_INNODB_MMAP && defined FALLOC_FL_ZERO_RANGE + if (is_mmap()) + { + const size_t ps{my_system_page_size}; + ut_ad(buf_free == calc_lsn_offset(get_lsn())); + size_t offset{buf_free & ~(ps - 1)}; + const size_t start_offset{calc_lsn_offset(checkpoint_lsn) & ~(ps - 1)}; + if (offset == start_offset); + else if (offset < start_offset) + { + madvise(buf + offset, start_offset - offset, MADV_DONTNEED); + fallocate(log.m_file, FALLOC_FL_ZERO_RANGE, offset, + start_offset - offset); + } + else + { + madvise(buf + START_OFFSET, start_offset - START_OFFSET, MADV_DONTNEED); + fallocate(log.m_file, FALLOC_FL_ZERO_RANGE, START_OFFSET, + start_offset - START_OFFSET); + offset+= ps; + if (size_t end_size= file_size - offset) + { + madvise(buf + offset, end_size, MADV_DONTNEED); + fallocate(log.m_file, FALLOC_FL_ZERO_RANGE, offset + ps, end_size); + } + } + } +#endif + log_resize_release(); if (UNIV_LIKELY(resizing <= 1)); diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index dfe034ec2a5a0..3db4c1084c2a7 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -18443,6 +18443,16 @@ buffer_pool_load_abort( } } +#ifdef HAVE_INNODB_MMAP +static void innodb_log_file_mmap_update(THD *thd, st_mysql_sys_var*, + void *, const void *save) +{ + mysql_mutex_unlock(&LOCK_global_system_variables); + log_sys.set_mmap(*static_cast(save)); + mysql_mutex_lock(&LOCK_global_system_variables); +} +#endif + #if defined __linux__ || defined _WIN32 static void innodb_log_file_buffering_update(THD *thd, st_mysql_sys_var*, void *, const void *save) @@ -19334,6 +19344,13 @@ static MYSQL_SYSVAR_UINT(log_buffer_size, log_sys.buf_size, "Redo log buffer size in bytes.", NULL, NULL, 16U << 20, 2U << 20, log_sys.buf_size_max, 4096); +#ifdef HAVE_INNODB_MMAP +static MYSQL_SYSVAR_BOOL(log_file_mmap, log_sys.log_mmap, + PLUGIN_VAR_OPCMDARG, + "Whether ib_logfile0 should be memory-mapped", + nullptr, innodb_log_file_mmap_update, FALSE); +#endif + #if defined __linux__ || defined _WIN32 static MYSQL_SYSVAR_BOOL(log_file_buffering, log_sys.log_buffered, PLUGIN_VAR_OPCMDARG, @@ -19808,6 +19825,9 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(deadlock_report), MYSQL_SYSVAR(page_size), MYSQL_SYSVAR(log_buffer_size), +#ifdef HAVE_INNODB_MMAP + MYSQL_SYSVAR(log_file_mmap), +#endif #if defined __linux__ || defined _WIN32 MYSQL_SYSVAR(log_file_buffering), #endif diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index cef0dcae1b095..6a93b21034a62 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -124,9 +124,6 @@ class log_file_t dberr_t read(os_offset_t offset, span buf) noexcept; void write(os_offset_t offset, span buf) noexcept; bool flush() const noexcept { return os_file_flush(m_file); } -#ifdef HAVE_PMEM - byte *mmap(bool read_only, const struct stat &st) noexcept; -#endif }; /** Redo log buffer */ @@ -189,7 +186,7 @@ struct log_t public: /** number of append_prepare_wait(); protected by lock_lsn() or lsn_lock */ size_t waits; - /** innodb_log_buffer_size (size of buf,flush_buf if !is_pmem(), in bytes) */ + /** innodb_log_buffer_size (size of buf,flush_buf if !is_mmap(), in bytes) */ unsigned buf_size; /** log file size in bytes, including the header */ lsn_t file_size; @@ -231,7 +228,7 @@ struct log_t /** Last written LSN */ lsn_t write_lsn; - /** buffer for writing data to ib_logfile0, or nullptr if is_pmem() + /** buffer for writing data to ib_logfile0, or nullptr if is_mmap() In write_buf(), buf and flush_buf are swapped */ byte *flush_buf; /** set when there may be need to initiate a log checkpoint. @@ -281,6 +278,10 @@ struct log_t public: /** format of the redo log: e.g., FORMAT_10_8 */ uint32_t format; +#ifdef HAVE_INNODB_MMAP + /** whether the memory-mapped interface is enabled for the log */ + my_bool log_mmap; +#endif #if defined __linux__ || defined _WIN32 /** whether file system caching is enabled for the log */ my_bool log_buffered; @@ -323,7 +324,7 @@ struct log_t /** whether there is capacity in the log buffer */ bool buf_free_ok() const noexcept { - ut_ad(!is_pmem()); + ut_ad(!is_mmap()); return (buf_free.load(std::memory_order_relaxed) & ~buf_free_LOCK) < max_buf_free; } @@ -331,8 +332,14 @@ struct log_t void set_buf_free(size_t f) noexcept { ut_ad(f < buf_free_LOCK); buf_free.store(f, std::memory_order_relaxed); } +#ifdef HAVE_INNODB_MMAP + bool is_mmap() const noexcept { return !flush_buf; } +#else + static constexpr bool is_mmap() { return false; } +#endif + #ifdef HAVE_PMEM - bool is_pmem() const noexcept { return !flush_buf; } + bool is_pmem() const noexcept { return is_mmap() && !is_opened(); } #else static constexpr bool is_pmem() { return false; } #endif @@ -376,40 +383,32 @@ struct log_t @return whether an error occurred */ static bool resize_rename() noexcept; -#ifdef HAVE_PMEM /** @return pointer for writing to resize_buf - @retval nullptr if no PMEM based resizing is active */ + @retval nullptr if no is_mmap() based resizing is active */ inline byte *resize_buf_begin(lsn_t lsn) const noexcept; /** @return end of resize_buf */ inline const byte *resize_buf_end() const noexcept { return resize_buf + resize_target; } /** Initialise the redo log subsystem. */ - void create_low(); - /** Initialise the redo log subsystem. - @return whether the initialisation succeeded */ - bool create() { create_low(); return true; } + void create(); /** Attach a log file. @return whether the memory allocation succeeded */ bool attach(log_file_t file, os_offset_t size); + +#ifdef HAVE_INNODB_MMAP + /** Try to enable or disable memory-mapped access (update log_mmap) */ + void set_mmap(bool mmap); + void close_file(bool really_close= true); #else - /** Initialise the redo log subsystem. - @return whether the initialisation succeeded */ - bool create(); - /** Attach a log file. */ - void attach_low(log_file_t file, os_offset_t size); - bool attach(log_file_t file, os_offset_t size) - { attach_low(file, size); return true; } + void close_file(); #endif - #if defined __linux__ || defined _WIN32 /** Try to enable or disable file system caching (update log_buffered) */ void set_buffered(bool buffered); #endif - void close_file(); - /** Calculate the checkpoint safety margins. */ static void set_capacity(); @@ -445,10 +444,17 @@ struct log_t flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); } -#ifdef HAVE_PMEM +#ifdef HAVE_INNODB_MMAP +# ifdef HAVE_PMEM /** Persist the log. @param lsn desired new value of flushed_to_disk_lsn */ - inline void persist(lsn_t lsn) noexcept; + void persist(lsn_t lsn) noexcept; +# endif + enum durability { WRITE, DURABLE, SYNC }; + /** Write or persist memory-mapped log. + @param durable whether this must be durable + @return pending write LSN */ + lsn_t write_mmap(durability durable) noexcept; #endif bool check_for_checkpoint() const @@ -467,14 +473,16 @@ struct log_t void close(); #if defined __linux__ || defined _WIN32 + static constexpr size_t max_block_size= 4096; /** @return the physical block size of the storage */ size_t get_block_size() const noexcept { ut_ad(block_size); return block_size; } /** Set the log block size for file I/O. */ void set_block_size(uint32_t size) noexcept { block_size= size; } #else + static constexpr size_t max_block_size= 512; /** @return the physical block size of the storage */ - static size_t get_block_size() { return 512; } + static size_t get_block_size() { return max_block_size; } #endif private: @@ -490,11 +498,11 @@ struct log_t public: /** Reserve space in the log buffer for appending data. @tparam spin whether to use the spin-only lock_lsn() - @tparam pmem log_sys.is_pmem() + @tparam mmap log_sys.is_mmap() @param size total length of the data to append(), in bytes @param ex whether log_sys.latch is exclusively locked @return the start LSN and the buffer position for append() */ - template + template std::pair append_prepare(size_t size, bool ex) noexcept; /** Append a string of bytes to the redo log. @@ -504,7 +512,7 @@ struct log_t void append(byte *&d, const void *s, size_t size) noexcept { ut_ad(latch_have_any()); - ut_ad(d + size <= buf + (is_pmem() ? file_size : buf_size)); + ut_ad(d + size <= buf + (is_mmap() ? file_size : buf_size)); memcpy(d, s, size); d+= size; } diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h index a73b727991ccc..6cf79c857e45e 100644 --- a/storage/innobase/include/log0recv.h +++ b/storage/innobase/include/log0recv.h @@ -408,19 +408,18 @@ struct recv_sys_t ATTRIBUTE_COLD void report_progress() const; public: /** Parse and register one log_t::FORMAT_10_8 mini-transaction, - handling log_sys.is_pmem() buffer wrap-around. + without handling any log_sys.is_mmap() buffer wrap-around. @tparam store whether to store the records @param if_exists if store: whether to check if the tablespace exists */ template static parse_mtr_result parse_mtr(bool if_exists) noexcept; - /** Parse and register one log_t::FORMAT_10_8 mini-transaction, - handling log_sys.is_pmem() buffer wrap-around. + handling log_sys.is_mmap() buffer wrap-around. @tparam store whether to store the records @param if_exists if store: whether to check if the tablespace exists */ template - static parse_mtr_result parse_pmem(bool if_exists) noexcept -#ifdef HAVE_PMEM + static parse_mtr_result parse_mmap(bool if_exists) noexcept +#ifdef HAVE_INNODB_MMAP ; #else { return parse_mtr(if_exists); } diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i index 1b4f70b683b67..70a136273ee28 100644 --- a/storage/innobase/include/univ.i +++ b/storage/innobase/include/univ.i @@ -170,6 +170,9 @@ using the call command. */ #define UNIV_INLINE static inline #define UNIV_WORD_SIZE SIZEOF_SIZE_T +#if SIZEOF_SIZE_T == 8 +# define HAVE_INNODB_MMAP +#endif /** The following alignment is used in memory allocations in memory heap management to ensure correct alignment for doubles etc. */ @@ -199,7 +202,7 @@ and 2 bits for flags. This limits the uncompressed page size to 16k. /* Define the Min, Max, Default page sizes. */ /** Minimum Page Size Shift (power of 2) */ #define UNIV_PAGE_SIZE_SHIFT_MIN 12U -/** log2 of largest page size (1<<16 == 64436 bytes). */ +/** log2 of largest page size (1<<16 == 65536 bytes). */ /** Maximum Page Size Shift (power of 2) */ #define UNIV_PAGE_SIZE_SHIFT_MAX 16U /** log2 of default page size (1<<14 == 16384 bytes). */ diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index ea717de226a9a..f03dea23d9698 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -86,11 +86,7 @@ void log_t::set_capacity() log_sys.max_checkpoint_age = margin; } -#ifdef HAVE_PMEM -void log_t::create_low() -#else -bool log_t::create() -#endif +void log_t::create() { ut_ad(this == &log_sys); ut_ad(!is_initialised()); @@ -101,35 +97,10 @@ bool log_t::create() need_checkpoint.store(true, std::memory_order_relaxed); write_lsn= FIRST_LSN; -#ifndef HAVE_PMEM - buf= static_cast(ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME)); - if (!buf) - { - alloc_fail: - sql_print_error("InnoDB: Cannot allocate memory;" - " too large innodb_log_buffer_size?"); - return false; - } - flush_buf= static_cast(ut_malloc_dontdump(buf_size, - PSI_INSTRUMENT_ME)); - if (!flush_buf) - { - ut_free_dodump(buf, buf_size); - buf= nullptr; - goto alloc_fail; - } - - TRASH_ALLOC(buf, buf_size); - TRASH_ALLOC(flush_buf, buf_size); - checkpoint_buf= static_cast(aligned_malloc(4096, 4096)); - memset_aligned<4096>(checkpoint_buf, 0, 4096); - max_buf_free= buf_size / LOG_BUF_FLUSH_RATIO - LOG_BUF_FLUSH_MARGIN; -#else ut_ad(!checkpoint_buf); ut_ad(!buf); ut_ad(!flush_buf); max_buf_free= 1; -#endif latch.SRW_LOCK_INIT(log_latch_key); @@ -143,9 +114,6 @@ bool log_t::create() set_buf_free(0); ut_ad(is_initialised()); -#ifndef HAVE_PMEM - return true; -#endif } dberr_t log_file_t::close() noexcept @@ -177,22 +145,72 @@ void log_file_t::write(os_offset_t offset, span buf) noexcept << IF_WIN(GetLastError(), errno) << "."; } -#ifdef HAVE_PMEM -# include "cache.h" +#ifdef HAVE_INNODB_MMAP +# ifdef HAVE_PMEM +# include "cache.h" +# endif /** Attempt to memory map a file. @param file log file handle @param size file size @return pointer to memory mapping @retval MAP_FAILED if the memory cannot be mapped */ -static void *log_mmap(os_file_t file, os_offset_t size) +static void *log_mmap(os_file_t file, +# ifdef HAVE_PMEM + bool &is_pmem, /*!< whether the file is on pmem */ +# endif + os_offset_t size) { + if (my_system_page_size > 4096) + return MAP_FAILED; +# ifndef HAVE_PMEM + if (!log_sys.log_mmap) + return MAP_FAILED; +# endif + + const bool read_only= + !srv_read_only_mode && srv_operation < SRV_OPERATION_BACKUP; + +# ifdef _WIN32 + void *ptr= MAP_FAILED; + if (HANDLE h= CreateFileMappingA(file, nullptr, PAGE_READWRITE, + DWORD(size >> 32), DWORD(size), nullptr)) + { + ptr= MapViewOfFileEx(h, read_only ? FILE_MAP_READ : FILE_MAP_WRITE, + 0, 0, size, nullptr); + CloseHandle(h); + if (!ptr) + ptr= MAP_FAILED; + else + log_sys.set_block_size(CPU_LEVEL1_DCACHE_LINESIZE); + } +# else + int prot= PROT_READ, flags= MAP_SHARED; + + if (read_only) +# ifdef HAVE_PMEM + flags= MAP_SHARED_VALIDATE | MAP_SYNC, +# endif + prot= PROT_READ | PROT_WRITE; + void *ptr= - my_mmap(0, size_t(size), - srv_read_only_mode ? PROT_READ : PROT_READ | PROT_WRITE, - MAP_SHARED_VALIDATE | MAP_SYNC, file, 0); -#ifdef __linux__ - if (ptr == MAP_FAILED) + my_mmap(0, size_t(size), prot, flags, file, 0); + +# ifdef HAVE_PMEM + is_pmem= ptr != MAP_FAILED; +# endif + + if (ptr != MAP_FAILED) + { +# ifdef HAVE_PMEM + log_sys.set_block_size(CPU_LEVEL1_DCACHE_LINESIZE); +# endif + return ptr; + } + +# ifdef HAVE_PMEM +# ifdef __linux__ /* On Linux, we pretend that /dev/shm is PMEM */ + if (srv_operation < SRV_OPERATION_BACKUP) { struct stat st; if (!fstat(file, &st)) @@ -202,49 +220,82 @@ static void *log_mmap(os_file_t file, os_offset_t size) if (!stat("/dev/shm", &st)) { MSAN_STAT_WORKAROUND(&st); - if (st.st_dev == st_dev) - ptr= my_mmap(0, size_t(size), - srv_read_only_mode ? PROT_READ : PROT_READ | PROT_WRITE, - MAP_SHARED, file, 0); + is_pmem= st.st_dev == st_dev; } } } -#endif /* __linux__ */ + if (!is_pmem) +# endif + if (!log_sys.log_mmap) + return ptr; + ptr= my_mmap(0, size_t(size), prot, MAP_SHARED, file, 0); +# endif /* HAVE_PMEM */ +# endif return ptr; } #endif -#ifdef HAVE_PMEM -bool log_t::attach(log_file_t file, os_offset_t size) -#else -void log_t::attach_low(log_file_t file, os_offset_t size) +#if defined __linux__ || defined _WIN32 +/** Display a message about opening the log */ +ATTRIBUTE_COLD static void log_file_message() +{ + sql_print_information("InnoDB: %s (block size=%u bytes)", +# ifdef HAVE_INNODB_MMAP + log_sys.log_mmap + ? (log_sys.log_buffered + ? "Memory-mapped log" + : "Memory-mapped unbuffered log") + : +# endif + log_sys.log_buffered + ? "Buffered log writes" + : "File system buffers for log disabled", + log_sys.get_block_size()); +} #endif + +bool log_t::attach(log_file_t file, os_offset_t size) { log= file; ut_ad(!size || size >= START_OFFSET + SIZE_OF_FILE_CHECKPOINT); file_size= size; -#ifdef HAVE_PMEM ut_ad(!buf); ut_ad(!flush_buf); - if (size && !(size_t(size) & 4095) && srv_operation != SRV_OPERATION_BACKUP) +#ifdef HAVE_INNODB_MMAP + if (size) { - void *ptr= log_mmap(log.m_file, size); +#ifdef HAVE_PMEM + bool is_pmem; +#endif + void *ptr= ::log_mmap(log.m_file, +#ifdef HAVE_PMEM + is_pmem, +#endif + size); if (ptr != MAP_FAILED) { - log.close(); - mprotect(ptr, size_t(size), PROT_READ); +# ifdef HAVE_PMEM + if (is_pmem) + { + log.close(); + log_buffered= false; + log_maybe_unbuffered= true; + } +# endif + IF_WIN(,mprotect(ptr, size_t(size), PROT_READ)); buf= static_cast(ptr); max_buf_free= 1; -# if defined __linux__ || defined _WIN32 - set_block_size(CPU_LEVEL1_DCACHE_LINESIZE); -# endif - log_maybe_unbuffered= true; - log_buffered= false; mtr_t::finisher_update(); +# ifdef HAVE_PMEM + if (!is_pmem) +# endif + goto func_exit; return true; } } + log_mmap= false; +#endif buf= static_cast(ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME)); if (!buf) { @@ -258,30 +309,34 @@ void log_t::attach_low(log_file_t file, os_offset_t size) PSI_INSTRUMENT_ME)); if (!flush_buf) { + alloc_fail2: ut_free_dodump(buf, buf_size); buf= nullptr; goto alloc_fail; } + checkpoint_buf= static_cast(aligned_malloc(get_block_size(), + get_block_size())); + if (!checkpoint_buf) + { + ut_free_dodump(flush_buf, buf_size); + flush_buf= nullptr; + goto alloc_fail2; + } + TRASH_ALLOC(buf, buf_size); TRASH_ALLOC(flush_buf, buf_size); max_buf_free= buf_size / LOG_BUF_FLUSH_RATIO - LOG_BUF_FLUSH_MARGIN; -#endif + mtr_t::finisher_update(); + memset_aligned<64>(checkpoint_buf, 0, get_block_size()); +#ifdef HAVE_INNODB_MMAP + func_exit: +#endif #if defined __linux__ || defined _WIN32 - sql_print_information("InnoDB: %s (block size=%u bytes)", - log_buffered - ? "Buffered log writes" - : "File system buffers for log disabled", - block_size); + log_file_message(); #endif - - mtr_t::finisher_update(); -#ifdef HAVE_PMEM - checkpoint_buf= static_cast(aligned_malloc(block_size, block_size)); - memset_aligned<64>(checkpoint_buf, 0, block_size); return true; -#endif } /** Write a log file header. @@ -324,12 +379,29 @@ void log_t::create(lsn_t lsn) noexcept last_checkpoint_lsn= 0; -#ifdef HAVE_PMEM - if (is_pmem()) + DBUG_PRINT("ib_log", ("write header " LSN_PF, lsn)); + +#ifdef HAVE_INNODB_MMAP + if (is_mmap()) { - mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE); + IF_WIN(,mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE)); memset_aligned<4096>(buf, 0, 4096); + log_sys.header_write(buf, lsn, is_encrypted()); set_buf_free(START_OFFSET); + +# ifdef HAVE_PMEM + if (is_pmem()) + pmem_persist(buf, 512); + else +# endif + { +# ifdef _WIN32 + FlushViewOfFile(buf, 512); + log.flush(); +# else + msync(buf, my_system_page_size, MS_SYNC); +# endif + } } else #endif @@ -337,47 +409,53 @@ void log_t::create(lsn_t lsn) noexcept set_buf_free(0); memset_aligned<4096>(flush_buf, 0, buf_size); memset_aligned<4096>(buf, 0, buf_size); - } - - log_sys.header_write(buf, lsn, is_encrypted()); - DBUG_PRINT("ib_log", ("write header " LSN_PF, lsn)); - -#ifdef HAVE_PMEM - if (is_pmem()) - pmem_persist(buf, 512); - else -#endif - { + log_sys.header_write(buf, lsn, is_encrypted()); log.write(0, {buf, 4096}); memset_aligned<512>(buf, 0, 512); } } +#ifdef HAVE_INNODB_MMAP +void log_t::close_file(bool really_close) +#else void log_t::close_file() +#endif { -#ifdef HAVE_PMEM - if (is_pmem()) +#ifdef HAVE_INNODB_MMAP + if (is_mmap()) { - ut_ad(!is_opened()); ut_ad(!checkpoint_buf); + ut_ad(!flush_buf); if (buf) { my_munmap(buf, file_size); buf= nullptr; } - return; + } + else +#endif + { + ut_ad(!buf == !flush_buf); + ut_ad(!buf == !checkpoint_buf); +#ifdef SAFEMALLOC + if (buf) +#endif + { + ut_free_dodump(buf, buf_size); + buf= nullptr; + ut_free_dodump(flush_buf, buf_size); + flush_buf= nullptr; + } + aligned_free(checkpoint_buf); + checkpoint_buf= nullptr; } - ut_free_dodump(buf, buf_size); - buf= nullptr; - ut_free_dodump(flush_buf, buf_size); - flush_buf= nullptr; - aligned_free(checkpoint_buf); - checkpoint_buf= nullptr; +#ifdef HAVE_INNODB_MMAP + if (really_close) #endif - if (is_opened()) - if (const dberr_t err= log.close()) - ib::fatal() << "closing ib_logfile0 failed: " << err; + if (is_opened()) + if (const dberr_t err= log.close()) + ib::fatal() << "closing ib_logfile0 failed: " << err; } /** Acquire all latches that protect the log. */ @@ -426,11 +504,7 @@ void log_t::set_buffered(bool buffered) OS_FILE_OPEN, OS_FILE_NORMAL, OS_LOG_FILE, false, &success); ut_a(log.m_file != OS_FILE_CLOSED); - sql_print_information("InnoDB: %s (block size=%u bytes)", - log_buffered - ? "Buffered log writes" - : "File system buffers for log disabled", - block_size); + log_file_message(); } log_resize_release(); } @@ -449,6 +523,9 @@ log_t::resize_start_status log_t::resize_start(os_offset_t size) noexcept resize_start_status status= RESIZE_NO_CHANGE; lsn_t start_lsn{0}; +#ifdef HAVE_PMEM + bool is_pmem{false}; +#endif if (resize_in_progress()) status= RESIZE_IN_PROGRESS; @@ -472,10 +549,15 @@ log_t::resize_start_status log_t::resize_start(os_offset_t size) noexcept void *ptr= nullptr, *ptr2= nullptr; success= os_file_set_size(path.c_str(), resize_log.m_file, size); if (!success); -#ifdef HAVE_PMEM - else if (is_pmem()) +#ifdef HAVE_INNODB_MMAP + else if (is_mmap()) { - ptr= log_mmap(resize_log.m_file, size); + ptr= ::log_mmap(resize_log.m_file, +#ifdef HAVE_PMEM + is_pmem, +#endif + size); + if (ptr == MAP_FAILED) goto alloc_fail; } @@ -513,12 +595,11 @@ log_t::resize_start_status log_t::resize_start(os_offset_t size) noexcept resize_target= size; resize_buf= static_cast(ptr); resize_flush_buf= static_cast(ptr2); - if (is_pmem()) - { - resize_log.close(); +#ifdef HAVE_INNODB_MMAP + if (is_mmap()) start_lsn= get_lsn(); - } else +#endif { memcpy_aligned<16>(resize_buf, buf, (buf_free + 15) & ~15); start_lsn= first_lsn + @@ -552,14 +633,13 @@ void log_t::resize_abort() noexcept if (resize_in_progress() > 1) { - if (!is_pmem()) + if (!is_mmap()) { - resize_log.close(); ut_free_dodump(resize_buf, buf_size); ut_free_dodump(resize_flush_buf, buf_size); resize_flush_buf= nullptr; } -#ifdef HAVE_PMEM +#ifdef HAVE_INNODB_MMAP else { ut_ad(!resize_log.is_opened()); @@ -568,6 +648,8 @@ void log_t::resize_abort() noexcept my_munmap(resize_buf, resize_target); } #endif + if (resize_log.is_opened()) + resize_log.close(); resize_buf= nullptr; resize_target= 0; resize_lsn.store(0, std::memory_order_relaxed); @@ -732,9 +814,7 @@ static size_t log_pad(lsn_t lsn, size_t pad, byte *begin, byte *extra) #endif #ifdef HAVE_PMEM -/** Persist the log. -@param lsn desired new value of flushed_to_disk_lsn */ -inline void log_t::persist(lsn_t lsn) noexcept +void log_t::persist(lsn_t lsn) noexcept { ut_ad(is_pmem()); ut_ad(!write_lock.is_owner()); @@ -753,12 +833,11 @@ inline void log_t::persist(lsn_t lsn) noexcept if (UNIV_UNLIKELY(end < start)) { - pmem_persist(log_sys.buf + start, log_sys.file_size - start); - pmem_persist(log_sys.buf + log_sys.START_OFFSET, - end - log_sys.START_OFFSET); + pmem_persist(buf + start, file_size - start); + pmem_persist(buf + START_OFFSET, end - START_OFFSET); } else - pmem_persist(log_sys.buf + start, end - start); + pmem_persist(buf + start, end - start); old= flushed_to_disk_lsn.load(std::memory_order_relaxed); @@ -778,6 +857,62 @@ inline void log_t::persist(lsn_t lsn) noexcept } #endif +#ifdef HAVE_INNODB_MMAP +lsn_t log_t::write_mmap(durability durable) noexcept +{ + ut_ad(write_lock.is_owner()); + ut_ad(!durable || flush_lock.is_owner()); + ut_ad(latch_have_wr()); + + const lsn_t end_lsn{get_lsn(std::memory_order_relaxed)}; + if (UNIV_LIKELY(durable != SYNC)) + latch.wr_unlock(); + write_lock.set_pending(end_lsn); + const size_t ps_1{my_system_page_size - 1}; + const size_t start{size_t(calc_lsn_offset(write_lsn)) & ~ps_1}; + const size_t end{(size_t(calc_lsn_offset(end_lsn)) + ps_1) & ~ps_1}; + +# ifndef _WIN32 + const int flags= durable != WRITE ? MS_SYNC : MS_ASYNC; + if (UNIV_UNLIKELY(end < start)) + { + msync(buf + start, file_size - start, flags); + msync(buf + START_OFFSET, end - START_OFFSET, flags); + } + else if (UNIV_LIKELY(end != start)) + msync(buf + start, end - start, flags); +# else + if (UNIV_UNLIKELY(end < start)) + { + FlushViewOfFile(buf + start, file_size - start); + FlushViewOfFile(buf + START_OFFSET, end - START_OFFSET); + } + else if (UNIV_LIKELY(end != start)) + FlushViewOfFile(log_sys.buf + start, end - start); +# endif + + write_lsn= end_lsn; + + const lsn_t pending_write_lsn{write_lock.release(end_lsn)}; + switch (durable) { + case WRITE: + break; + case DURABLE: +# ifdef _WIN32 + break; +# endif + case SYNC: +# ifdef _WIN32 + flush(end_lsn); +# endif + flushed_to_disk_lsn.store(end_lsn, std::memory_order_release); + log_flush_notify(end_lsn); + flush_lock.release(end_lsn); + } + return pending_write_lsn; +} +#endif + /** Write resize_buf to resize_log. @param length the used length of resize_buf */ ATTRIBUTE_COLD void log_t::resize_write_buf(size_t length) noexcept @@ -811,7 +946,7 @@ ATTRIBUTE_COLD void log_t::resize_write_buf(size_t length) noexcept template inline lsn_t log_t::write_buf() noexcept { ut_ad(latch_have_wr()); - ut_ad(!is_pmem()); + ut_ad(!is_mmap()); ut_ad(!srv_read_only_mode); const lsn_t lsn{get_lsn(std::memory_order_relaxed)}; @@ -955,7 +1090,6 @@ void log_write_up_to(lsn_t lsn, bool durable, { if (flush_lock.acquire(lsn, callback) != group_commit_lock::ACQUIRED) return; - flush_lock.set_pending(log_sys.get_lsn()); } lsn_t pending_write_lsn= 0, pending_flush_lsn= 0; @@ -964,16 +1098,31 @@ void log_write_up_to(lsn_t lsn, bool durable, group_commit_lock::ACQUIRED) { log_sys.latch.wr_lock(SRW_LOCK_CALL); - pending_write_lsn= write_lock.release(log_sys.write_buf()); + +#ifdef HAVE_INNODB_MMAP + if (log_sys.is_mmap()) + { + pending_write_lsn= + log_sys.write_mmap(durable ? log_sys.DURABLE : log_sys.WRITE); +# ifndef _WIN32 + if (!pending_write_lsn) + return; + goto repeat_loop; +# endif + } + else +#endif + pending_write_lsn= write_lock.release(log_sys.write_buf()); } if (durable) - { pending_flush_lsn= log_flush(write_lock.value()); - } if (pending_write_lsn || pending_flush_lsn) { +#if defined HAVE_INNODB_MMAP && !defined _WIN32 + repeat_loop: +#endif /* There is no new group commit lead; some async waiters could stall. */ callback= &dummy_callback; lsn= std::max(pending_write_lsn, pending_flush_lsn); @@ -1000,20 +1149,80 @@ ATTRIBUTE_COLD void log_write_and_flush_prepare() group_commit_lock::ACQUIRED); } +#ifdef HAVE_INNODB_MMAP +/** Try to enable or disable memory-mapped access (update log_mmap) */ +ATTRIBUTE_COLD void log_t::set_mmap(bool mmap) +{ + if (is_pmem() || high_level_read_only) + return; + log_resize_acquire(); + if (!resize_in_progress() && is_opened() && bool(log_mmap) != mmap) + { + alignas(16) byte log_block[max_block_size]; + const size_t bs{get_block_size()}; + const size_t bf{buf_free.load(std::memory_order_relaxed)}; + lsn_t end_lsn; + { + byte *const b= buf; + if (is_mmap()) + { + IF_WIN(FlushViewOfFile(b, file_size), msync(b, file_size, MS_ASYNC)); + write_lsn= end_lsn= get_lsn(std::memory_order_relaxed); + } + else + { + ut_ad(bf < max_buf_free); + end_lsn= write_buf(); + } + + memcpy_aligned<16>(log_block, b + (bf & ~(bs - 1)), bs); + } + + flush(end_lsn); + flushed_to_disk_lsn.store(end_lsn, std::memory_order_release); + log_flush_notify(end_lsn); + + close_file(false); + log_mmap= mmap; + ut_a(attach(log, file_size)); + ut_ad(is_mmap() == bool(log_mmap)); + + if (log_mmap) + { + const size_t new_buf_free{calc_lsn_offset(end_lsn)}; + set_buf_free(new_buf_free); + ut_ad(!memcmp_aligned<16>(log_sys.buf + (new_buf_free & ~(bs - 1)), + log_block, new_buf_free & (bs - 1) & ~15)); + IF_WIN(,mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE)); + } + else + { + set_buf_free(bf & (bs - 1)); + memcpy_aligned<16>(log_sys.buf, log_block, bs); + } + } + log_resize_release(); +} +#endif + /** Durably write the log up to log_sys.get_lsn(). */ ATTRIBUTE_COLD void log_write_and_flush() { ut_ad(!srv_read_only_mode); - if (!log_sys.is_pmem()) + if (!log_sys.is_mmap()) { const lsn_t lsn{log_sys.write_buf()}; write_lock.release(lsn); log_flush(lsn); } #ifdef HAVE_PMEM - else + else if (log_sys.is_pmem()) log_sys.persist(log_sys.get_lsn()); #endif +#ifdef HAVE_INNODB_MMAP + else + log_sys.write_mmap(log_sys.SYNC); +#endif } /****************************************************************//** @@ -1291,18 +1500,9 @@ void log_t::close() if (!is_initialised()) return; close_file(); -#ifndef HAVE_PMEM - ut_free_dodump(buf, buf_size); - buf= nullptr; - ut_free_dodump(flush_buf, buf_size); - flush_buf= nullptr; - aligned_free(checkpoint_buf); - checkpoint_buf= nullptr; -#else ut_ad(!checkpoint_buf); ut_ad(!buf); ut_ad(!flush_buf); -#endif latch.destroy(); diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index 6b6a686823cb4..40ece54454094 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -1632,7 +1632,7 @@ ATTRIBUTE_COLD static dberr_t recv_log_recover_pre_10_2() byte *buf= const_cast(field_ref_zero); - if (source_offset < (log_sys.is_pmem() ? log_sys.file_size : 4096)) + if (source_offset < (log_sys.is_mmap() ? log_sys.file_size : 4096)) memcpy_aligned<512>(buf, &log_sys.buf[source_offset & ~511], 512); else if (dberr_t err= recv_sys.read(source_offset & ~511, {buf, 512})) @@ -1671,7 +1671,7 @@ static dberr_t recv_log_recover_10_5(lsn_t lsn_offset) { byte *buf= const_cast(field_ref_zero); - if (lsn_offset < (log_sys.is_pmem() ? log_sys.file_size : 4096)) + if (lsn_offset < (log_sys.is_mmap() ? log_sys.file_size : 4096)) memcpy_aligned<512>(buf, &log_sys.buf[lsn_offset & ~511], 512); else { @@ -1772,7 +1772,7 @@ dberr_t recv_sys_t::find_checkpoint() log_sys.next_checkpoint_lsn= 0; lsn= 0; buf= my_assume_aligned<4096>(log_sys.buf); - if (!log_sys.is_pmem()) + if (!log_sys.is_mmap()) if (dberr_t err= log_sys.log.read(0, {buf, 4096})) return err; /* Check the header page checksum. There was no @@ -1842,7 +1842,7 @@ dberr_t recv_sys_t::find_checkpoint() for (size_t field= log_t::CHECKPOINT_1; field <= log_t::CHECKPOINT_2; field+= log_t::CHECKPOINT_2 - log_t::CHECKPOINT_1) { - if (log_sys.is_pmem()) + if (log_sys.is_mmap()) buf= log_sys.buf + field; else if (dberr_t err= log_sys.log.read(field, @@ -2215,7 +2215,7 @@ static void store_freed_or_init_rec(page_id_t page_id, bool freed) /** Wrapper for log_sys.buf[] between recv_sys.offset and recv_sys.len */ struct recv_buf { - bool is_pmem() const noexcept { return log_sys.is_pmem(); } + bool is_mmap() const noexcept { return log_sys.is_mmap(); } const byte *ptr; @@ -2306,11 +2306,11 @@ struct recv_buf } }; -#ifdef HAVE_PMEM +#ifdef HAVE_INNODB_MMAP /** Ring buffer wrapper for log_sys.buf[]; recv_sys.len == log_sys.file_size */ struct recv_ring : public recv_buf { - static constexpr bool is_pmem() { return true; } + static constexpr bool is_mmap() { return true; } constexpr recv_ring(const byte *ptr) : recv_buf(ptr) {} @@ -2603,7 +2603,7 @@ recv_sys_t::parse_mtr_result recv_sys_t::parse(source &l, bool if_exists) ut_d(const source el{l}); lsn+= l - begin; offset= l.ptr - log_sys.buf; - if (!l.is_pmem()); + if (!l.is_mmap()); else if (offset == log_sys.file_size) offset= log_sys.START_OFFSET; else @@ -3116,12 +3116,12 @@ recv_sys_t::parse_mtr_result recv_sys_t::parse_mtr(bool if_exists) noexcept template recv_sys_t::parse_mtr_result recv_sys_t::parse_mtr(bool) noexcept; -#ifdef HAVE_PMEM +#ifdef HAVE_INNODB_MMAP template -recv_sys_t::parse_mtr_result recv_sys_t::parse_pmem(bool if_exists) noexcept +recv_sys_t::parse_mtr_result recv_sys_t::parse_mmap(bool if_exists) noexcept { recv_sys_t::parse_mtr_result r{parse_mtr(if_exists)}; - if (UNIV_LIKELY(r != PREMATURE_EOF) || !log_sys.is_pmem()) + if (UNIV_LIKELY(r != PREMATURE_EOF) || !log_sys.is_mmap()) return r; ut_ad(recv_sys.len == log_sys.file_size); ut_ad(recv_sys.offset >= log_sys.START_OFFSET); @@ -3132,6 +3132,10 @@ recv_sys_t::parse_mtr_result recv_sys_t::parse_pmem(bool if_exists) noexcept : &log_sys.buf[recv_sys.offset]}; return recv_sys.parse(s, if_exists); } + +/** for mariadb-backup; @see xtrabackup_copy_mmap_logfile() */ +template +recv_sys_t::parse_mtr_result recv_sys_t::parse_mmap(bool) noexcept; #endif /** Apply the hashed log records to the page, if the page lsn is less than the @@ -4001,8 +4005,8 @@ void recv_sys_t::apply(bool last_batch) in ascending order of buf_page_t::oldest_modification. */ log_sort_flush_list(); -#ifdef HAVE_PMEM - if (last_batch && log_sys.is_pmem()) +#if defined HAVE_INNODB_MMAP && !defined _WIN32 + if (last_batch && log_sys.is_mmap()) mprotect(log_sys.buf, len, PROT_READ | PROT_WRITE); #endif @@ -4030,15 +4034,13 @@ static bool recv_scan_log(bool last_phase) bool store{recv_sys.file_checkpoint != 0}; size_t buf_size= log_sys.buf_size; -#ifdef HAVE_PMEM - if (log_sys.is_pmem()) + if (log_sys.is_mmap()) { recv_sys.offset= size_t(log_sys.calc_lsn_offset(recv_sys.lsn)); buf_size= size_t(log_sys.file_size); recv_sys.len= size_t(log_sys.file_size); } else -#endif { recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) & block_size_1; @@ -4100,7 +4102,7 @@ static bool recv_scan_log(bool last_phase) for (;;) { const byte& b{log_sys.buf[recv_sys.offset]}; - r= recv_sys.parse_pmem(false); + r= recv_sys.parse_mmap(false); switch (r) { case recv_sys_t::PREMATURE_EOF: goto read_more; @@ -4130,7 +4132,7 @@ static bool recv_scan_log(bool last_phase) else { ut_ad(recv_sys.file_checkpoint != 0); - switch ((r= recv_sys.parse_pmem(false))) { + switch ((r= recv_sys.parse_mmap(false))) { case recv_sys_t::PREMATURE_EOF: goto read_more; case recv_sys_t::GOT_EOF: @@ -4152,11 +4154,11 @@ static bool recv_scan_log(bool last_phase) if (!store) skip_the_rest: - while ((r= recv_sys.parse_pmem(false)) == recv_sys_t::OK); + while ((r= recv_sys.parse_mmap(false)) == recv_sys_t::OK); else { uint16_t count= 0; - while ((r= recv_sys.parse_pmem(last_phase)) == recv_sys_t::OK) + while ((r= recv_sys.parse_mmap(last_phase)) == recv_sys_t::OK) if (!++count && recv_sys.report(time(nullptr))) { const size_t n= recv_sys.pages.size(); @@ -4195,10 +4197,9 @@ static bool recv_scan_log(bool last_phase) } read_more: -#ifdef HAVE_PMEM - if (log_sys.is_pmem()) + if (log_sys.is_mmap()) break; -#endif + if (recv_sys.is_corrupt_log()) break; @@ -4713,14 +4714,14 @@ dberr_t recv_recovery_from_checkpoint_start() if (!srv_read_only_mode && log_sys.is_latest()) { ut_ad(log_sys.get_flushed_lsn() == log_sys.get_lsn()); ut_ad(recv_sys.lsn == log_sys.get_lsn()); - if (!log_sys.is_pmem()) { + if (!log_sys.is_mmap()) { const size_t bs_1{log_sys.get_block_size() - 1}; const size_t ro{recv_sys.offset}; recv_sys.offset &= bs_1; memmove_aligned<64>(log_sys.buf, log_sys.buf + (ro & ~bs_1), log_sys.get_block_size()); -#ifdef HAVE_PMEM +#ifndef _WIN32 } else { mprotect(log_sys.buf, size_t(log_sys.file_size), PROT_READ | PROT_WRITE); diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index 74d3adb28717f..f287ade16ead3 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -39,17 +39,16 @@ Created 11/26/1995 Heikki Tuuri #include "mariadb_stats.h" #include "my_cpu.h" -#ifdef HAVE_PMEM void (*mtr_t::commit_logger)(mtr_t *, std::pair); -#endif + std::pair (*mtr_t::finisher)(mtr_t *, size_t); unsigned mtr_t::spin_wait_delay; void mtr_t::finisher_update() { ut_ad(log_sys.latch_have_wr()); -#ifdef HAVE_PMEM - if (log_sys.is_pmem()) + + if (log_sys.is_mmap()) { commit_logger= mtr_t::commit_log; finisher= spin_wait_delay @@ -57,7 +56,7 @@ void mtr_t::finisher_update() return; } commit_logger= mtr_t::commit_log; -#endif + finisher= (spin_wait_delay ? mtr_t::finish_writer : mtr_t::finish_writer); @@ -351,11 +350,11 @@ inline lsn_t log_t::get_write_target() const return write_lsn + max_buf_free / 2; } -template +template void mtr_t::commit_log(mtr_t *mtr, std::pair lsns) { size_t modified= 0; - const lsn_t write_lsn= pmem ? 0 : log_sys.get_write_target(); + const lsn_t write_lsn= mmap ? 0 : log_sys.get_write_target(); if (mtr->m_made_dirty) { @@ -475,7 +474,7 @@ void mtr_t::commit_log(mtr_t *mtr, std::pair lsns) if (UNIV_UNLIKELY(lsns.second != PAGE_FLUSH_NO)) buf_flush_ahead(mtr->m_commit_lsn, lsns.second == PAGE_FLUSH_SYNC); - if (!pmem && UNIV_UNLIKELY(write_lsn != 0)) + if (!mmap && UNIV_UNLIKELY(write_lsn != 0)) log_write_up_to(write_lsn, false); } @@ -501,11 +500,7 @@ void mtr_t::commit() ut_ad(!srv_read_only_mode); std::pair lsns{do_write()}; process_freed_pages(); -#ifdef HAVE_PMEM commit_logger(this, lsns); -#else - commit_log(this, lsns); -#endif } else { @@ -963,7 +958,6 @@ void log_t::lsn_lock_bts() noexcept # endif } -inline #else ATTRIBUTE_NOINLINE #endif @@ -1011,7 +1005,7 @@ ATTRIBUTE_COLD size_t log_t::append_prepare_wait(size_t b, bool ex, lsn_t lsn) else latch.rd_unlock(); - log_write_up_to(lsn, is_pmem()); + log_write_up_to(lsn, is_mmap()); if (ex) latch.wr_lock(SRW_LOCK_CALL); @@ -1027,16 +1021,16 @@ ATTRIBUTE_COLD size_t log_t::append_prepare_wait(size_t b, bool ex, lsn_t lsn) /** Reserve space in the log buffer for appending data. @tparam spin whether to use the spin-only lock_lsn() -@tparam pmem log_sys.is_pmem() +@tparam mmap log_sys.is_mmap() @param size total length of the data to append(), in bytes @param ex whether log_sys.latch is exclusively locked @return the start LSN and the buffer position for append() */ -template +template inline std::pair log_t::append_prepare(size_t size, bool ex) noexcept { ut_ad(ex ? latch_have_wr() : latch_have_rd()); - ut_ad(pmem == is_pmem()); + ut_ad(mmap == is_mmap()); if (!spin) lsn_lock.wr_lock(); size_t b{spin ? lock_lsn() : buf_free.load(std::memory_order_relaxed)}; @@ -1044,14 +1038,14 @@ std::pair log_t::append_prepare(size_t size, bool ex) noexcept const lsn_t l{lsn.load(std::memory_order_relaxed)}, end_lsn{l + size}; - if (UNIV_UNLIKELY(pmem + if (UNIV_UNLIKELY(mmap ? (end_lsn - get_flushed_lsn(std::memory_order_relaxed)) > capacity() : b + size >= buf_size)) b= append_prepare_wait(b, ex, l); size_t new_buf_free= b + size; - if (pmem && new_buf_free >= file_size) + if (mmap && new_buf_free >= file_size) new_buf_free-= size_t(capacity()); lsn.store(end_lsn, std::memory_order_relaxed); @@ -1207,10 +1201,10 @@ inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len, end-= len; size_t s; -#ifdef HAVE_PMEM +#ifdef HAVE_INNODB_MMAP if (!resize_flush_buf) { - ut_ad(is_pmem()); + ut_ad(is_mmap()); const size_t resize_capacity{resize_target - START_OFFSET}; const lsn_t resizing{resize_in_progress()}; if (UNIV_UNLIKELY(lsn < resizing)) @@ -1238,7 +1232,7 @@ inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len, /* The destination buffer (log_sys.resize_buf) did not wrap around */ memcpy(resize_buf + s, end + capacity(), l); memcpy(resize_buf + s + l, &buf[START_OFFSET], len - l); - goto pmem_nowrap; + goto mmap_nowrap; } else { @@ -1260,7 +1254,7 @@ inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len, memcpy(resize_buf + START_OFFSET + (l - rl), &buf[START_OFFSET], len - l); } - goto pmem_wrap; + goto mmap_wrap; } } else @@ -1270,7 +1264,7 @@ inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len, if (UNIV_LIKELY(s + len <= resize_target)) { memcpy(resize_buf + s, end, len); - pmem_nowrap: + mmap_nowrap: s+= len - seq; } else @@ -1279,7 +1273,7 @@ inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len, memcpy(resize_buf + s, end, resize_target - s); memcpy(resize_buf + START_OFFSET, end + (resize_target - s), len - (resize_target - s)); - pmem_wrap: + mmap_wrap: s+= len - seq; if (s >= resize_target) s-= resize_capacity; @@ -1304,7 +1298,7 @@ inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len, } } -template +template std::pair mtr_t::finish_writer(mtr_t *mtr, size_t len) { @@ -1315,16 +1309,14 @@ mtr_t::finish_writer(mtr_t *mtr, size_t len) const size_t size{mtr->m_commit_lsn ? 5U + 8U : 5U}; std::pair start= - log_sys.append_prepare(len, mtr->m_latch_ex); + log_sys.append_prepare(len, mtr->m_latch_ex); - if (!pmem) + if (!mmap) { mtr->m_log.for_each_block([&start](const mtr_buf_t::block_t *b) { log_sys.append(start.second, b->begin(), b->used()); return true; }); -#ifdef HAVE_PMEM write_trailer: -#endif *start.second++= log_sys.get_sequence_bit(start.first + len - size); if (mtr->m_commit_lsn) { @@ -1335,7 +1327,6 @@ mtr_t::finish_writer(mtr_t *mtr, size_t len) mach_write_to_4(start.second, mtr->m_crc); start.second+= 4; } -#ifdef HAVE_PMEM else { if (UNIV_LIKELY(start.second + len <= &log_sys.buf[log_sys.file_size])) @@ -1383,9 +1374,6 @@ mtr_t::finish_writer(mtr_t *mtr, size_t len) ((size >= size_left) ? log_sys.START_OFFSET : log_sys.file_size) + (size - size_left); } -#else - static_assert(!pmem, ""); -#endif log_sys.resize_write(start.first, start.second, len, size); diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 45cad964da8f1..6ef348a7179c2 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -1098,7 +1098,8 @@ static lsn_t srv_prepare_to_delete_redo_log_file() log_sys.latch.wr_unlock(); - log_write_up_to(flushed_lsn, false); + if (latest_format) + log_write_up_to(flushed_lsn, false); ut_ad(flushed_lsn == log_sys.get_lsn()); ut_ad(!os_aio_pending_reads()); @@ -1294,10 +1295,7 @@ dberr_t srv_start(bool create_new_db) } #endif /* UNIV_DEBUG */ - if (!log_sys.create()) { - return srv_init_abort(DB_ERROR); - } - + log_sys.create(); recv_sys.create(); lock_sys.create(srv_lock_table_size); @@ -1860,13 +1858,13 @@ dberr_t srv_start(bool create_new_db) if (srv_print_verbose_log) { sql_print_information("InnoDB: " "log sequence number " LSN_PF -#ifdef HAVE_PMEM +#ifdef HAVE_INNODB_MMAP "%s" #endif "; transaction id " TRX_ID_FMT, recv_sys.lsn, -#ifdef HAVE_PMEM - log_sys.is_pmem() +#ifdef HAVE_INNODB_MMAP + log_sys.is_mmap() ? " (memory-mapped)" : "", #endif trx_sys.get_max_trx_id());