map_handle cache now looks to be working well.

ned14 · Aug 21, 2021 · 2917dc7 · 2917dc7
1 parent 46f0760
commit 2917dc7
Show file tree

Hide file tree

Showing 5 changed files with 101 additions and 33 deletions.
diff --git a/include/llfio/revision.hpp b/include/llfio/revision.hpp
@@ -1,4 +1,4 @@
 // Note the second line of this file must ALWAYS be the git SHA, third line ALWAYS the git SHA update time
-#define LLFIO_PREVIOUS_COMMIT_REF    d3ff87fd8b91f06d4f7bd240860ef68f15a03621
-#define LLFIO_PREVIOUS_COMMIT_DATE   "2021-08-20 20:17:59 +00:00"
-#define LLFIO_PREVIOUS_COMMIT_UNIQUE d3ff87fd
+#define LLFIO_PREVIOUS_COMMIT_REF    46f0760e7ed2b80c46acd91bacc130049620bee6
+#define LLFIO_PREVIOUS_COMMIT_DATE   "2021-08-21 13:31:38 +00:00"
+#define LLFIO_PREVIOUS_COMMIT_UNIQUE 46f0760e
diff --git a/include/llfio/v2.0/detail/impl/map_handle.ipp b/include/llfio/v2.0/detail/impl/map_handle.ipp
@@ -57,7 +57,7 @@ namespace detail
     size_t trie_count{0};
     map_handle_cache_item_t *trie_children[8 * sizeof(size_t)];
     bool trie_nobbledir{0};
-    size_t bytes_in_cache{0};
+    size_t bytes_in_cache{0}, hits{0}, misses{0};
   };
   static const size_t page_size_shift = [] { return QUICKCPPLIB_NAMESPACE::algorithm::bitwise_trie::detail::bitscanr(utils::page_size()); }();
   class map_handle_cache_t : protected QUICKCPPLIB_NAMESPACE::algorithm::bitwise_trie::bitwise_trie<map_handle_cache_base_t, map_handle_cache_item_t>
@@ -66,7 +66,11 @@ namespace detail
     using _lock_guard = std::unique_lock<std::mutex>;
 
   public:
+#ifdef __linux__
     std::atomic<unsigned> do_not_store_failed_count{0};
+#endif
+
+    ~map_handle_cache_t() { trim_cache(std::chrono::steady_clock::now(), (size_t)-1); }
 
     using _base::size;
     void *get(size_t bytes, size_t page_size)
@@ -79,8 +83,10 @@ namespace detail
       }
       if(it == _base::end() || page_size != it->page_size || _bytes != it->trie_key)
       {
+        misses++;
         return nullptr;
       }
+      hits++;
       auto *p = *it;
       _base::erase(it);
       _base::bytes_in_cache -= bytes;
@@ -98,43 +104,49 @@ namespace detail
       _base::insert(p);
       _base::bytes_in_cache += bytes;
     }
-    map_handle::cache_statistics trim_cache(std::chrono::steady_clock::time_point older_than)
+    map_handle::cache_statistics trim_cache(std::chrono::steady_clock::time_point older_than, size_t max_items)
     {
       _lock_guard g(lock);
       map_handle::cache_statistics ret;
-      if(older_than != std::chrono::steady_clock::time_point())
+
+      if(older_than != std::chrono::steady_clock::time_point() && max_items > 0)
       {
-        for(auto it = _base::begin(); it != _base::end();)
+        // Prefer bigger items to trim than smaller ones
+        for(auto it = --_base::end(); it != _base::end() && max_items > 0;)
         {
           if(it->when_added <= older_than)
           {
             auto *p = *it;
-            it = _base::erase(it);
+            _base::erase(it--);
             const auto _bytes = p->trie_key << page_size_shift;
 #ifdef _WIN32
             if(!win32_release_nonfile_allocations((byte *) p->addr, _bytes, MEM_RELEASE))
 #else
             if(-1 == ::munmap(p->addr, _bytes))
 #endif
             {
-              LLFIO_LOG_FATAL(nullptr, "map_handle cache failed to trim a map! If on Linux, you may have exceeded the "
-                      "64k VMA process limit, set the LLFIO_DEBUG_LINUX_MUNMAP macro at the top of posix/map_handle.ipp to cause dumping of VMAs to "
-                      "/tmp/llfio_unmap_debug_smaps.txt, and combine with strace to figure it out.");
+              LLFIO_LOG_FATAL(nullptr,
+                              "map_handle cache failed to trim a map! If on Linux, you may have exceeded the "
+                              "64k VMA process limit, set the LLFIO_DEBUG_LINUX_MUNMAP macro at the top of posix/map_handle.ipp to cause dumping of VMAs to "
+                              "/tmp/llfio_unmap_debug_smaps.txt, and combine with strace to figure it out.");
               abort();
             }
             _base::bytes_in_cache -= _bytes;
             ret.bytes_just_trimmed += _bytes;
             ret.items_just_trimmed++;
+            max_items--;
             delete p;
           }
           else
           {
-            ++it;
+            --it;
           }
         }
       }
       ret.items_in_cache = _base::size();
       ret.bytes_in_cache = _base::bytes_in_cache;
+      ret.hits = _base::hits;
+      ret.misses = _base::misses;
       return ret;
     }
   };
@@ -249,9 +261,9 @@ bool map_handle::_recycle_map() noexcept
   }
 }
 
-map_handle::cache_statistics map_handle::trim_cache(std::chrono::steady_clock::time_point older_than) noexcept
+map_handle::cache_statistics map_handle::trim_cache(std::chrono::steady_clock::time_point older_than, size_t max_items) noexcept
 {
-  return detail::map_handle_cache().trim_cache(older_than);
+  return detail::map_handle_cache().trim_cache(older_than, max_items);
 }
 
 

diff --git a/include/llfio/v2.0/map_handle.hpp b/include/llfio/v2.0/map_handle.hpp
@@ -345,6 +345,9 @@ guaranteed that writing into it will not fail. Note that memory mapped files hav
 their file contents, so except for pages written into and not yet flushed to storage, memory mapped files
 usually do not contribute more than a few pages each to commit charge.
 
+\note You can determine the virtual memory accounting model for your system using `map_handle::memory_accounting()`.
+This caches the result of interrogating the system, so it is fast after its first call.
+
 The system commit limit can be easily exceeded if programs commit a lot of memory that they never use.
 To avoid this, for large allocations you should *reserve* pages which you don't expect to use immediately,
 and *later* explicitly commit and decommit them. You can request pages not accounted against the system
@@ -360,6 +363,40 @@ modified pages. This makes sense, given the prevalence of code which commits mem
 however it also leads to anti-social outcomes such as Linux distributions enabling pathological
 workarounds such as over commit and specialised OOM killers.
 
+## Map handle caching
+
+Repeatedly freeing and allocating virtual memory is particularly expensive because page contents must
+be cleared by the system before they can be handed out again. Most kernels clear pages using an idle
+loop, but if the system is busy then a surprising amount of CPU time can get consumed wiping pages.
+
+Most users of page allocated memory can tolerate receiving dirty pages, so `map_handle` implements
+a process-local cache of previously allocated page regions which have since been `close()`d. If a
+new `map_handle::map()` asks for virtual memory and there is a region in the cache, that region is
+returned instead of a new region.
+
+Before a region is added to the cache, it is decommitted (except on Linux when overcommit is enabled,
+see below). It therefore only consumes virtual address space in your process, and does not otherwise
+consume any resources apart from a VMA entry in the kernel. In particular, it does not appear in
+your process' RAM consumption (except on Linux). When a region is removed from the cache,
+it is committed, thus adding it to your process' RAM consumption. During this decommit-recommit
+process the kernel **may** choose to scavenge the memory, in which case fresh pages will be restored.
+However there is a good chance that whatever the pages contained before decommit will still be there
+after recommit.
+
+Linux has a famously messed up virtual memory implementation. LLFIO implements a strict memory
+accounting model, and ordinarily we tell Linux what pages are to be counted towards commit charge
+or not so you don't have to. If overcommit is disabled in the system, you then get identical strict
+memory accounting like on every other OS.
+
+If however overcommit is enabled, we don't decommit pages, but rather mark them `LazyFree`. This is
+to avoid inhibiting VMA coalescing, which is super important on Linux because of its ridiculously
+low per-process VMA limit typically 64k regions on most installs. Therefore, if you do disable
+overcommit, you will also need to substantially raise the maximum per process VMA limit as now LLFIO
+will strictly decommit memory, which prevents VMA coalescing and thus generates lots more VMAs.
+
+The process local map handle cache does not self trim over time, so if you wish to reclaim virtual
+address space you need to manually call `map_handle::trim_cache()` from time to time.
+
 ## Barriers:
 
 `map_handle`, because it implements `io_handle`, implements `barrier()` in a very conservative way
@@ -665,10 +702,12 @@ class LLFIO_DECL map_handle : public lockable_io_handle
     size_t bytes_in_cache{0};
     size_t items_just_trimmed{0};
     size_t bytes_just_trimmed{0};
+    size_t hits{0}, misses{0};
   };
   /*! Get statistics about the map handle cache, optionally trimming the least recently used maps.
    */
-  static LLFIO_HEADERS_ONLY_MEMFUNC_SPEC cache_statistics trim_cache(std::chrono::steady_clock::time_point older_than = {}) noexcept;
+  static LLFIO_HEADERS_ONLY_MEMFUNC_SPEC cache_statistics trim_cache(std::chrono::steady_clock::time_point older_than = {},
+                                                                     size_t max_items = (size_t) -1) noexcept;
 
   //! The memory section this handle is using
   section_handle *section() const noexcept { return _section; }

diff --git a/test/tests/map_handle_cache.cpp b/test/tests/map_handle_cache.cpp
@@ -27,18 +27,20 @@ Distributed under the Boost Software License, Version 1.0.
 #include <deque>
 #include <list>
 
+inline QUICKCPPLIB_NOINLINE void fault(LLFIO_V2_NAMESPACE::map_handle &mh)
+{
+  for(auto *p = (volatile char *) mh.address(); p < (volatile char *) mh.address() + mh.length(); p += mh.page_size())
+  {
+    *p = 1;
+  }
+};
+
 static inline void TestMapHandleCache()
 {
   static constexpr size_t ITEMS_COUNT = 10000;
   namespace llfio = LLFIO_V2_NAMESPACE;
-  bool free_cache_immediately = false;
+  bool free_cache_immediately = true;
   auto test = [&] {
-    auto fault = [](llfio::map_handle &mh) {
-      for(auto *p = (volatile char *) mh.address(); p < (volatile char *) mh.address() + mh.length(); p += mh.page_size())
-      {
-        *p = 1;
-      }
-    };
     QUICKCPPLIB_NAMESPACE::algorithm::small_prng::small_prng rand;
     std::vector<llfio::map_handle> maps;
     for(size_t n = 0; n < ITEMS_COUNT; n++)
@@ -59,7 +61,8 @@ static inline void TestMapHandleCache()
       BOOST_REQUIRE(stats.items_in_cache == 0);
     }
     auto begin = std::chrono::steady_clock::now();
-    for(size_t n = 0; n < ITEMS_COUNT * 10; n++)
+    size_t ops = 0;
+    for(size_t n = 0; n < ITEMS_COUNT * 100; n++)
     {
       auto v = rand();
       auto toallocate = (v >> 2) & (128 * 1024 - 1);
@@ -70,10 +73,12 @@ static inline void TestMapHandleCache()
       if(v & 1)
       {
         maps[n % ITEMS_COUNT].close().value();
+        ops++;
       }
       else
       {
         fault((maps[n % ITEMS_COUNT] = llfio::map_handle::map(toallocate, false).value()));
+        ops += 2;
       }
       if(free_cache_immediately)
       {
@@ -85,23 +90,29 @@ static inline void TestMapHandleCache()
     auto end = std::chrono::steady_clock::now();
     {
       auto stats = llfio::map_handle::trim_cache();
-      std::cout << "\nIn the map_handle cache after churn there are " << stats.bytes_in_cache << " bytes in the cache in " << stats.items_in_cache << " items."
-                << std::endl;
+      auto usage = llfio::utils::current_process_memory_usage().value();
+      std::cout << "\n\nIn the map_handle cache after churn there are " << (stats.bytes_in_cache / 1024.0 / 1024.0) << " Mb in the cache in "
+                << stats.items_in_cache << " items. There were " << stats.hits << " hits and " << stats.misses
+                << " misses. Process virtual address space used is " << (usage.total_address_space_in_use / 1024.0 / 1024.0 / 1024.0)
+                << " Gb and commit charge is " << (usage.private_committed / 1024.0 / 1024.0) << " Mb." << std::endl;
     }
     for(auto &i : maps)
     {
       i.close().value();
     }
     {
       auto stats = llfio::map_handle::trim_cache();
-      std::cout << "\nIn the map_handle cache after releasing everything there are " << stats.bytes_in_cache << " bytes in the cache in "
-                << stats.items_in_cache << " items." << std::endl;
+      auto usage = llfio::utils::current_process_memory_usage().value();
+      std::cout << "\nIn the map_handle cache after releasing everything there are " << (stats.bytes_in_cache / 1024.0 / 1024.0) << " Mb in the cache in "
+                << stats.items_in_cache << " items. Process virtual address space used is " << (usage.total_address_space_in_use / 1024.0 / 1024.0 / 1024.0)
+                << " Gb and commit charge is " << (usage.private_committed / 1024.0 / 1024.0) << " Mb." << std::endl;
     }
-    std::cout << "With free_cache_immediately = " << free_cache_immediately << " it took "
-              << (std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count() / 1000.0 / ITEMS_COUNT) << " us per allocation-free." << std::endl;
+    std::cout << "\nWith free_cache_immediately = " << free_cache_immediately << " it took "
+              << (std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count() / 1000.0 / ops) << " us per allocation-free."
+              << std::endl;
   };
   test();
-  free_cache_immediately = true;
+  free_cache_immediately = false;
   test();
 }
 

diff --git a/test/tests/utils.cpp b/test/tests/utils.cpp
@@ -172,14 +172,20 @@ static inline void TestCurrentProcessMemoryUsage()
     BOOST_CHECK(within(before_anything, after_fault, 1024, 1024, 1024, 1024));
     BOOST_CHECK(within(before_anything, after_decommit, 1024, 0, 0, 0));
 #ifdef _WIN32
-    BOOST_CHECK(within(before_anything, after_zero, 1024, 0, 1024, 0));  // may not evict faulted set on POSIX
+    BOOST_CHECK(within(before_anything, after_zero, 1024, 0, 1024, 0));
+    BOOST_CHECK(within(before_anything, after_do_not_store, 1024, 0, 1024, 0));  // do_not_store() decreases RSS but not commit on Windows
 #else
-    (void) after_zero;
+    (void) after_zero;  // may not evict faulted set on POSIX
+    BOOST_CHECK(within(before_anything, after_do_not_store, 1024, 1024, 0, 1024));  // do_not_store() decreases commit but does not RSS on POSIX
 #endif
-    BOOST_CHECK(within(before_anything, after_do_not_store, 1024, 0, 1024, 0));
 #endif
   }
   std::cout << "\nFor file mapping:\n";
+  {
+    auto stats = llfio::map_handle::trim_cache(std::chrono::steady_clock::now());
+    BOOST_REQUIRE(stats.bytes_in_cache == 0);
+    BOOST_REQUIRE(stats.items_in_cache == 0);
+  }
   {
     auto sectionh = llfio::section_handle::section(1024 * 1024 * 1024).value();
     llfio::utils::process_memory_usage before_anything, after_reserve, after_commit, after_fault, after_decommit, after_zero, after_do_not_store;