From 9daa4303cdc03f6b90b72c369e6377c6beb75c39 Mon Sep 17 00:00:00 2001
From: Alex A Yermoshenko <alex.a.ermoshenko@gmail.com>
Date: Sun, 17 Dec 2023 19:21:21 +0100
Subject: [PATCH] Update taskflow to 3.6.0 (#93)

Fixed potential data-race issues induced by incorrect memory order.

Co-authored-by: Alex A. Yermoshenko <ermoshenko@devoler.com>
---
 lib/taskflow/algorithm/critical.hpp       |   78 +
 lib/taskflow/algorithm/data_pipeline.hpp  |  637 ++++++
 lib/taskflow/algorithm/find.hpp           |  547 +++++
 lib/taskflow/algorithm/for_each.hpp       |  173 ++
 lib/taskflow/algorithm/launch.hpp         |   58 +
 lib/taskflow/algorithm/partitioner.hpp    |  543 +++++
 lib/taskflow/algorithm/pipeline.hpp       | 1663 ++++++++++++++
 lib/taskflow/algorithm/reduce.hpp         |  295 +++
 lib/taskflow/algorithm/scan.hpp           |  614 +++++
 lib/taskflow/algorithm/sort.hpp           |  648 ++++++
 lib/taskflow/algorithm/transform.hpp      |  199 ++
 lib/taskflow/core/async.hpp               |  396 ++++
 lib/taskflow/core/async_task.hpp          |  125 +
 lib/taskflow/core/declarations.hpp        |   30 +-
 lib/taskflow/core/executor-module-opt.hpp | 2025 +++++++++++++++++
 lib/taskflow/core/executor.hpp            | 2501 +++++++++++++++------
 lib/taskflow/core/flow_builder.hpp        | 1891 +++++++++-------
 lib/taskflow/core/graph.hpp               |  862 +++++--
 lib/taskflow/core/notifier.hpp            |    4 +-
 lib/taskflow/core/observer.hpp            |  377 +++-
 lib/taskflow/core/semaphore.hpp           |   39 +-
 lib/taskflow/core/task.hpp                |  284 ++-
 lib/taskflow/core/taskflow.hpp            |  365 ++-
 lib/taskflow/core/topology.hpp            |   19 +-
 lib/taskflow/core/tsq.hpp                 |  380 +++-
 lib/taskflow/core/worker.hpp              |  175 +-
 lib/taskflow/taskflow.hpp                 |   37 +-
 lib/taskflow/utility/iterator.hpp         |   16 +-
 lib/taskflow/utility/macros.hpp           |   17 +
 lib/taskflow/utility/math.hpp             |   28 +-
 lib/taskflow/utility/object_pool.hpp      |  145 +-
 lib/taskflow/utility/os.hpp               |   62 +-
 lib/taskflow/utility/serializer.hpp       |  609 ++---
 lib/taskflow/utility/singleton.hpp        |    2 +-
 lib/taskflow/utility/small_vector.hpp     | 1048 +++++++++
 lib/taskflow/utility/stream.hpp           |    2 +-
 lib/taskflow/utility/traits.hpp           |  339 ++-
 lib/taskflow/utility/uuid.hpp             |   60 +-
 38 files changed, 14599 insertions(+), 2694 deletions(-)
 create mode 100644 lib/taskflow/algorithm/critical.hpp
 create mode 100644 lib/taskflow/algorithm/data_pipeline.hpp
 create mode 100644 lib/taskflow/algorithm/find.hpp
 create mode 100644 lib/taskflow/algorithm/for_each.hpp
 create mode 100644 lib/taskflow/algorithm/launch.hpp
 create mode 100644 lib/taskflow/algorithm/partitioner.hpp
 create mode 100644 lib/taskflow/algorithm/pipeline.hpp
 create mode 100644 lib/taskflow/algorithm/reduce.hpp
 create mode 100644 lib/taskflow/algorithm/scan.hpp
 create mode 100644 lib/taskflow/algorithm/sort.hpp
 create mode 100644 lib/taskflow/algorithm/transform.hpp
 create mode 100644 lib/taskflow/core/async.hpp
 create mode 100644 lib/taskflow/core/async_task.hpp
 create mode 100644 lib/taskflow/core/executor-module-opt.hpp
 create mode 100644 lib/taskflow/utility/macros.hpp
 create mode 100644 lib/taskflow/utility/small_vector.hpp

diff --git a/lib/taskflow/algorithm/critical.hpp b/lib/taskflow/algorithm/critical.hpp
new file mode 100644
index 0000000..c781d28
--- /dev/null
+++ b/lib/taskflow/algorithm/critical.hpp
@@ -0,0 +1,78 @@
+#pragma once
+
+#include "../core/task.hpp"
+
+/**
+@file critical.hpp
+@brief critical include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// CriticalSection
+// ----------------------------------------------------------------------------
+
+/**
+@class CriticalSection
+
+@brief class to create a critical region of limited workers to run tasks
+
+tf::CriticalSection is a warpper over tf::Semaphore and is specialized for
+limiting the maximum concurrency over a set of tasks.
+A critical section starts with an initial count representing that limit.
+When a task is added to the critical section,
+the task acquires and releases the semaphore internal to the critical section.
+This design avoids explicit call of tf::Task::acquire and tf::Task::release.
+The following example creates a critical section of one worker and adds
+the five tasks to the critical section.
+
+@code{.cpp}
+tf::Executor executor(8);   // create an executor of 8 workers
+tf::Taskflow taskflow;
+
+// create a critical section of 1 worker
+tf::CriticalSection critical_section(1);
+
+tf::Task A = taskflow.emplace([](){ std::cout << "A" << std::endl; });
+tf::Task B = taskflow.emplace([](){ std::cout << "B" << std::endl; });
+tf::Task C = taskflow.emplace([](){ std::cout << "C" << std::endl; });
+tf::Task D = taskflow.emplace([](){ std::cout << "D" << std::endl; });
+tf::Task E = taskflow.emplace([](){ std::cout << "E" << std::endl; });
+
+critical_section.add(A, B, C, D, E);
+
+executor.run(taskflow).wait();
+@endcode
+
+*/
+class CriticalSection : public Semaphore {
+
+  public:
+
+    /**
+    @brief constructs a critical region of a limited number of workers
+    */
+    explicit CriticalSection(size_t max_workers = 1);
+
+    /**
+    @brief adds a task into the critical region
+    */
+    template <typename... Tasks>
+    void add(Tasks...tasks);
+};
+
+inline CriticalSection::CriticalSection(size_t max_workers) :
+  Semaphore {max_workers} {
+}
+
+template <typename... Tasks>
+void CriticalSection::add(Tasks... tasks) {
+  (tasks.acquire(*this), ...);
+  (tasks.release(*this), ...);
+}
+
+
+}  // end of namespace tf. ---------------------------------------------------
+
+
diff --git a/lib/taskflow/algorithm/data_pipeline.hpp b/lib/taskflow/algorithm/data_pipeline.hpp
new file mode 100644
index 0000000..0393548
--- /dev/null
+++ b/lib/taskflow/algorithm/data_pipeline.hpp
@@ -0,0 +1,637 @@
+#pragma once
+
+#include "pipeline.hpp"
+
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// Class Definition: DataPipe
+// ----------------------------------------------------------------------------
+
+/**
+@class DataPipe
+
+@brief class to create a stage in a data-parallel pipeline 
+
+A data pipe represents a stage of a data-parallel pipeline. 
+A data pipe can be either @em parallel direction or @em serial direction 
+(specified by tf::PipeType) and is associated with a callable to invoke 
+by the pipeline scheduler.
+
+You need to use the template function, tf::make_data_pipe, to create 
+a data pipe. The input and output types of a tf::DataPipe should be decayed types 
+(though the library will always decay them for you using `std::decay`)
+to allow internal storage to work.
+The data will be passed by reference to your callable, at which you can take 
+it by copy or reference.
+
+@code{.cpp}
+tf::make_data_pipe<int, std::string>(
+  tf::PipeType::SERIAL, 
+  [](int& input) {return std::to_string(input + 100);}
+);
+@endcode
+
+In addition to the data, you callable can take an additional reference 
+of tf::Pipeflow in the second argument to probe the runtime information
+for a stage task, such as its line number and token number:
+
+@code{.cpp}
+tf::make_data_pipe<int, std::string>(
+  tf::PipeType::SERIAL, 
+  [](int& input, tf::Pipeflow& pf) {
+    printf("token=%lu, line=%lu\n", pf.token(), pf.line());
+    return std::to_string(input + 100);
+  }
+);
+@endcode
+
+*/
+template <typename Input, typename Output, typename C>
+class DataPipe {
+
+  template <typename... Ps>
+  friend class DataPipeline;
+
+  public:
+
+  /**
+  @brief callable type of the data pipe
+  */
+  using callable_t = C;
+
+  /**
+  @brief input type of the data pipe
+  */
+  using input_t = Input;
+
+  /**
+  @brief output type of the data pipe
+  */
+  using output_t = Output;
+
+  /**
+  @brief default constructor
+  */
+  DataPipe() = default;
+
+  /**
+  @brief constructs a data pipe
+
+  You should use the helper function, tf::make_data_pipe, 
+  to create a DataPipe object, especially when you need tf::DataPipe
+  to automatically deduct the lambda type.
+  */
+  DataPipe(PipeType d, callable_t&& callable) :
+    _type{d}, _callable{std::forward<callable_t>(callable)} {
+  }
+
+  /**
+  @brief queries the type of the data pipe
+
+  A data pipe can be either parallel (tf::PipeType::PARALLEL) or serial
+  (tf::PipeType::SERIAL).
+  */
+  PipeType type() const {
+    return _type;
+  }
+
+  /**
+  @brief assigns a new type to the data pipe
+  */
+  void type(PipeType type) {
+    _type = type;
+  }
+
+  /**
+  @brief assigns a new callable to the data pipe
+
+  @tparam U callable type
+  @param callable a callable object constructible from the callable type
+                  of this data pipe
+
+  Assigns a new callable to the pipe using universal forwarding.
+  */
+  template <typename U>
+  void callable(U&& callable) {
+    _callable = std::forward<U>(callable);
+  }
+
+  private:
+
+  PipeType _type;
+
+  callable_t _callable;
+};
+
+/**
+@brief function to construct a data pipe (tf::DataPipe)
+
+@tparam Input input data type
+@tparam Output output data type
+@tparam C callable type
+
+tf::make_data_pipe is a helper function to create a data pipe (tf::DataPipe)
+in a data-parallel pipeline (tf::DataPipeline).
+The first argument specifies the direction of the data pipe,
+either tf::PipeType::SERIAL or tf::PipeType::PARALLEL,
+and the second argument is a callable to invoke by the pipeline scheduler.
+Input and output data types are specified via template parameters,
+which will always be decayed by the library to its original form
+for storage purpose.
+The callable must take the input data type in its first argument
+and returns a value of the output data type.
+
+@code{.cpp}
+tf::make_data_pipe<int, std::string>(
+  tf::PipeType::SERIAL, 
+  [](int& input) {
+    return std::to_string(input + 100);
+  }
+);
+@endcode
+
+The callable can additionally take a reference of tf::Pipeflow, 
+which allows you to query the runtime information of a stage task,
+such as its line number and token number.
+
+@code{.cpp}
+tf::make_data_pipe<int, std::string>(
+  tf::PipeType::SERIAL, 
+  [](int& input, tf::Pipeflow& pf) {
+    printf("token=%lu, line=%lu\n", pf.token(), pf.line());
+    return std::to_string(input + 100);
+  }
+);
+@endcode
+
+*/
+template <typename Input, typename Output, typename C>
+auto make_data_pipe(PipeType d, C&& callable) {
+  return DataPipe<Input, Output, C>(d, std::forward<C>(callable));
+}
+
+// ----------------------------------------------------------------------------
+// Class Definition: DataPipeline
+// ----------------------------------------------------------------------------
+
+/**
+@class DataPipeline
+
+@brief class to create a data-parallel pipeline scheduling framework
+
+@tparam Ps data pipe types
+
+Similar to tf::Pipeline, a tf::DataPipeline is a composable graph object
+for users to create a <i>data-parallel pipeline scheduling framework</i> 
+using a module task in a taskflow.
+The only difference is that tf::DataPipeline provides a data abstraction
+for users to quickly express dataflow in a pipeline.
+The following example creates a data-parallel pipeline of three stages
+that generate dataflow from `void` to `int`, `std::string`, `float`, and `void`.
+
+@code{.cpp}
+#include <taskflow/taskflow.hpp>
+#include <taskflow/algorithm/data_pipeline.hpp>
+
+int main() {
+
+  // data flow => void -> int -> std::string -> float -> void 
+  tf::Taskflow taskflow("pipeline");
+  tf::Executor executor;
+
+  const size_t num_lines = 4;
+
+  tf::DataPipeline pl(num_lines,
+    tf::make_data_pipe<void, int>(tf::PipeType::SERIAL, [&](tf::Pipeflow& pf) -> int{
+      if(pf.token() == 5) {
+        pf.stop();
+        return 0;
+      }
+      else {
+        return pf.token();
+      }
+    }),
+    tf::make_data_pipe<int, std::string>(tf::PipeType::SERIAL, [](int& input) {
+      return std::to_string(input + 100);
+    }),
+    tf::make_data_pipe<std::string, void>(tf::PipeType::SERIAL, [](std::string& input) {
+      std::cout << input << std::endl;
+    })
+  );
+
+  // build the pipeline graph using composition
+  taskflow.composed_of(pl).name("pipeline");
+
+  // dump the pipeline graph structure (with composition)
+  taskflow.dump(std::cout);
+
+  // run the pipeline
+  executor.run(taskflow).wait();
+
+  return 0;
+}
+@endcode
+
+The pipeline schedules five tokens over four parallel lines in a circular fashion, 
+as depicted below:
+
+@code{.shell-session}
+o -> o -> o
+|    |    |
+v    v    v
+o -> o -> o
+|    |    |
+v    v    v
+o -> o -> o
+|    |    |
+v    v    v
+o -> o -> o
+@endcode
+*/
+template <typename... Ps>
+class DataPipeline {
+
+  static_assert(sizeof...(Ps)>0, "must have at least one pipe");
+
+  /**
+  @private
+  */
+  struct Line {
+    std::atomic<size_t> join_counter;
+  };
+
+  /**
+  @private
+  */
+  struct PipeMeta {
+    PipeType type;
+  };
+
+
+  public:
+  
+  /**
+  @brief internal storage type for each data token (default std::variant)
+  */
+  using data_t = unique_variant_t<std::variant<std::conditional_t<
+    std::is_void_v<typename Ps::output_t>, 
+    std::monostate, 
+    std::decay_t<typename Ps::output_t>>...
+  >>;
+
+  /**
+  @brief constructs a data-parallel pipeline object
+
+  @param num_lines the number of parallel lines
+  @param ps a list of pipes
+
+  Constructs a data-parallel pipeline of up to @c num_lines parallel lines to schedule
+  tokens through the given linear chain of pipes.
+  The first pipe must define a serial direction (tf::PipeType::SERIAL)
+  or an exception will be thrown.
+  */
+  DataPipeline(size_t num_lines, Ps&&... ps);
+
+  /**
+  @brief constructs a data-parallel pipeline object
+
+  @param num_lines the number of parallel lines
+  @param ps a tuple of pipes
+
+  Constructs a data-parallel pipeline of up to @c num_lines parallel lines to schedule
+  tokens through the given linear chain of pipes stored in a std::tuple.
+  The first pipe must define a serial direction (tf::PipeType::SERIAL)
+  or an exception will be thrown.
+  */
+  DataPipeline(size_t num_lines, std::tuple<Ps...>&& ps);
+
+  /**
+  @brief queries the number of parallel lines
+
+  The function returns the number of parallel lines given by the user
+  upon the construction of the pipeline.
+  The number of lines represents the maximum parallelism this pipeline
+  can achieve.
+  */
+  size_t num_lines() const noexcept;
+
+  /**
+  @brief queries the number of pipes
+
+  The Function returns the number of pipes given by the user
+  upon the construction of the pipeline.
+  */
+  constexpr size_t num_pipes() const noexcept;
+
+  /**
+  @brief resets the pipeline
+
+  Resetting the pipeline to the initial state. After resetting a pipeline,
+  its token identifier will start from zero as if the pipeline was just
+  constructed.
+  */
+  void reset();
+
+  /**
+  @brief queries the number of generated tokens in the pipeline
+
+  The number represents the total scheduling tokens that has been
+  generated by the pipeline so far.
+  */
+  size_t num_tokens() const noexcept;
+
+  /**
+  @brief obtains the graph object associated with the pipeline construct
+
+  This method is primarily used as an opaque data structure for creating
+  a module task of this pipeline.
+  */
+  Graph& graph();
+
+  private:
+
+  Graph _graph;
+
+  size_t _num_tokens;
+
+  std::tuple<Ps...> _pipes;
+  std::array<PipeMeta, sizeof...(Ps)> _meta;
+  std::vector<std::array<Line, sizeof...(Ps)>> _lines;
+  std::vector<Task> _tasks;
+  std::vector<Pipeflow> _pipeflows;
+  std::vector<CachelineAligned<data_t>> _buffer;
+
+  template <size_t... I>
+  auto _gen_meta(std::tuple<Ps...>&&, std::index_sequence<I...>);
+
+  void _on_pipe(Pipeflow&, Runtime&);
+  void _build();
+};
+
+// constructor
+template <typename... Ps>
+DataPipeline<Ps...>::DataPipeline(size_t num_lines, Ps&&... ps) :
+  _pipes     {std::make_tuple(std::forward<Ps>(ps)...)},
+  _meta      {PipeMeta{ps.type()}...},
+  _lines     (num_lines),
+  _tasks     (num_lines + 1),
+  _pipeflows (num_lines),
+  _buffer    (num_lines) {
+
+  if(num_lines == 0) {
+    TF_THROW("must have at least one line");
+  }
+
+  if(std::get<0>(_pipes).type() != PipeType::SERIAL) {
+    TF_THROW("first pipe must be serial");
+  }
+
+  reset();
+  _build();
+}
+
+// constructor
+template <typename... Ps>
+DataPipeline<Ps...>::DataPipeline(size_t num_lines, std::tuple<Ps...>&& ps) :
+  _pipes     {std::forward<std::tuple<Ps...>>(ps)},
+  _meta      {_gen_meta(
+    std::forward<std::tuple<Ps...>>(ps), std::make_index_sequence<sizeof...(Ps)>{}
+  )},
+  _lines     (num_lines),
+  _tasks     (num_lines + 1),
+  _pipeflows (num_lines),
+  _buffer    (num_lines) {
+
+  if(num_lines == 0) {
+    TF_THROW("must have at least one line");
+  }
+
+  if(std::get<0>(_pipes).type() != PipeType::SERIAL) {
+    TF_THROW("first pipe must be serial");
+  }
+
+  reset();
+  _build();
+}
+
+// Function: _get_meta
+template <typename... Ps>
+template <size_t... I>
+auto DataPipeline<Ps...>::_gen_meta(std::tuple<Ps...>&& ps, std::index_sequence<I...>) {
+  return std::array{PipeMeta{std::get<I>(ps).type()}...};
+}
+
+// Function: num_lines
+template <typename... Ps>
+size_t DataPipeline<Ps...>::num_lines() const noexcept {
+  return _pipeflows.size();
+}
+
+// Function: num_pipes
+template <typename... Ps>
+constexpr size_t DataPipeline<Ps...>::num_pipes() const noexcept {
+  return sizeof...(Ps);
+}
+
+// Function: num_tokens
+template <typename... Ps>
+size_t DataPipeline<Ps...>::num_tokens() const noexcept {
+  return _num_tokens;
+}
+
+// Function: graph
+template <typename... Ps>
+Graph& DataPipeline<Ps...>::graph() {
+  return _graph;
+}
+
+// Function: reset
+template <typename... Ps>
+void DataPipeline<Ps...>::reset() {
+
+  _num_tokens = 0;
+
+  for(size_t l = 0; l<num_lines(); l++) {
+    _pipeflows[l]._pipe = 0;
+    _pipeflows[l]._line = l;
+  }
+
+  _lines[0][0].join_counter.store(0, std::memory_order_relaxed);
+
+  for(size_t l=1; l<num_lines(); l++) {
+    for(size_t f=1; f<num_pipes(); f++) {
+      _lines[l][f].join_counter.store(
+        static_cast<size_t>(_meta[f].type), std::memory_order_relaxed
+      );
+    }
+  }
+
+  for(size_t f=1; f<num_pipes(); f++) {
+    _lines[0][f].join_counter.store(1, std::memory_order_relaxed);
+  }
+
+  for(size_t l=1; l<num_lines(); l++) {
+    _lines[l][0].join_counter.store(
+      static_cast<size_t>(_meta[0].type) - 1, std::memory_order_relaxed
+    );
+  }
+}
+
+// Procedure: _on_pipe
+template <typename... Ps>
+void DataPipeline<Ps...>::_on_pipe(Pipeflow& pf, Runtime&) {
+
+  visit_tuple([&](auto&& pipe){
+
+    using data_pipe_t = std::decay_t<decltype(pipe)>;
+    using callable_t  = typename data_pipe_t::callable_t;
+    using input_t     = std::decay_t<typename data_pipe_t::input_t>;
+    using output_t    = std::decay_t<typename data_pipe_t::output_t>;
+    
+    // first pipe
+    if constexpr (std::is_invocable_v<callable_t, Pipeflow&>) {
+      // [](tf::Pipeflow&) -> void {}, i.e., we only have one pipe
+      if constexpr (std::is_void_v<output_t>) {
+        pipe._callable(pf);
+      // [](tf::Pipeflow&) -> output_t {}
+      } else {
+        _buffer[pf._line].data = pipe._callable(pf);
+      }
+    }
+    // other pipes without pipeflow in the second argument
+    else if constexpr (std::is_invocable_v<callable_t, std::add_lvalue_reference_t<input_t> >) {
+      // [](input_t&) -> void {}, i.e., the last pipe
+      if constexpr (std::is_void_v<output_t>) {
+        pipe._callable(std::get<input_t>(_buffer[pf._line].data));
+      // [](input_t&) -> output_t {}
+      } else {
+        _buffer[pf._line].data = pipe._callable(
+          std::get<input_t>(_buffer[pf._line].data)
+        );
+      }
+    }
+    // other pipes with pipeflow in the second argument
+    else if constexpr (std::is_invocable_v<callable_t, input_t&, Pipeflow&>) {
+      // [](input_t&, tf::Pipeflow&) -> void {}
+      if constexpr (std::is_void_v<output_t>) {
+        pipe._callable(std::get<input_t>(_buffer[pf._line].data), pf);
+      // [](input_t&, tf::Pipeflow&) -> output_t {}
+      } else {
+        _buffer[pf._line].data = pipe._callable(
+          std::get<input_t>(_buffer[pf._line].data), pf
+        );
+      }
+    }
+    //else if constexpr(std::is_invocable_v<callable_t, Pipeflow&, Runtime&>) {
+    //  pipe._callable(pf, rt);
+    //}
+    else {
+      static_assert(dependent_false_v<callable_t>, "un-supported pipe callable type");
+    }
+  }, _pipes, pf._pipe);
+}
+
+// Procedure: _build
+template <typename... Ps>
+void DataPipeline<Ps...>::_build() {
+
+  using namespace std::literals::string_literals;
+
+  FlowBuilder fb(_graph);
+
+  // init task
+  _tasks[0] = fb.emplace([this]() {
+    return static_cast<int>(_num_tokens % num_lines());
+  }).name("cond");
+
+  // line task
+  for(size_t l = 0; l < num_lines(); l++) {
+
+    _tasks[l + 1] = fb.emplace([this, l] (tf::Runtime& rt) mutable {
+
+      auto pf = &_pipeflows[l];
+
+      pipeline:
+
+      _lines[pf->_line][pf->_pipe].join_counter.store(
+        static_cast<size_t>(_meta[pf->_pipe].type), std::memory_order_relaxed
+      );
+
+      if (pf->_pipe == 0) {
+        pf->_token = _num_tokens;
+        if (pf->_stop = false, _on_pipe(*pf, rt); pf->_stop == true) {
+          // here, the pipeline is not stopped yet because other
+          // lines of tasks may still be running their last stages
+          return;
+        }
+        ++_num_tokens;
+      }
+      else {
+        _on_pipe(*pf, rt);
+      }
+
+      size_t c_f = pf->_pipe;
+      size_t n_f = (pf->_pipe + 1) % num_pipes();
+      size_t n_l = (pf->_line + 1) % num_lines();
+
+      pf->_pipe = n_f;
+
+      // ---- scheduling starts here ----
+      // Notice that the shared variable f must not be changed after this
+      // point because it can result in data race due to the following
+      // condition:
+      //
+      // a -> b
+      // |    |
+      // v    v
+      // c -> d
+      //
+      // d will be spawned by either c or b, so if c changes f but b spawns d
+      // then data race on f will happen
+
+      std::array<int, 2> retval;
+      size_t n = 0;
+
+      // downward dependency
+      if(_meta[c_f].type == PipeType::SERIAL &&
+         _lines[n_l][c_f].join_counter.fetch_sub(
+           1, std::memory_order_acq_rel) == 1
+        ) {
+        retval[n++] = 1;
+      }
+
+      // forward dependency
+      if(_lines[pf->_line][n_f].join_counter.fetch_sub(
+          1, std::memory_order_acq_rel) == 1
+        ) {
+        retval[n++] = 0;
+      }
+
+      // notice that the task index starts from 1
+      switch(n) {
+        case 2: {
+          rt.schedule(_tasks[n_l+1]);
+          goto pipeline;
+        }
+        case 1: {
+          if (retval[0] == 1) {
+            pf = &_pipeflows[n_l];
+          }
+          goto pipeline;
+        }
+      }
+    }).name("rt-"s + std::to_string(l));
+
+    _tasks[0].precede(_tasks[l+1]);
+  }
+}
+
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
+
+
diff --git a/lib/taskflow/algorithm/find.hpp b/lib/taskflow/algorithm/find.hpp
new file mode 100644
index 0000000..ab0d801
--- /dev/null
+++ b/lib/taskflow/algorithm/find.hpp
@@ -0,0 +1,547 @@
+#pragma once
+
+#include "launch.hpp"
+
+namespace tf {
+
+namespace detail {
+
+// Function: find_if_loop
+template <typename Iterator, typename Predicate>
+TF_FORCE_INLINE bool find_if_loop(
+  std::atomic<size_t>& offset, 
+  Iterator& beg,
+  size_t& prev_e,
+  size_t  curr_b, 
+  size_t  curr_e,
+  Predicate&& predicate
+) {
+  // early prune
+  if(offset.load(std::memory_order_relaxed) < curr_b) {
+    return true;
+  }
+  std::advance(beg, curr_b - prev_e);
+  for(size_t x = curr_b; x<curr_e; x++) {
+    if(predicate(*beg++)) {
+      atomic_min(offset, x);
+      return true;
+    }
+  }
+  prev_e = curr_e;
+  return false;
+}
+
+// Function: find_if_not_loop
+template <typename Iterator, typename Predicate>
+TF_FORCE_INLINE bool find_if_not_loop(
+  std::atomic<size_t>& offset, 
+  Iterator& beg,
+  size_t& prev_e,
+  size_t  curr_b, 
+  size_t  curr_e,
+  Predicate&& predicate
+) {
+
+  // early prune
+  if(offset.load(std::memory_order_relaxed) < curr_b) {
+    return true;
+  }
+  std::advance(beg, curr_b - prev_e);
+  for(size_t x = curr_b; x<curr_e; x++) {
+    if(!predicate(*beg++)) {
+      atomic_min(offset, x);
+      return true;
+    }
+  }
+  prev_e = curr_e;
+  return false;
+}
+
+// Function: make_find_if_task
+template <typename B, typename E, typename T, typename UOP, typename P>
+TF_FORCE_INLINE auto make_find_if_task(
+  B first, E last, T& result, UOP predicate, P&& part
+) {
+  
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using namespace std::string_literals;
+
+  return 
+  [b=first, e=last, predicate, &result, part=std::forward<P>(part)] 
+  (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B_t beg = b;
+    E_t end = e;
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(beg, end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= part.chunk_size()) {
+      result = std::find_if(beg, end, predicate);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+    
+    std::atomic<size_t> offset(N);
+    
+    // static partitioner
+    if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) {
+
+      size_t chunk_size;
+
+      for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) {
+      
+        chunk_size = part.adjusted_chunk_size(N, W, w);
+
+        launch_loop(W, w, rt,
+          [N, W, curr_b, chunk_size, beg, &predicate, &offset, &part] 
+          () mutable {
+            part.loop_until(N, W, curr_b, chunk_size,
+              [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable {
+                return detail::find_if_loop(
+                  offset, beg, prev_e, curr_b, curr_e, predicate
+                );
+              }
+            ); 
+          }
+        );
+      }
+
+      rt.join();
+    }
+    // dynamic partitioner
+    else {
+      std::atomic<size_t> next(0);
+      launch_loop(N, W, rt, next, part, 
+        [N, W, beg, &predicate, &offset, &next, &part] () mutable {
+          part.loop_until(N, W, next, 
+            [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable {
+              return detail::find_if_loop(
+                offset, beg, prev_e, curr_b, curr_e, predicate
+              );
+            }
+          ); 
+        }
+      );
+    }
+
+    // update the result iterator by the offset
+    result = std::next(beg, offset.load(std::memory_order_relaxed));
+  };
+}
+
+// Function: make_find_if_not_task
+template <typename B, typename E, typename T, typename UOP, typename P>
+TF_FORCE_INLINE auto make_find_if_not_task(
+  B first, E last, T& result, UOP predicate, P&& part
+) {
+  
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using namespace std::string_literals;
+
+  return
+  [b=first, e=last, predicate, &result, part=std::forward<P>(part)] 
+  (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B_t beg = b;
+    E_t end = e;
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(beg, end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= part.chunk_size()) {
+      result = std::find_if_not(beg, end, predicate);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+    
+    std::atomic<size_t> offset(N);
+    
+    // static partitioner
+    if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) {
+
+      size_t chunk_size;
+
+      for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) {
+      
+        chunk_size = part.adjusted_chunk_size(N, W, w);
+
+        launch_loop(W, w, rt,
+          [N, W, curr_b, chunk_size, beg, &predicate, &offset, &part] () mutable {
+            part.loop_until(N, W, curr_b, chunk_size,
+              [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable {
+                return detail::find_if_not_loop(
+                  offset, beg, prev_e, curr_b, curr_e, predicate
+                );
+              }
+            ); 
+          }
+        );
+      }
+
+      rt.join();
+    }
+    // dynamic partitioner
+    else {
+      std::atomic<size_t> next(0);
+      launch_loop(N, W, rt, next, part,
+        [N, W, beg, &predicate, &offset, &next, &part] () mutable {
+          part.loop_until(N, W, next, 
+            [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable {
+              return detail::find_if_not_loop(
+                offset, beg, prev_e, curr_b, curr_e, predicate
+              );
+            }
+          ); 
+        }
+      );
+    }
+
+    // update the result iterator by the offset
+    result = std::next(beg, offset.load(std::memory_order_relaxed));
+  };
+}
+
+// Function: make_min_element_task
+template <typename B, typename E, typename T, typename C, typename P>
+TF_FORCE_INLINE auto make_min_element_task(
+  B first, E last, T& result, C comp, P&& part
+) {
+
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using namespace std::string_literals;
+
+  return 
+  [b=first, e=last, &result, comp, part=std::forward<P>(part)] 
+  (Runtime& rt) mutable {
+
+    // fetch the iterator values
+    B_t beg = b;
+    E_t end = e;
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(beg, end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= part.chunk_size()) {
+      result = std::min_element(beg, end, comp);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+
+    std::mutex mutex;
+    
+    // initialize the result to the first element
+    result = beg++;
+    N--;
+
+    // static partitioner
+    if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) {
+      
+      size_t chunk_size;
+
+      for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) {
+        
+        // we force chunk size to be at least two because the temporary
+        // variable sum needs to avoid copy at the first step
+        chunk_size = std::max(size_t{2}, part.adjusted_chunk_size(N, W, w));
+        
+        launch_loop(W, w, rt,
+        [beg, curr_b, N, W, chunk_size, &comp, &mutex, &result, &part] () mutable {
+
+          std::advance(beg, curr_b);
+
+          if(N - curr_b == 1) {
+            std::lock_guard<std::mutex> lock(mutex);
+            if(comp(*beg, *result)) {
+              result = beg;
+            }
+            return;
+          }
+
+          auto beg1 = beg++;
+          auto beg2 = beg++;
+          T smallest = comp(*beg1, *beg2) ? beg1 : beg2;
+        
+          // loop reduce
+          part.loop(N, W, curr_b, chunk_size,
+            [&, prev_e=curr_b+2](size_t curr_b, size_t curr_e) mutable {
+
+              if(curr_b > prev_e) {
+                std::advance(beg, curr_b - prev_e);
+              }
+              else {
+                curr_b = prev_e;
+              }
+
+              for(size_t x=curr_b; x<curr_e; x++, beg++) {
+                if(comp(*beg, *smallest)) {
+                  smallest = beg;
+                }
+              }
+              prev_e = curr_e;
+            }
+          ); 
+          
+          // final reduce
+          std::lock_guard<std::mutex> lock(mutex);
+          if(comp(*smallest, *result)) {
+            result = smallest;
+          }
+        });
+      }
+      rt.join();
+    }
+    // dynamic partitioner
+    else {
+      std::atomic<size_t> next(0);
+      launch_loop(N, W, rt, next, part, 
+        [beg, N, W, &next, &comp, &mutex, &result, &part] () mutable {
+          // pre-reduce
+          size_t s0 = next.fetch_add(2, std::memory_order_relaxed);
+
+          if(s0 >= N) {
+            return;
+          }
+
+          std::advance(beg, s0);
+
+          if(N - s0 == 1) {
+            std::lock_guard<std::mutex> lock(mutex);
+            if(comp(*beg, *result)) {
+              result = beg;
+            }
+            return;
+          }
+
+          auto beg1 = beg++;
+          auto beg2 = beg++;
+
+          T smallest = comp(*beg1, *beg2) ? beg1 : beg2;
+          
+          // loop reduce
+          part.loop(N, W, next, 
+            [&, prev_e=s0+2](size_t curr_b, size_t curr_e) mutable {
+              std::advance(beg, curr_b - prev_e);
+              for(size_t x=curr_b; x<curr_e; x++, beg++) {
+                if(comp(*beg, *smallest)) {
+                  smallest = beg;
+                }
+              }
+              prev_e = curr_e;
+            }
+          ); 
+          
+          // final reduce
+          std::lock_guard<std::mutex> lock(mutex);
+          if(comp(*smallest, *result)) {
+            result = smallest;
+          }
+        }
+      );
+    }
+  };
+}
+
+// Function: make_max_element_task
+template <typename B, typename E, typename T, typename C, typename P>
+TF_FORCE_INLINE auto make_max_element_task(B first, E last, T& result, C comp, P&& part) {
+
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using namespace std::string_literals;
+
+  return 
+  [b=first, e=last, &result, comp, part=std::forward<P>(part)] 
+  (Runtime& rt) mutable {
+
+    // fetch the iterator values
+    B_t beg = b;
+    E_t end = e;
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(beg, end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= part.chunk_size()) {
+      result = std::max_element(beg, end, comp);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+
+    std::mutex mutex;
+    
+    // initialize the result to the first element
+    result = beg++;
+    N--;
+
+    // static partitioner
+    if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) {
+      
+      size_t chunk_size;
+
+      for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) {
+        
+        // we force chunk size to be at least two because the temporary
+        // variable sum needs to avoid copy at the first step
+        chunk_size = std::max(size_t{2}, part.adjusted_chunk_size(N, W, w));
+        
+        launch_loop(W, w, rt,
+        [beg, curr_b, N, W, chunk_size, &comp, &mutex, &result, &part] () mutable {
+
+          std::advance(beg, curr_b);
+
+          if(N - curr_b == 1) {
+            std::lock_guard<std::mutex> lock(mutex);
+            if(comp(*result, *beg)) {
+              result = beg;
+            }
+            return;
+          }
+
+          auto beg1 = beg++;
+          auto beg2 = beg++;
+          T largest = comp(*beg1, *beg2) ? beg2 : beg1;
+        
+          // loop reduce
+          part.loop(N, W, curr_b, chunk_size,
+            [&, prev_e=curr_b+2](size_t curr_b, size_t curr_e) mutable {
+
+              if(curr_b > prev_e) {
+                std::advance(beg, curr_b - prev_e);
+              }
+              else {
+                curr_b = prev_e;
+              }
+
+              for(size_t x=curr_b; x<curr_e; x++, beg++) {
+                if(comp(*largest, *beg)) {
+                  largest = beg;
+                }
+              }
+              prev_e = curr_e;
+            }
+          ); 
+          
+          // final reduce
+          std::lock_guard<std::mutex> lock(mutex);
+          if(comp(*result, *largest)) {
+            result = largest;
+          }
+        });
+      }
+      rt.join();
+    }
+    // dynamic partitioner
+    else {
+      std::atomic<size_t> next(0);
+      launch_loop(N, W, rt, next, part,
+        [beg, N, W, &next, &comp, &mutex, &result, &part] () mutable {
+          // pre-reduce
+          size_t s0 = next.fetch_add(2, std::memory_order_relaxed);
+
+          if(s0 >= N) {
+            return;
+          }
+
+          std::advance(beg, s0);
+
+          if(N - s0 == 1) {
+            std::lock_guard<std::mutex> lock(mutex);
+            if(comp(*result, *beg)) {
+              result = beg;
+            }
+            return;
+          }
+
+          auto beg1 = beg++;
+          auto beg2 = beg++;
+
+          T largest = comp(*beg1, *beg2) ? beg2 : beg1;
+          
+          // loop reduce
+          part.loop(N, W, next, 
+            [&, prev_e=s0+2](size_t curr_b, size_t curr_e) mutable {
+              std::advance(beg, curr_b - prev_e);
+              for(size_t x=curr_b; x<curr_e; x++, beg++) {
+                if(comp(*largest, *beg)) {
+                  largest = beg;
+                }
+              }
+              prev_e = curr_e;
+            }
+          ); 
+          
+          // final reduce
+          std::lock_guard<std::mutex> lock(mutex);
+          if(comp(*result, *largest)) {
+            result = largest;
+          }
+        }
+      );
+    }
+  };
+}
+
+}  // namespace detail --------------------------------------------------------
+
+// Function: find_if
+template <typename B, typename E, typename T, typename UOP, typename P>
+Task tf::FlowBuilder::find_if(B first, E last, T& result, UOP predicate, P&& part) {
+  return emplace(detail::make_find_if_task(
+    first, last, result, predicate, std::forward<P>(part)
+  ));
+}
+
+// Function: find_if_not
+template <typename B, typename E, typename T, typename UOP, typename P>
+Task tf::FlowBuilder::find_if_not(B first, E last, T& result, UOP predicate, P&& part) {
+  return emplace(detail::make_find_if_not_task(
+    first, last, result, predicate, std::forward<P>(part)
+  ));
+}
+
+// ----------------------------------------------------------------------------
+// min_element
+// ----------------------------------------------------------------------------
+
+// Function: min_element
+template <typename B, typename E, typename T, typename C, typename P>
+Task FlowBuilder::min_element(B first, E last, T& result, C comp, P&& part) {
+  return emplace(detail::make_min_element_task(
+    first, last, result, comp, std::forward<P>(part)
+  ));
+}
+
+// ----------------------------------------------------------------------------
+// max_element
+// ----------------------------------------------------------------------------
+
+// Function: max_element
+template <typename B, typename E, typename T, typename C, typename P>
+Task FlowBuilder::max_element(B first, E last, T& result, C comp, P&& part) {
+  return emplace(detail::make_max_element_task(
+    first, last, result, comp, std::forward<P>(part)
+  ));
+}
+
+}  // end of namespace tf -----------------------------------------------------
diff --git a/lib/taskflow/algorithm/for_each.hpp b/lib/taskflow/algorithm/for_each.hpp
new file mode 100644
index 0000000..d15958a
--- /dev/null
+++ b/lib/taskflow/algorithm/for_each.hpp
@@ -0,0 +1,173 @@
+#pragma once
+
+#include "launch.hpp"
+
+namespace tf {
+
+namespace detail {
+
+// Function: make_for_each_task
+template <typename B, typename E, typename C, typename P>
+TF_FORCE_INLINE auto make_for_each_task(B beg, E end, C c, P&& part) {
+  
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using namespace std::string_literals;
+
+  return [b=beg, e=end, c, part=std::forward<P>(part)] (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B_t beg = b;
+    E_t end = e;
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(beg, end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= part.chunk_size()) {
+      std::for_each(beg, end, c);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+    
+    // static partitioner
+    if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) {
+      size_t chunk_size;
+      for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) {
+        chunk_size = part.adjusted_chunk_size(N, W, w);
+        launch_loop(W, w, rt, [=, &c, &part] () mutable {
+          part.loop(N, W, curr_b, chunk_size,
+            [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable {
+              std::advance(beg, curr_b - prev_e);
+              for(size_t x = curr_b; x<curr_e; x++) {
+                c(*beg++);
+              }
+              prev_e = curr_e;
+            }
+          ); 
+        });
+      }
+
+      rt.join();
+    }
+    // dynamic partitioner
+    else {
+      std::atomic<size_t> next(0);
+      launch_loop(N, W, rt, next, part, [=, &c, &next, &part] () mutable {
+        part.loop(N, W, next, 
+          [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable {
+            std::advance(beg, curr_b - prev_e);
+            for(size_t x = curr_b; x<curr_e; x++) {
+              c(*beg++);
+            }
+            prev_e = curr_e;
+          }
+        ); 
+      });
+    }
+  };
+}
+
+// Function: make_for_each_index_task
+template <typename B, typename E, typename S, typename C, typename P>
+TF_FORCE_INLINE auto make_for_each_index_task(B beg, E end, S inc, C c, P&& part){
+
+  using namespace std::string_literals;
+
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using S_t = std::decay_t<unwrap_ref_decay_t<S>>;
+
+  return [b=beg, e=end, a=inc, c, part=std::forward<P>(part)] 
+  (Runtime& rt) mutable {
+
+    // fetch the iterator values
+    B_t beg = b;
+    E_t end = e;
+    S_t inc = a;
+
+    size_t W = rt.executor().num_workers();
+    size_t N = distance(beg, end, inc);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= part.chunk_size()) {
+      for(size_t x=0; x<N; x++, beg+=inc) {
+        c(beg);
+      }
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+    
+    // static partitioner
+    if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) {
+
+      size_t chunk_size;
+
+      for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) {
+        chunk_size = part.adjusted_chunk_size(N, W, w);
+        launch_loop(W, w, rt, [=, &c, &part] () mutable {
+          part.loop(N, W, curr_b, chunk_size,
+            [&](size_t curr_b, size_t curr_e) {
+              auto idx = static_cast<B_t>(curr_b) * inc + beg;
+              for(size_t x=curr_b; x<curr_e; x++, idx += inc) {
+                c(idx);
+              }
+            }
+          ); 
+        });
+      }
+
+      rt.join();
+    }
+    // dynamic partitioner
+    else {
+      std::atomic<size_t> next(0);
+      launch_loop(N, W, rt, next, part, [=, &c, &next, &part] () mutable {
+        part.loop(N, W, next, 
+          [&](size_t curr_b, size_t curr_e) {
+            auto idx = static_cast<B_t>(curr_b) * inc + beg;
+            for(size_t x=curr_b; x<curr_e; x++, idx += inc) {
+              c(idx);
+            }
+          }
+        ); 
+      });
+    }
+  };
+}
+
+}  // end of namespace detail -------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// for_each
+// ----------------------------------------------------------------------------
+
+// Function: for_each
+template <typename B, typename E, typename C, typename P>
+Task FlowBuilder::for_each(B beg, E end, C c, P&& part) {
+  return emplace(
+    detail::make_for_each_task(beg, end, c, std::forward<P>(part))
+  );
+}
+
+// ----------------------------------------------------------------------------
+// for_each_index
+// ----------------------------------------------------------------------------
+
+// Function: for_each_index
+template <typename B, typename E, typename S, typename C, typename P>
+Task FlowBuilder::for_each_index(B beg, E end, S inc, C c, P&& part){
+  return emplace(
+    detail::make_for_each_index_task(beg, end, inc, c, std::forward<P>(part))
+  );
+}
+
+
+}  // end of namespace tf -----------------------------------------------------
+
diff --git a/lib/taskflow/algorithm/launch.hpp b/lib/taskflow/algorithm/launch.hpp
new file mode 100644
index 0000000..363223e
--- /dev/null
+++ b/lib/taskflow/algorithm/launch.hpp
@@ -0,0 +1,58 @@
+#pragma once
+
+#include "../core/async.hpp"
+
+namespace tf {
+
+// Function: launch_loop
+template <typename P, typename Loop>
+TF_FORCE_INLINE void launch_loop(
+  size_t N, 
+  size_t W, 
+  Runtime& rt, 
+  std::atomic<size_t>& next, 
+  P&& part, 
+  Loop&& loop
+) {
+
+  //static_assert(std::is_lvalue_reference_v<Loop>, "");
+  
+  using namespace std::string_literals;
+
+  for(size_t w=0; w<W; w++) {
+    auto r = N - next.load(std::memory_order_relaxed);
+    // no more loop work to do - finished by previous async tasks
+    if(!r) {
+      break;
+    }
+    // tail optimization
+    if(r <= part.chunk_size() || w == W-1) {
+      loop();
+      break;
+    }
+    else {
+      rt.silent_async_unchecked("loop-"s + std::to_string(w), loop);
+    }
+  }
+      
+  rt.join();
+}
+
+// Function: launch_loop
+template <typename Loop>
+TF_FORCE_INLINE void launch_loop(
+  size_t W,
+  size_t w,
+  Runtime& rt, 
+  Loop&& loop 
+) {
+  using namespace std::string_literals;
+  if(w == W-1) {
+    loop();
+  }
+  else {
+    rt.silent_async_unchecked("loop-"s + std::to_string(w), loop);
+  }
+}
+
+}  // end of namespace tf -----------------------------------------------------
diff --git a/lib/taskflow/algorithm/partitioner.hpp b/lib/taskflow/algorithm/partitioner.hpp
new file mode 100644
index 0000000..4a253fa
--- /dev/null
+++ b/lib/taskflow/algorithm/partitioner.hpp
@@ -0,0 +1,543 @@
+// reference:
+// - gomp: https://github.com/gcc-mirror/gcc/blob/master/libgomp/iter.c
+// - komp: https://github.com/llvm-mirror/openmp/blob/master/runtime/src/kmp_dispatch.cpp
+
+#pragma once
+
+/**
+@file partitioner.hpp
+@brief partitioner include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// Partitioner Base
+// ----------------------------------------------------------------------------
+
+/**
+@class PartitionerBase
+
+@brief class to derive a partitioner for scheduling parallel algorithms
+
+The class provides base methods to derive a partitioner that can be used
+to schedule parallel iterations (e.g., tf::Taskflow::for_each).
+
+An partitioner defines the scheduling method for running parallel algorithms,
+such tf::Taskflow::for_each, tf::Taskflow::reduce, and so on.
+By default, we provide the following partitioners:
+
++ tf::GuidedPartitioner to enable guided scheduling algorithm of adaptive chunk size
++ tf::DynamicPartitioner to enable dynamic scheduling algorithm of equal chunk size
++ tf::StaticPartitioner to enable static scheduling algorithm of static chunk size
++ tf::RandomPartitioner to enable random scheduling algorithm of random chunk size
+
+Depending on applications, partitioning algorithms can impact the performance
+a lot. 
+For example, if a parallel-iteration workload contains a regular work unit per
+iteration, tf::StaticPartitioner can deliver the best performance.
+On the other hand, if the work unit per iteration is irregular and unbalanced,
+tf::GuidedPartitioner or tf::DynamicPartitioner can outperform tf::StaticPartitioner.
+In most situations, tf::GuidedPartitioner can deliver decent performance and
+is thus used as our default partitioner.
+*/
+class PartitionerBase {
+
+  public:
+
+  /**
+  @brief default constructor
+  */
+  PartitionerBase() = default;
+
+  /**
+  @brief construct a partitioner with the given chunk size
+  */
+  explicit PartitionerBase(size_t chunk_size) : _chunk_size {chunk_size} {}
+
+  /**
+  @brief query the chunk size of this partitioner
+  */
+  size_t chunk_size() const { return _chunk_size; }
+  
+  /**
+  @brief update the chunk size of this partitioner
+  */
+  void chunk_size(size_t cz) { _chunk_size = cz; }
+
+  protected:
+  
+  /**
+  @brief chunk size 
+  */
+  size_t _chunk_size{0};
+};
+
+// ----------------------------------------------------------------------------
+// Guided Partitioner
+// ----------------------------------------------------------------------------
+  
+/**
+@class GuidedPartitioner
+
+@brief class to construct a guided partitioner for scheduling parallel algorithms
+
+The size of a partition is proportional to the number of unassigned iterations 
+divided by the number of workers, 
+and the size will gradually decrease to the given chunk size.
+The last partition may be smaller than the chunk size.
+*/
+class GuidedPartitioner : public PartitionerBase {
+
+  public:
+  
+  /**
+  @brief default constructor
+  */
+  GuidedPartitioner() : PartitionerBase{1} {}
+
+  /**
+  @brief construct a guided partitioner with the given chunk size
+  */
+  explicit GuidedPartitioner(size_t sz) : PartitionerBase (sz) {}
+  
+  // --------------------------------------------------------------------------
+  // scheduling methods
+  // --------------------------------------------------------------------------
+  
+  /**
+  @private
+  */
+  template <typename F, 
+    std::enable_if_t<std::is_invocable_r_v<void, F, size_t, size_t>, void>* = nullptr
+  >
+  void loop(
+    size_t N, 
+    size_t W, 
+    std::atomic<size_t>& next, 
+    F&& func
+  ) const {
+
+    size_t chunk_size = (_chunk_size == 0) ? size_t{1} : _chunk_size;
+
+    size_t p1 = 2 * W * (chunk_size + 1);
+    float  p2 = 0.5f / static_cast<float>(W);
+    size_t curr_b = next.load(std::memory_order_relaxed);
+
+    while(curr_b < N) {
+
+      size_t r = N - curr_b;
+
+      // fine-grained
+      if(r < p1) {
+        while(1) {
+          curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
+          if(curr_b >= N) {
+            return;
+          }
+          func(curr_b, std::min(curr_b + chunk_size, N));
+        }
+        break;
+      }
+      // coarse-grained
+      else {
+        size_t q = static_cast<size_t>(p2 * r);
+        if(q < chunk_size) {
+          q = chunk_size;
+        }
+        //size_t curr_e = (q <= r) ? curr_b + q : N;
+        size_t curr_e = std::min(curr_b + q, N);
+        if(next.compare_exchange_strong(curr_b, curr_e, std::memory_order_relaxed,
+                                                        std::memory_order_relaxed)) {
+          func(curr_b, curr_e);
+          curr_b = next.load(std::memory_order_relaxed);
+        }
+      }
+    }
+  }
+  
+  /**
+  @private
+  */
+  template <typename F, 
+    std::enable_if_t<std::is_invocable_r_v<bool, F, size_t, size_t>, void>* = nullptr
+  >
+  void loop_until(
+    size_t N, 
+    size_t W, 
+    std::atomic<size_t>& next, 
+    F&& func
+  ) const {
+
+    size_t chunk_size = (_chunk_size == 0) ? size_t{1} : _chunk_size;
+
+    size_t p1 = 2 * W * (chunk_size + 1);
+    float  p2 = 0.5f / static_cast<float>(W);
+    size_t curr_b = next.load(std::memory_order_relaxed);
+
+    while(curr_b < N) {
+
+      size_t r = N - curr_b;
+
+      // fine-grained
+      if(r < p1) {
+        while(1) {
+          curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
+          if(curr_b >= N) {
+            return;
+          }
+          if(func(curr_b, std::min(curr_b + chunk_size, N))) {
+            return;
+          }
+        }
+        break;
+      }
+      // coarse-grained
+      else {
+        size_t q = static_cast<size_t>(p2 * r);
+        if(q < chunk_size) {
+          q = chunk_size;
+        }
+        //size_t curr_e = (q <= r) ? curr_b + q : N;
+        size_t curr_e = std::min(curr_b + q, N);
+        if(next.compare_exchange_strong(curr_b, curr_e, std::memory_order_relaxed,
+                                                        std::memory_order_relaxed)) {
+          if(func(curr_b, curr_e)) {
+            return;
+          }
+          curr_b = next.load(std::memory_order_relaxed);
+        }
+      }
+    }
+  }
+};
+
+// ----------------------------------------------------------------------------
+// Dynamic Partitioner
+// ----------------------------------------------------------------------------
+
+/**
+@class DynamicPartitioner
+
+@brief class to construct a dynamic partitioner for scheduling parallel algorithms
+
+The partitioner splits iterations into many partitions each of size equal to 
+the given chunk size.
+Different partitions are distributed dynamically to workers 
+without any specific order.
+*/
+class DynamicPartitioner : public PartitionerBase {
+
+  public:
+
+  /**
+  @brief default constructor
+  */
+  DynamicPartitioner() : PartitionerBase{1} {};
+  
+  /**
+  @brief construct a dynamic partitioner with the given chunk size
+  */
+  explicit DynamicPartitioner(size_t sz) : PartitionerBase (sz) {}
+  
+  // --------------------------------------------------------------------------
+  // scheduling methods
+  // --------------------------------------------------------------------------
+  
+  /**
+  @private
+  */
+  template <typename F, 
+    std::enable_if_t<std::is_invocable_r_v<void, F, size_t, size_t>, void>* = nullptr
+  >
+  void loop(
+    size_t N, 
+    size_t, 
+    std::atomic<size_t>& next, 
+    F&& func
+  ) const {
+
+    size_t chunk_size = (_chunk_size == 0) ? size_t{1} : _chunk_size;
+    size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
+
+    while(curr_b < N) {
+      func(curr_b, std::min(curr_b + chunk_size, N));
+      curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
+    }
+  }
+  
+  /**
+  @private
+  */
+  template <typename F, 
+    std::enable_if_t<std::is_invocable_r_v<bool, F, size_t, size_t>, void>* = nullptr
+  >
+  void loop_until(
+    size_t N, 
+    size_t, 
+    std::atomic<size_t>& next, 
+    F&& func
+  ) const {
+
+    size_t chunk_size = (_chunk_size == 0) ? size_t{1} : _chunk_size;
+    size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
+
+    while(curr_b < N) {
+      if(func(curr_b, std::min(curr_b + chunk_size, N))) {
+        return;
+      }
+      curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
+    }
+  }
+};
+
+// ----------------------------------------------------------------------------
+// Static Partitioner
+// ----------------------------------------------------------------------------
+
+/**
+@class StaticPartitioner
+
+@brief class to construct a dynamic partitioner for scheduling parallel algorithms
+
+The partitioner divides iterations into chunks and distributes chunks 
+to workers in order.
+If the chunk size is not specified (default @c 0), the partitioner resorts to a chunk size
+that equally distributes iterations into workers.
+
+@code{.cpp}
+std::vector<int> data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+taskflow.for_each(
+  data.begin(), data.end(), [](int i){}, StaticPartitioner(0)
+);
+executor.run(taskflow).run();
+@endcode
+*/
+class StaticPartitioner : public PartitionerBase {
+
+  public:
+
+  /**
+  @brief default constructor
+  */
+  StaticPartitioner() : PartitionerBase{0} {};
+  
+  /**
+  @brief construct a dynamic partitioner with the given chunk size
+  */
+  explicit StaticPartitioner(size_t sz) : PartitionerBase(sz) {}
+  
+  /**
+  @brief queries the adjusted chunk size
+  
+  Returns the given chunk size if it is not zero, or returns
+  <tt>N/W + (w < N%W)</tt>, where @c N is the number of iterations,
+  @c W is the number of workers, and @c w is the worker ID.
+  */
+  size_t adjusted_chunk_size(size_t N, size_t W, size_t w) const {
+    return _chunk_size ? _chunk_size : N/W + (w < N%W);
+  }
+  
+  // --------------------------------------------------------------------------
+  // scheduling methods
+  // --------------------------------------------------------------------------
+
+  /**
+  @private
+  */
+  template <typename F, 
+    std::enable_if_t<std::is_invocable_r_v<void, F, size_t, size_t>, void>* = nullptr
+  >
+  void loop(
+    size_t N, 
+    size_t W, 
+    size_t curr_b, 
+    size_t chunk_size,
+    F&& func
+  ) {
+    size_t stride = W * chunk_size;
+    while(curr_b < N) {
+      size_t curr_e = std::min(curr_b + chunk_size, N);
+      func(curr_b, curr_e);
+      curr_b += stride;
+    }
+  }
+  
+  /**
+  @private
+  */
+  template <typename F, 
+    std::enable_if_t<std::is_invocable_r_v<bool, F, size_t, size_t>, void>* = nullptr
+  >
+  void loop_until(
+    size_t N, 
+    size_t W, 
+    size_t curr_b, 
+    size_t chunk_size,
+    F&& func
+  ) {
+    size_t stride = W * chunk_size;
+    while(curr_b < N) {
+      size_t curr_e = std::min(curr_b + chunk_size, N);
+      if(func(curr_b, curr_e)) {
+        return;
+      }
+      curr_b += stride;
+    }
+  }
+};
+
+// ----------------------------------------------------------------------------
+// RandomPartitioner
+// ----------------------------------------------------------------------------
+
+/**
+@class RandomPartitioner
+
+@brief class to construct a random partitioner for scheduling parallel algorithms
+
+Similar to tf::DynamicPartitioner, 
+the partitioner splits iterations into many partitions but each with a random
+chunk size in the range, <tt>c = [alpha * N * W, beta * N * W]</tt>.
+By default, @c alpha is <tt>0.01</tt> and @c beta is <tt>0.5</tt>, respectively.
+
+*/
+class RandomPartitioner : public PartitionerBase {
+
+  public:
+
+  /**
+  @brief default constructor
+  */
+  RandomPartitioner() = default;
+  
+  /**
+  @brief constructs a random partitioner 
+  */
+  RandomPartitioner(size_t cz) : PartitionerBase(cz) {}
+  
+  /**
+  @brief constructs a random partitioner with the given parameters
+  */
+  RandomPartitioner(float alpha, float beta) : _alpha {alpha}, _beta {beta} {}
+
+  /**
+  @brief queries the @c alpha value
+  */
+  float alpha() const { return _alpha; }
+  
+  /**
+  @brief queries the @c beta value
+  */
+  float beta() const { return _beta; }
+  
+  /**
+  @brief queries the range of chunk size
+  
+  @param N number of iterations
+  @param W number of workers
+  */
+  std::pair<size_t, size_t> chunk_size_range(size_t N, size_t W) const {
+    
+    size_t b1 = static_cast<size_t>(_alpha * N * W);
+    size_t b2 = static_cast<size_t>(_beta  * N * W);
+
+    if(b1 > b2) {
+      std::swap(b1, b2);
+    }
+
+    b1 = std::max(b1, size_t{1});
+    b2 = std::max(b2, b1 + 1);
+
+    return {b1, b2};
+  }
+
+  // --------------------------------------------------------------------------
+  // scheduling methods
+  // --------------------------------------------------------------------------
+  
+  /**
+  @private
+  */
+  template <typename F, 
+    std::enable_if_t<std::is_invocable_r_v<void, F, size_t, size_t>, void>* = nullptr
+  >
+  void loop(
+    size_t N, 
+    size_t W, 
+    std::atomic<size_t>& next, 
+    F&& func
+  ) const {
+
+    auto [b1, b2] = chunk_size_range(N, W); 
+    
+    std::default_random_engine engine {std::random_device{}()};
+    std::uniform_int_distribution<size_t> dist(b1, b2);
+    
+    size_t chunk_size = dist(engine);
+    size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
+
+    while(curr_b < N) {
+      func(curr_b, std::min(curr_b + chunk_size, N));
+      chunk_size = dist(engine);
+      curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
+    }
+  }
+
+  /**
+  @private
+  */
+  template <typename F, 
+    std::enable_if_t<std::is_invocable_r_v<bool, F, size_t, size_t>, void>* = nullptr
+  >
+  void loop_until(
+    size_t N, 
+    size_t W, 
+    std::atomic<size_t>& next, 
+    F&& func
+  ) const {
+
+    auto [b1, b2] = chunk_size_range(N, W); 
+    
+    std::default_random_engine engine {std::random_device{}()};
+    std::uniform_int_distribution<size_t> dist(b1, b2);
+    
+    size_t chunk_size = dist(engine);
+    size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
+
+    while(curr_b < N) {
+      if(func(curr_b, std::min(curr_b + chunk_size, N))){
+        return;
+      }
+      chunk_size = dist(engine);
+      curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed);
+    }
+  }
+
+  private:
+
+  float _alpha {0.01f};
+  float _beta  {0.5f};
+
+};
+
+/**
+@brief default partitioner set to tf::GuidedPartitioner
+
+Guided partitioner can achieve decent performance for most parallel algorithms,
+especially for those with irregular and unbalanced workload per iteration.
+*/
+using DefaultPartitioner = GuidedPartitioner;
+
+/**
+@brief determines if a type is a partitioner 
+
+A partitioner is a derived type from tf::PartitionerBase.
+*/
+template <typename C>
+inline constexpr bool is_partitioner_v = std::is_base_of<PartitionerBase, C>::value;
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
diff --git a/lib/taskflow/algorithm/pipeline.hpp b/lib/taskflow/algorithm/pipeline.hpp
new file mode 100644
index 0000000..5442d56
--- /dev/null
+++ b/lib/taskflow/algorithm/pipeline.hpp
@@ -0,0 +1,1663 @@
+#pragma once
+
+#include "../taskflow.hpp"
+
+/**
+@file pipeline.hpp
+@brief pipeline include file
+*/
+
+namespace tf {
+
+
+// ----------------------------------------------------------------------------
+// Structure Definition: DeferredPipeflow
+// ----------------------------------------------------------------------------
+// For example: 
+// 12.defer(7); 12.defer(16);
+//        _____
+//       |     |
+//       v     |
+// 7    12    16
+// |     ^
+// |____ |
+//
+// DeferredPipeflow dpf of 12 :
+// dpf._token = 12;
+// dpf._num_deferrals = 1;
+// dpf._dependents = std::list<size_t>{7,16};
+// dpf._dependent_satellites has following two entries
+// {key: 7, value: dpf._dependents.begin()} 
+// {key: 16, value: dpf._dependents.begin()+1}
+//
+/** @private */
+class DeferredPipeflow {
+
+  template <typename... Ps>
+  friend class Pipeline;
+  
+  template <typename P>
+  friend class ScalablePipeline;
+  
+  public:
+  
+    DeferredPipeflow() = default;
+    DeferredPipeflow(const DeferredPipeflow&) = delete;
+    DeferredPipeflow(DeferredPipeflow&&) = delete;
+  
+    DeferredPipeflow(size_t t, size_t n, std::unordered_set<size_t>&& dep) : 
+      _token{t}, _num_deferrals{n}, _dependents{std::move(dep)} {
+    }
+  
+    DeferredPipeflow& operator = (const DeferredPipeflow&) = delete;
+    DeferredPipeflow& operator = (DeferredPipeflow&&) = delete;
+  
+  private:
+  
+    // token id
+    size_t _token;
+  
+    // number of deferrals
+    size_t _num_deferrals;  
+  
+    // dependents
+    // For example,
+    // 12.defer(7); 12.defer(16)
+    // _dependents = {7, 16}
+    std::unordered_set<size_t> _dependents;
+};
+
+
+
+// ----------------------------------------------------------------------------
+// Class Definition: Pipeflow
+// ----------------------------------------------------------------------------
+
+/**
+@class Pipeflow
+
+@brief class to create a pipeflow object used by the pipe callable
+
+Pipeflow represents a <i>scheduling token</i> in the pipeline scheduling
+framework. A pipeflow is created by the pipeline scheduler at runtime to
+pass to the pipe callable. Users can query the present statistics
+of that scheduling token, including the line identifier, pipe identifier,
+and token identifier, and build their application algorithms based on
+these statistics.
+At the first stage, users can explicitly call the stop method
+to stop the pipeline scheduler.
+
+@code{.cpp}
+tf::Pipe{tf::PipeType::SERIAL, [](tf::Pipeflow& pf){
+  std::cout << "token id=" << pf.token()
+            << " at line=" << pf.line()
+            << " at pipe=" << pf.pipe()
+            << '\n';
+}};
+@endcode
+
+Pipeflow can only be created privately by the tf::Pipeline and
+be used through the pipe callable.
+*/
+class Pipeflow {
+
+  template <typename... Ps>
+  friend class Pipeline;
+
+  template <typename P>
+  friend class ScalablePipeline;
+
+  template <typename... Ps>
+  friend class DataPipeline;
+
+  public:
+
+  /**
+  @brief default constructor
+  */
+  Pipeflow() = default;
+
+  /**
+  @brief queries the line identifier of the present token
+  */
+  size_t line() const {
+    return _line;
+  }
+
+  /**
+  @brief queries the pipe identifier of the present token
+  */
+  size_t pipe() const {
+    return _pipe;
+  }
+
+  /**
+  @brief queries the token identifier
+  */
+  size_t token() const {
+    return _token;
+  }
+
+  /**
+  @brief stops the pipeline scheduling
+
+  Only the first pipe can call this method to stop the pipeline.
+  Calling stop from other pipes will throw exception.
+  */
+  void stop() {
+    if(_pipe != 0) {
+      TF_THROW("only the first pipe can stop the token");
+    }
+    _stop = true;
+  }
+
+  /**
+  @brief queries the number of deferrals
+  */
+  size_t num_deferrals() const {
+    return _num_deferrals;
+  }
+
+  /**
+  @brief pushes token in _dependents
+
+  Only the first pipe can call this method to defer the current
+  scheduling token to the given token.
+  */
+  void defer(size_t token) {
+    if(_pipe != 0) {
+      TF_THROW("only the first pipe can defer the current scheduling token");
+    }
+    _dependents.insert(token);
+  }
+  
+  private:
+
+  // Regular data
+  size_t _line;
+  size_t _pipe;
+  size_t _token;
+  bool   _stop;
+  
+  // Data field for token dependencies
+  size_t _num_deferrals; 
+  std::unordered_set<size_t> _dependents; 
+
+};
+
+// ----------------------------------------------------------------------------
+// Class Definition: PipeType
+// ----------------------------------------------------------------------------
+
+/**
+@enum PipeType
+
+@brief enumeration of all pipe types
+*/
+enum class PipeType : int {
+  /** @brief parallel type */
+  PARALLEL = 1,
+  /** @brief serial type */
+  SERIAL   = 2
+};
+
+// ----------------------------------------------------------------------------
+// Class Definition: Pipe
+// ----------------------------------------------------------------------------
+
+/**
+@class Pipe
+
+@brief class to create a pipe object for a pipeline stage
+
+@tparam C callable type
+
+A pipe represents a stage of a pipeline. A pipe can be either
+@em parallel direction or @em serial direction (specified by tf::PipeType)
+and is coupled with a callable to invoke by the pipeline scheduler.
+The callable must take a referenced tf::Pipeflow object in the first argument:
+
+@code{.cpp}
+Pipe{PipeType::SERIAL, [](tf::Pipeflow&){}}
+@endcode
+
+The pipeflow object is used to query the statistics of a scheduling token
+in the pipeline, such as pipe, line, and token numbers.
+*/
+template <typename C = std::function<void(tf::Pipeflow&)>>
+class Pipe {
+
+  template <typename... Ps>
+  friend class Pipeline;
+
+  template <typename P>
+  friend class ScalablePipeline;
+
+  public:
+
+  /**
+  @brief alias of the callable type
+  */
+  using callable_t = C;
+
+  /**
+  @brief default constructor
+  */
+  Pipe() = default;
+
+  /**
+  @brief constructs the pipe object
+
+  @param d pipe type (tf::PipeType)
+  @param callable callable type
+
+  The constructor constructs a pipe with the given direction
+  (tf::PipeType::SERIAL or tf::PipeType::PARALLEL) and the given callable. 
+  The callable must take a referenced tf::Pipeflow object in the first argument.
+
+  @code{.cpp}
+  Pipe{PipeType::SERIAL, [](tf::Pipeflow&){}}
+  @endcode
+
+  When creating a pipeline, the direction of the first pipe must be serial
+  (tf::PipeType::SERIAL).
+  */
+  Pipe(PipeType d, C&& callable) :
+    _type{d}, _callable{std::forward<C>(callable)} {
+  }
+
+  /**
+  @brief queries the type of the pipe
+
+  Returns the type of the callable.
+  */
+  PipeType type() const {
+    return _type;
+  }
+
+  /**
+  @brief assigns a new type to the pipe
+
+  @param type a tf::PipeType variable
+  */
+  void type(PipeType type) {
+    _type = type;
+  }
+
+  /**
+  @brief assigns a new callable to the pipe
+
+  @tparam U callable type
+  @param callable a callable object constructible from std::function<void(tf::Pipeflow&)>
+
+  Assigns a new callable to the pipe with universal forwarding.
+  */
+  template <typename U>
+  void callable(U&& callable) {
+    _callable = std::forward<U>(callable);
+  }
+
+  private:
+
+  PipeType _type;
+
+  C _callable;
+};
+
+// ----------------------------------------------------------------------------
+// Class Definition: Pipeline
+// ----------------------------------------------------------------------------
+
+/**
+@class Pipeline
+
+@brief class to create a pipeline scheduling framework
+
+@tparam Ps pipe types
+
+A pipeline is a composable graph object for users to create a
+<i>pipeline scheduling framework</i> using a module task in a taskflow.
+Unlike the conventional pipeline programming frameworks (e.g., Intel TBB),
+%Taskflow's pipeline algorithm does not provide any data abstraction,
+which often restricts users from optimizing data layouts in their applications,
+but a flexible framework for users to customize their application data
+atop our pipeline scheduling.
+The following code creates a pipeline of four parallel lines to schedule
+tokens through three serial pipes:
+
+@code{.cpp}
+tf::Taskflow taskflow;
+tf::Executor executor;
+
+const size_t num_lines = 4;
+const size_t num_pipes = 3;
+
+// create a custom data buffer
+std::array<std::array<int, num_pipes>, num_lines> buffer;
+
+// create a pipeline graph of four concurrent lines and three serial pipes
+tf::Pipeline pipeline(num_lines,
+  // first pipe must define a serial direction
+  tf::Pipe{tf::PipeType::SERIAL, [&buffer](tf::Pipeflow& pf) {
+    // generate only 5 scheduling tokens
+    if(pf.token() == 5) {
+      pf.stop();
+    }
+    // save the token id into the buffer
+    else {
+      buffer[pf.line()][pf.pipe()] = pf.token();
+    }
+  }},
+  tf::Pipe{tf::PipeType::SERIAL, [&buffer] (tf::Pipeflow& pf) {
+    // propagate the previous result to this pipe by adding one
+    buffer[pf.line()][pf.pipe()] = buffer[pf.line()][pf.pipe()-1] + 1;
+  }},
+  tf::Pipe{tf::PipeType::SERIAL, [&buffer](tf::Pipeflow& pf){
+    // propagate the previous result to this pipe by adding one
+    buffer[pf.line()][pf.pipe()] = buffer[pf.line()][pf.pipe()-1] + 1;
+  }}
+);
+
+// build the pipeline graph using composition
+tf::Task init = taskflow.emplace([](){ std::cout << "ready\n"; })
+                        .name("starting pipeline");
+tf::Task task = taskflow.composed_of(pipeline)
+                        .name("pipeline");
+tf::Task stop = taskflow.emplace([](){ std::cout << "stopped\n"; })
+                        .name("pipeline stopped");
+
+// create task dependency
+init.precede(task);
+task.precede(stop);
+
+// run the pipeline
+executor.run(taskflow).wait();
+@endcode
+
+The above example creates a pipeline graph that schedules five tokens over
+four parallel lines in a circular fashion, as depicted below:
+
+@code{.shell-session}
+o -> o -> o
+|    |    |
+v    v    v
+o -> o -> o
+|    |    |
+v    v    v
+o -> o -> o
+|    |    |
+v    v    v
+o -> o -> o
+@endcode
+
+At each pipe stage, the program propagates the result to the next pipe
+by adding one to the result stored in a custom data storage, @c buffer.
+The pipeline scheduler will generate five scheduling tokens and then stop.
+
+Internally, tf::Pipeline uses std::tuple to store the given sequence of pipes.
+The definition of each pipe can be different, completely decided by the compiler
+to optimize the object layout.
+After a pipeline is constructed, it is not possible to change its pipes.
+If applications need to change these pipes, please use tf::ScalablePipeline.
+*/
+template <typename... Ps>
+class Pipeline {
+
+  static_assert(sizeof...(Ps)>0, "must have at least one pipe");
+
+  /**
+  @private
+  */
+  struct Line {
+    std::atomic<size_t> join_counter;
+  };
+
+  /**
+  @private
+  */
+  struct PipeMeta {
+    PipeType type;
+  };
+
+  public:
+
+  /**
+  @brief constructs a pipeline object
+
+  @param num_lines the number of parallel lines
+  @param ps a list of pipes
+
+  Constructs a pipeline of up to @c num_lines parallel lines to schedule
+  tokens through the given linear chain of pipes.
+  The first pipe must define a serial direction (tf::PipeType::SERIAL)
+  or an exception will be thrown.
+  */
+  Pipeline(size_t num_lines, Ps&&... ps);
+
+  /**
+  @brief constructs a pipeline object
+
+  @param num_lines the number of parallel lines
+  @param ps a tuple of pipes
+
+  Constructs a pipeline of up to @c num_lines parallel lines to schedule
+  tokens through the given linear chain of pipes.
+  The first pipe must define a serial direction (tf::PipeType::SERIAL)
+  or an exception will be thrown.
+  */
+  Pipeline(size_t num_lines, std::tuple<Ps...>&& ps);
+
+  /**
+  @brief queries the number of parallel lines
+
+  The function returns the number of parallel lines given by the user
+  upon the construction of the pipeline.
+  The number of lines represents the maximum parallelism this pipeline
+  can achieve.
+  */
+  size_t num_lines() const noexcept;
+
+  /**
+  @brief queries the number of pipes
+
+  The Function returns the number of pipes given by the user
+  upon the construction of the pipeline.
+  */
+  constexpr size_t num_pipes() const noexcept;
+
+  /**
+  @brief resets the pipeline
+
+  Resetting the pipeline to the initial state. After resetting a pipeline,
+  its token identifier will start from zero as if the pipeline was just
+  constructed.
+  */
+  void reset();
+
+  /**
+  @brief queries the number of generated tokens in the pipeline
+
+  The number represents the total scheduling tokens that has been
+  generated by the pipeline so far.
+  */
+  size_t num_tokens() const noexcept;
+
+  /**
+  @brief obtains the graph object associated with the pipeline construct
+
+  This method is primarily used as an opaque data structure for creating
+  a module task of the this pipeline.
+  */
+  Graph& graph();
+
+
+  private:
+
+  Graph _graph;
+
+  size_t _num_tokens;
+
+  std::tuple<Ps...> _pipes;
+  std::array<PipeMeta, sizeof...(Ps)> _meta;
+  std::vector<std::array<Line, sizeof...(Ps)>> _lines;
+  std::vector<Task> _tasks;
+  std::vector<Pipeflow> _pipeflows;
+  
+  // queue of ready tokens (paired with their deferral times)
+  // For example,
+  // when 12 does not have any dependents,
+  // we put 12 in _ready_tokens queue
+  // Assume num_deferrals of 12 is 1,
+  // we push pair{12, 1} in the queue 
+  std::queue<std::pair<size_t, size_t>> _ready_tokens;
+
+  // unordered_map of token dependencies
+  // For example,
+  // 12.defer(16); 13.defer(16);
+  // _token_dependencies has the following entry
+  // {key: 16, value: std::vector{12, 13}}.
+  std::unordered_map<size_t, std::vector<size_t>> _token_dependencies;
+  
+  // unordered_map of deferred tokens
+  // For example,
+  // 12.defer(16); 13.defer(16);
+  // _deferred_tokens has the following two entries
+  // {key: 12, DeferredPipeflow of 12} and
+  // {key: 13, DeferredPipeflow of 13}
+  std::unordered_map<size_t, DeferredPipeflow> _deferred_tokens;
+  
+  // variable to keep track of the longest deferred tokens
+  // For example,
+  // 2.defer(16)
+  // 5.defer(19)
+  // 5.defer(17),
+  // _longest_deferral will be 19 - after token 19 the pipeline
+  // has almost zero cost on handling deferred pipeflow
+  size_t _longest_deferral = 0;  
+  
+  template <size_t... I>
+  auto _gen_meta(std::tuple<Ps...>&&, std::index_sequence<I...>);
+
+  void _on_pipe(Pipeflow&, Runtime&);
+  void _build();
+  void _check_dependents(Pipeflow&);
+  void _construct_deferred_tokens(Pipeflow&);
+  void _resolve_token_dependencies(Pipeflow&); 
+};
+
+// constructor
+template <typename... Ps>
+Pipeline<Ps...>::Pipeline(size_t num_lines, Ps&&... ps) :
+  _pipes     {std::make_tuple(std::forward<Ps>(ps)...)},
+  _meta      {PipeMeta{ps.type()}...},
+  _lines     (num_lines),
+  _tasks     (num_lines + 1),
+  _pipeflows (num_lines) {
+
+  if(num_lines == 0) {
+    TF_THROW("must have at least one line");
+  }
+
+  if(std::get<0>(_pipes).type() != PipeType::SERIAL) {
+    TF_THROW("first pipe must be serial");
+  }
+
+  reset();
+  _build();
+}
+
+// constructor
+template <typename... Ps>
+Pipeline<Ps...>::Pipeline(size_t num_lines, std::tuple<Ps...>&& ps) :
+  _pipes     {std::forward<std::tuple<Ps...>>(ps)},
+  _meta      {_gen_meta(
+    std::forward<std::tuple<Ps...>>(ps), std::make_index_sequence<sizeof...(Ps)>{}
+  )},
+  _lines     (num_lines),
+  _tasks     (num_lines + 1),
+  _pipeflows (num_lines) {
+
+  if(num_lines == 0) {
+    TF_THROW("must have at least one line");
+  }
+
+  if(std::get<0>(_pipes).type() != PipeType::SERIAL) {
+    TF_THROW("first pipe must be serial");
+  }
+
+  reset();
+  _build();
+}
+
+// Function: _get_meta
+template <typename... Ps>
+template <size_t... I>
+auto Pipeline<Ps...>::_gen_meta(std::tuple<Ps...>&& ps, std::index_sequence<I...>) {
+  return std::array{PipeMeta{std::get<I>(ps).type()}...};
+}
+
+// Function: num_lines
+template <typename... Ps>
+size_t Pipeline<Ps...>::num_lines() const noexcept {
+  return _pipeflows.size();
+}
+
+// Function: num_pipes
+template <typename... Ps>
+constexpr size_t Pipeline<Ps...>::num_pipes() const noexcept {
+  return sizeof...(Ps);
+}
+
+// Function: num_tokens
+template <typename... Ps>
+size_t Pipeline<Ps...>::num_tokens() const noexcept {
+  return _num_tokens;
+}
+
+// Function: graph
+template <typename... Ps>
+Graph& Pipeline<Ps...>::graph() {
+  return _graph;
+}
+
+// Function: reset
+template <typename... Ps>
+void Pipeline<Ps...>::reset() {
+
+  _num_tokens = 0;
+
+  for(size_t l = 0; l<num_lines(); l++) {
+    _pipeflows[l]._pipe = 0;
+    _pipeflows[l]._line = l;
+    
+    _pipeflows[l]._num_deferrals = 0;
+    _pipeflows[l]._dependents.clear();
+  }
+  
+  assert(_ready_tokens.empty() == true);
+  _token_dependencies.clear();
+  _deferred_tokens.clear();
+
+  _lines[0][0].join_counter.store(0, std::memory_order_relaxed);
+
+  for(size_t l=1; l<num_lines(); l++) {
+    for(size_t f=1; f<num_pipes(); f++) {
+      _lines[l][f].join_counter.store(
+        static_cast<size_t>(_meta[f].type), std::memory_order_relaxed
+      );
+    }
+  }
+
+  for(size_t f=1; f<num_pipes(); f++) {
+    _lines[0][f].join_counter.store(1, std::memory_order_relaxed);
+  }
+
+  for(size_t l=1; l<num_lines(); l++) {
+    _lines[l][0].join_counter.store(
+      static_cast<size_t>(_meta[0].type) - 1, std::memory_order_relaxed
+    );
+  }
+}
+
+// Procedure: _on_pipe
+template <typename... Ps>
+void Pipeline<Ps...>::_on_pipe(Pipeflow& pf, Runtime& rt) {
+  visit_tuple([&](auto&& pipe){
+    using callable_t = typename std::decay_t<decltype(pipe)>::callable_t;
+    if constexpr (std::is_invocable_v<callable_t, Pipeflow&>) {
+      pipe._callable(pf);
+    }
+    else if constexpr(std::is_invocable_v<callable_t, Pipeflow&, Runtime&>) {
+      pipe._callable(pf, rt);
+    }
+    else {
+      static_assert(dependent_false_v<callable_t>, "un-supported pipe callable type");
+    }
+  }, _pipes, pf._pipe);
+}
+
+// Procedure: _check_dependents
+// Check and remove invalid dependents after on_pipe
+// For example, users may defer a pipeflow to multiple tokens,
+// and we need to remove invalid tokens.
+//   12.defer(7);   // valid only if 7 is deferred, or invalid otherwise
+//   12.defer(16);  // 16 is valid 
+template <typename... Ps>
+void Pipeline<Ps...>::_check_dependents(Pipeflow& pf) {
+  //if (pf._dependents.size()) {
+  ++pf._num_deferrals;
+  
+  for (auto it = pf._dependents.begin(); it != pf._dependents.end();) {
+ 
+    // valid (e.g., 12.defer(16)) 
+    if (*it >= _num_tokens) {
+      _token_dependencies[*it].push_back(pf._token);
+      _longest_deferral = std::max(_longest_deferral, *it);
+      ++it;
+    }
+    // valid or invalid (e.g., 12.defer(7))
+    else {
+      auto pit = _deferred_tokens.find(*it);
+      
+      // valid (e.g., 7 is deferred)
+      if (pit != _deferred_tokens.end()) {
+        _token_dependencies[*it].push_back(pf._token);
+        ++it;
+      }
+
+      // invalid (e.g., 7 is finished - this this 12.defer(7) is dummy)
+      else {
+        it = pf._dependents.erase(it);
+      }
+    }
+  }
+}
+
+// Procedure: _construct_deferred_tokens
+// Construct a data structure for a deferred token
+// 
+// For example, 
+// 12.defer(7); 12.defer(16);
+// After _check_dependents, 12 needs to be deferred,
+// so we will construct a data structure for 12 using hashmap:
+// {key: 12, value: DeferredPipeflow of 12}
+template <typename... Ps>
+void Pipeline<Ps...>::_construct_deferred_tokens(Pipeflow& pf) {
+  
+  //auto res = _deferred_tokens.emplace(
+  //  pf._token, DeferredPipeflow{pf._token, pf._num_deferrals, std::move(pf._dependents)}
+  //);
+  
+  // construct the deferred pipeflow with zero copy
+  //auto res = _deferred_tokens.emplace(
+  _deferred_tokens.emplace(
+    std::piecewise_construct,
+    std::forward_as_tuple(pf._token),
+    std::forward_as_tuple(
+      pf._token, pf._num_deferrals, std::move(pf._dependents)
+    )
+  );
+
+  //assert(res.second == true);
+}
+
+// Procedure: _resolve_token_dependencies
+// Resolve dependencies for tokens that defer to current token
+// 
+// For example,
+// 12.defer(16);
+// 13.defer(16);
+// _token_dependencies will have the entry
+// {key: 16, value: std::vector{12, 13}} 
+//
+// When 16 finishes, we need to remove 16 from 12's and 13's 
+// individual_dependents
+template <typename... Ps>
+void Pipeline<Ps...>::_resolve_token_dependencies(Pipeflow& pf) {
+
+  if (auto it = _token_dependencies.find(pf._token);
+      it != _token_dependencies.end()) {
+    
+    // iterate tokens that defer to pf._token
+    // (e.g., 12 and 13)
+    for(size_t target : it->second) {
+
+      auto dpf = _deferred_tokens.find(target);
+
+      assert(dpf != _deferred_tokens.end());
+
+      // erase pf._token from target's _dependents
+      // (e.g., remove 16 from 12's dependents)
+      dpf->second._dependents.erase(pf._token);
+      //  dpf->second._dependent_satellites[pf._token]
+      //);
+
+      // target has no dependents
+      if (dpf->second._dependents.empty()) {
+
+        // push target into _ready_tokens queue
+        _ready_tokens.emplace(dpf->second._token, dpf->second._num_deferrals);
+        //_ready_tokens.push(
+        //  std::make_pair(dpf->second._token, dpf->second._num_deferrals)
+        //);
+        
+        // erase target from _deferred_tokens
+        _deferred_tokens.erase(dpf);
+      }
+    }
+
+    // remove pf._token from _token_dependencies
+    // (e.g., remove the entry
+    // {key: 16, value: std::vector{12, 13}} from _token_dependencies)
+    _token_dependencies.erase(it);
+  }
+}
+
+// Procedure: _build
+template <typename... Ps>
+void Pipeline<Ps...>::_build() {
+
+  using namespace std::literals::string_literals;
+
+  FlowBuilder fb(_graph);
+
+  // init task
+  _tasks[0] = fb.emplace([this]() {
+    return static_cast<int>(_num_tokens % num_lines());
+  }).name("cond");
+
+  // line task
+  for(size_t l = 0; l < num_lines(); l++) {
+
+    _tasks[l + 1] = fb.emplace([this, l] (tf::Runtime& rt) mutable {
+
+      auto pf = &_pipeflows[l];
+
+      pipeline:
+
+      _lines[pf->_line][pf->_pipe].join_counter.store(
+        static_cast<size_t>(_meta[pf->_pipe].type), std::memory_order_relaxed
+      );
+      
+      // First pipe does all jobs of initialization and token dependencies
+      if (pf->_pipe == 0) {
+        // _ready_tokens queue is not empty
+        // substitute pf with the token at the front of the queue
+        if (!_ready_tokens.empty()) {
+          pf->_token = _ready_tokens.front().first;
+          pf->_num_deferrals = _ready_tokens.front().second;
+          _ready_tokens.pop();
+        }
+        else {
+          pf->_token = _num_tokens;
+          pf->_num_deferrals = 0;
+        }
+      
+      handle_token_dependency: 
+
+        if (pf->_stop = false, _on_pipe(*pf, rt); pf->_stop == true) {
+          // here, the pipeline is not stopped yet because other
+          // lines of tasks may still be running their last stages
+          return;
+        }
+        
+        if (_num_tokens == pf->_token) {
+          ++_num_tokens;
+        }
+      
+        if (pf->_dependents.empty() == false){ 
+          // check if the pf->_dependents have valid dependents
+          _check_dependents(*pf); 
+          
+          // tokens in pf->_dependents are all valid dependents 
+          if (pf->_dependents.size()) {
+            
+            // construct a data structure for pf in _deferred_tokens 
+            _construct_deferred_tokens(*pf);
+            goto pipeline;
+          }
+
+          // tokens in pf->_dependents are invalid dependents
+          // directly goto on_pipe on the same line
+          else {
+            goto handle_token_dependency;
+          }
+        }
+        
+        // Every token within the deferral range needs to check
+        // if it can resolve dependencies on other tokens.
+        if (pf->_token <= _longest_deferral) {
+          _resolve_token_dependencies(*pf); 
+        }
+      }
+      else {
+        _on_pipe(*pf, rt);
+      }
+
+      size_t c_f = pf->_pipe;
+      size_t n_f = (pf->_pipe + 1) % num_pipes();
+      size_t n_l = (pf->_line + 1) % num_lines();
+
+      pf->_pipe = n_f;
+
+      // ---- scheduling starts here ----
+      // Notice that the shared variable f must not be changed after this
+      // point because it can result in data race due to the following
+      // condition:
+      //
+      // a -> b
+      // |    |
+      // v    v
+      // c -> d
+      //
+      // d will be spawned by either c or b, so if c changes f but b spawns d
+      // then data race on f will happen
+
+      std::array<int, 2> retval;
+      size_t n = 0;
+
+      // downward dependency
+      if(_meta[c_f].type == PipeType::SERIAL &&
+         _lines[n_l][c_f].join_counter.fetch_sub(
+           1, std::memory_order_acq_rel) == 1
+        ) {
+        retval[n++] = 1;
+      }
+
+      // forward dependency
+      if(_lines[pf->_line][n_f].join_counter.fetch_sub(
+          1, std::memory_order_acq_rel) == 1
+        ) {
+        retval[n++] = 0;
+      }
+      
+      // notice that the task index starts from 1
+      switch(n) {
+        case 2: {
+          rt.schedule(_tasks[n_l+1]);
+          goto pipeline;
+        }
+        case 1: {
+          // downward dependency 
+          if (retval[0] == 1) {
+            pf = &_pipeflows[n_l];
+          }
+          // forward dependency
+          goto pipeline;
+        }
+      }
+    }).name("rt-"s + std::to_string(l));
+
+    _tasks[0].precede(_tasks[l+1]);
+  }
+}
+
+// ----------------------------------------------------------------------------
+// Class Definition: ScalablePipeline
+// ----------------------------------------------------------------------------
+
+/**
+@class ScalablePipeline
+
+@brief class to create a scalable pipeline object
+
+@tparam P type of the iterator to a range of pipes
+
+A scalable pipeline is a composable graph object for users to create a
+<i>pipeline scheduling framework</i> using a module task in a taskflow.
+Unlike tf::Pipeline that instantiates all pipes upon the construction time,
+tf::ScalablePipeline allows variable assignments of pipes using range iterators.
+Users can also reset a scalable pipeline to a different range of pipes
+between runs. The following code creates a scalable pipeline of four
+parallel lines to schedule tokens through three serial pipes in a custom storage,
+then resetting the pipeline to a new range of five serial pipes:
+
+@code{.cpp}
+tf::Taskflow taskflow("pipeline");
+tf::Executor executor;
+
+const size_t num_lines = 4;
+
+// create data storage
+std::array<int, num_lines> buffer;
+
+// define the pipe callable
+auto pipe_callable = [&buffer] (tf::Pipeflow& pf) mutable {
+  switch(pf.pipe()) {
+    // first stage generates only 5 scheduling tokens and saves the
+    // token number into the buffer.
+    case 0: {
+      if(pf.token() == 5) {
+        pf.stop();
+      }
+      else {
+        printf("stage 1: input token = %zu\n", pf.token());
+        buffer[pf.line()] = pf.token();
+      }
+      return;
+    }
+    break;
+
+    // other stages propagate the previous result to this pipe and
+    // increment it by one
+    default: {
+      printf(
+        "stage %zu: input buffer[%zu] = %d\n", pf.pipe(), pf.line(), buffer[pf.line()]
+      );
+      buffer[pf.line()] = buffer[pf.line()] + 1;
+    }
+    break;
+  }
+};
+
+// create a vector of three pipes
+std::vector< tf::Pipe<std::function<void(tf::Pipeflow&)>> > pipes;
+
+for(size_t i=0; i<3; i++) {
+  pipes.emplace_back(tf::PipeType::SERIAL, pipe_callable);
+}
+
+// create a pipeline of four parallel lines based on the given vector of pipes
+tf::ScalablePipeline pl(num_lines, pipes.begin(), pipes.end());
+
+// build the pipeline graph using composition
+tf::Task init = taskflow.emplace([](){ std::cout << "ready\n"; })
+                        .name("starting pipeline");
+tf::Task task = taskflow.composed_of(pl)
+                        .name("pipeline");
+tf::Task stop = taskflow.emplace([](){ std::cout << "stopped\n"; })
+                        .name("pipeline stopped");
+
+// create task dependency
+init.precede(task);
+task.precede(stop);
+
+// dump the pipeline graph structure (with composition)
+taskflow.dump(std::cout);
+
+// run the pipeline
+executor.run(taskflow).wait();
+
+// reset the pipeline to a new range of five pipes and starts from
+// the initial state (i.e., token counts from zero)
+for(size_t i=0; i<2; i++) {
+  pipes.emplace_back(tf::PipeType::SERIAL, pipe_callable);
+}
+pl.reset(pipes.begin(), pipes.end());
+
+executor.run(taskflow).wait();
+@endcode
+
+The above example creates a pipeline graph that schedules five tokens over
+four parallel lines in a circular fashion, first going through three serial pipes
+and then five serial pipes:
+
+@code{.shell-session}
+# initial construction of three serial pipes
+o -> o -> o
+|    |    |
+v    v    v
+o -> o -> o
+|    |    |
+v    v    v
+o -> o -> o
+|    |    |
+v    v    v
+o -> o -> o
+
+# resetting to a new range of five serial pipes
+o -> o -> o -> o -> o
+|    |    |    |    |
+v    v    v    v    v
+o -> o -> o -> o -> o
+|    |    |    |    |
+v    v    v    v    v
+o -> o -> o -> o -> o
+|    |    |    |    |
+v    v    v    v    v
+o -> o -> o -> o -> o
+@endcode
+
+Each pipe has the same type of `%tf::Pipe<%std::function<void(%tf::Pipeflow&)>>`
+and is kept in a vector that is amenable to change.
+We construct the scalable pipeline using two range iterators pointing to the
+beginning and the end of the vector.
+At each pipe stage, the program propagates the result to the next pipe
+by adding one to the result stored in a custom data storage, @c buffer.
+The pipeline scheduler will generate five scheduling tokens and then stop.
+
+A scalable pipeline is move-only.
+*/
+template <typename P>
+class ScalablePipeline {
+
+  /**
+  @private
+  */
+  struct Line {
+    std::atomic<size_t> join_counter;
+  };
+
+  public:
+
+  /**
+  @brief pipe type
+  */
+  using pipe_t = typename std::iterator_traits<P>::value_type;
+
+  /**
+  @brief default constructor
+  */
+  ScalablePipeline() = default;
+
+  /**
+  @brief constructs an empty scalable pipeline object
+
+  @param num_lines the number of parallel lines
+
+  An empty scalable pipeline does not have any pipes.
+  The pipeline needs to be reset to a valid range of pipes
+  before running.
+  */
+  ScalablePipeline(size_t num_lines);
+
+  /**
+  @brief constructs a scalable pipeline object
+
+  @param num_lines the number of parallel lines
+  @param first iterator to the beginning of the range
+  @param last iterator to the end of the range
+
+  Constructs a pipeline from the given range of pipes specified in
+  <tt>[first, last)</tt> using @c num_lines parallel lines.
+  The first pipe must define a serial direction (tf::PipeType::SERIAL)
+  or an exception will be thrown.
+
+  Internally, the scalable pipeline copies the iterators
+  from the specified range. Those pipe callables pointed to by
+  these iterators must remain valid during the execution of the pipeline.
+  */
+  ScalablePipeline(size_t num_lines, P first, P last);
+
+  /**
+  @brief disabled copy constructor
+  */
+  ScalablePipeline(const ScalablePipeline&) = delete;
+
+  /**
+  @brief move constructor
+
+  Constructs a pipeline from the given @c rhs using move semantics
+  (i.e. the data in @c rhs is moved into this pipeline).
+  After the move, @c rhs is in a state as if it is just constructed.
+  The behavior is undefined if @c rhs is running during the move.
+  */
+  ScalablePipeline(ScalablePipeline&& rhs);
+
+  /**
+  @brief disabled copy assignment operator
+  */
+  ScalablePipeline& operator = (const ScalablePipeline&) = delete;
+
+  /**
+  @brief move constructor
+
+  Replaces the contents with those of @c rhs using move semantics
+  (i.e. the data in @c rhs is moved into this pipeline).
+  After the move, @c rhs is in a state as if it is just constructed.
+  The behavior is undefined if @c rhs is running during the move.
+  */
+  ScalablePipeline& operator = (ScalablePipeline&& rhs);
+
+  /**
+  @brief queries the number of parallel lines
+
+  The function returns the number of parallel lines given by the user
+  upon the construction of the pipeline.
+  The number of lines represents the maximum parallelism this pipeline
+  can achieve.
+  */
+  size_t num_lines() const noexcept;
+
+  /**
+  @brief queries the number of pipes
+
+  The Function returns the number of pipes given by the user
+  upon the construction of the pipeline.
+  */
+  size_t num_pipes() const noexcept;
+
+  /**
+  @brief resets the pipeline
+
+  Resets the pipeline to the initial state. After resetting a pipeline,
+  its token identifier will start from zero.
+  */
+  void reset();
+
+  /**
+  @brief resets the pipeline with a new range of pipes
+
+  @param first iterator to the beginning of the range
+  @param last iterator to the end of the range
+
+  The member function assigns the pipeline to a new range of pipes
+  specified in <tt>[first, last)</tt> and resets the pipeline to the
+  initial state. After resetting a pipeline, its token identifier will
+  start from zero.
+
+  Internally, the scalable pipeline copies the iterators
+  from the specified range. Those pipe callables pointed to by
+  these iterators must remain valid during the execution of the pipeline.
+  */
+  void reset(P first, P last);
+
+  /**
+  @brief resets the pipeline to a new line number and a
+         new range of pipes
+
+  @param num_lines number of parallel lines
+  @param first iterator to the beginning of the range
+  @param last iterator to the end of the range
+
+  The member function resets the pipeline to a new number of
+  parallel lines and a new range of pipes specified in
+  <tt>[first, last)</tt>, as if the pipeline is just constructed.
+  After resetting a pipeline, its token identifier will start from zero.
+
+  Internally, the scalable pipeline copies the iterators
+  from the specified range. Those pipe callables pointed to by
+  these iterators must remain valid during the execution of the pipeline.
+  */
+  void reset(size_t num_lines, P first, P last);
+
+  /**
+  @brief queries the number of generated tokens in the pipeline
+
+  The number represents the total scheduling tokens that has been
+  generated by the pipeline so far.
+  */
+  size_t num_tokens() const noexcept;
+
+  /**
+  @brief obtains the graph object associated with the pipeline construct
+
+  This method is primarily used as an opaque data structure for creating
+  a module task of the this pipeline.
+  */
+  Graph& graph();
+
+  private:
+
+  Graph _graph;
+
+  size_t _num_tokens{0};
+
+  std::vector<P> _pipes;
+  std::vector<Task> _tasks;
+  std::vector<Pipeflow> _pipeflows;
+  std::unique_ptr<Line[]> _lines;
+
+  // chchiu
+  std::queue<std::pair<size_t, size_t>> _ready_tokens;
+  std::unordered_map<size_t, std::vector<size_t>> _token_dependencies;
+  std::unordered_map<size_t, DeferredPipeflow> _deferred_tokens;
+  size_t _longest_deferral = 0;
+  
+  void _check_dependents(Pipeflow&);
+  void _construct_deferred_tokens(Pipeflow&);
+  void _resolve_token_dependencies(Pipeflow&);
+  // chchiu
+
+  void _on_pipe(Pipeflow&, Runtime&);
+  void _build();
+
+  Line& _line(size_t, size_t);
+};
+
+// constructor
+template <typename P>
+ScalablePipeline<P>::ScalablePipeline(size_t num_lines) :
+  _tasks     (num_lines + 1),
+  _pipeflows (num_lines) {
+
+  if(num_lines == 0) {
+    TF_THROW("must have at least one line");
+  }
+
+  _build();
+}
+
+// constructor
+template <typename P>
+ScalablePipeline<P>::ScalablePipeline(size_t num_lines, P first, P last) :
+  _tasks     (num_lines + 1),
+  _pipeflows (num_lines) {
+
+  if(num_lines == 0) {
+    TF_THROW("must have at least one line");
+  }
+
+  reset(first, last);
+  _build();
+}
+
+// move constructor
+template <typename P>
+ScalablePipeline<P>::ScalablePipeline(ScalablePipeline&& rhs) :
+  _graph              {std::move(rhs._graph)},
+  _num_tokens         {rhs._num_tokens},
+  _pipes              {std::move(rhs._pipes)},
+  _tasks              {std::move(rhs._tasks)},
+  _pipeflows          {std::move(rhs._pipeflows)},
+  _lines              {std::move(rhs._lines)},
+  _ready_tokens       {std::move(rhs._ready_tokens)},
+  _token_dependencies {std::move(rhs._token_dependencies)},
+  _deferred_tokens    {std::move(rhs._deferred_tokens)},
+  _longest_deferral   {rhs._longest_deferral}{
+
+  rhs._longest_deferral = 0;
+  rhs._num_tokens       = 0;
+}
+
+// move assignment operator
+template <typename P>
+ScalablePipeline<P>& ScalablePipeline<P>::operator = (ScalablePipeline&& rhs) {
+  _graph                = std::move(rhs._graph);
+  _num_tokens           = rhs._num_tokens;
+  _pipes                = std::move(rhs._pipes);
+  _tasks                = std::move(rhs._tasks);
+  _pipeflows            = std::move(rhs._pipeflows);
+  _lines                = std::move(rhs._lines);
+  rhs._num_tokens       = 0;
+  _ready_tokens         = std::move(rhs._ready_tokens);
+  _token_dependencies   = std::move(rhs._token_dependencies);
+  _deferred_tokens      = std::move(rhs._deferred_tokens);
+  _longest_deferral     = rhs._longest_deferral;
+  rhs._longest_deferral = 0;
+  return *this;
+}
+
+// Function: num_lines
+template <typename P>
+size_t ScalablePipeline<P>::num_lines() const noexcept {
+  return _pipeflows.size();
+}
+
+// Function: num_pipes
+template <typename P>
+size_t ScalablePipeline<P>::num_pipes() const noexcept {
+  return _pipes.size();
+}
+
+// Function: num_tokens
+template <typename P>
+size_t ScalablePipeline<P>::num_tokens() const noexcept {
+  return _num_tokens;
+}
+
+// Function: graph
+template <typename P>
+Graph& ScalablePipeline<P>::graph() {
+  return _graph;
+}
+
+// Function: _line
+template <typename P>
+typename ScalablePipeline<P>::Line& ScalablePipeline<P>::_line(size_t l, size_t p) {
+  return _lines[l*num_pipes() + p];
+}
+
+template <typename P>
+void ScalablePipeline<P>::reset(size_t num_lines, P first, P last) {
+
+  if(num_lines == 0) {
+    TF_THROW("must have at least one line");
+  }
+
+  _graph.clear();
+  _tasks.resize(num_lines + 1);
+  _pipeflows.resize(num_lines);
+
+  reset(first, last);
+
+  _build();
+}
+
+// Function: reset
+template <typename P>
+void ScalablePipeline<P>::reset(P first, P last) {
+
+  size_t num_pipes = static_cast<size_t>(std::distance(first, last));
+
+  if(num_pipes == 0) {
+    TF_THROW("pipeline cannot be empty");
+  }
+
+  if(first->type() != PipeType::SERIAL) {
+    TF_THROW("first pipe must be serial");
+  }
+
+  _pipes.resize(num_pipes);
+
+  size_t i=0;
+  for(auto itr = first; itr != last; itr++) {
+    _pipes[i++] = itr;
+  }
+
+  _lines = std::make_unique<Line[]>(num_lines() * _pipes.size());
+
+  reset();
+}
+
+// Function: reset
+template <typename P>
+void ScalablePipeline<P>::reset() {
+
+  _num_tokens = 0;
+
+  for(size_t l = 0; l<num_lines(); l++) {
+    _pipeflows[l]._pipe = 0;
+    _pipeflows[l]._line = l;
+    _pipeflows[l]._num_deferrals = 0;
+    _pipeflows[l]._dependents.clear();
+  }
+
+  _line(0, 0).join_counter.store(0, std::memory_order_relaxed);
+
+  for(size_t l=1; l<num_lines(); l++) {
+    for(size_t f=1; f<num_pipes(); f++) {
+      _line(l, f).join_counter.store(
+        static_cast<size_t>(_pipes[f]->type()), std::memory_order_relaxed
+      );
+    }
+  }
+
+  for(size_t f=1; f<num_pipes(); f++) {
+    _line(0, f).join_counter.store(1, std::memory_order_relaxed);
+  }
+
+  for(size_t l=1; l<num_lines(); l++) {
+    _line(l, 0).join_counter.store(
+      static_cast<size_t>(_pipes[0]->type()) - 1, std::memory_order_relaxed
+    );
+  }
+  
+  assert(_ready_tokens.empty() == true);
+  _token_dependencies.clear();
+  _deferred_tokens.clear();
+}
+
+// Procedure: _on_pipe
+template <typename P>
+void ScalablePipeline<P>::_on_pipe(Pipeflow& pf, Runtime& rt) {
+    
+  using callable_t = typename pipe_t::callable_t;
+
+  if constexpr (std::is_invocable_v<callable_t, Pipeflow&>) {
+    _pipes[pf._pipe]->_callable(pf);
+  }
+  else if constexpr(std::is_invocable_v<callable_t, Pipeflow&, Runtime&>) {
+    _pipes[pf._pipe]->_callable(pf, rt);
+  }
+  else {
+    static_assert(dependent_false_v<callable_t>, "un-supported pipe callable type");
+  }
+}
+
+template <typename P>
+void ScalablePipeline<P>::_check_dependents(Pipeflow& pf) {
+  ++pf._num_deferrals;
+  
+  for (auto it = pf._dependents.begin(); it != pf._dependents.end();) {
+ 
+    // valid (e.g., 12.defer(16)) 
+    if (*it >= _num_tokens) {
+      _token_dependencies[*it].push_back(pf._token);
+      _longest_deferral = std::max(_longest_deferral, *it);
+      ++it;
+    }
+    // valid or invalid (e.g., 12.defer(7))
+    else {
+      auto pit = _deferred_tokens.find(*it);
+      
+      // valid (e.g., 7 is deferred)
+      if (pit != _deferred_tokens.end()) {
+        _token_dependencies[*it].push_back(pf._token);
+        ++it;
+      }
+
+      else {
+        it = pf._dependents.erase(it);
+      }
+    }
+  }
+}
+
+// Procedure: _construct_deferred_tokens
+// Construct a data structure for a deferred token
+template <typename P>
+void ScalablePipeline<P>::_construct_deferred_tokens(Pipeflow& pf) {
+  
+  // construct the deferred pipeflow with zero copy
+  _deferred_tokens.emplace(
+    std::piecewise_construct,
+    std::forward_as_tuple(pf._token),
+    std::forward_as_tuple(
+      pf._token, pf._num_deferrals, std::move(pf._dependents)
+    )
+  );
+}
+
+// Procedure: _resolve_token_dependencies
+// Resolve dependencies for tokens that defer to current token
+template <typename P>
+void ScalablePipeline<P>::_resolve_token_dependencies(Pipeflow& pf) {
+
+  if (auto it = _token_dependencies.find(pf._token);
+      it != _token_dependencies.end()) {
+    
+    // iterate tokens that defer to pf._token
+    for(size_t target : it->second) {
+
+      auto dpf = _deferred_tokens.find(target);
+
+      assert(dpf != _deferred_tokens.end());
+
+      // erase pf._token from target's _dependents
+      dpf->second._dependents.erase(pf._token);
+      
+      // target has no dependents
+      if (dpf->second._dependents.empty()) {
+        _ready_tokens.emplace(dpf->second._token, dpf->second._num_deferrals);
+        _deferred_tokens.erase(dpf);
+      }
+    }
+
+    _token_dependencies.erase(it);
+  }
+}
+
+// Procedure: _build
+template <typename P>
+void ScalablePipeline<P>::_build() {
+
+  using namespace std::literals::string_literals;
+
+  FlowBuilder fb(_graph);
+
+  // init task
+  _tasks[0] = fb.emplace([this]() {
+    return static_cast<int>(_num_tokens % num_lines());
+  }).name("cond");
+
+  // line task
+  for(size_t l = 0; l < num_lines(); l++) {
+
+    _tasks[l + 1] = fb.emplace([this, l] (tf::Runtime& rt) mutable {
+
+      auto pf = &_pipeflows[l];
+
+      pipeline:
+
+      _line(pf->_line, pf->_pipe).join_counter.store(
+        static_cast<size_t>(_pipes[pf->_pipe]->type()), std::memory_order_relaxed
+      );
+
+      // First pipe does all jobs of initialization and token dependencies
+      if (pf->_pipe == 0) {
+        // _ready_tokens queue is not empty
+        // substitute pf with the token at the front of the queue
+        if (!_ready_tokens.empty()) {
+          pf->_token = _ready_tokens.front().first;
+          pf->_num_deferrals = _ready_tokens.front().second;
+          _ready_tokens.pop();
+        }
+        else {
+          pf->_token = _num_tokens;
+          pf->_num_deferrals = 0;
+        }
+      
+      handle_token_dependency: 
+
+        if (pf->_stop = false, _on_pipe(*pf, rt); pf->_stop == true) {
+          // here, the pipeline is not stopped yet because other
+          // lines of tasks may still be running their last stages
+          return;
+        }
+        
+        if (_num_tokens == pf->_token) {
+          ++_num_tokens;
+        }
+      
+        if (pf->_dependents.empty() == false){ 
+          // check if the pf->_dependents have valid dependents
+          _check_dependents(*pf); 
+          
+          // tokens in pf->_dependents are all valid dependents 
+          if (pf->_dependents.size()) {
+            
+            // construct a data structure for pf in _deferred_tokens 
+            _construct_deferred_tokens(*pf);
+            goto pipeline;
+          }
+
+          // tokens in pf->_dependents are invalid dependents
+          // directly goto on_pipe on the same line
+          else {
+            goto handle_token_dependency;
+          }
+        }
+        
+        // Every token within the deferral range needs to check
+        // if it can resolve dependencies on other tokens.
+        if (pf->_token <= _longest_deferral) {
+          _resolve_token_dependencies(*pf); 
+        }
+      }
+      else {
+        _on_pipe(*pf, rt);
+      }
+
+      size_t c_f = pf->_pipe;
+      size_t n_f = (pf->_pipe + 1) % num_pipes();
+      size_t n_l = (pf->_line + 1) % num_lines();
+
+      pf->_pipe = n_f;
+
+      // ---- scheduling starts here ----
+      // Notice that the shared variable f must not be changed after this
+      // point because it can result in data race due to the following
+      // condition:
+      //
+      // a -> b
+      // |    |
+      // v    v
+      // c -> d
+      //
+      // d will be spawned by either c or b, so if c changes f but b spawns d
+      // then data race on f will happen
+
+      std::array<int, 2> retval;
+      size_t n = 0;
+
+      // downward dependency
+      if(_pipes[c_f]->type() == PipeType::SERIAL &&
+         _line(n_l, c_f).join_counter.fetch_sub(
+           1, std::memory_order_acq_rel) == 1
+        ) {
+        retval[n++] = 1;
+      }
+
+      // forward dependency
+      if(_line(pf->_line, n_f).join_counter.fetch_sub(
+          1, std::memory_order_acq_rel) == 1
+        ) {
+        retval[n++] = 0;
+      }
+
+      // notice that the task index starts from 1
+      switch(n) {
+        case 2: {
+          rt.schedule(_tasks[n_l+1]);
+          goto pipeline;
+        }
+        case 1: {
+          if (retval[0] == 1) {
+            pf = &_pipeflows[n_l];
+          }
+          goto pipeline;
+        }
+      }
+    }).name("rt-"s + std::to_string(l));
+
+    _tasks[0].precede(_tasks[l+1]);
+  }
+}
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
+
+
diff --git a/lib/taskflow/algorithm/reduce.hpp b/lib/taskflow/algorithm/reduce.hpp
new file mode 100644
index 0000000..64869dc
--- /dev/null
+++ b/lib/taskflow/algorithm/reduce.hpp
@@ -0,0 +1,295 @@
+#pragma once
+
+#include "launch.hpp"
+
+namespace tf {
+
+namespace detail {
+
+// Function: make_reduce_task
+template <typename B, typename E, typename T, typename O, typename P>
+TF_FORCE_INLINE auto make_reduce_task(B beg, E end, T& init, O bop, P&& part) {
+
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using namespace std::string_literals;
+
+  return 
+  [b=beg, e=end, &r=init, bop, part=std::forward<P>(part)] 
+  (Runtime& rt) mutable {
+
+    // fetch the iterator values
+    B_t beg = b;
+    E_t end = e;
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(beg, end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= part.chunk_size()) {
+      for(; beg!=end; r = bop(r, *beg++));
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+
+    std::mutex mtx;
+
+    // static partitioner
+    if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) {
+      
+      size_t chunk_size;
+
+      for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) {
+        
+        // we force chunk size to be at least two because the temporary
+        // variable sum need to avoid copy at the first step
+        chunk_size = std::max(size_t{2}, part.adjusted_chunk_size(N, W, w));
+        
+        launch_loop(W, w, rt, [=, &bop, &mtx, &r, &part] () mutable {
+
+          std::advance(beg, curr_b);
+
+          if(N - curr_b == 1) {
+            std::lock_guard<std::mutex> lock(mtx);
+            r = bop(r, *beg);
+            return;
+          }
+
+          auto beg1 = beg++;
+          auto beg2 = beg++;
+          T sum = bop(*beg1, *beg2);
+        
+          // loop reduce
+          part.loop(N, W, curr_b, chunk_size,
+            [&, prev_e=curr_b+2](size_t curr_b, size_t curr_e) mutable {
+
+              if(curr_b > prev_e) {
+                std::advance(beg, curr_b - prev_e);
+              }
+              else {
+                curr_b = prev_e;
+              }
+
+              for(size_t x=curr_b; x<curr_e; x++, beg++) {
+                sum = bop(sum, *beg);
+              }
+              prev_e = curr_e;
+            }
+          ); 
+          
+          // final reduce
+          std::lock_guard<std::mutex> lock(mtx);
+          r = bop(r, sum);
+
+        });
+      }
+      rt.join();
+    }
+    // dynamic partitioner
+    else {
+      std::atomic<size_t> next(0);
+      launch_loop(N, W, rt, next, part, [=, &bop, &mtx, &next, &r, &part] () mutable {
+        // pre-reduce
+        size_t s0 = next.fetch_add(2, std::memory_order_relaxed);
+
+        if(s0 >= N) {
+          return;
+        }
+
+        std::advance(beg, s0);
+
+        if(N - s0 == 1) {
+          std::lock_guard<std::mutex> lock(mtx);
+          r = bop(r, *beg);
+          return;
+        }
+
+        auto beg1 = beg++;
+        auto beg2 = beg++;
+
+        T sum = bop(*beg1, *beg2);
+        
+        // loop reduce
+        part.loop(N, W, next, 
+          [&, prev_e=s0+2](size_t curr_b, size_t curr_e) mutable {
+            std::advance(beg, curr_b - prev_e);
+            for(size_t x=curr_b; x<curr_e; x++, beg++) {
+              sum = bop(sum, *beg);
+            }
+            prev_e = curr_e;
+          }
+        ); 
+        
+        // final reduce
+        std::lock_guard<std::mutex> lock(mtx);
+        r = bop(r, sum);
+      });
+    }
+  };
+}
+
+// Function: make_transform_reduce_task
+template <typename B, typename E, typename T, typename BOP, typename UOP, typename P>
+TF_FORCE_INLINE auto make_transform_reduce_task(
+  B beg, E end, T& init, BOP bop, UOP uop, P&& part
+) {
+
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using namespace std::string_literals;
+
+  return 
+  [b=beg, e=end, &r=init, bop, uop, part=std::forward<P>(part)] 
+  (Runtime& rt) mutable {
+
+    // fetch the iterator values
+    B_t beg = b;
+    E_t end = e;
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(beg, end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= part.chunk_size()) {
+      for(; beg!=end; r = bop(std::move(r), uop(*beg++)));
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+
+    std::mutex mtx;
+    
+    // static partitioner
+    if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) {
+      
+      size_t chunk_size;
+
+      for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) {
+      
+        chunk_size = part.adjusted_chunk_size(N, W, w);
+
+        launch_loop(W, w, rt, [=, &bop, &uop, &mtx, &r, &part] () mutable {
+
+          std::advance(beg, curr_b);
+
+          if(N - curr_b == 1) {
+            std::lock_guard<std::mutex> lock(mtx);
+            r = bop(std::move(r), uop(*beg));
+            return;
+          }
+
+          //auto beg1 = beg++;
+          //auto beg2 = beg++;
+          //T sum = bop(uop(*beg1), uop(*beg2));
+
+          T sum = (chunk_size == 1) ? uop(*beg++) : bop(uop(*beg++), uop(*beg++));
+        
+          // loop reduce
+          part.loop(N, W, curr_b, chunk_size,
+            [&, prev_e=curr_b+(chunk_size == 1 ? 1 : 2)]
+            (size_t curr_b, size_t curr_e) mutable {
+              if(curr_b > prev_e) {
+                std::advance(beg, curr_b - prev_e);
+              }
+              else {
+                curr_b = prev_e;
+              }
+              for(size_t x=curr_b; x<curr_e; x++, beg++) {
+                sum = bop(std::move(sum), uop(*beg));
+              }
+              prev_e = curr_e;
+            }
+          ); 
+          
+          // final reduce
+          std::lock_guard<std::mutex> lock(mtx);
+          r = bop(std::move(r), std::move(sum));
+
+        });
+      }
+      
+      rt.join();
+    }
+    // dynamic partitioner
+    else {
+      std::atomic<size_t> next(0);
+        
+      launch_loop(N, W, rt, next, part, [=, &bop, &uop, &mtx, &next, &r, &part] () mutable {
+
+        // pre-reduce
+        size_t s0 = next.fetch_add(2, std::memory_order_relaxed);
+
+        if(s0 >= N) {
+          return;
+        }
+
+        std::advance(beg, s0);
+
+        if(N - s0 == 1) {
+          std::lock_guard<std::mutex> lock(mtx);
+          r = bop(std::move(r), uop(*beg));
+          return;
+        }
+
+        auto beg1 = beg++;
+        auto beg2 = beg++;
+
+        T sum = bop(uop(*beg1), uop(*beg2));
+        
+        // loop reduce
+        part.loop(N, W, next, 
+          [&, prev_e=s0+2](size_t curr_b, size_t curr_e) mutable {
+            std::advance(beg, curr_b - prev_e);
+            for(size_t x=curr_b; x<curr_e; x++, beg++) {
+              sum = bop(std::move(sum), uop(*beg));
+            }
+            prev_e = curr_e;
+          }
+        ); 
+        
+        // final reduce
+        std::lock_guard<std::mutex> lock(mtx);
+        r = bop(std::move(r), std::move(sum));
+      });
+    }
+  };
+}
+
+}  // end of namespace detail -------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// default reduction
+// ----------------------------------------------------------------------------
+
+// Function: reduce
+template <typename B, typename E, typename T, typename O, typename P>
+Task FlowBuilder::reduce(B beg, E end, T& init, O bop, P&& part) {
+  return emplace(detail::make_reduce_task(
+    beg, end, init, bop, std::forward<P>(part)
+  ));
+}
+
+// ----------------------------------------------------------------------------
+// default transform and reduction
+// ----------------------------------------------------------------------------
+
+// Function: transform_reduce
+template <typename B, typename E, typename T, typename BOP, typename UOP, typename P>
+Task FlowBuilder::transform_reduce(
+  B beg, E end, T& init, BOP bop, UOP uop, P&& part
+) {
+  return emplace(detail::make_transform_reduce_task(
+    beg, end, init, bop, uop, std::forward<P>(part)
+  ));
+}
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
+
diff --git a/lib/taskflow/algorithm/scan.hpp b/lib/taskflow/algorithm/scan.hpp
new file mode 100644
index 0000000..cccb205
--- /dev/null
+++ b/lib/taskflow/algorithm/scan.hpp
@@ -0,0 +1,614 @@
+#pragma once
+
+#include "launch.hpp"
+
+namespace tf {
+
+namespace detail {
+
+// Function: scan_loop
+template <typename Iterator, typename BufferT, typename B>
+TF_FORCE_INLINE void scan_loop(
+  tf::Runtime& rt,
+  std::atomic<size_t>& counter, 
+  BufferT& buf, 
+  B&& bop, 
+  Iterator d_beg, 
+  size_t W,
+  size_t w, 
+  size_t chunk_size
+){
+  // whoever finishes the last performs global scan
+  if(counter.fetch_add(1, std::memory_order_acq_rel) == W-1) {
+    for(size_t i=1; i<buf.size(); i++) {
+      buf[i].data = bop(buf[i-1].data, buf[i].data);
+    }
+    counter.store(0, std::memory_order_release);
+  }
+
+  // first worker no need to do any work
+  if(w==0) {
+    return;
+  } 
+
+  // need to do public corun because multiple workers can call this
+  rt.executor().corun_until([&counter](){
+    return counter.load(std::memory_order_acquire) == 0;
+  });
+  
+  // block addup
+  for(size_t i=0; i<chunk_size; i++) {
+    *d_beg++ = bop(buf[w-1].data, *d_beg);
+  }
+}
+
+// Function: make_inclusive_scan_task
+template <typename B, typename E, typename D, typename BOP>
+TF_FORCE_INLINE auto make_inclusive_scan_task(B first, E last, D d_first, BOP bop) {
+  
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using D_t = std::decay_t<unwrap_ref_decay_t<D>>;
+  using value_type = typename std::iterator_traits<B_t>::value_type;
+  using namespace std::string_literals;
+  
+  return [=] (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B_t s_beg = first;
+    E_t s_end = last;
+    D_t d_beg = d_first;
+
+    if(s_beg == s_end) {
+      return;
+    }
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(s_beg, s_end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= 2) {
+      std::inclusive_scan(s_beg, s_end, d_beg, bop);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+    
+    std::vector<CachelineAligned<value_type>> buf(W);
+    std::atomic<size_t> counter(0);
+
+    size_t Q = N/W;
+    size_t R = N%W;
+    
+    //auto orig_d_beg = d_beg;
+    //ExecutionPolicy<StaticPartitioner> policy;
+
+    for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
+
+      chunk_size = std::min(Q + (w < R), N - curr_b);
+
+      // block scan
+      launch_loop(W, w, rt, [=, &rt, &bop, &buf, &counter] () mutable {
+
+        auto result = d_beg;
+
+        // local scan per worker
+        auto& init = buf[w].data;
+        *d_beg++ = init = *s_beg++;
+
+        for(size_t i=1; i<chunk_size; i++){
+          *d_beg++ = init = bop(init, *s_beg++); 
+        }
+
+        // block scan
+        detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size);
+        
+        //size_t offset = R ? Q + 1 : Q;
+        //size_t rest   = N - offset;
+        //size_t rest_Q = rest / W;
+        //size_t rest_R = rest % W;
+        //
+        //chunk_size = policy.chunk_size() == 0 ? 
+        //             rest_Q + (w < rest_R) : policy.chunk_size();
+        //
+        //size_t curr_b = policy.chunk_size() == 0 ? 
+        //                offset + (w<rest_R ? w*(rest_Q + 1) : rest_R + w*rest_Q) :
+        //                offset + w*policy.chunk_size();
+
+        //policy(N, W, curr_b, chunk_size,
+        //  [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable {
+        //    std::advance(orig_d_beg, curr_b - prev_e);
+        //    for(size_t x = curr_b; x<curr_e; x++) {
+        //      size_t j = x < (Q+1)*R ? x/(Q+1) : (x-(Q+1)*R)/Q + R;
+        //      *orig_d_beg++ = bop(buf[j-1].data, *orig_d_beg);
+        //    }
+        //    prev_e = curr_e;
+        //  }
+        //);
+      });
+      
+      std::advance(s_beg, chunk_size);
+      std::advance(d_beg, chunk_size);
+      curr_b += chunk_size;
+    }
+
+    rt.join();
+  };
+}
+
+// Function: make_inclusive_scan_task
+template <typename B, typename E, typename D, typename BOP, typename T>
+TF_FORCE_INLINE auto make_inclusive_scan_task(B first, E last, D d_first, BOP bop, T init) {
+  
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using D_t = std::decay_t<unwrap_ref_decay_t<D>>;
+  using value_type = typename std::iterator_traits<B_t>::value_type;
+  using namespace std::string_literals;
+  
+  return [=] (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B_t s_beg = first;
+    E_t s_end = last;
+    D_t d_beg = d_first;
+
+    if(s_beg == s_end) {
+      return;
+    }
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(s_beg, s_end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= 2) {
+      std::inclusive_scan(s_beg, s_end, d_beg, bop, init);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+    
+    std::vector<CachelineAligned<value_type>> buf(W);
+    std::atomic<size_t> counter(0);
+    
+    // set up the initial value for the first worker
+    buf[0].data = std::move(init);
+
+    size_t Q = N/W;
+    size_t R = N%W;
+
+    for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
+
+      chunk_size = std::min(Q + (w < R), N - curr_b);
+
+      // block scan
+      launch_loop(W, w, rt, [=, &rt, &bop, &buf, &counter] () mutable {
+
+        auto result = d_beg;
+
+        // local scan per worker
+        auto& init = buf[w].data;
+        *d_beg++ = init = (w == 0) ? bop(init, *s_beg++) : *s_beg++;
+
+        for(size_t i=1; i<chunk_size; i++){
+          *d_beg++ = init = bop(init, *s_beg++); 
+        }
+        
+        // block scan
+        detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size);
+      });
+
+      std::advance(s_beg, chunk_size);
+      std::advance(d_beg, chunk_size);
+      curr_b += chunk_size;
+    }
+
+    rt.join();
+  };
+}
+
+// ----------------------------------------------------------------------------
+// Transform Inclusive Scan
+// ----------------------------------------------------------------------------
+
+// Function: transform_inclusive_scan
+template <typename B, typename E, typename D, typename BOP, typename UOP>
+TF_FORCE_INLINE auto make_transform_inclusive_scan_task(
+  B first, E last, D d_first, BOP bop, UOP uop
+) {
+  
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using D_t = std::decay_t<unwrap_ref_decay_t<D>>;
+  using value_type = typename std::iterator_traits<B_t>::value_type;
+  using namespace std::string_literals;
+  
+  return [=] (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B_t s_beg = first;
+    E_t s_end = last;
+    D_t d_beg = d_first;
+
+    if(s_beg == s_end) {
+      return;
+    }
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(s_beg, s_end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= 2) {
+      std::transform_inclusive_scan(s_beg, s_end, d_beg, bop, uop);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+    
+    std::vector<CachelineAligned<value_type>> buf(W);
+    std::atomic<size_t> counter(0);
+
+    size_t Q = N/W;
+    size_t R = N%W;
+    
+    for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
+
+      chunk_size = std::min(Q + (w < R), N - curr_b);
+
+      // block scan
+      launch_loop(W, w, rt, [=, &rt, &bop, &uop, &buf, &counter] () mutable {
+
+        auto result = d_beg;
+
+        // local scan per worker
+        auto& init = buf[w].data;
+        *d_beg++ = init = uop(*s_beg++);
+
+        for(size_t i=1; i<chunk_size; i++){
+          *d_beg++ = init = bop(init, uop(*s_beg++)); 
+        }
+
+        // block scan
+        detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size);
+      });
+      
+      std::advance(s_beg, chunk_size);
+      std::advance(d_beg, chunk_size);
+      curr_b += chunk_size;
+    }
+
+    rt.join();
+  };
+}
+
+// Function: transform_inclusive_scan
+template <typename B, typename E, typename D, typename BOP, typename UOP, typename T>
+TF_FORCE_INLINE auto make_transform_inclusive_scan_task(
+  B first, E last, D d_first, BOP bop, UOP uop, T init
+) {
+  
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using D_t = std::decay_t<unwrap_ref_decay_t<D>>;
+  using value_type = typename std::iterator_traits<B_t>::value_type;
+  using namespace std::string_literals;
+  
+  return [=] (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B_t s_beg = first;
+    E_t s_end = last;
+    D_t d_beg = d_first;
+
+    if(s_beg == s_end) {
+      return;
+    }
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(s_beg, s_end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= 2) {
+      std::transform_inclusive_scan(s_beg, s_end, d_beg, bop, uop, init);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+    
+    std::vector<CachelineAligned<value_type>> buf(W);
+    std::atomic<size_t> counter(0);
+    
+    // set up the initial value for the first worker
+    buf[0].data = std::move(init);
+
+    size_t Q = N/W;
+    size_t R = N%W;
+
+    for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
+
+      chunk_size = std::min(Q + (w < R), N - curr_b);
+
+      // block scan
+      launch_loop(W, w, rt, [=, &rt, &bop, &uop, &buf, &counter] () mutable {
+
+        auto result = d_beg;
+
+        // local scan per worker
+        auto& init = buf[w].data;
+        *d_beg++ = init = (w == 0) ? bop(init, uop(*s_beg++)) : uop(*s_beg++);
+
+        for(size_t i=1; i<chunk_size; i++){
+          *d_beg++ = init = bop(init, uop(*s_beg++)); 
+        }
+        
+        // block scan
+        detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size);
+      });
+
+      std::advance(s_beg, chunk_size);
+      std::advance(d_beg, chunk_size);
+      curr_b += chunk_size;
+    }
+
+    rt.join();
+    
+  };
+}
+
+// ----------------------------------------------------------------------------
+// Exclusive Scan
+// ----------------------------------------------------------------------------
+
+// Function: make_exclusive_scan_task
+template <typename B, typename E, typename D, typename T, typename BOP>
+TF_FORCE_INLINE auto make_exclusive_scan_task(
+  B first, E last, D d_first, T init, BOP bop
+) {
+  
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using D_t = std::decay_t<unwrap_ref_decay_t<D>>;
+  using value_type = typename std::iterator_traits<B_t>::value_type;
+  using namespace std::string_literals;
+  
+  return [=] (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B_t s_beg = first;
+    E_t s_end = last;
+    D_t d_beg = d_first;
+
+    if(s_beg == s_end) {
+      return;
+    }
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(s_beg, s_end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= 2) {
+      std::exclusive_scan(s_beg, s_end, d_beg, init, bop);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+    
+    std::vector<CachelineAligned<value_type>> buf(W);
+    std::atomic<size_t> counter(0);
+
+    size_t Q = N/W;
+    size_t R = N%W;
+
+    // fetch the init value
+    auto s_beg_temp = s_beg;
+    for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
+      chunk_size = std::min(Q + (w<R), N - curr_b);  
+      buf[w].data = w ? *s_beg_temp : std::move(init);
+      std::advance(s_beg_temp, chunk_size - !w);
+      curr_b += chunk_size;
+    }
+    
+    for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
+
+      chunk_size = std::min(Q + (w < R), N - curr_b);
+
+      // block scan
+      launch_loop(W, w, rt, [=, &rt, &bop, &buf, &counter] () mutable {
+
+        auto result = d_beg;
+
+        // local scan per worker
+        auto& init = buf[w].data;
+
+        for(size_t i=1; i<chunk_size; i++) {
+          auto v = init;
+          init = bop(init, *s_beg++);
+          *d_beg++ = std::move(v);
+        }
+        *d_beg++ = init;
+        
+        // block scan
+        detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size);
+      });
+      
+      std::advance(s_beg, chunk_size);
+      std::advance(d_beg, chunk_size);
+      curr_b += chunk_size;
+    }
+
+    rt.join();
+    
+  };
+}
+
+// ----------------------------------------------------------------------------
+// Transform Exclusive Scan
+// ----------------------------------------------------------------------------
+
+// Function: 
+template <typename B, typename E, typename D, typename T, typename BOP, typename UOP>
+TF_FORCE_INLINE auto make_transform_exclusive_scan_task(
+  B first, E last, D d_first, T init, BOP bop, UOP uop
+) {
+  
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using D_t = std::decay_t<unwrap_ref_decay_t<D>>;
+  using value_type = typename std::iterator_traits<B_t>::value_type;
+  using namespace std::string_literals;
+  
+  return [=] (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B_t s_beg = first;
+    E_t s_end = last;
+    D_t d_beg = d_first;
+
+    if(s_beg == s_end) {
+      return;
+    }
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(s_beg, s_end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= 2) {
+      std::transform_exclusive_scan(s_beg, s_end, d_beg, init, bop, uop);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+    
+    std::vector<CachelineAligned<value_type>> buf(W);
+    std::atomic<size_t> counter(0);
+
+    size_t Q = N/W;
+    size_t R = N%W;
+
+    // fetch the init value
+    auto s_beg_temp = s_beg;
+    for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
+      chunk_size = std::min(Q + (w<R), N - curr_b);  
+      buf[w].data = w ? uop(*s_beg_temp) : std::move(init);
+      std::advance(s_beg_temp, chunk_size - !w);
+      curr_b += chunk_size;
+    }
+    
+    for(size_t w=0, curr_b=0, chunk_size; w<W && curr_b < N; ++w) {
+
+      chunk_size = std::min(Q + (w < R), N - curr_b);
+
+      // block scan
+      launch_loop(W, w, rt, [=, &rt, &bop, &uop, &buf, &counter] () mutable {
+
+        auto result = d_beg;
+
+        // local scan per worker
+        auto& init = buf[w].data;
+
+        for(size_t i=1; i<chunk_size; i++) {
+          auto v = init;
+          init = bop(init, uop(*s_beg++));
+          *d_beg++ = std::move(v);
+        }
+        *d_beg++ = init;
+        
+        // block scan
+        detail::scan_loop(rt, counter, buf, bop, result, W, w, chunk_size);
+      });
+      
+      std::advance(s_beg, chunk_size);
+      std::advance(d_beg, chunk_size);
+      curr_b += chunk_size;
+    }
+
+    rt.join();
+    
+  };
+}
+
+}  // end of namespace tf::detail ---------------------------------------------
+
+// ----------------------------------------------------------------------------
+// Inclusive Scan
+// ----------------------------------------------------------------------------
+
+// Function: inclusive_scan
+template <typename B, typename E, typename D, typename BOP>
+Task FlowBuilder::inclusive_scan(B first, E last, D d_first, BOP bop) {
+  return emplace(detail::make_inclusive_scan_task(
+    first, last, d_first, bop
+  ));
+}
+
+// Function: inclusive_scan
+template <typename B, typename E, typename D, typename BOP, typename T>
+Task FlowBuilder::inclusive_scan(B first, E last, D d_first, BOP bop, T init) {
+  return emplace(detail::make_inclusive_scan_task(
+    first, last, d_first, bop, init
+  ));
+}
+
+// ----------------------------------------------------------------------------
+// Transform Inclusive Scan
+// ----------------------------------------------------------------------------
+
+// Function: transform_inclusive_scan
+template <typename B, typename E, typename D, typename BOP, typename UOP>
+Task FlowBuilder::transform_inclusive_scan(
+  B first, E last, D d_first, BOP bop, UOP uop
+) {
+  return emplace(detail::make_transform_inclusive_scan_task(
+    first, last, d_first, bop, uop
+  ));
+}
+
+// Function: transform_inclusive_scan
+template <typename B, typename E, typename D, typename BOP, typename UOP, typename T>
+Task FlowBuilder::transform_inclusive_scan(
+  B first, E last, D d_first, BOP bop, UOP uop, T init
+) {
+  return emplace(detail::make_transform_inclusive_scan_task(
+    first, last, d_first, bop, uop, init
+  ));  
+}
+
+// ----------------------------------------------------------------------------
+// Exclusive Scan
+// ----------------------------------------------------------------------------
+
+// Function: exclusive_scan
+template <typename B, typename E, typename D, typename T, typename BOP>
+Task FlowBuilder::exclusive_scan(B first, E last, D d_first, T init, BOP bop) {
+  return emplace(detail::make_exclusive_scan_task(
+    first, last, d_first, init, bop
+  ));
+}
+
+// ----------------------------------------------------------------------------
+// Transform Exclusive Scan
+// ----------------------------------------------------------------------------
+
+// Function: transform_exclusive_scan
+template <typename B, typename E, typename D, typename T, typename BOP, typename UOP>
+Task FlowBuilder::transform_exclusive_scan(
+  B first, E last, D d_first, T init, BOP bop, UOP uop
+) {
+  return emplace(detail::make_transform_exclusive_scan_task(
+    first, last, d_first, init, bop, uop
+  )); 
+}
+
+}  // end of namespace tf -----------------------------------------------------
diff --git a/lib/taskflow/algorithm/sort.hpp b/lib/taskflow/algorithm/sort.hpp
new file mode 100644
index 0000000..a4fdf3c
--- /dev/null
+++ b/lib/taskflow/algorithm/sort.hpp
@@ -0,0 +1,648 @@
+#pragma once
+
+#include "../core/async.hpp"
+
+namespace tf {
+
+// threshold whether or not to perform parallel sort
+template <typename I>
+constexpr size_t parallel_sort_cutoff() {
+
+  //using value_type = std::decay_t<decltype(*std::declval<I>())>;
+  using value_type = typename std::iterator_traits<I>::value_type;
+
+  constexpr size_t object_size = sizeof(value_type);
+
+  if constexpr(std::is_same_v<value_type, std::string>) {
+    return 65536 / sizeof(std::string);
+  }
+  else {
+    if constexpr(object_size < 16) return 4096;
+    else if constexpr(object_size < 32) return 2048;
+    else if constexpr(object_size < 64) return 1024;
+    else if constexpr(object_size < 128) return 768;
+    else if constexpr(object_size < 256) return 512;
+    else if constexpr(object_size < 512) return 256;
+    else return 128;
+  }
+}
+
+// ----------------------------------------------------------------------------
+// pattern-defeating quick sort (pdqsort)
+// https://github.com/orlp/pdqsort/
+// ----------------------------------------------------------------------------
+
+template<typename T, size_t cacheline_size=64>
+inline T* align_cacheline(T* p) {
+#if defined(UINTPTR_MAX) && __cplusplus >= 201103L
+  std::uintptr_t ip = reinterpret_cast<std::uintptr_t>(p);
+#else
+  std::size_t ip = reinterpret_cast<std::size_t>(p);
+#endif
+  ip = (ip + cacheline_size - 1) & -cacheline_size;
+  return reinterpret_cast<T*>(ip);
+}
+
+template<typename Iter>
+inline void swap_offsets(
+  Iter first, Iter last,
+  unsigned char* offsets_l, unsigned char* offsets_r,
+  size_t num, bool use_swaps
+) {
+  typedef typename std::iterator_traits<Iter>::value_type T;
+  if (use_swaps) {
+    // This case is needed for the descending distribution, where we need
+    // to have proper swapping for pdqsort to remain O(n).
+    for (size_t i = 0; i < num; ++i) {
+        std::iter_swap(first + offsets_l[i], last - offsets_r[i]);
+    }
+  } else if (num > 0) {
+    Iter l = first + offsets_l[0]; Iter r = last - offsets_r[0];
+    T tmp(std::move(*l)); *l = std::move(*r);
+    for (size_t i = 1; i < num; ++i) {
+        l = first + offsets_l[i]; *r = std::move(*l);
+        r = last - offsets_r[i]; *l = std::move(*r);
+    }
+    *r = std::move(tmp);
+  }
+}
+
+// Sorts [begin, end) using insertion sort with the given comparison function.
+template<typename RandItr, typename Compare>
+void insertion_sort(RandItr begin, RandItr end, Compare comp) {
+
+  using T = typename std::iterator_traits<RandItr>::value_type;
+
+  if (begin == end) {
+    return;
+  }
+
+  for (RandItr cur = begin + 1; cur != end; ++cur) {
+
+    RandItr shift = cur;
+    RandItr shift_1 = cur - 1;
+
+    // Compare first to avoid 2 moves for an element
+    // already positioned correctly.
+    if (comp(*shift, *shift_1)) {
+      T tmp = std::move(*shift);
+      do {
+        *shift-- = std::move(*shift_1);
+      }while (shift != begin && comp(tmp, *--shift_1));
+      *shift = std::move(tmp);
+    }
+  }
+}
+
+// Sorts [begin, end) using insertion sort with the given comparison function.
+// Assumes *(begin - 1) is an element smaller than or equal to any element
+// in [begin, end).
+template<typename RandItr, typename Compare>
+void unguarded_insertion_sort(RandItr begin, RandItr end, Compare comp) {
+
+  using T = typename std::iterator_traits<RandItr>::value_type;
+
+  if (begin == end) {
+    return;
+  }
+
+  for (RandItr cur = begin + 1; cur != end; ++cur) {
+    RandItr shift = cur;
+    RandItr shift_1 = cur - 1;
+
+    // Compare first so we can avoid 2 moves
+    // for an element already positioned correctly.
+    if (comp(*shift, *shift_1)) {
+      T tmp = std::move(*shift);
+
+      do {
+        *shift-- = std::move(*shift_1);
+      }while (comp(tmp, *--shift_1));
+
+      *shift = std::move(tmp);
+    }
+  }
+}
+
+// Attempts to use insertion sort on [begin, end).
+// Will return false if more than
+// partial_insertion_sort_limit elements were moved,
+// and abort sorting. Otherwise it will successfully sort and return true.
+template<typename RandItr, typename Compare>
+bool partial_insertion_sort(RandItr begin, RandItr end, Compare comp) {
+
+  using T = typename std::iterator_traits<RandItr>::value_type;
+  using D = typename std::iterator_traits<RandItr>::difference_type;
+
+  // When we detect an already sorted partition, attempt an insertion sort
+  // that allows this amount of element moves before giving up.
+  constexpr auto partial_insertion_sort_limit = D{8};
+
+  if (begin == end) return true;
+
+  auto limit = D{0};
+
+  for (RandItr cur = begin + 1; cur != end; ++cur) {
+
+    if (limit > partial_insertion_sort_limit) {
+      return false;
+    }
+
+    RandItr shift = cur;
+    RandItr shift_1 = cur - 1;
+
+    // Compare first so we can avoid 2 moves
+    // for an element already positioned correctly.
+    if (comp(*shift, *shift_1)) {
+      T tmp = std::move(*shift);
+
+      do {
+        *shift-- = std::move(*shift_1);
+      }while (shift != begin && comp(tmp, *--shift_1));
+
+      *shift = std::move(tmp);
+      limit += cur - shift;
+    }
+  }
+
+  return true;
+}
+
+// Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal
+// to the pivot are put in the right-hand partition. Returns the position of the pivot after
+// partitioning and whether the passed sequence already was correctly partitioned. Assumes the
+// pivot is a median of at least 3 elements and that [begin, end) is at least
+// insertion_sort_threshold long. Uses branchless partitioning.
+template<typename Iter, typename Compare>
+std::pair<Iter, bool> partition_right_branchless(Iter begin, Iter end, Compare comp) {
+
+  typedef typename std::iterator_traits<Iter>::value_type T;
+
+  constexpr size_t block_size = 64;
+  constexpr size_t cacheline_size = 64;
+
+  // Move pivot into local for speed.
+  T pivot(std::move(*begin));
+  Iter first = begin;
+  Iter last = end;
+
+  // Find the first element greater than or equal than the pivot (the median of 3 guarantees
+  // this exists).
+  while (comp(*++first, pivot));
+
+  // Find the first element strictly smaller than the pivot. We have to guard this search if
+  // there was no element before *first.
+  if (first - 1 == begin) while (first < last && !comp(*--last, pivot));
+  else                    while (                !comp(*--last, pivot));
+
+  // If the first pair of elements that should be swapped to partition are the same element,
+  // the passed in sequence already was correctly partitioned.
+  bool already_partitioned = first >= last;
+  if (!already_partitioned) {
+    std::iter_swap(first, last);
+    ++first;
+
+    // The following branchless partitioning is derived from "BlockQuicksort: How Branch
+    // Mispredictions don't affect Quicksort" by Stefan Edelkamp and Armin Weiss, but
+    // heavily micro-optimized.
+    unsigned char offsets_l_storage[block_size + cacheline_size];
+    unsigned char offsets_r_storage[block_size + cacheline_size];
+    unsigned char* offsets_l = align_cacheline(offsets_l_storage);
+    unsigned char* offsets_r = align_cacheline(offsets_r_storage);
+
+    Iter offsets_l_base = first;
+    Iter offsets_r_base = last;
+    size_t num_l, num_r, start_l, start_r;
+    num_l = num_r = start_l = start_r = 0;
+
+    while (first < last) {
+      // Fill up offset blocks with elements that are on the wrong side.
+      // First we determine how much elements are considered for each offset block.
+      size_t num_unknown = last - first;
+      size_t left_split = num_l == 0 ? (num_r == 0 ? num_unknown / 2 : num_unknown) : 0;
+      size_t right_split = num_r == 0 ? (num_unknown - left_split) : 0;
+
+      // Fill the offset blocks.
+      if (left_split >= block_size) {
+        for (size_t i = 0; i < block_size;) {
+          offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first;
+          offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first;
+          offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first;
+          offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first;
+          offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first;
+          offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first;
+          offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first;
+          offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first;
+        }
+      } else {
+        for (size_t i = 0; i < left_split;) {
+          offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first;
+        }
+      }
+
+      if (right_split >= block_size) {
+        for (size_t i = 0; i < block_size;) {
+          offsets_r[num_r] = ++i; num_r += comp(*--last, pivot);
+          offsets_r[num_r] = ++i; num_r += comp(*--last, pivot);
+          offsets_r[num_r] = ++i; num_r += comp(*--last, pivot);
+          offsets_r[num_r] = ++i; num_r += comp(*--last, pivot);
+          offsets_r[num_r] = ++i; num_r += comp(*--last, pivot);
+          offsets_r[num_r] = ++i; num_r += comp(*--last, pivot);
+          offsets_r[num_r] = ++i; num_r += comp(*--last, pivot);
+          offsets_r[num_r] = ++i; num_r += comp(*--last, pivot);
+        }
+      } else {
+        for (size_t i = 0; i < right_split;) {
+          offsets_r[num_r] = ++i; num_r += comp(*--last, pivot);
+        }
+      }
+
+      // Swap elements and update block sizes and first/last boundaries.
+      size_t num = std::min(num_l, num_r);
+      swap_offsets(
+        offsets_l_base, offsets_r_base, 
+        offsets_l + start_l, offsets_r + start_r,
+        num, num_l == num_r
+      );
+      num_l -= num; num_r -= num;
+      start_l += num; start_r += num;
+
+      if (num_l == 0) {
+        start_l = 0;
+        offsets_l_base = first;
+      }
+
+      if (num_r == 0) {
+        start_r = 0;
+        offsets_r_base = last;
+      }
+    }
+
+    // We have now fully identified [first, last)'s proper position. Swap the last elements.
+    if (num_l) {
+      offsets_l += start_l;
+      while (num_l--) std::iter_swap(offsets_l_base + offsets_l[num_l], --last);
+      first = last;
+    }
+    if (num_r) {
+      offsets_r += start_r;
+      while (num_r--) std::iter_swap(offsets_r_base - offsets_r[num_r], first), ++first;
+      last = first;
+    }
+  }
+
+  // Put the pivot in the right place.
+  Iter pivot_pos = first - 1;
+  *begin = std::move(*pivot_pos);
+  *pivot_pos = std::move(pivot);
+
+  return std::make_pair(pivot_pos, already_partitioned);
+}
+
+// Partitions [begin, end) around pivot *begin using comparison function comp.
+// Elements equal to the pivot are put in the right-hand partition.
+// Returns the position of the pivot after partitioning and whether the passed
+// sequence already was correctly partitioned.
+// Assumes the pivot is a median of at least 3 elements and that [begin, end)
+// is at least insertion_sort_threshold long.
+template<typename Iter, typename Compare>
+std::pair<Iter, bool> partition_right(Iter begin, Iter end, Compare comp) {
+
+  using T = typename std::iterator_traits<Iter>::value_type;
+
+  // Move pivot into local for speed.
+  T pivot(std::move(*begin));
+
+  Iter first = begin;
+  Iter last = end;
+
+  // Find the first element greater than or equal than the pivot
+  // (the median of 3 guarantees/ this exists).
+  while (comp(*++first, pivot));
+
+  // Find the first element strictly smaller than the pivot.
+  // We have to guard this search if there was no element before *first.
+  if (first - 1 == begin) while (first < last && !comp(*--last, pivot));
+  else while (!comp(*--last, pivot));
+
+  // If the first pair of elements that should be swapped to partition
+  // are the same element, the passed in sequence already was correctly
+  // partitioned.
+  bool already_partitioned = first >= last;
+
+  // Keep swapping pairs of elements that are on the wrong side of the pivot.
+  // Previously swapped pairs guard the searches,
+  // which is why the first iteration is special-cased above.
+  while (first < last) {
+    std::iter_swap(first, last);
+    while (comp(*++first, pivot));
+    while (!comp(*--last, pivot));
+  }
+
+  // Put the pivot in the right place.
+  Iter pivot_pos = first - 1;
+  *begin = std::move(*pivot_pos);
+  *pivot_pos = std::move(pivot);
+
+  return std::make_pair(pivot_pos, already_partitioned);
+}
+
+// Similar function to the one above, except elements equal to the pivot
+// are put to the left of the pivot and it doesn't check or return
+// if the passed sequence already was partitioned.
+// Since this is rarely used (the many equal case),
+// and in that case pdqsort already has O(n) performance,
+// no block quicksort is applied here for simplicity.
+template<typename RandItr, typename Compare>
+RandItr partition_left(RandItr begin, RandItr end, Compare comp) {
+
+  using T = typename std::iterator_traits<RandItr>::value_type;
+
+  T pivot(std::move(*begin));
+
+  RandItr first = begin;
+  RandItr last = end;
+
+  while (comp(pivot, *--last));
+
+  if (last + 1 == end) {
+    while (first < last && !comp(pivot, *++first));
+  }
+  else {
+    while (!comp(pivot, *++first));
+  }
+
+  while (first < last) {
+    std::iter_swap(first, last);
+    while (comp(pivot, *--last));
+    while (!comp(pivot, *++first));
+  }
+
+  RandItr pivot_pos = last;
+  *begin = std::move(*pivot_pos);
+  *pivot_pos = std::move(pivot);
+
+  return pivot_pos;
+}
+
+template<typename Iter, typename Compare, bool Branchless>
+void parallel_pdqsort(
+  tf::Runtime& rt,
+  Iter begin, Iter end, Compare comp,
+  int bad_allowed, bool leftmost = true
+) {
+
+  // Partitions below this size are sorted sequentially
+  constexpr auto cutoff = parallel_sort_cutoff<Iter>();
+
+  // Partitions below this size are sorted using insertion sort
+  constexpr auto insertion_sort_threshold = 24;
+
+  // Partitions above this size use Tukey's ninther to select the pivot.
+  constexpr auto ninther_threshold = 128;
+
+  //using diff_t = typename std::iterator_traits<Iter>::difference_type;
+
+  // Use a while loop for tail recursion elimination.
+  while (true) {
+
+    //diff_t size = end - begin;
+    size_t size = end - begin;
+
+    // Insertion sort is faster for small arrays.
+    if (size < insertion_sort_threshold) {
+      if (leftmost) {
+        insertion_sort(begin, end, comp);
+      }
+      else {
+        unguarded_insertion_sort(begin, end, comp);
+      }
+      return;
+    }
+
+    if(size <= cutoff) {
+      std::sort(begin, end, comp);
+      return;
+    }
+
+    // Choose pivot as median of 3 or pseudomedian of 9.
+    //diff_t s2 = size / 2;
+    size_t s2 = size >> 1;
+    if (size > ninther_threshold) {
+      sort3(begin, begin + s2, end - 1, comp);
+      sort3(begin + 1, begin + (s2 - 1), end - 2, comp);
+      sort3(begin + 2, begin + (s2 + 1), end - 3, comp);
+      sort3(begin + (s2 - 1), begin + s2, begin + (s2 + 1), comp);
+      std::iter_swap(begin, begin + s2);
+    }
+    else {
+      sort3(begin + s2, begin, end - 1, comp);
+    }
+
+    // If *(begin - 1) is the end of the right partition
+    // of a previous partition operation, there is no element in [begin, end)
+    // that is smaller than *(begin - 1).
+    // Then if our pivot compares equal to *(begin - 1) we change strategy,
+    // putting equal elements in the left partition,
+    // greater elements in the right partition.
+    // We do not have to recurse on the left partition,
+    // since it's sorted (all equal).
+    if (!leftmost && !comp(*(begin - 1), *begin)) {
+      begin = partition_left(begin, end, comp) + 1;
+      continue;
+    }
+
+    // Partition and get results.
+    const auto pair = Branchless ? partition_right_branchless(begin, end, comp) :
+                                   partition_right(begin, end, comp);
+       
+    const auto pivot_pos = pair.first;
+    const auto already_partitioned = pair.second;
+
+    // Check for a highly unbalanced partition.
+    //diff_t l_size = pivot_pos - begin;
+    //diff_t r_size = end - (pivot_pos + 1);
+    const size_t l_size = pivot_pos - begin;
+    const size_t r_size = end - (pivot_pos + 1);
+    const bool highly_unbalanced = l_size < size / 8 || r_size < size / 8;
+
+    // If we got a highly unbalanced partition we shuffle elements
+    // to break many patterns.
+    if (highly_unbalanced) {
+      // If we had too many bad partitions, switch to heapsort
+      // to guarantee O(n log n).
+      if (--bad_allowed == 0) {
+        std::make_heap(begin, end, comp);
+        std::sort_heap(begin, end, comp);
+        return;
+      }
+
+      if (l_size >= insertion_sort_threshold) {
+        std::iter_swap(begin, begin + l_size / 4);
+        std::iter_swap(pivot_pos - 1, pivot_pos - l_size / 4);
+        if (l_size > ninther_threshold) {
+          std::iter_swap(begin + 1, begin + (l_size / 4 + 1));
+          std::iter_swap(begin + 2, begin + (l_size / 4 + 2));
+          std::iter_swap(pivot_pos - 2, pivot_pos - (l_size / 4 + 1));
+          std::iter_swap(pivot_pos - 3, pivot_pos - (l_size / 4 + 2));
+        }
+      }
+
+      if (r_size >= insertion_sort_threshold) {
+        std::iter_swap(pivot_pos + 1, pivot_pos + (1 + r_size / 4));
+        std::iter_swap(end - 1,                   end - r_size / 4);
+        if (r_size > ninther_threshold) {
+          std::iter_swap(pivot_pos + 2, pivot_pos + (2 + r_size / 4));
+          std::iter_swap(pivot_pos + 3, pivot_pos + (3 + r_size / 4));
+          std::iter_swap(end - 2,             end - (1 + r_size / 4));
+          std::iter_swap(end - 3,             end - (2 + r_size / 4));
+        }
+      }
+    }
+    // decently balanced
+    else {
+      // sequence try to use insertion sort.
+      if (already_partitioned &&
+          partial_insertion_sort(begin, pivot_pos, comp) &&
+          partial_insertion_sort(pivot_pos + 1, end, comp)
+      ) {
+        return;
+      }
+    }
+
+    // Sort the left partition first using recursion and
+    // do tail recursion elimination for the right-hand partition.
+    rt.silent_async(
+      [&rt, begin, pivot_pos, comp, bad_allowed, leftmost] () mutable {
+        parallel_pdqsort<Iter, Compare, Branchless>(
+          rt, begin, pivot_pos, comp, bad_allowed, leftmost
+        );
+      }
+    );
+    begin = pivot_pos + 1;
+    leftmost = false;
+  }
+}
+
+// ----------------------------------------------------------------------------
+// 3-way quick sort
+// ----------------------------------------------------------------------------
+
+// 3-way quick sort
+template <typename RandItr, typename C>
+void parallel_3wqsort(tf::Runtime& rt, RandItr first, RandItr last, C compare) {
+
+  using namespace std::string_literals;
+
+  constexpr auto cutoff = parallel_sort_cutoff<RandItr>();
+
+  sort_partition:
+
+  if(static_cast<size_t>(last - first) < cutoff) {
+    std::sort(first, last+1, compare);
+    return;
+  }
+
+  auto m = pseudo_median_of_nine(first, last, compare);
+
+  if(m != first) {
+    std::iter_swap(first, m);
+  }
+
+  auto l = first;
+  auto r = last;
+  auto f = std::next(first, 1);
+  bool is_swapped_l = false;
+  bool is_swapped_r = false;
+
+  while(f <= r) {
+    if(compare(*f, *l)) {
+      is_swapped_l = true;
+      std::iter_swap(l, f);
+      l++;
+      f++;
+    }
+    else if(compare(*l, *f)) {
+      is_swapped_r = true;
+      std::iter_swap(r, f);
+      r--;
+    }
+    else {
+      f++;
+    }
+  }
+
+  if(l - first > 1 && is_swapped_l) {
+    //rt.emplace([&](tf::Runtime& rtl) mutable {
+    //  parallel_3wqsort(rtl, first, l-1, compare);
+    //});
+    rt.silent_async([&rt, first, l, &compare] () mutable {
+      parallel_3wqsort(rt, first, l-1, compare);
+    });
+  }
+
+  if(last - r > 1 && is_swapped_r) {
+    //rt.emplace([&](tf::Runtime& rtr) mutable {
+    //  parallel_3wqsort(rtr, r+1, last, compare);
+    //});
+    //rt.silent_async([&rt, r, last, &compare] () mutable {
+    //  parallel_3wqsort(rt, r+1, last, compare);
+    //});
+    first = r+1;
+    goto sort_partition;
+  }
+
+  //rt.join();
+}
+
+// ----------------------------------------------------------------------------
+// tf::Taskflow::sort
+// ----------------------------------------------------------------------------
+
+// Function: sort
+template <typename B, typename E, typename C>
+Task FlowBuilder::sort(B beg, E end, C cmp) {
+
+  Task task = emplace([b=beg, e=end, cmp] (Runtime& rt) mutable {
+
+    using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+    using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+
+    // fetch the iterator values
+    B_t beg = b;
+    E_t end = e;
+
+    if(beg == end) {
+      return;
+    }
+
+    size_t W = rt._executor.num_workers();
+    size_t N = std::distance(beg, end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= parallel_sort_cutoff<B_t>()) {
+      std::sort(beg, end, cmp);
+      return;
+    }
+
+    //parallel_3wqsort(rt, beg, end-1, cmp);
+    parallel_pdqsort<B_t, C,
+      is_std_compare_v<std::decay_t<C>> &&
+      std::is_arithmetic_v<typename std::iterator_traits<B_t>::value_type>
+    >(rt, beg, end, cmp, log2(end - beg));
+
+    rt.join();
+  });
+
+  return task;
+}
+
+// Function: sort
+template <typename B, typename E>
+Task FlowBuilder::sort(B beg, E end) {
+  using value_type = std::decay_t<decltype(*std::declval<B>())>;
+  return sort(beg, end, std::less<value_type>{});
+}
+
+}  // namespace tf ------------------------------------------------------------
+
diff --git a/lib/taskflow/algorithm/transform.hpp b/lib/taskflow/algorithm/transform.hpp
new file mode 100644
index 0000000..4c87887
--- /dev/null
+++ b/lib/taskflow/algorithm/transform.hpp
@@ -0,0 +1,199 @@
+#pragma once
+
+#include "launch.hpp"
+
+namespace tf {
+
+namespace detail {
+
+// Function: make_transform_task
+template <typename B, typename E, typename O, typename C, typename P>
+TF_FORCE_INLINE auto make_transform_task(
+  B first1, E last1, O d_first, C c, P&& part
+) {
+
+  using namespace std::string_literals;
+
+  using B_t = std::decay_t<unwrap_ref_decay_t<B>>;
+  using E_t = std::decay_t<unwrap_ref_decay_t<E>>;
+  using O_t = std::decay_t<unwrap_ref_decay_t<O>>;
+  
+  return
+  [first1, last1, d_first, c, part=std::forward<P>(part)] 
+  (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B_t beg   = first1;
+    E_t end   = last1;
+    O_t d_beg = d_first;
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(beg, end);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= part.chunk_size()) {
+      std::transform(beg, end, d_beg, c);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+
+    // static partitioner
+    if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) {
+      size_t chunk_size;
+      for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) {
+        chunk_size = part.adjusted_chunk_size(N, W, w);
+        launch_loop(W, w, rt, [=, &part] () mutable {
+          part.loop(N, W, curr_b, chunk_size,
+            [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable {
+              std::advance(beg, curr_b - prev_e);
+              std::advance(d_beg, curr_b - prev_e);
+              for(size_t x = curr_b; x<curr_e; x++) {
+                *d_beg++ = c(*beg++);
+              }
+              prev_e = curr_e;
+            }
+          ); 
+        });
+      }
+      rt.join();
+    }
+    // dynamic partitioner
+    else {
+      std::atomic<size_t> next(0);
+      
+      launch_loop(N, W, rt, next, part, [=, &next, &part] () mutable {
+        part.loop(N, W, next, 
+          [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable {
+            std::advance(beg, curr_b - prev_e);
+            std::advance(d_beg, curr_b - prev_e);
+            for(size_t x = curr_b; x<curr_e; x++) {
+              *d_beg++ = c(*beg++);
+            }
+            prev_e = curr_e;
+          }
+        ); 
+      });
+    }
+  };
+}
+
+// Function: make_transform_task
+template <
+  typename B1, typename E1, typename B2, typename O, typename C, typename P,
+  std::enable_if_t<!is_partitioner_v<std::decay_t<C>>, void>* = nullptr
+>
+TF_FORCE_INLINE auto make_transform_task(
+  B1 first1, E1 last1, B2 first2, O d_first, C c, P&& part
+) {
+
+  using namespace std::string_literals;
+
+  using B1_t = std::decay_t<unwrap_ref_decay_t<B1>>;
+  using E1_t = std::decay_t<unwrap_ref_decay_t<E1>>;
+  using B2_t = std::decay_t<unwrap_ref_decay_t<B2>>;
+  using O_t = std::decay_t<unwrap_ref_decay_t<O>>;
+
+  return
+  [first1, last1, first2, d_first, c, part=std::forward<P>(part)] 
+  (Runtime& rt) mutable {
+
+    // fetch the stateful values
+    B1_t beg1 = first1;
+    E1_t end1 = last1;
+    B2_t beg2 = first2;
+    O_t d_beg = d_first;
+
+    size_t W = rt.executor().num_workers();
+    size_t N = std::distance(beg1, end1);
+
+    // only myself - no need to spawn another graph
+    if(W <= 1 || N <= part.chunk_size()) {
+      std::transform(beg1, end1, beg2, d_beg, c);
+      return;
+    }
+
+    if(N < W) {
+      W = N;
+    }
+
+    // static partitioner
+    if constexpr(std::is_same_v<std::decay_t<P>, StaticPartitioner>) {
+      size_t chunk_size;
+      for(size_t w=0, curr_b=0; w<W && curr_b < N; ++w, curr_b += chunk_size) {
+        chunk_size = part.adjusted_chunk_size(N, W, w);
+        launch_loop(W, w, rt, [=, &c, &part] () mutable {
+          part.loop(N, W, curr_b, chunk_size,
+            [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable {
+              std::advance(beg1, curr_b - prev_e);
+              std::advance(beg2, curr_b - prev_e);
+              std::advance(d_beg, curr_b - prev_e);
+              for(size_t x = curr_b; x<curr_e; x++) {
+                *d_beg++ = c(*beg1++, *beg2++);
+              }
+              prev_e = curr_e;
+            }
+          ); 
+        });
+      }
+      rt.join();
+    }
+    // dynamic partitioner
+    else {
+      std::atomic<size_t> next(0);
+      launch_loop(N, W, rt, next, part, [=, &c, &next, &part] () mutable {
+        part.loop(N, W, next, 
+          [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable {
+            std::advance(beg1, curr_b - prev_e);
+            std::advance(beg2, curr_b - prev_e);
+            std::advance(d_beg, curr_b - prev_e);
+            for(size_t x = curr_b; x<curr_e; x++) {
+              *d_beg++ = c(*beg1++, *beg2++);
+            }
+            prev_e = curr_e;
+          }
+        ); 
+      });
+    }
+  };
+}
+
+}  // end of namespace detail -------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// transform
+// ----------------------------------------------------------------------------
+
+// Function: transform
+template <typename B, typename E, typename O, typename C, typename P>
+Task FlowBuilder::transform(B first1, E last1, O d_first, C c, P&& part) {
+  return emplace(
+    detail::make_transform_task(first1, last1, d_first, c, std::forward<P>(part))
+  );
+}
+
+// ----------------------------------------------------------------------------
+// transform2
+// ----------------------------------------------------------------------------
+  
+// Function: transform
+template <
+  typename B1, typename E1, typename B2, typename O, typename C, typename P,
+  std::enable_if_t<!is_partitioner_v<std::decay_t<C>>, void>*
+>
+Task FlowBuilder::transform(
+  B1 first1, E1 last1, B2 first2, O d_first, C c, P&& part
+) {
+
+  return emplace(detail::make_transform_task(
+    first1, last1, first2, d_first, c, std::forward<P>(part)
+  ));
+}
+
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
diff --git a/lib/taskflow/core/async.hpp b/lib/taskflow/core/async.hpp
new file mode 100644
index 0000000..69788c6
--- /dev/null
+++ b/lib/taskflow/core/async.hpp
@@ -0,0 +1,396 @@
+#pragma once
+
+#include "executor.hpp"
+
+// https://hackmd.io/@sysprog/concurrency-atomics
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// Async
+// ----------------------------------------------------------------------------
+
+// Function: async
+template <typename F>
+auto Executor::async(const std::string& name, F&& f) {
+
+  _increment_topology();
+
+  using R = std::invoke_result_t<std::decay_t<F>>;
+
+  std::promise<R> p;
+  auto fu{p.get_future()};
+
+  auto node = node_pool.animate(
+    name, 0, nullptr, nullptr, 0,
+    std::in_place_type_t<Node::Async>{}, 
+    _make_promised_async(std::move(p), std::forward<F>(f))
+  );
+
+  _schedule_async_task(node);
+
+  return fu;
+}
+
+// Function: async
+template <typename F>
+auto Executor::async(F&& f) {
+  return async("", std::forward<F>(f));
+}
+
+// ----------------------------------------------------------------------------
+// Silent Async
+// ----------------------------------------------------------------------------
+
+// Function: silent_async
+template <typename F>
+void Executor::silent_async(const std::string& name, F&& f) {
+
+  _increment_topology();
+
+  auto node = node_pool.animate(
+    name, 0, nullptr, nullptr, 0,
+    std::in_place_type_t<Node::Async>{}, std::forward<F>(f)
+  );
+
+  _schedule_async_task(node);
+}
+
+// Function: silent_async
+template <typename F>
+void Executor::silent_async(F&& f) {
+  silent_async("", std::forward<F>(f));
+}
+
+// ----------------------------------------------------------------------------
+// Async Helper Methods
+// ----------------------------------------------------------------------------
+
+// Function: _make_promised_async
+template <typename R, typename F>
+auto Executor::_make_promised_async(std::promise<R>&& p, F&& func) {
+  return [p=make_moc(std::move(p)), func=std::forward<F>(func)]() mutable {
+    if constexpr(std::is_same_v<R, void>) {
+      func();
+      p.object.set_value();
+    }
+    else {
+      p.object.set_value(func());
+    }
+  };
+}
+  
+// Procedure: _schedule_async_task
+inline void Executor::_schedule_async_task(Node* node) {  
+  if(auto w = _this_worker(); w) {
+    _schedule(*w, node);
+  }
+  else{
+    _schedule(node);
+  }
+}
+
+// Procedure: _tear_down_async
+inline void Executor::_tear_down_async(Node* node) {
+  // from runtime
+  if(node->_parent) {
+    node->_parent->_join_counter.fetch_sub(1, std::memory_order_release);
+  }
+  // from executor
+  else {
+    _decrement_topology_and_notify();
+  }
+  node_pool.recycle(node);
+}
+
+// ----------------------------------------------------------------------------
+// Silent Dependent Async
+// ----------------------------------------------------------------------------
+
+// Function: silent_dependent_async
+template <typename F, typename... Tasks,
+  std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>*
+>
+tf::AsyncTask Executor::silent_dependent_async(F&& func, Tasks&&... tasks) {
+  return silent_dependent_async("", std::forward<F>(func), std::forward<Tasks>(tasks)...);
+}
+
+// Function: silent_dependent_async
+template <typename F, typename... Tasks,
+  std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>*
+>
+tf::AsyncTask Executor::silent_dependent_async(
+  const std::string& name, F&& func, Tasks&&... tasks 
+){
+
+  _increment_topology();
+
+  size_t num_dependents = sizeof...(Tasks);
+  
+  std::shared_ptr<Node> node(
+    node_pool.animate(
+      name, 0, nullptr, nullptr, num_dependents,
+      std::in_place_type_t<Node::DependentAsync>{}, std::forward<F>(func)
+    ),
+    [&](Node* ptr){ node_pool.recycle(ptr); }
+  );
+  
+  {
+    std::scoped_lock lock(_asyncs_mutex);
+    _asyncs.insert(node);
+  }
+  
+  if constexpr(sizeof...(Tasks) > 0) {
+    (_process_async_dependent(node.get(), tasks, num_dependents), ...);
+  }
+
+  if(num_dependents == 0) {
+    _schedule_async_task(node.get());
+  }
+
+  return AsyncTask(std::move(node));
+}
+
+// Function: silent_dependent_async
+template <typename F, typename I,
+  std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>*
+>
+tf::AsyncTask Executor::silent_dependent_async(F&& func, I first, I last) {
+  return silent_dependent_async("", std::forward<F>(func), first, last);
+}
+
+// Function: silent_dependent_async
+template <typename F, typename I,
+  std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>*
+>
+tf::AsyncTask Executor::silent_dependent_async(
+  const std::string& name, F&& func, I first, I last
+) {
+
+  _increment_topology();
+
+  size_t num_dependents = std::distance(first, last);
+  
+  std::shared_ptr<Node> node(
+    node_pool.animate(
+      name, 0, nullptr, nullptr, num_dependents,
+      std::in_place_type_t<Node::DependentAsync>{}, std::forward<F>(func)
+    ),
+    [&](Node* ptr){ node_pool.recycle(ptr); }
+  );
+  
+  {
+    std::scoped_lock lock(_asyncs_mutex);
+    _asyncs.insert(node);
+  }
+  
+  for(; first != last; first++){
+    _process_async_dependent(node.get(), *first, num_dependents);
+  }
+
+  if(num_dependents == 0) {
+    _schedule_async_task(node.get());
+  }
+
+  return AsyncTask(std::move(node));
+}
+
+// ----------------------------------------------------------------------------
+// Dependent Async
+// ----------------------------------------------------------------------------
+
+// Function: dependent_async
+template <typename F, typename... Tasks,
+  std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>*
+>
+auto Executor::dependent_async(F&& func, Tasks&&... tasks) {
+  return dependent_async("", std::forward<F>(func), std::forward<Tasks>(tasks)...);
+}
+
+// Function: dependent_async
+template <typename F, typename... Tasks,
+  std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>*
+>
+auto Executor::dependent_async(
+  const std::string& name, F&& func, Tasks&&... tasks 
+) {
+  
+  _increment_topology();
+  
+  using R = std::invoke_result_t<std::decay_t<F>>;
+
+  std::promise<R> p;
+  auto fu{p.get_future()};
+
+  size_t num_dependents = sizeof...(tasks);
+
+  std::shared_ptr<Node> node(
+    node_pool.animate(
+      name, 0, nullptr, nullptr, num_dependents,
+      std::in_place_type_t<Node::DependentAsync>{},
+      _make_promised_async(std::move(p), std::forward<F>(func))
+    ),
+    [&](Node* ptr){ node_pool.recycle(ptr); }
+  );
+  
+  {
+    std::scoped_lock lock(_asyncs_mutex);
+    _asyncs.insert(node);
+  }
+  
+  if constexpr(sizeof...(Tasks) > 0) {
+    (_process_async_dependent(node.get(), tasks, num_dependents), ...);
+  }
+
+  if(num_dependents == 0) {
+    _schedule_async_task(node.get());
+  }
+
+  return std::make_pair(AsyncTask(std::move(node)), std::move(fu));
+}
+
+// Function: dependent_async
+template <typename F, typename I,
+  std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>*
+>
+auto Executor::dependent_async(F&& func, I first, I last) {
+  return dependent_async("", std::forward<F>(func), first, last);
+}
+
+// Function: dependent_async
+template <typename F, typename I,
+  std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>*
+>
+auto Executor::dependent_async(
+  const std::string& name, F&& func, I first, I last
+) {
+  
+  _increment_topology();
+  
+  using R = std::invoke_result_t<std::decay_t<F>>;
+
+  std::promise<R> p;
+  auto fu{p.get_future()};
+
+  size_t num_dependents = std::distance(first, last);
+
+  std::shared_ptr<Node> node(
+    node_pool.animate(
+      name, 0, nullptr, nullptr, num_dependents,
+      std::in_place_type_t<Node::DependentAsync>{},
+      _make_promised_async(std::move(p), std::forward<F>(func))
+    ),
+    [&](Node* ptr){ node_pool.recycle(ptr); }
+  );
+  
+  {
+    std::scoped_lock lock(_asyncs_mutex);
+    _asyncs.insert(node);
+  }
+  
+  for(; first != last; first++) {
+    _process_async_dependent(node.get(), *first, num_dependents);
+  }
+
+  if(num_dependents == 0) {
+    _schedule_async_task(node.get());
+  }
+
+  return std::make_pair(AsyncTask(std::move(node)), std::move(fu));
+}
+
+// ----------------------------------------------------------------------------
+// Dependent Async Helper Functions
+// ----------------------------------------------------------------------------
+
+// Procedure: _process_async_dependent
+inline void Executor::_process_async_dependent(
+  Node* node, tf::AsyncTask& task, size_t& num_dependents
+) {
+
+  std::shared_ptr<Node> dep;
+  {
+    std::scoped_lock lock(_asyncs_mutex);
+    if(auto itr = _asyncs.find(task._node); itr != _asyncs.end()){
+      dep = *itr;
+    }
+  }
+  
+  // if the dependent task exists
+  if(dep) {
+    auto& state = std::get_if<Node::DependentAsync>(&(dep->_handle))->state;
+
+    add_dependent:
+
+    auto target = Node::AsyncState::UNFINISHED;
+    
+    // acquires the lock
+    if(state.compare_exchange_weak(target, Node::AsyncState::LOCKED,
+                                   std::memory_order_acq_rel,
+                                   std::memory_order_acquire)) {
+      dep->_successors.push_back(node);
+      state.store(Node::AsyncState::UNFINISHED, std::memory_order_release);
+    }
+    // dep's state is FINISHED, which means dep finished its callable already
+    // thus decrement the node's join counter by 1
+    else if (target == Node::AsyncState::FINISHED) {
+      // decrement the counter needs to be the order of acquire and release
+      // to synchronize with the worker
+      num_dependents = node->_join_counter.fetch_sub(1, std::memory_order_acq_rel) - 1;
+    }
+    // another worker adding an async task that shares the same dependent
+    else {
+      goto add_dependent;
+    }
+  }
+  else {
+    num_dependents = node->_join_counter.fetch_sub(1, std::memory_order_acq_rel) - 1;
+  }
+}
+
+// Procedure: _tear_down_dependent_async
+inline void Executor::_tear_down_dependent_async(Worker& worker, Node* node) {
+  
+  // this async task comes from Executor
+  auto& state = std::get_if<Node::DependentAsync>(&(node->_handle))->state;
+  auto target = Node::AsyncState::UNFINISHED;
+
+  while(!state.compare_exchange_weak(target, Node::AsyncState::FINISHED,
+                                     std::memory_order_acq_rel,
+                                     std::memory_order_relaxed)) {
+    target = Node::AsyncState::UNFINISHED;
+  }
+  
+  // spaw successors whenever their dependencies are resolved
+  worker._cache = nullptr;
+  for(size_t i=0; i<node->_successors.size(); ++i) {
+    //if(auto s = node->_successors[i]; --(s->_join_counter) == 0) {
+    if(auto s = node->_successors[i]; 
+      s->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1
+    ) {
+      if(worker._cache) {
+        _schedule(worker, worker._cache);
+      }
+      worker._cache = s;
+    }
+  }
+    
+  // remove myself from the asyncs using extraction to avoid calling
+  // ~Node inside the lock
+  typename std::unordered_set<std::shared_ptr<Node>>::node_type extracted;
+  {
+    std::shared_ptr<Node> ptr(node, [](Node*){});
+    std::scoped_lock lock(_asyncs_mutex); 
+    extracted = _asyncs.extract(ptr);
+    // assert(extracted.empty() == false);
+  }
+  
+  _decrement_topology_and_notify();
+}
+
+
+
+
+
+}  // end of namespace tf -----------------------------------------------------
+
diff --git a/lib/taskflow/core/async_task.hpp b/lib/taskflow/core/async_task.hpp
new file mode 100644
index 0000000..7c92d8e
--- /dev/null
+++ b/lib/taskflow/core/async_task.hpp
@@ -0,0 +1,125 @@
+#pragma once
+
+#include "graph.hpp"
+
+/**
+@file async_task.hpp
+@brief asynchronous task include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// AsyncTask
+// ----------------------------------------------------------------------------
+
+/**
+@brief class to create a dependent asynchronous task
+
+A tf::AsyncTask is a lightweight handle that retains @em shared ownership
+of a dependent async task created by an executor.
+This shared ownership ensures that the async task remains alive when
+adding it to the dependency list of another async task, 
+thus avoiding the classical [ABA problem](https://en.wikipedia.org/wiki/ABA_problem).
+
+@code{.cpp}
+// main thread retains shared ownership of async task A
+tf::AsyncTask A = executor.silent_dependent_async([](){});
+
+// task A remains alive (i.e., at least one ref count by the main thread) 
+// when being added to the dependency list of async task B
+tf::AsyncTask B = executor.silent_dependent_async([](){}, A);
+@endcode
+
+Currently, tf::AsyncTask is implemented based on C++ smart pointer std::shared_ptr and 
+is considered cheap to copy or move as long as only a handful of objects
+own it.
+When a worker completes an async task, it will remove the task from the executor,
+decrementing the number of shared owners by one.
+If that counter reaches zero, the task is destroyed.
+*/
+class AsyncTask {
+  
+  friend class FlowBuilder;
+  friend class Runtime;
+  friend class Taskflow;
+  friend class TaskView;
+  friend class Executor;
+  
+  public:
+    
+    /**
+    @brief constructs an empty task handle
+    */
+    AsyncTask() = default;
+    
+    /**
+    @brief destroys the managed asynchronous task if this is the last owner
+    */
+    ~AsyncTask() = default;
+    
+    /**
+    @brief constructs an task that shares ownership of @c rhs
+    */
+    AsyncTask(const AsyncTask& rhs) = default;
+
+    /**
+    @brief move-constructs an task from @c rhs
+    */
+    AsyncTask(AsyncTask&& rhs) = default;
+    
+    /**
+    @brief shares ownership of the task managed by @c rhs
+    */
+    AsyncTask& operator = (const AsyncTask& rhs) = default;
+
+    /**
+    @brief move-assigns the task from @c rhs
+    */
+    AsyncTask& operator = (AsyncTask&& rhs) = default;
+    
+    /**
+    @brief checks if the task stores a non-null shared pointer
+    */
+    bool empty() const;
+    
+    /**
+    @brief release the ownership 
+    */
+    void reset();
+    
+    /**
+    @brief obtains a hash value of the underlying node
+    */
+    size_t hash_value() const;
+
+  private:
+
+    AsyncTask(std::shared_ptr<Node>);
+
+    std::shared_ptr<Node> _node;
+};
+
+// Constructor
+inline AsyncTask::AsyncTask(std::shared_ptr<Node> ptr) : _node {std::move(ptr)} {
+}
+
+// Function: empty
+inline bool AsyncTask::empty() const {
+  return _node == nullptr;
+}
+
+// Function: reset
+inline void AsyncTask::reset() {
+  _node.reset();
+}
+
+// Function: hash_value
+inline size_t AsyncTask::hash_value() const {
+  return std::hash<std::shared_ptr<Node>>{}(_node);
+}
+
+}  // end of namespace tf ----------------------------------------------------
+
+
+
diff --git a/lib/taskflow/core/declarations.hpp b/lib/taskflow/core/declarations.hpp
index b7f1b24..dd89ab3 100644
--- a/lib/taskflow/core/declarations.hpp
+++ b/lib/taskflow/core/declarations.hpp
@@ -2,19 +2,23 @@
 
 namespace tf {
 
+// ----------------------------------------------------------------------------
 // taskflow
+// ----------------------------------------------------------------------------
 class AsyncTopology;
 class Node;
 class Graph;
 class FlowBuilder;
 class Semaphore;
 class Subflow;
+class Runtime;
 class Task;
 class TaskView;
 class Taskflow;
 class Topology;
 class TopologyBase;
 class Executor;
+class Worker;
 class WorkerView;
 class ObserverInterface;
 class ChromeTracingObserver;
@@ -24,17 +28,29 @@ class TFProfManager;
 template <typename T>
 class Future;
 
+template <typename...Fs>
+class Pipeline;
+
+// ----------------------------------------------------------------------------
 // cudaFlow
-class cudaNode;
-class cudaGraph;
+// ----------------------------------------------------------------------------
+class cudaFlowNode;
+class cudaFlowGraph;
 class cudaTask;
 class cudaFlow;
 class cudaFlowCapturer;
-class cudaFlowCapturerBase;
-class cudaCapturingBase;
-class cudaSequentialCapturing;
-class cudaRoundRobinCapturing;
-class cublasFlowCapturer;
+class cudaFlowOptimizerBase;
+class cudaFlowLinearOptimizer;
+class cudaFlowSequentialOptimizer;
+class cudaFlowRoundRobinOptimizer;
+
+// ----------------------------------------------------------------------------
+// syclFlow
+// ----------------------------------------------------------------------------
+class syclNode;
+class syclGraph;
+class syclTask;
+class syclFlow;
 
 
 }  // end of namespace tf -----------------------------------------------------
diff --git a/lib/taskflow/core/executor-module-opt.hpp b/lib/taskflow/core/executor-module-opt.hpp
new file mode 100644
index 0000000..0e2b1ee
--- /dev/null
+++ b/lib/taskflow/core/executor-module-opt.hpp
@@ -0,0 +1,2025 @@
+#pragma once
+
+#include "observer.hpp"
+#include "taskflow.hpp"
+
+/**
+@file executor.hpp
+@brief executor include file
+*/
+
+namespace tf {
+
+// ----------------------------------------------------------------------------
+// Executor Definition
+// ----------------------------------------------------------------------------
+
+/** @class Executor
+
+@brief class to create an executor for running a taskflow graph
+
+An executor manages a set of worker threads to run one or multiple taskflows
+using an efficient work-stealing scheduling algorithm.
+
+@code{.cpp}
+// Declare an executor and a taskflow
+tf::Executor executor;
+tf::Taskflow taskflow;
+
+// Add three tasks into the taskflow
+tf::Task A = taskflow.emplace([] () { std::cout << "This is TaskA\n"; });
+tf::Task B = taskflow.emplace([] () { std::cout << "This is TaskB\n"; });
+tf::Task C = taskflow.emplace([] () { std::cout << "This is TaskC\n"; });
+
+// Build precedence between tasks
+A.precede(B, C);
+
+tf::Future<void> fu = executor.run(taskflow);
+fu.wait();                // block until the execution completes
+
+executor.run(taskflow, [](){ std::cout << "end of 1 run"; }).wait();
+executor.run_n(taskflow, 4);
+executor.wait_for_all();  // block until all associated executions finish
+executor.run_n(taskflow, 4, [](){ std::cout << "end of 4 runs"; }).wait();
+executor.run_until(taskflow, [cnt=0] () mutable { return ++cnt == 10; });
+@endcode
+
+All the @c run methods are @em thread-safe. You can submit multiple
+taskflows at the same time to an executor from different threads.
+*/
+class Executor {
+
+  friend class FlowBuilder;
+  friend class Subflow;
+  friend class Runtime;
+
+  public:
+
+    /**
+    @brief constructs the executor with @c N worker threads
+
+    The constructor spawns @c N worker threads to run tasks in a
+    work-stealing loop. The number of workers must be greater than zero
+    or an exception will be thrown.
+    By default, the number of worker threads is equal to the maximum
+    hardware concurrency returned by std::thread::hardware_concurrency.
+    */
+    explicit Executor(size_t N = std::thread::hardware_concurrency());
+
+    /**
+    @brief destructs the executor
+
+    The destructor calls Executor::wait_for_all to wait for all submitted
+    taskflows to complete and then notifies all worker threads to stop
+    and join these threads.
+    */
+    ~Executor();
+
+    /**
+    @brief runs a taskflow once
+
+    @param taskflow a tf::Taskflow object
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes the given taskflow once and returns a tf::Future
+    object that eventually holds the result of the execution.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run(taskflow);
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+
+    @attention
+    The executor does not own the given taskflow. It is your responsibility to
+    ensure the taskflow remains alive during its execution.
+    */
+    tf::Future<void> run(Taskflow& taskflow);
+
+    /**
+    @brief runs a moved taskflow once
+
+    @param taskflow a moved tf::Taskflow object
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes a moved taskflow once and returns a tf::Future
+    object that eventually holds the result of the execution.
+    The executor will take care of the lifetime of the moved taskflow.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run(std::move(taskflow));
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+    */
+    tf::Future<void> run(Taskflow&& taskflow);
+
+    /**
+    @brief runs a taskflow once and invoke a callback upon completion
+
+    @param taskflow a tf::Taskflow object
+    @param callable a callable object to be invoked after this run
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes the given taskflow once and invokes the given
+    callable when the execution completes.
+    This member function returns a tf::Future object that
+    eventually holds the result of the execution.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run(taskflow, [](){ std::cout << "done"; });
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+
+    @attention
+    The executor does not own the given taskflow. It is your responsibility to
+    ensure the taskflow remains alive during its execution.
+    */
+    template<typename C>
+    tf::Future<void> run(Taskflow& taskflow, C&& callable);
+
+    /**
+    @brief runs a moved taskflow once and invoke a callback upon completion
+
+    @param taskflow a moved tf::Taskflow object
+    @param callable a callable object to be invoked after this run
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes a moved taskflow once and invokes the given
+    callable when the execution completes.
+    This member function returns a tf::Future object that
+    eventually holds the result of the execution.
+    The executor will take care of the lifetime of the moved taskflow.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run(
+      std::move(taskflow), [](){ std::cout << "done"; }
+    );
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+    */
+    template<typename C>
+    tf::Future<void> run(Taskflow&& taskflow, C&& callable);
+
+    /**
+    @brief runs a taskflow for @c N times
+
+    @param taskflow a tf::Taskflow object
+    @param N number of runs
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes the given taskflow @c N times and returns a tf::Future
+    object that eventually holds the result of the execution.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run_n(taskflow, 2);  // run taskflow 2 times
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+
+    @attention
+    The executor does not own the given taskflow. It is your responsibility to
+    ensure the taskflow remains alive during its execution.
+    */
+    tf::Future<void> run_n(Taskflow& taskflow, size_t N);
+
+    /**
+    @brief runs a moved taskflow for @c N times
+
+    @param taskflow a moved tf::Taskflow object
+    @param N number of runs
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes a moved taskflow @c N times and returns a tf::Future
+    object that eventually holds the result of the execution.
+    The executor will take care of the lifetime of the moved taskflow.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run_n(
+      std::move(taskflow), 2    // run the moved taskflow 2 times
+    );
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+    */
+    tf::Future<void> run_n(Taskflow&& taskflow, size_t N);
+
+    /**
+    @brief runs a taskflow for @c N times and then invokes a callback
+
+    @param taskflow a tf::Taskflow
+    @param N number of runs
+    @param callable a callable object to be invoked after this run
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes the given taskflow @c N times and invokes the given
+    callable when the execution completes.
+    This member function returns a tf::Future object that
+    eventually holds the result of the execution.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run(
+      taskflow, 2, [](){ std::cout << "done"; }  // runs taskflow 2 times and invoke
+                                                 // the lambda to print "done"
+    );
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+
+    @attention
+    The executor does not own the given taskflow. It is your responsibility to
+    ensure the taskflow remains alive during its execution.
+    */
+    template<typename C>
+    tf::Future<void> run_n(Taskflow& taskflow, size_t N, C&& callable);
+
+    /**
+    @brief runs a moved taskflow for @c N times and then invokes a callback
+
+    @param taskflow a moved tf::Taskflow
+    @param N number of runs
+    @param callable a callable object to be invoked after this run
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes a moved taskflow @c N times and invokes the given
+    callable when the execution completes.
+    This member function returns a tf::Future object that
+    eventually holds the result of the execution.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run(
+      // run the moved taskflow 2 times and invoke the lambda to print "done"
+      std::move(taskflow), 2, [](){ std::cout << "done"; }
+    );
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+    */
+    template<typename C>
+    tf::Future<void> run_n(Taskflow&& taskflow, size_t N, C&& callable);
+
+    /**
+    @brief runs a taskflow multiple times until the predicate becomes true
+
+    @param taskflow a tf::Taskflow
+    @param pred a boolean predicate to return @c true for stop
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes the given taskflow multiple times until
+    the predicate returns @c true.
+    This member function returns a tf::Future object that
+    eventually holds the result of the execution.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run(
+      taskflow, [](){ return rand()%10 == 0 }
+    );
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+
+    @attention
+    The executor does not own the given taskflow. It is your responsibility to
+    ensure the taskflow remains alive during its execution.
+    */
+    template<typename P>
+    tf::Future<void> run_until(Taskflow& taskflow, P&& pred);
+
+    /**
+    @brief runs a moved taskflow and keeps running it
+           until the predicate becomes true
+
+    @param taskflow a moved tf::Taskflow object
+    @param pred a boolean predicate to return @c true for stop
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes a moved taskflow multiple times until
+    the predicate returns @c true.
+    This member function returns a tf::Future object that
+    eventually holds the result of the execution.
+    The executor will take care of the lifetime of the moved taskflow.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run(
+      std::move(taskflow), [](){ return rand()%10 == 0 }
+    );
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+    */
+    template<typename P>
+    tf::Future<void> run_until(Taskflow&& taskflow, P&& pred);
+
+    /**
+    @brief runs a taskflow multiple times until the predicate becomes true and
+           then invokes the callback
+
+    @param taskflow a tf::Taskflow
+    @param pred a boolean predicate to return @c true for stop
+    @param callable a callable object to be invoked after this run completes
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes the given taskflow multiple times until
+    the predicate returns @c true and then invokes the given callable when
+    the execution completes.
+    This member function returns a tf::Future object that
+    eventually holds the result of the execution.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run(
+      taskflow, [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; }
+    );
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+
+    @attention
+    The executor does not own the given taskflow. It is your responsibility to
+    ensure the taskflow remains alive during its execution.
+    */
+    template<typename P, typename C>
+    tf::Future<void> run_until(Taskflow& taskflow, P&& pred, C&& callable);
+
+    /**
+    @brief runs a moved taskflow and keeps running
+           it until the predicate becomes true and then invokes the callback
+
+    @param taskflow a moved tf::Taskflow
+    @param pred a boolean predicate to return @c true for stop
+    @param callable a callable object to be invoked after this run completes
+
+    @return a tf::Future that holds the result of the execution
+
+    This member function executes a moved taskflow multiple times until
+    the predicate returns @c true and then invokes the given callable when
+    the execution completes.
+    This member function returns a tf::Future object that
+    eventually holds the result of the execution.
+    The executor will take care of the lifetime of the moved taskflow.
+
+    @code{.cpp}
+    tf::Future<void> future = executor.run(
+      std::move(taskflow),
+      [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; }
+    );
+    // do something else
+    future.wait();
+    @endcode
+
+    This member function is thread-safe.
+    */
+    template<typename P, typename C>
+    tf::Future<void> run_until(Taskflow&& taskflow, P&& pred, C&& callable);
+
+    /**
+    @brief wait for all tasks to complete
+
+    This member function waits until all submitted tasks
+    (e.g., taskflows, asynchronous tasks) to finish.
+
+    @code{.cpp}
+    executor.run(taskflow1);
+    executor.run_n(taskflow2, 10);
+    executor.run_n(taskflow3, 100);
+    executor.wait_for_all();  // wait until the above submitted taskflows finish
+    @endcode
+    */
+    void wait_for_all();
+
+    /**
+    @brief queries the number of worker threads
+
+    Each worker represents one unique thread spawned by an executor
+    upon its construction time.
+
+    @code{.cpp}
+    tf::Executor executor(4);
+    std::cout << executor.num_workers();    // 4
+    @endcode
+    */
+    size_t num_workers() const noexcept;
+
+    /**
+    @brief queries the number of running topologies at the time of this call
+
+    When a taskflow is submitted to an executor, a topology is created to store
+    runtime metadata of the running taskflow.
+    When the execution of the submitted taskflow finishes,
+    its corresponding topology will be removed from the executor.
+
+    @code{.cpp}
+    executor.run(taskflow);
+    std::cout << executor.num_topologies();  // 0 or 1 (taskflow still running)
+    @endcode
+    */
+    size_t num_topologies() const;
+
+    /**
+    @brief queries the number of running taskflows with moved ownership
+
+    @code{.cpp}
+    executor.run(std::move(taskflow));
+    std::cout << executor.num_taskflows();  // 0 or 1 (taskflow still running)
+    @endcode
+    */
+    size_t num_taskflows() const;
+
+    /**
+    @brief queries the id of the caller thread in this executor
+
+    Each worker has an unique id in the range of @c 0 to @c N-1 associated with
+    its parent executor.
+    If the caller thread does not belong to the executor, @c -1 is returned.
+
+    @code{.cpp}
+    tf::Executor executor(4);   // 4 workers in the executor
+    executor.this_worker_id();  // -1 (main thread is not a worker)
+
+    taskflow.emplace([&](){
+      std::cout << executor.this_worker_id();  // 0, 1, 2, or 3
+    });
+    executor.run(taskflow);
+    @endcode
+    */
+    int this_worker_id() const;
+
+    /**
+    @brief runs a given function asynchronously
+
+    @tparam F callable type
+    @tparam ArgsT parameter types
+
+    @param f callable object to call
+    @param args parameters to pass to the callable
+
+    @return a tf::Future that will holds the result of the execution
+
+    The method creates an asynchronous task to launch the given
+    function on the given arguments.
+    Unlike std::async, the return here is a @em tf::Future that holds
+    an optional object to the result.
+    If the asynchronous task is cancelled before it runs, the return is
+    a @c std::nullopt, or the value returned by the callable.
+
+    @code{.cpp}
+    tf::Future<std::optional<int>> future = executor.async([](){
+      std::cout << "create an asynchronous task and returns 1\n";
+      return 1;
+    });
+    @endcode
+
+    This member function is thread-safe.
+    */
+    template <typename F, typename... ArgsT>
+    auto async(F&& f, ArgsT&&... args);
+
+    /**
+    @brief runs a given function asynchronously and gives a name to this task
+
+    @tparam F callable type
+    @tparam ArgsT parameter types
+
+    @param name name of the asynchronous task
+    @param f callable object to call
+    @param args parameters to pass to the callable
+
+    @return a tf::Future that will holds the result of the execution
+
+    The method creates a named asynchronous task to launch the given
+    function on the given arguments.
+    Naming an asynchronous task is primarily used for profiling and visualizing
+    the task execution timeline.
+    Unlike std::async, the return here is a tf::Future that holds
+    an optional object to the result.
+    If the asynchronous task is cancelled before it runs, the return is
+    a @c std::nullopt, or the value returned by the callable.
+
+    @code{.cpp}
+    tf::Future<std::optional<int>> future = executor.named_async("name", [](){
+      std::cout << "create an asynchronous task with a name and returns 1\n";
+      return 1;
+    });
+    @endcode
+
+    This member function is thread-safe.
+    */
+    template <typename F, typename... ArgsT>
+    auto named_async(const std::string& name, F&& f, ArgsT&&... args);
+
+    /**
+    @brief similar to tf::Executor::async but does not return a future object
+
+    This member function is more efficient than tf::Executor::async
+    and is encouraged to use when there is no data returned.
+
+    @code{.cpp}
+    executor.silent_async([](){
+      std::cout << "create an asynchronous task with no return\n";
+    });
+    @endcode
+
+    This member function is thread-safe.
+    */
+    template <typename F, typename... ArgsT>
+    void silent_async(F&& f, ArgsT&&... args);
+
+    /**
+    @brief similar to tf::Executor::named_async but does not return a future object
+
+    This member function is more efficient than tf::Executor::named_async
+    and is encouraged to use when there is no data returned.
+
+    @code{.cpp}
+    executor.named_silent_async("name", [](){
+      std::cout << "create an asynchronous task with a name and no return\n";
+    });
+    @endcode
+
+    This member function is thread-safe.
+    */
+    template <typename F, typename... ArgsT>
+    void named_silent_async(const std::string& name, F&& f, ArgsT&&... args);
+
+    /**
+    @brief constructs an observer to inspect the activities of worker threads
+
+    @tparam Observer observer type derived from tf::ObserverInterface
+    @tparam ArgsT argument parameter pack
+
+    @param args arguments to forward to the constructor of the observer
+
+    @return a shared pointer to the created observer
+
+    Each executor manages a list of observers with shared ownership with callers.
+    For each of these observers, the two member functions,
+    tf::ObserverInterface::on_entry and tf::ObserverInterface::on_exit
+    will be called before and after the execution of a task.
+
+    This member function is not thread-safe.
+    */
+    template <typename Observer, typename... ArgsT>
+    std::shared_ptr<Observer> make_observer(ArgsT&&... args);
+
+    /**
+    @brief removes an observer from the executor
+
+    This member function is not thread-safe.
+    */
+    template <typename Observer>
+    void remove_observer(std::shared_ptr<Observer> observer);
+
+    /**
+    @brief queries the number of observers
+    */
+    size_t num_observers() const noexcept;
+
+  private:
+
+    std::condition_variable _topology_cv;
+    std::mutex _taskflow_mutex;
+    std::mutex _topology_mutex;
+    std::mutex _wsq_mutex;
+
+    size_t _num_topologies {0};
+
+    std::unordered_map<std::thread::id, size_t> _wids;
+    std::vector<Worker> _workers;
+    std::vector<std::thread> _threads;
+    std::list<Taskflow> _taskflows;
+
+    Notifier _notifier;
+
+    TaskQueue<Node*> _wsq;
+
+    std::atomic<size_t> _num_actives {0};
+    std::atomic<size_t> _num_thieves {0};
+    std::atomic<bool>   _done {0};
+
+    std::unordered_set<std::shared_ptr<ObserverInterface>> _observers;
+
+    Worker* _this_worker();
+
+    bool _wait_for_task(Worker&, Node*&);
+
+    void _observer_prologue(Worker&, Node*);
+    void _observer_epilogue(Worker&, Node*);
+    void _spawn(size_t);
+    void _worker_loop(Worker&);
+    void _exploit_task(Worker&, Node*&);
+    void _explore_task(Worker&, Node*&);
+    void _consume_task(Worker&, Node*);
+    void _schedule(Worker&, Node*);
+    void _schedule(Node*);
+    void _schedule(Worker&, const SmallVector<Node*>&);
+    void _schedule(const SmallVector<Node*>&);
+    void _set_up_topology(Worker*, Topology*);
+    void _tear_down_topology(Worker&, Topology*);
+    void _tear_down_async(Node*);
+    void _tear_down_invoke(Worker&, Node*);
+    void _cancel_invoke(Worker&, Node*);
+    void _increment_topology();
+    void _decrement_topology();
+    void _decrement_topology_and_notify();
+    void _invoke(Worker&, Node*);
+    void _invoke_static_task(Worker&, Node*);
+    void _invoke_dynamic_task(Worker&, Node*);
+    void _invoke_dynamic_task_external(Worker&, Node*, Graph&, bool);
+    void _invoke_dynamic_task_internal(Worker&, Node*, Graph&);
+    void _invoke_condition_task(Worker&, Node*, SmallVector<int>&);
+    void _invoke_multi_condition_task(Worker&, Node*, SmallVector<int>&);
+    void _invoke_module_task(Worker&, Node*, bool&);
+    void _invoke_module_task_internal(Worker&, Node*, Graph&, bool&);
+    void _invoke_async_task(Worker&, Node*);
+    void _invoke_silent_async_task(Worker&, Node*);
+    void _invoke_cudaflow_task(Worker&, Node*);
+    void _invoke_syclflow_task(Worker&, Node*);
+    void _invoke_runtime_task(Worker&, Node*);
+
+    template <typename C,
+      std::enable_if_t<is_cudaflow_task_v<C>, void>* = nullptr
+    >
+    void _invoke_cudaflow_task_entry(Node*, C&&);
+
+    template <typename C, typename Q,
+      std::enable_if_t<is_syclflow_task_v<C>, void>* = nullptr
+    >
+    void _invoke_syclflow_task_entry(Node*, C&&, Q&);
+};
+
+// Constructor
+inline Executor::Executor(size_t N) :
+  _workers    {N},
+  _notifier   {N} {
+
+  if(N == 0) {
+    TF_THROW("no cpu workers to execute taskflows");
+  }
+
+  _spawn(N);
+
+  // instantite the default observer if requested
+  if(has_env(TF_ENABLE_PROFILER)) {
+    TFProfManager::get()._manage(make_observer<TFProfObserver>());
+  }
+}
+
+// Destructor
+inline Executor::~Executor() {
+
+  // wait for all topologies to complete
+  wait_for_all();
+
+  // shut down the scheduler
+  _done = true;
+
+  _notifier.notify(true);
+
+  for(auto& t : _threads){
+    t.join();
+  }
+}
+
+// Function: num_workers
+inline size_t Executor::num_workers() const noexcept {
+  return _workers.size();
+}
+
+// Function: num_topologies
+inline size_t Executor::num_topologies() const {
+  return _num_topologies;
+}
+
+// Function: num_taskflows
+inline size_t Executor::num_taskflows() const {
+  return _taskflows.size();
+}
+
+// Function: _this_worker
+inline Worker* Executor::_this_worker() {
+  auto itr = _wids.find(std::this_thread::get_id());
+  return itr == _wids.end() ? nullptr : &_workers[itr->second];
+}
+
+// Function: named_async
+template <typename F, typename... ArgsT>
+auto Executor::named_async(const std::string& name, F&& f, ArgsT&&... args) {
+
+  _increment_topology();
+
+  using T = std::invoke_result_t<F, ArgsT...>;
+  using R = std::conditional_t<std::is_same_v<T, void>, void, std::optional<T>>;
+
+  std::promise<R> p;
+
+  auto tpg = std::make_shared<AsyncTopology>();
+
+  Future<R> fu(p.get_future(), tpg);
+
+  auto node = node_pool.animate(
+    std::in_place_type_t<Node::Async>{},
+    [p=make_moc(std::move(p)), f=std::forward<F>(f), args...]
+    (bool cancel) mutable {
+      if constexpr(std::is_same_v<R, void>) {
+        if(!cancel) {
+          f(args...);
+        }
+        p.object.set_value();
+      }
+      else {
+        p.object.set_value(cancel ? std::nullopt : std::make_optional(f(args...)));
+      }
+    },
+    std::move(tpg)
+  );
+
+  node->_name = name;
+
+  if(auto w = _this_worker(); w) {
+    _schedule(*w, node);
+  }
+  else{
+    _schedule(node);
+  }
+
+  return fu;
+}
+
+// Function: async
+template <typename F, typename... ArgsT>
+auto Executor::async(F&& f, ArgsT&&... args) {
+  return named_async("", std::forward<F>(f), std::forward<ArgsT>(args)...);
+}
+
+// Function: named_silent_async
+template <typename F, typename... ArgsT>
+void Executor::named_silent_async(
+  const std::string& name, F&& f, ArgsT&&... args
+) {
+
+  _increment_topology();
+
+  Node* node = node_pool.animate(
+    std::in_place_type_t<Node::SilentAsync>{},
+    [f=std::forward<F>(f), args...] () mutable {
+      f(args...);
+    }
+  );
+
+  node->_name = name;
+
+  if(auto w = _this_worker(); w) {
+    _schedule(*w, node);
+  }
+  else {
+    _schedule(node);
+  }
+}
+
+// Function: silent_async
+template <typename F, typename... ArgsT>
+void Executor::silent_async(F&& f, ArgsT&&... args) {
+  named_silent_async("", std::forward<F>(f), std::forward<ArgsT>(args)...);
+}
+
+// Function: this_worker_id
+inline int Executor::this_worker_id() const {
+  auto i = _wids.find(std::this_thread::get_id());
+  return i == _wids.end() ? -1 : static_cast<int>(_workers[i->second]._id);
+}
+
+// Procedure: _spawn
+inline void Executor::_spawn(size_t N) {
+
+  std::mutex mutex;
+  std::condition_variable cond;
+  size_t n=0;
+
+  for(size_t id=0; id<N; ++id) {
+
+    _workers[id]._id = id;
+    _workers[id]._vtm = id;
+    _workers[id]._executor = this;
+    _workers[id]._waiter = &_notifier._waiters[id];
+
+    _threads.emplace_back([this] (
+      Worker& w, std::mutex& mutex, std::condition_variable& cond, size_t& n
+    ) -> void {
+
+      // enables the mapping
+      {
+        std::scoped_lock lock(mutex);
+        _wids[std::this_thread::get_id()] = w._id;
+        if(n++; n == num_workers()) {
+          cond.notify_one();
+        }
+      }
+
+      //this_worker().worker = &w;
+
+      Node* t = nullptr;
+
+      // must use 1 as condition instead of !done
+      while(1) {
+
+        // execute the tasks.
+        _exploit_task(w, t);
+
+        // wait for tasks
+        if(_wait_for_task(w, t) == false) {
+          break;
+        }
+      }
+
+    }, std::ref(_workers[id]), std::ref(mutex), std::ref(cond), std::ref(n));
+  }
+
+  std::unique_lock<std::mutex> lock(mutex);
+  cond.wait(lock, [&](){ return n==N; });
+}
+
+// Function: _consume_task
+inline void Executor::_consume_task(Worker& w, Node* p) {
+
+  std::uniform_int_distribution<size_t> rdvtm(0, _workers.size()-1);
+
+  while(p->_join_counter != 0) {
+    exploit:
+    if(auto t = w._wsq.pop(); t) {
+      _invoke(w, t);
+    }
+    else {
+      size_t num_steals = 0;
+      //size_t num_pauses = 0;
+      size_t max_steals = ((_workers.size() + 1) << 1);
+
+      explore:
+
+      t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal();
+      if(t) {
+        _invoke(w, t);
+        goto exploit;
+      }
+      else if(p->_join_counter != 0){
+
+        if(num_steals++ > max_steals) {
+          std::this_thread::yield();
+        }
+
+        //std::this_thread::yield();
+        w._vtm = rdvtm(w._rdgen);
+        goto explore;
+      }
+      else {
+        break;
+      }
+    }
+  }
+}
+
+// Function: _explore_task
+inline void Executor::_explore_task(Worker& w, Node*& t) {
+
+  //assert(_workers[w].wsq.empty());
+  //assert(!t);
+
+  size_t num_steals = 0;
+  size_t num_yields = 0;
+  size_t max_steals = ((_workers.size() + 1) << 1);
+
+  std::uniform_int_distribution<size_t> rdvtm(0, _workers.size()-1);
+
+  do {
+    t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal();
+
+    if(t) {
+      break;
+    }
+
+    if(num_steals++ > max_steals) {
+      std::this_thread::yield();
+      if(num_yields++ > 100) {
+        break;
+      }
+    }
+
+    w._vtm = rdvtm(w._rdgen);
+  } while(!_done);
+
+}
+
+// Procedure: _exploit_task
+inline void Executor::_exploit_task(Worker& w, Node*& t) {
+
+  if(t) {
+
+    if(_num_actives.fetch_add(1) == 0 && _num_thieves == 0) {
+      _notifier.notify(false);
+    }
+
+    while(t) {
+      _invoke(w, t);
+      t = w._wsq.pop();
+    }
+
+    --_num_actives;
+  }
+}
+
+// Function: _wait_for_task
+inline bool Executor::_wait_for_task(Worker& worker, Node*& t) {
+
+  wait_for_task:
+
+  //assert(!t);
+
+  ++_num_thieves;
+
+  explore_task:
+
+  _explore_task(worker, t);
+
+  if(t) {
+    if(_num_thieves.fetch_sub(1) == 1) {
+      _notifier.notify(false);
+    }
+    return true;
+  }
+
+  _notifier.prepare_wait(worker._waiter);
+
+  //if(auto vtm = _find_vtm(me); vtm != _workers.size()) {
+  if(!_wsq.empty()) {
+
+    _notifier.cancel_wait(worker._waiter);
+    //t = (vtm == me) ? _wsq.steal() : _workers[vtm].wsq.steal();
+
+    t = _wsq.steal();  // must steal here
+    if(t) {
+      if(_num_thieves.fetch_sub(1) == 1) {
+        _notifier.notify(false);
+      }
+      return true;
+    }
+    else {
+      worker._vtm = worker._id;
+      goto explore_task;
+    }
+  }
+
+  if(_done) {
+    _notifier.cancel_wait(worker._waiter);
+    _notifier.notify(true);
+    --_num_thieves;
+    return false;
+  }
+
+  if(_num_thieves.fetch_sub(1) == 1) {
+    if(_num_actives) {
+      _notifier.cancel_wait(worker._waiter);
+      goto wait_for_task;
+    }
+    // check all queues again
+    for(auto& w : _workers) {
+      if(!w._wsq.empty()) {
+        worker._vtm = w._id;
+        _notifier.cancel_wait(worker._waiter);
+        goto wait_for_task;
+      }
+    }
+  }
+
+  // Now I really need to relinguish my self to others
+  _notifier.commit_wait(worker._waiter);
+
+  return true;
+}
+
+// Function: make_observer
+template<typename Observer, typename... ArgsT>
+std::shared_ptr<Observer> Executor::make_observer(ArgsT&&... args) {
+
+  static_assert(
+    std::is_base_of_v<ObserverInterface, Observer>,
+    "Observer must be derived from ObserverInterface"
+  );
+
+  // use a local variable to mimic the constructor
+  auto ptr = std::make_shared<Observer>(std::forward<ArgsT>(args)...);
+
+  ptr->set_up(_workers.size());
+
+  _observers.emplace(std::static_pointer_cast<ObserverInterface>(ptr));
+
+  return ptr;
+}
+
+// Procedure: remove_observer
+template <typename Observer>
+void Executor::remove_observer(std::shared_ptr<Observer> ptr) {
+
+  static_assert(
+    std::is_base_of_v<ObserverInterface, Observer>,
+    "Observer must be derived from ObserverInterface"
+  );
+
+  _observers.erase(std::static_pointer_cast<ObserverInterface>(ptr));
+}
+
+// Function: num_observers
+inline size_t Executor::num_observers() const noexcept {
+  return _observers.size();
+}
+
+// Procedure: _schedule
+inline void Executor::_schedule(Worker& worker, Node* node) {
+
+  node->_state.fetch_or(Node::READY, std::memory_order_release);
+
+  // caller is a worker to this pool
+  if(worker._executor == this) {
+    worker._wsq.push(node);
+    return;
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(_wsq_mutex);
+    _wsq.push(node);
+  }
+
+  _notifier.notify(false);
+}
+
+// Procedure: _schedule
+inline void Executor::_schedule(Node* node) {
+
+  node->_state.fetch_or(Node::READY, std::memory_order_release);
+
+  {
+    std::lock_guard<std::mutex> lock(_wsq_mutex);
+    _wsq.push(node);
+  }
+
+  _notifier.notify(false);
+}
+
+// Procedure: _schedule
+inline void Executor::_schedule(
+  Worker& worker, const SmallVector<Node*>& nodes
+) {
+
+  // We need to cacth the node count to avoid accessing the nodes
+  // vector while the parent topology is removed!
+  const auto num_nodes = nodes.size();
+
+  if(num_nodes == 0) {
+    return;
+  }
+
+  // make the node ready
+  for(size_t i=0; i<num_nodes; ++i) {
+    nodes[i]->_state.fetch_or(Node::READY, std::memory_order_release);
+  }
+
+  if(worker._executor == this) {
+    for(size_t i=0; i<num_nodes; ++i) {
+      worker._wsq.push(nodes[i]);
+    }
+    return;
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(_wsq_mutex);
+    for(size_t k=0; k<num_nodes; ++k) {
+      _wsq.push(nodes[k]);
+    }
+  }
+
+  _notifier.notify_n(num_nodes);
+}
+
+// Procedure: _schedule
+inline void Executor::_schedule(const SmallVector<Node*>& nodes) {
+
+  // parent topology may be removed!
+  const auto num_nodes = nodes.size();
+
+  if(num_nodes == 0) {
+    return;
+  }
+
+  // make the node ready
+  for(size_t i=0; i<num_nodes; ++i) {
+    nodes[i]->_state.fetch_or(Node::READY, std::memory_order_release);
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(_wsq_mutex);
+    for(size_t k=0; k<num_nodes; ++k) {
+      _wsq.push(nodes[k]);
+    }
+  }
+
+  _notifier.notify_n(num_nodes);
+}
+
+// Procedure: _invoke
+inline void Executor::_invoke(Worker& worker, Node* node) {
+
+  int state;
+  SmallVector<int> conds;
+
+  // synchronize all outstanding memory operations caused by reordering
+  do {
+    state = node->_state.load(std::memory_order_acquire);
+  } while(! (state & Node::READY));
+
+  // unwind stack for deferred node
+  if(state & Node::DEFERRED) {
+    node->_state.fetch_and(~Node::DEFERRED, std::memory_order_relaxed);
+    goto invoke_epilogue;
+  }
+
+  //while(!(node->_state.load(std::memory_order_acquire) & Node::READY));
+
+  invoke_prologue:
+
+  // no need to do other things if the topology is cancelled
+  if(node->_is_cancelled()) {
+    _cancel_invoke(worker, node);
+    return;
+  }
+
+  // if acquiring semaphore(s) exists, acquire them first
+  if(node->_semaphores && !node->_semaphores->to_acquire.empty()) {
+    SmallVector<Node*> nodes;
+    if(!node->_acquire_all(nodes)) {
+      _schedule(worker, nodes);
+      return;
+    }
+    node->_state.fetch_or(Node::ACQUIRED, std::memory_order_release);
+  }
+
+  // condition task
+  //int cond = -1;
+  //SmallVector<int> conds = { -1 };
+
+  // switch is faster than nested if-else due to jump table
+  switch(node->_handle.index()) {
+    // static task
+    case Node::STATIC:{
+      _invoke_static_task(worker, node);
+    }
+    break;
+
+    // dynamic task
+    case Node::DYNAMIC: {
+      _invoke_dynamic_task(worker, node);
+    }
+    break;
+
+    // condition task
+    case Node::CONDITION: {
+      _invoke_condition_task(worker, node, conds);
+    }
+    break;
+
+    // multi-condition task
+    case Node::MULTI_CONDITION: {
+      _invoke_multi_condition_task(worker, node, conds);
+    }
+    break;
+
+    // module task
+    case Node::MODULE: {
+      bool deferred = false;
+      _invoke_module_task(worker, node, deferred);
+      if(deferred) {
+        return;
+      }
+    }
+    break;
+
+    // async task
+    case Node::ASYNC: {
+      _invoke_async_task(worker, node);
+      _tear_down_async(node);
+      return ;
+    }
+    break;
+
+    // silent async task
+    case Node::SILENT_ASYNC: {
+      _invoke_silent_async_task(worker, node);
+      _tear_down_async(node);
+      return ;
+    }
+    break;
+
+    // cudaflow task
+    case Node::CUDAFLOW: {
+      _invoke_cudaflow_task(worker, node);
+    }
+    break;
+
+    // syclflow task
+    case Node::SYCLFLOW: {
+      _invoke_syclflow_task(worker, node);
+    }
+    break;
+
+    // runtime task
+    case Node::RUNTIME: {
+      _invoke_runtime_task(worker, node);
+    }
+    break;
+
+    // monostate (placeholder)
+    default:
+    break;
+  }
+
+  invoke_epilogue:
+
+  // if releasing semaphores exist, release them
+  if(node->_semaphores && !node->_semaphores->to_release.empty()) {
+    _schedule(worker, node->_release_all());
+  }
+
+  // We MUST recover the dependency since the graph may have cycles.
+  // This must be done before scheduling the successors, otherwise this might cause
+  // race condition on the _dependents
+  if((node->_state.load(std::memory_order_relaxed) & Node::CONDITIONED)) {
+    node->_join_counter = node->num_strong_dependents();
+  }
+  else {
+    node->_join_counter = node->num_dependents();
+  }
+
+  // acquire the parent flow counter
+  auto& j = (node->_parent) ? node->_parent->_join_counter :
+                              node->_topology->_join_counter;
+
+  Node* cache {nullptr};
+
+  // At this point, the node storage might be destructed (to be verified)
+  // case 1: non-condition task
+  switch(node->_handle.index()) {
+
+    // condition and multi-condition tasks
+    case Node::CONDITION:
+    case Node::MULTI_CONDITION: {
+      for(auto cond : conds) {
+        if(cond >= 0 && static_cast<size_t>(cond) < node->_successors.size()) {
+          auto s = node->_successors[cond];
+          // zeroing the join counter for invariant
+          s->_join_counter.store(0, std::memory_order_relaxed);
+          j.fetch_add(1);
+          if(cache) {
+            _schedule(worker, cache);
+          }
+          cache = s;
+        }
+      }
+    }
+    break;
+
+    // non-condition task
+    default: {
+      for(size_t i=0; i<node->_successors.size(); ++i) {
+        if(--(node->_successors[i]->_join_counter) == 0) {
+          j.fetch_add(1);
+          if(cache) {
+            _schedule(worker, cache);
+          }
+          cache = node->_successors[i];
+        }
+      }
+    }
+    break;
+  }
+
+  // tear_down the invoke
+  _tear_down_invoke(worker, node);
+
+  // perform tail recursion elimination for the right-most child to reduce
+  // the number of expensive pop/push operations through the task queue
+  if(cache) {
+    node = cache;
+    //node->_state.fetch_or(Node::READY, std::memory_order_release);
+    goto invoke_prologue;
+  }
+}
+
+// Procedure: _tear_down_async
+inline void Executor::_tear_down_async(Node* node) {
+  if(node->_parent) {
+    node->_parent->_join_counter.fetch_sub(1);
+  }
+  else {
+    _decrement_topology_and_notify();
+  }
+  node_pool.recycle(node);
+}
+
+// Proecdure: _tear_down_invoke
+inline void Executor::_tear_down_invoke(Worker& worker, Node* node) {
+  // we must check parent first before substracting the join counter,
+  // or it can introduce data race
+  if(auto parent = node->_parent; parent == nullptr) {
+    if(node->_topology->_join_counter.fetch_sub(1) == 1) {
+      _tear_down_topology(worker, node->_topology);
+    }
+  }
+  else {
+    // prefetch the deferred status, as subtracting the join counter can
+    // immediately cause the other worker to release the subflow
+    auto deferred = parent->_state.load(std::memory_order_relaxed) & Node::DEFERRED;
+    if(parent->_join_counter.fetch_sub(1) == 1 && deferred) {
+      _schedule(worker, parent);
+    }
+  }
+}
+
+// Procedure: _cancel_invoke
+inline void Executor::_cancel_invoke(Worker& worker, Node* node) {
+
+  switch(node->_handle.index()) {
+    // async task needs to carry out the promise
+    case Node::ASYNC:
+      std::get_if<Node::Async>(&(node->_handle))->work(true);
+      _tear_down_async(node);
+    break;
+
+    // silent async doesn't need to carry out the promise
+    case Node::SILENT_ASYNC:
+      _tear_down_async(node);
+    break;
+
+    // tear down topology if the node is the last leaf
+    default: {
+      _tear_down_invoke(worker, node);
+    }
+    break;
+  }
+}
+
+// Procedure: _observer_prologue
+inline void Executor::_observer_prologue(Worker& worker, Node* node) {
+  for(auto& observer : _observers) {
+    observer->on_entry(WorkerView(worker), TaskView(*node));
+  }
+}
+
+// Procedure: _observer_epilogue
+inline void Executor::_observer_epilogue(Worker& worker, Node* node) {
+  for(auto& observer : _observers) {
+    observer->on_exit(WorkerView(worker), TaskView(*node));
+  }
+}
+
+// Procedure: _invoke_static_task
+inline void Executor::_invoke_static_task(Worker& worker, Node* node) {
+  _observer_prologue(worker, node);
+  std::get_if<Node::Static>(&node->_handle)->work();
+  _observer_epilogue(worker, node);
+}
+
+// Procedure: _invoke_dynamic_task
+inline void Executor::_invoke_dynamic_task(Worker& w, Node* node) {
+
+  _observer_prologue(w, node);
+
+  auto handle = std::get_if<Node::Dynamic>(&node->_handle);
+
+  handle->subgraph._clear();
+
+  Subflow sf(*this, w, node, handle->subgraph);
+
+  handle->work(sf);
+
+  if(sf._joinable) {
+    _invoke_dynamic_task_internal(w, node, handle->subgraph);
+  }
+
+  _observer_epilogue(w, node);
+}
+
+// Procedure: _invoke_dynamic_task_external
+inline void Executor::_invoke_dynamic_task_external(
+  Worker& w, Node* p, Graph& g, bool detach
+) {
+
+  // graph is empty and has no async tasks
+  if(g.empty() && p->_join_counter == 0) {
+    return;
+  }
+
+  SmallVector<Node*> src;
+
+  for(auto n : g._nodes) {
+
+    n->_topology = p->_topology;
+    n->_state.store(0, std::memory_order_relaxed);
+    n->_set_up_join_counter();
+
+    if(detach) {
+      n->_parent = nullptr;
+      n->_state.fetch_or(Node::DETACHED, std::memory_order_relaxed);
+    }
+    else {
+      n->_parent = p;
+    }
+
+    if(n->num_dependents() == 0) {
+      src.push_back(n);
+    }
+  }
+
+  // detach here
+  if(detach) {
+
+    {
+      std::lock_guard<std::mutex> lock(p->_topology->_taskflow._mutex);
+      p->_topology->_taskflow._graph._merge(std::move(g));
+    }
+
+    p->_topology->_join_counter.fetch_add(src.size());
+    _schedule(w, src);
+  }
+  // join here
+  else {
+    p->_join_counter.fetch_add(src.size());
+    _schedule(w, src);
+    _consume_task(w, p);
+  }
+}
+
+// Procedure: _invoke_dynamic_task_internal
+inline void Executor::_invoke_dynamic_task_internal(
+  Worker& w, Node* p, Graph& g
+) {
+
+  // graph is empty and has no async tasks
+  if(g.empty() && p->_join_counter == 0) {
+    return;
+  }
+
+  SmallVector<Node*> src;
+
+  for(auto n : g._nodes) {
+    n->_topology = p->_topology;
+    n->_state.store(0, std::memory_order_relaxed);
+    n->_set_up_join_counter();
+    n->_parent = p;
+    if(n->num_dependents() == 0) {
+      src.push_back(n);
+    }
+  }
+  p->_join_counter.fetch_add(src.size());
+  _schedule(w, src);
+  _consume_task(w, p);
+}
+
+// Procedure: _invoke_module_task_internal
+inline void Executor::_invoke_module_task_internal(
+  Worker& w, Node* p, Graph& g, bool& deferred
+) {
+
+  // graph is empty and has no async tasks
+  if(g.empty()) {
+    return;
+  }
+
+  // set deferred
+  deferred = true;
+  p->_state.fetch_or(Node::DEFERRED, std::memory_order_relaxed);
+
+  SmallVector<Node*> src;
+
+  for(auto n : g._nodes) {
+    n->_topology = p->_topology;
+    n->_state.store(0, std::memory_order_relaxed);
+    n->_set_up_join_counter();
+    n->_parent = p;
+    if(n->num_dependents() == 0) {
+      src.push_back(n);
+    }
+  }
+  p->_join_counter.fetch_add(src.size());
+  _schedule(w, src);
+}
+
+// Procedure: _invoke_condition_task
+inline void Executor::_invoke_condition_task(
+  Worker& worker, Node* node, SmallVector<int>& conds
+) {
+  _observer_prologue(worker, node);
+  conds = { std::get_if<Node::Condition>(&node->_handle)->work() };
+  _observer_epilogue(worker, node);
+}
+
+// Procedure: _invoke_multi_condition_task
+inline void Executor::_invoke_multi_condition_task(
+  Worker& worker, Node* node, SmallVector<int>& conds
+) {
+  _observer_prologue(worker, node);
+  conds = std::get_if<Node::MultiCondition>(&node->_handle)->work();
+  _observer_epilogue(worker, node);
+}
+
+// Procedure: _invoke_cudaflow_task
+inline void Executor::_invoke_cudaflow_task(Worker& worker, Node* node) {
+  _observer_prologue(worker, node);
+  std::get_if<Node::cudaFlow>(&node->_handle)->work(*this, node);
+  _observer_epilogue(worker, node);
+}
+
+// Procedure: _invoke_syclflow_task
+inline void Executor::_invoke_syclflow_task(Worker& worker, Node* node) {
+  _observer_prologue(worker, node);
+  std::get_if<Node::syclFlow>(&node->_handle)->work(*this, node);
+  _observer_epilogue(worker, node);
+}
+
+// Procedure: _invoke_module_task
+inline void Executor::_invoke_module_task(Worker& w, Node* node, bool& deferred) {
+  _observer_prologue(w, node);
+  _invoke_module_task_internal(
+    w, node, std::get_if<Node::Module>(&node->_handle)->graph, deferred
+  );
+  _observer_epilogue(w, node);
+}
+
+// Procedure: _invoke_async_task
+inline void Executor::_invoke_async_task(Worker& w, Node* node) {
+  _observer_prologue(w, node);
+  std::get_if<Node::Async>(&node->_handle)->work(false);
+  _observer_epilogue(w, node);
+}
+
+// Procedure: _invoke_silent_async_task
+inline void Executor::_invoke_silent_async_task(Worker& w, Node* node) {
+  _observer_prologue(w, node);
+  std::get_if<Node::SilentAsync>(&node->_handle)->work();
+  _observer_epilogue(w, node);
+}
+
+// Procedure: _invoke_runtime_task
+inline void Executor::_invoke_runtime_task(Worker& w, Node* node) {
+  _observer_prologue(w, node);
+  Runtime rt(*this, w, node);
+  std::get_if<Node::Runtime>(&node->_handle)->work(rt);
+  _observer_epilogue(w, node);
+}
+
+// Function: run
+inline tf::Future<void> Executor::run(Taskflow& f) {
+  return run_n(f, 1, [](){});
+}
+
+// Function: run
+inline tf::Future<void> Executor::run(Taskflow&& f) {
+  return run_n(std::move(f), 1, [](){});
+}
+
+// Function: run
+template <typename C>
+tf::Future<void> Executor::run(Taskflow& f, C&& c) {
+  return run_n(f, 1, std::forward<C>(c));
+}
+
+// Function: run
+template <typename C>
+tf::Future<void> Executor::run(Taskflow&& f, C&& c) {
+  return run_n(std::move(f), 1, std::forward<C>(c));
+}
+
+// Function: run_n
+inline tf::Future<void> Executor::run_n(Taskflow& f, size_t repeat) {
+  return run_n(f, repeat, [](){});
+}
+
+// Function: run_n
+inline tf::Future<void> Executor::run_n(Taskflow&& f, size_t repeat) {
+  return run_n(std::move(f), repeat, [](){});
+}
+
+// Function: run_n
+template <typename C>
+tf::Future<void> Executor::run_n(Taskflow& f, size_t repeat, C&& c) {
+  return run_until(
+    f, [repeat]() mutable { return repeat-- == 0; }, std::forward<C>(c)
+  );
+}
+
+// Function: run_n
+template <typename C>
+tf::Future<void> Executor::run_n(Taskflow&& f, size_t repeat, C&& c) {
+  return run_until(
+    std::move(f), [repeat]() mutable { return repeat-- == 0; }, std::forward<C>(c)
+  );
+}
+
+// Function: run_until
+template<typename P>
+tf::Future<void> Executor::run_until(Taskflow& f, P&& pred) {
+  return run_until(f, std::forward<P>(pred), [](){});
+}
+
+// Function: run_until
+template<typename P>
+tf::Future<void> Executor::run_until(Taskflow&& f, P&& pred) {
+  return run_until(std::move(f), std::forward<P>(pred), [](){});
+}
+
+// Function: run_until
+template <typename P, typename C>
+tf::Future<void> Executor::run_until(Taskflow& f, P&& p, C&& c) {
+
+  _increment_topology();
+
+  // Need to check the empty under the lock since dynamic task may
+  // define detached blocks that modify the taskflow at the same time
+  bool empty;
+  {
+    std::lock_guard<std::mutex> lock(f._mutex);
+    empty = f.empty();
+  }
+
+  // No need to create a real topology but returns an dummy future
+  if(empty || p()) {
+    c();
+    std::promise<void> promise;
+    promise.set_value();
+    _decrement_topology_and_notify();
+    return tf::Future<void>(promise.get_future(), std::monostate{});
+  }
+
+  // create a topology for this run
+  auto t = std::make_shared<Topology>(f, std::forward<P>(p), std::forward<C>(c));
+
+  // need to create future before the topology got torn down quickly
+  tf::Future<void> future(t->_promise.get_future(), t);
+
+  // modifying topology needs to be protected under the lock
+  {
+    std::lock_guard<std::mutex> lock(f._mutex);
+    f._topologies.push(t);
+    if(f._topologies.size() == 1) {
+      _set_up_topology(_this_worker(), t.get());
+    }
+  }
+
+  return future;
+}
+
+// Function: run_until
+template <typename P, typename C>
+tf::Future<void> Executor::run_until(Taskflow&& f, P&& pred, C&& c) {
+
+  std::list<Taskflow>::iterator itr;
+
+  {
+    std::scoped_lock<std::mutex> lock(_taskflow_mutex);
+    itr = _taskflows.emplace(_taskflows.end(), std::move(f));
+    itr->_satellite = itr;
+  }
+
+  return run_until(*itr, std::forward<P>(pred), std::forward<C>(c));
+}
+
+// Procedure: _increment_topology
+inline void Executor::_increment_topology() {
+  std::lock_guard<std::mutex> lock(_topology_mutex);
+  ++_num_topologies;
+}
+
+// Procedure: _decrement_topology_and_notify
+inline void Executor::_decrement_topology_and_notify() {
+  std::lock_guard<std::mutex> lock(_topology_mutex);
+  if(--_num_topologies == 0) {
+    _topology_cv.notify_all();
+  }
+}
+
+// Procedure: _decrement_topology
+inline void Executor::_decrement_topology() {
+  std::lock_guard<std::mutex> lock(_topology_mutex);
+  --_num_topologies;
+}
+
+// Procedure: wait_for_all
+inline void Executor::wait_for_all() {
+  std::unique_lock<std::mutex> lock(_topology_mutex);
+  _topology_cv.wait(lock, [&](){ return _num_topologies == 0; });
+}
+
+// Function: _set_up_topology
+inline void Executor::_set_up_topology(Worker* worker, Topology* tpg) {
+
+  // ---- under taskflow lock ----
+
+  tpg->_sources.clear();
+  tpg->_taskflow._graph._clear_detached();
+
+  // scan each node in the graph and build up the links
+  for(auto node : tpg->_taskflow._graph._nodes) {
+
+    node->_topology = tpg;
+    node->_state.store(0, std::memory_order_relaxed);
+
+    if(node->num_dependents() == 0) {
+      tpg->_sources.push_back(node);
+    }
+
+    node->_set_up_join_counter();
+  }
+
+  tpg->_join_counter = tpg->_sources.size();
+
+  if(worker) {
+    _schedule(*worker, tpg->_sources);
+  }
+  else {
+    _schedule(tpg->_sources);
+  }
+}
+
+// Function: _tear_down_topology
+inline void Executor::_tear_down_topology(Worker& worker, Topology* tpg) {
+
+  auto &f = tpg->_taskflow;
+
+  //assert(&tpg == &(f._topologies.front()));
+
+  // case 1: we still need to run the topology again
+  if(!tpg->_is_cancelled && !tpg->_pred()) {
+    //assert(tpg->_join_counter == 0);
+    std::lock_guard<std::mutex> lock(f._mutex);
+    tpg->_join_counter = tpg->_sources.size();
+    _schedule(worker, tpg->_sources);
+  }
+  // case 2: the final run of this topology
+  else {
+
+    // TODO: if the topology is cancelled, need to release all semaphores
+
+    if(tpg->_call != nullptr) {
+      tpg->_call();
+    }
+
+    // If there is another run (interleave between lock)
+    if(std::unique_lock<std::mutex> lock(f._mutex); f._topologies.size()>1) {
+      //assert(tpg->_join_counter == 0);
+
+      // Set the promise
+      tpg->_promise.set_value();
+      f._topologies.pop();
+      tpg = f._topologies.front().get();
+
+      // decrement the topology but since this is not the last we don't notify
+      _decrement_topology();
+
+      // set up topology needs to be under the lock or it can
+      // introduce memory order error with pop
+      _set_up_topology(&worker, tpg);
+    }
+    else {
+      //assert(f._topologies.size() == 1);
+
+      // Need to back up the promise first here becuz taskflow might be
+      // destroy soon after calling get
+      auto p {std::move(tpg->_promise)};
+
+      // Back up lambda capture in case it has the topology pointer,
+      // to avoid it releasing on pop_front ahead of _mutex.unlock &
+      // _promise.set_value. Released safely when leaving scope.
+      auto c {std::move(tpg->_call)};
+
+      // Get the satellite if any
+      auto s {f._satellite};
+
+      // Now we remove the topology from this taskflow
+      f._topologies.pop();
+
+      //f._mutex.unlock();
+      lock.unlock();
+
+      // We set the promise in the end in case taskflow leaves the scope.
+      // After set_value, the caller will return from wait
+      p.set_value();
+
+      _decrement_topology_and_notify();
+
+      // remove the taskflow if it is managed by the executor
+      // TODO: in the future, we may need to synchronize on wait
+      // (which means the following code should the moved before set_value)
+      if(s) {
+        std::scoped_lock<std::mutex> lock(_taskflow_mutex);
+        _taskflows.erase(*s);
+      }
+    }
+  }
+}
+
+// ############################################################################
+// Forward Declaration: Subflow
+// ############################################################################
+
+inline void Subflow::join() {
+
+  // assert(this_worker().worker == &_worker);
+
+  if(!_joinable) {
+    TF_THROW("subflow not joinable");
+  }
+
+  // only the parent worker can join the subflow
+  _executor._invoke_dynamic_task_external(_worker, _parent, _graph, false);
+  _joinable = false;
+}
+
+inline void Subflow::detach() {
+
+  // assert(this_worker().worker == &_worker);
+
+  if(!_joinable) {
+    TF_THROW("subflow already joined or detached");
+  }
+
+  // only the parent worker can detach the subflow
+  _executor._invoke_dynamic_task_external(_worker, _parent, _graph, true);
+  _joinable = false;
+}
+
+// Function: named_async
+template <typename F, typename... ArgsT>
+auto Subflow::named_async(const std::string& name, F&& f, ArgsT&&... args) {
+  return _named_async(
+    *_executor._this_worker(), name, std::forward<F>(f), std::forward<ArgsT>(args)...
+  );
+}
+
+// Function: _named_async
+template <typename F, typename... ArgsT>
+auto Subflow::_named_async(
+  Worker& w,
+  const std::string& name,
+  F&& f,
+  ArgsT&&... args
+) {
+
+  _parent->_join_counter.fetch_add(1);
+
+  using T = std::invoke_result_t<F, ArgsT...>;
+  using R = std::conditional_t<std::is_same_v<T, void>, void, std::optional<T>>;
+
+  std::promise<R> p;
+
+  auto tpg = std::make_shared<AsyncTopology>();
+
+  Future<R> fu(p.get_future(), tpg);
+
+  auto node = node_pool.animate(
+    std::in_place_type_t<Node::Async>{},
+    [p=make_moc(std::move(p)), f=std::forward<F>(f), args...]
+    (bool cancel) mutable {
+      if constexpr(std::is_same_v<R, void>) {
+        if(!cancel) {
+          f(args...);
+        }
+        p.object.set_value();
+      }
+      else {
+        p.object.set_value(cancel ? std::nullopt : std::make_optional(f(args...)));
+      }
+    },
+    std::move(tpg)
+  );
+
+  node->_name = name;
+  node->_topology = _parent->_topology;
+  node->_parent = _parent;
+
+  _executor._schedule(w, node);
+
+  return fu;
+}
+
+// Function: async
+template <typename F, typename... ArgsT>
+auto Subflow::async(F&& f, ArgsT&&... args) {
+  return named_async("", std::forward<F>(f), std::forward<ArgsT>(args)...);
+}
+
+// Function: _named_silent_async
+template <typename F, typename... ArgsT>
+void Subflow::_named_silent_async(
+  Worker& w, const std::string& name, F&& f, ArgsT&&... args
+) {
+
+  _parent->_join_counter.fetch_add(1);
+
+  auto node = node_pool.animate(
+    std::in_place_type_t<Node::SilentAsync>{},
+    [f=std::forward<F>(f), args...] () mutable {
+      f(args...);
+    }
+  );
+
+  node->_name = name;
+  node->_topology = _parent->_topology;
+  node->_parent = _parent;
+
+  _executor._schedule(w, node);
+}
+
+// Function: silent_async
+template <typename F, typename... ArgsT>
+void Subflow::named_silent_async(const std::string& name, F&& f, ArgsT&&... args) {
+  _named_silent_async(
+    *_executor._this_worker(), name, std::forward<F>(f), std::forward<ArgsT>(args)...
+  );
+}
+
+// Function: named_silent_async
+template <typename F, typename... ArgsT>
+void Subflow::silent_async(F&& f, ArgsT&&... args) {
+  named_silent_async("", std::forward<F>(f), std::forward<ArgsT>(args)...);
+}
+
+// ############################################################################
+// Forward Declaration: Runtime
+// ############################################################################
+
+// Procedure: schedule
+inline void Runtime::schedule(Task task) {
+  auto node = task._node;
+  auto& j = node->_parent ? node->_parent->_join_counter :
+                            node->_topology->_join_counter;
+  j.fetch_add(1);
+  _executor._schedule(_worker, node);
+}
+
+// Procedure: run
+template <typename C>
+void Runtime::run(C&& callable) {
+
+  // dynamic task (subflow)
+  if constexpr(is_dynamic_task_v<C>) {
+    Graph graph;
+    Subflow sf(_executor, _worker, _parent, graph);
+    callable(sf);
+    if(sf._joinable) {
+      _executor._invoke_dynamic_task_internal(_worker, _parent, graph);
+    }
+  }
+  else {
+    static_assert(dependent_false_v<C>, "unsupported task callable to run");
+  }
+}
+
+}  // end of namespace tf -----------------------------------------------------
+
+
+
+
+
+
+
+
diff --git a/lib/taskflow/core/executor.hpp b/lib/taskflow/core/executor.hpp
index 28cae53..a5607e0 100644
--- a/lib/taskflow/core/executor.hpp
+++ b/lib/taskflow/core/executor.hpp
@@ -2,283 +2,1119 @@
 
 #include "observer.hpp"
 #include "taskflow.hpp"
+#include "async_task.hpp"
 
-/** 
+/**
 @file executor.hpp
 @brief executor include file
 */
 
 namespace tf {
 
-struct PerThread {
-
-  Worker* worker;
-  PerThread() : worker{ nullptr } {}
-};
-
-thread_local PerThread per_thread;
-
 // ----------------------------------------------------------------------------
 // Executor Definition
 // ----------------------------------------------------------------------------
 
-
 /** @class Executor
 
-@brief execution interface for running a taskflow graph
+@brief class to create an executor for running a taskflow graph
 
-An executor object manages a set of worker threads to run taskflow(s)
+An executor manages a set of worker threads to run one or multiple taskflows
 using an efficient work-stealing scheduling algorithm.
 
+@code{.cpp}
+// Declare an executor and a taskflow
+tf::Executor executor;
+tf::Taskflow taskflow;
+
+// Add three tasks into the taskflow
+tf::Task A = taskflow.emplace([] () { std::cout << "This is TaskA\n"; });
+tf::Task B = taskflow.emplace([] () { std::cout << "This is TaskB\n"; });
+tf::Task C = taskflow.emplace([] () { std::cout << "This is TaskC\n"; });
+
+// Build precedence between tasks
+A.precede(B, C);
+
+tf::Future<void> fu = executor.run(taskflow);
+fu.wait();                // block until the execution completes
+
+executor.run(taskflow, [](){ std::cout << "end of 1 run"; }).wait();
+executor.run_n(taskflow, 4);
+executor.wait_for_all();  // block until all associated executions finish
+executor.run_n(taskflow, 4, [](){ std::cout << "end of 4 runs"; }).wait();
+executor.run_until(taskflow, [cnt=0] () mutable { return ++cnt == 10; });
+@endcode
+
+All the @c run methods are @em thread-safe. You can submit multiple
+taskflows at the same time to an executor from different threads.
 */
 class Executor {
 
   friend class FlowBuilder;
   friend class Subflow;
-  friend class cudaFlow;
+  friend class Runtime;
 
+  public:
 
+  /**
+  @brief constructs the executor with @c N worker threads
 
-  public:
 
-    /**
-    @brief constructs the executor with N worker threads
-    */
-    explicit Executor(size_t N = std::thread::hardware_concurrency());
-    
-    /**
-    @brief destructs the executor 
-    */
-    ~Executor();
+  @param N number of workers (default std::thread::hardware_concurrency)
+  @param wix worker interface class to alter worker (thread) behaviors
+  
+  The constructor spawns @c N worker threads to run tasks in a
+  work-stealing loop. The number of workers must be greater than zero
+  or an exception will be thrown.
+  By default, the number of worker threads is equal to the maximum
+  hardware concurrency returned by std::thread::hardware_concurrency.
+
+  Users can alter the worker behavior, such as changing thread affinity,
+  via deriving an instance from tf::WorkerInterface.
+  */
+  explicit Executor(
+    size_t N = std::thread::hardware_concurrency(),
+    std::shared_ptr<WorkerInterface> wix = nullptr 
+  );
 
-    /**
-    @brief runs the taskflow once
-    
-    @param taskflow a tf::Taskflow object
+  /**
+  @brief destructs the executor
 
-    @return a tf::Future that will holds the result of the execution
-    */
-    tf::Future<void> run(Taskflow& taskflow);
+  The destructor calls Executor::wait_for_all to wait for all submitted
+  taskflows to complete and then notifies all worker threads to stop
+  and join these threads.
+  */
+  ~Executor();
 
-    /**
-    @brief runs the taskflow once and invoke a callback upon completion
+  /**
+  @brief runs a taskflow once
 
-    @param taskflow a tf::Taskflow object 
-    @param callable a callable object to be invoked after this run
+  @param taskflow a tf::Taskflow object
 
-    @return a tf::Future that will holds the result of the execution
-    */
-    template<typename C>
-    tf::Future<void> run(Taskflow& taskflow, C&& callable);
+  @return a tf::Future that holds the result of the execution
 
-    /**
-    @brief runs the taskflow for N times
-    
-    @param taskflow a tf::Taskflow object
-    @param N number of runs
+  This member function executes the given taskflow once and returns a tf::Future
+  object that eventually holds the result of the execution.
 
-    @return a tf::Future that will holds the result of the execution
-    */
-    tf::Future<void> run_n(Taskflow& taskflow, size_t N);
+  @code{.cpp}
+  tf::Future<void> future = executor.run(taskflow);
+  // do something else
+  future.wait();
+  @endcode
 
-    /**
-    @brief runs the taskflow for N times and then invokes a callback
+  This member function is thread-safe.
 
-    @param taskflow a tf::Taskflow 
-    @param N number of runs
-    @param callable a callable object to be invoked after this run
+  @attention
+  The executor does not own the given taskflow. It is your responsibility to
+  ensure the taskflow remains alive during its execution.
+  */
+  tf::Future<void> run(Taskflow& taskflow);
 
-    @return a tf::Future that will holds the result of the execution
-    */
-    template<typename C>
-    tf::Future<void> run_n(Taskflow& taskflow, size_t N, C&& callable);
+  /**
+  @brief runs a moved taskflow once
 
-    /**
-    @brief runs the taskflow multiple times until the predicate becomes true and 
-           then invokes a callback
+  @param taskflow a moved tf::Taskflow object
 
-    @param taskflow a tf::Taskflow 
-    @param pred a boolean predicate to return true for stop
+  @return a tf::Future that holds the result of the execution
 
-    @return a tf::Future that will holds the result of the execution
-    */
-    template<typename P>
-    tf::Future<void> run_until(Taskflow& taskflow, P&& pred);
+  This member function executes a moved taskflow once and returns a tf::Future
+  object that eventually holds the result of the execution.
+  The executor will take care of the lifetime of the moved taskflow.
 
-    /**
-    @brief runs the taskflow multiple times until the predicate becomes true and 
-           then invokes the callback
+  @code{.cpp}
+  tf::Future<void> future = executor.run(std::move(taskflow));
+  // do something else
+  future.wait();
+  @endcode
 
-    @param taskflow a tf::Taskflow 
-    @param pred a boolean predicate to return true for stop
-    @param callable a callable object to be invoked after this run
+  This member function is thread-safe.
+  */
+  tf::Future<void> run(Taskflow&& taskflow);
 
-    @return a tf::Future that will holds the result of the execution
-    */
-    template<typename P, typename C>
-    tf::Future<void> run_until(Taskflow& taskflow, P&& pred, C&& callable);
-    
-    /**
-    @brief wait for all pending graphs to complete
-    */
-    void wait_for_all();
-
-    /**
-    @brief queries the number of worker threads (can be zero)
-    */
-    size_t num_workers() const;
-    
-    /**
-    @brief queries the number of running topologies at the time of this call
+  /**
+  @brief runs a taskflow once and invoke a callback upon completion
 
-    When a taskflow is submitted to an executor, a topology is created to store
-    runtime metadata of the running taskflow.
-    */
-    size_t num_topologies() const;
+  @param taskflow a tf::Taskflow object
+  @param callable a callable object to be invoked after this run
 
-    /**
-    @brief queries the id of the caller thread in this executor
+  @return a tf::Future that holds the result of the execution
 
-    Each worker has an unique id from 0 to N-1 exclusive to the associated executor.
-    If the caller thread does not belong to the executor, -1 is returned.
-    */
-    int this_worker_id() const;
+  This member function executes the given taskflow once and invokes the given
+  callable when the execution completes.
+  This member function returns a tf::Future object that
+  eventually holds the result of the execution.
 
-    /** 
-    @brief runs a given function asynchronously
+  @code{.cpp}
+  tf::Future<void> future = executor.run(taskflow, [](){ std::cout << "done"; });
+  // do something else
+  future.wait();
+  @endcode
 
-    @tparam F callable type
-    @tparam ArgsT parameter types
+  This member function is thread-safe.
 
-    @param f callable object to call
-    @param args parameters to pass to the callable
-    
-    @return a tf::Future that will holds the result of the execution
+  @attention
+  The executor does not own the given taskflow. It is your responsibility to
+  ensure the taskflow remains alive during its execution.
+  */
+  template<typename C>
+  tf::Future<void> run(Taskflow& taskflow, C&& callable);
 
-    This method is thread-safe. Multiple threads can launch asynchronous tasks 
-    at the same time.
-    */
-    template <typename F, typename... ArgsT>
-    auto async(F&& f, ArgsT&&... args);
-    
-    /**
-    @brief similar to tf::Executor::async but does not return a future object
-    */
-    template <typename F, typename... ArgsT>
-    void silent_async(F&& f, ArgsT&&... args);
-    
-    /**
-    @brief constructs an observer to inspect the activities of worker threads
+  /**
+  @brief runs a moved taskflow once and invoke a callback upon completion
 
-    Each executor manage a list of observers in shared ownership with callers.
-    
-    @tparam Observer observer type derived from tf::ObserverInterface
-    @tparam ArgsT argument parameter pack
+  @param taskflow a moved tf::Taskflow object
+  @param callable a callable object to be invoked after this run
 
-    @param args arguments to forward to the constructor of the observer
-    
-    @return a shared pointer to the created observer
-    */
-    template <typename Observer, typename... ArgsT>
-    std::shared_ptr<Observer> make_observer(ArgsT&&... args);
-    
-    /**
-    @brief removes the associated observer
-    */
-    template <typename Observer>
-    void remove_observer(std::shared_ptr<Observer> observer);
+  @return a tf::Future that holds the result of the execution
 
-    /**
-    @brief queries the number of observers
-    */
-    size_t num_observers() const;
+  This member function executes a moved taskflow once and invokes the given
+  callable when the execution completes.
+  This member function returns a tf::Future object that
+  eventually holds the result of the execution.
+  The executor will take care of the lifetime of the moved taskflow.
 
-  private:
+  @code{.cpp}
+  tf::Future<void> future = executor.run(
+    std::move(taskflow), [](){ std::cout << "done"; }
+  );
+  // do something else
+  future.wait();
+  @endcode
 
-    
+  This member function is thread-safe.
+  */
+  template<typename C>
+  tf::Future<void> run(Taskflow&& taskflow, C&& callable);
 
-    const size_t _VICTIM_BEG;
-    const size_t _VICTIM_END;
-    const size_t _MAX_STEALS;
-    const size_t _MAX_YIELDS;
-   
-    std::condition_variable _topology_cv;
-    std::mutex _topology_mutex;
-    std::mutex _wsq_mutex;
+  /**
+  @brief runs a taskflow for @c N times
 
-    size_t _num_topologies {0};
-    
-    std::vector<Worker> _workers;
-    std::vector<std::thread> _threads;
+  @param taskflow a tf::Taskflow object
+  @param N number of runs
 
-    Notifier _notifier;
+  @return a tf::Future that holds the result of the execution
 
-    TaskQueue<Node*> _wsq;
+  This member function executes the given taskflow @c N times and returns a tf::Future
+  object that eventually holds the result of the execution.
 
-    std::atomic<size_t> _num_actives {0};
-    std::atomic<size_t> _num_thieves {0};
-    std::atomic<bool>   _done {0};
-    
-    std::unordered_set<std::shared_ptr<ObserverInterface>> _observers;
+  @code{.cpp}
+  tf::Future<void> future = executor.run_n(taskflow, 2);  // run taskflow 2 times
+  // do something else
+  future.wait();
+  @endcode
 
-    bool _wait_for_task(Worker&, Node*&);
-    
-    void _observer_prologue(Worker&, Node*);
-    void _observer_epilogue(Worker&, Node*);
-    void _spawn(size_t);
-    void _worker_loop(Worker&);
-    void _exploit_task(Worker&, Node*&);
-    void _explore_task(Worker&, Node*&);
-    void _schedule(Node*);
-    void _schedule(const std::vector<Node*>&);
-    void _invoke(Worker&, Node*);
-    void _invoke_static_task(Worker&, Node*);
-    void _invoke_dynamic_task(Worker&, Node*);
-    void _invoke_dynamic_task_internal(Worker&, Node*, Graph&, bool);
-    void _invoke_dynamic_task_external(Node*, Graph&, bool);
-    void _invoke_condition_task(Worker&, Node*, int&);
-    void _invoke_module_task(Worker&, Node*);
-    void _invoke_async_task(Worker&, Node*);
-    void _invoke_silent_async_task(Worker&, Node*);
-    void _set_up_topology(Topology*);
-    void _tear_down_topology(Topology*); 
-    void _tear_down_async(Node*);
-    void _tear_down_invoke(Node*, bool);
-    void _increment_topology();
-    void _decrement_topology();
-    void _decrement_topology_and_notify();
-    void _invoke_cudaflow_task(Worker&, Node*);
-    
-    template <typename C, std::enable_if_t<
-      std::is_invocable_r_v<void, C, cudaFlow&>, void>* = nullptr
-    >
-    void _invoke_cudaflow_task_entry(C&&, Node*);
-    
-    template <typename C, std::enable_if_t<
-      std::is_invocable_r_v<void, C, cudaFlowCapturer&>, void>* = nullptr
-    >
-    void _invoke_cudaflow_task_entry(C&&, Node*);
-    
-    //template <typename P>
-    //void _invoke_cudaflow_task_internal(cudaFlow&, P&&, bool);
+  This member function is thread-safe.
+
+  @attention
+  The executor does not own the given taskflow. It is your responsibility to
+  ensure the taskflow remains alive during its execution.
+  */
+  tf::Future<void> run_n(Taskflow& taskflow, size_t N);
+
+  /**
+  @brief runs a moved taskflow for @c N times
+
+  @param taskflow a moved tf::Taskflow object
+  @param N number of runs
+
+  @return a tf::Future that holds the result of the execution
+
+  This member function executes a moved taskflow @c N times and returns a tf::Future
+  object that eventually holds the result of the execution.
+  The executor will take care of the lifetime of the moved taskflow.
+
+  @code{.cpp}
+  tf::Future<void> future = executor.run_n(
+    std::move(taskflow), 2    // run the moved taskflow 2 times
+  );
+  // do something else
+  future.wait();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  tf::Future<void> run_n(Taskflow&& taskflow, size_t N);
+
+  /**
+  @brief runs a taskflow for @c N times and then invokes a callback
+
+  @param taskflow a tf::Taskflow
+  @param N number of runs
+  @param callable a callable object to be invoked after this run
+
+  @return a tf::Future that holds the result of the execution
+
+  This member function executes the given taskflow @c N times and invokes the given
+  callable when the execution completes.
+  This member function returns a tf::Future object that
+  eventually holds the result of the execution.
+
+  @code{.cpp}
+  tf::Future<void> future = executor.run(
+    taskflow, 2, [](){ std::cout << "done"; }  // runs taskflow 2 times and invoke
+                                               // the lambda to print "done"
+  );
+  // do something else
+  future.wait();
+  @endcode
+
+  This member function is thread-safe.
+
+  @attention
+  The executor does not own the given taskflow. It is your responsibility to
+  ensure the taskflow remains alive during its execution.
+  */
+  template<typename C>
+  tf::Future<void> run_n(Taskflow& taskflow, size_t N, C&& callable);
+
+  /**
+  @brief runs a moved taskflow for @c N times and then invokes a callback
+
+  @param taskflow a moved tf::Taskflow
+  @param N number of runs
+  @param callable a callable object to be invoked after this run
+
+  @return a tf::Future that holds the result of the execution
+
+  This member function executes a moved taskflow @c N times and invokes the given
+  callable when the execution completes.
+  This member function returns a tf::Future object that
+  eventually holds the result of the execution.
+
+  @code{.cpp}
+  tf::Future<void> future = executor.run_n(
+    // run the moved taskflow 2 times and invoke the lambda to print "done"
+    std::move(taskflow), 2, [](){ std::cout << "done"; }
+  );
+  // do something else
+  future.wait();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template<typename C>
+  tf::Future<void> run_n(Taskflow&& taskflow, size_t N, C&& callable);
+
+  /**
+  @brief runs a taskflow multiple times until the predicate becomes true
+
+  @param taskflow a tf::Taskflow
+  @param pred a boolean predicate to return @c true for stop
+
+  @return a tf::Future that holds the result of the execution
+
+  This member function executes the given taskflow multiple times until
+  the predicate returns @c true.
+  This member function returns a tf::Future object that
+  eventually holds the result of the execution.
+
+  @code{.cpp}
+  tf::Future<void> future = executor.run_until(
+    taskflow, [](){ return rand()%10 == 0 }
+  );
+  // do something else
+  future.wait();
+  @endcode
+
+  This member function is thread-safe.
+
+  @attention
+  The executor does not own the given taskflow. It is your responsibility to
+  ensure the taskflow remains alive during its execution.
+  */
+  template<typename P>
+  tf::Future<void> run_until(Taskflow& taskflow, P&& pred);
+
+  /**
+  @brief runs a moved taskflow and keeps running it
+         until the predicate becomes true
+
+  @param taskflow a moved tf::Taskflow object
+  @param pred a boolean predicate to return @c true for stop
+
+  @return a tf::Future that holds the result of the execution
+
+  This member function executes a moved taskflow multiple times until
+  the predicate returns @c true.
+  This member function returns a tf::Future object that
+  eventually holds the result of the execution.
+  The executor will take care of the lifetime of the moved taskflow.
+
+  @code{.cpp}
+  tf::Future<void> future = executor.run_until(
+    std::move(taskflow), [](){ return rand()%10 == 0 }
+  );
+  // do something else
+  future.wait();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template<typename P>
+  tf::Future<void> run_until(Taskflow&& taskflow, P&& pred);
+
+  /**
+  @brief runs a taskflow multiple times until the predicate becomes true and
+         then invokes the callback
+
+  @param taskflow a tf::Taskflow
+  @param pred a boolean predicate to return @c true for stop
+  @param callable a callable object to be invoked after this run completes
+
+  @return a tf::Future that holds the result of the execution
+
+  This member function executes the given taskflow multiple times until
+  the predicate returns @c true and then invokes the given callable when
+  the execution completes.
+  This member function returns a tf::Future object that
+  eventually holds the result of the execution.
+
+  @code{.cpp}
+  tf::Future<void> future = executor.run_until(
+    taskflow, [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; }
+  );
+  // do something else
+  future.wait();
+  @endcode
+
+  This member function is thread-safe.
+
+  @attention
+  The executor does not own the given taskflow. It is your responsibility to
+  ensure the taskflow remains alive during its execution.
+  */
+  template<typename P, typename C>
+  tf::Future<void> run_until(Taskflow& taskflow, P&& pred, C&& callable);
+
+  /**
+  @brief runs a moved taskflow and keeps running
+         it until the predicate becomes true and then invokes the callback
+
+  @param taskflow a moved tf::Taskflow
+  @param pred a boolean predicate to return @c true for stop
+  @param callable a callable object to be invoked after this run completes
+
+  @return a tf::Future that holds the result of the execution
+
+  This member function executes a moved taskflow multiple times until
+  the predicate returns @c true and then invokes the given callable when
+  the execution completes.
+  This member function returns a tf::Future object that
+  eventually holds the result of the execution.
+  The executor will take care of the lifetime of the moved taskflow.
+
+  @code{.cpp}
+  tf::Future<void> future = executor.run_until(
+    std::move(taskflow),
+    [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; }
+  );
+  // do something else
+  future.wait();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template<typename P, typename C>
+  tf::Future<void> run_until(Taskflow&& taskflow, P&& pred, C&& callable);
+
+  /**
+  @brief runs a target graph and waits until it completes using 
+         an internal worker of this executor
+  
+  @tparam T target type which has `tf::Graph& T::graph()` defined
+  @param target the target task graph object
+
+  The method runs a target graph which has `tf::Graph& T::graph()` defined 
+  and waits until the execution completes.
+  Unlike the typical flow of calling `tf::Executor::run` series 
+  plus waiting on the result, this method must be called by an internal
+  worker of this executor. The caller worker will participate in
+  the work-stealing loop of the scheduler, therby avoiding potential
+  deadlock caused by blocked waiting.
+  
+  @code{.cpp}
+  tf::Executor executor(2);
+  tf::Taskflow taskflow;
+  std::array<tf::Taskflow, 1000> others;
+  
+  std::atomic<size_t> counter{0};
+  
+  for(size_t n=0; n<1000; n++) {
+    for(size_t i=0; i<1000; i++) {
+      others[n].emplace([&](){ counter++; });
+    }
+    taskflow.emplace([&executor, &tf=others[n]](){
+      executor.corun(tf);
+      //executor.run(tf).wait();  <- blocking the worker without doing anything
+      //                             will introduce deadlock
+    });
+  }
+  executor.run(taskflow).wait();
+  @endcode 
+
+  The method is thread-safe as long as the target is not concurrently
+  ran by two or more threads.
+
+  @attention
+  You must call tf::Executor::corun from a worker of the calling executor
+  or an exception will be thrown.
+  */
+  template <typename T>
+  void corun(T& target);
+
+  /**
+  @brief keeps running the work-stealing loop until the predicate becomes true
+  
+  @tparam P predicate type
+  @param predicate a boolean predicate to indicate when to stop the loop
+
+  The method keeps the caller worker running in the work-stealing loop
+  until the stop predicate becomes true.
+
+  @code{.cpp}
+  taskflow.emplace([&](){
+    std::future<void> fu = std::async([](){ std::sleep(100s); });
+    executor.corun_until([](){
+      return fu.wait_for(std::chrono::seconds(0)) == future_status::ready;
+    });
+  });
+  @endcode
+
+  @attention
+  You must call tf::Executor::corun_until from a worker of the calling executor
+  or an exception will be thrown.
+  */
+  template <typename P>
+  void corun_until(P&& predicate);
+
+  /**
+  @brief waits for all tasks to complete
+
+  This member function waits until all submitted tasks
+  (e.g., taskflows, asynchronous tasks) to finish.
+
+  @code{.cpp}
+  executor.run(taskflow1);
+  executor.run_n(taskflow2, 10);
+  executor.run_n(taskflow3, 100);
+  executor.wait_for_all();  // wait until the above submitted taskflows finish
+  @endcode
+  */
+  void wait_for_all();
+
+  /**
+  @brief queries the number of worker threads
+
+  Each worker represents one unique thread spawned by an executor
+  upon its construction time.
+
+  @code{.cpp}
+  tf::Executor executor(4);
+  std::cout << executor.num_workers();    // 4
+  @endcode
+  */
+  size_t num_workers() const noexcept;
+
+  /**
+  @brief queries the number of running topologies at the time of this call
+
+  When a taskflow is submitted to an executor, a topology is created to store
+  runtime metadata of the running taskflow.
+  When the execution of the submitted taskflow finishes,
+  its corresponding topology will be removed from the executor.
+
+  @code{.cpp}
+  executor.run(taskflow);
+  std::cout << executor.num_topologies();  // 0 or 1 (taskflow still running)
+  @endcode
+  */
+  size_t num_topologies() const;
+
+  /**
+  @brief queries the number of running taskflows with moved ownership
+
+  @code{.cpp}
+  executor.run(std::move(taskflow));
+  std::cout << executor.num_taskflows();  // 0 or 1 (taskflow still running)
+  @endcode
+  */
+  size_t num_taskflows() const;
+  
+  /**
+  @brief queries the id of the caller thread in this executor
+
+  Each worker has an unique id in the range of @c 0 to @c N-1 associated with
+  its parent executor.
+  If the caller thread does not belong to the executor, @c -1 is returned.
+
+  @code{.cpp}
+  tf::Executor executor(4);   // 4 workers in the executor
+  executor.this_worker_id();  // -1 (main thread is not a worker)
+
+  taskflow.emplace([&](){
+    std::cout << executor.this_worker_id();  // 0, 1, 2, or 3
+  });
+  executor.run(taskflow);
+  @endcode
+  */
+  int this_worker_id() const;
+ 
+  // --------------------------------------------------------------------------
+  // Observer methods
+  // --------------------------------------------------------------------------
+
+  /**
+  @brief constructs an observer to inspect the activities of worker threads
+
+  @tparam Observer observer type derived from tf::ObserverInterface
+  @tparam ArgsT argument parameter pack
+
+  @param args arguments to forward to the constructor of the observer
+
+  @return a shared pointer to the created observer
+
+  Each executor manages a list of observers with shared ownership with callers.
+  For each of these observers, the two member functions,
+  tf::ObserverInterface::on_entry and tf::ObserverInterface::on_exit
+  will be called before and after the execution of a task.
+
+  This member function is not thread-safe.
+  */
+  template <typename Observer, typename... ArgsT>
+  std::shared_ptr<Observer> make_observer(ArgsT&&... args);
+
+  /**
+  @brief removes an observer from the executor
+
+  This member function is not thread-safe.
+  */
+  template <typename Observer>
+  void remove_observer(std::shared_ptr<Observer> observer);
+
+  /**
+  @brief queries the number of observers
+  */
+  size_t num_observers() const noexcept;
+
+  // --------------------------------------------------------------------------
+  // Async Task Methods
+  // --------------------------------------------------------------------------
+
+  /**
+  @brief runs a given function asynchronously
+
+  @tparam F callable type
+
+  @param func callable object
+
+  @return a @std_future that will hold the result of the execution
+
+  The method creates an asynchronous task to run the given function
+  and return a @std_future object that eventually will hold the result
+  of the return value.
+
+  @code{.cpp}
+  std::future<int> future = executor.async([](){
+    std::cout << "create an asynchronous task and returns 1\n";
+    return 1;
+  });
+  future.get();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template <typename F>
+  auto async(F&& func);
+
+  /**
+  @brief runs a given function asynchronously and gives a name to this task
+
+  @tparam F callable type
+
+  @param name name of the asynchronous task
+  @param func callable object
+
+  @return a @std_future that will hold the result of the execution
+  
+  The method creates and assigns a name to an asynchronous task 
+  to run the given function, 
+  returning @std_future object that eventually will hold the result
+  Assigned task names will appear in the observers of the executor.
+
+  @code{.cpp}
+  std::future<int> future = executor.async("name", [](){
+    std::cout << "create an asynchronous task with a name and returns 1\n";
+    return 1;
+  });
+  future.get();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template <typename F>
+  auto async(const std::string& name, F&& func);
+
+  /**
+  @brief similar to tf::Executor::async but does not return a future object
+  
+  @tparam F callable type
+  
+  @param func callable object
+
+  This member function is more efficient than tf::Executor::async
+  and is encouraged to use when you do not want a @std_future to
+  acquire the result or synchronize the execution.
+
+  @code{.cpp}
+  executor.silent_async([](){
+    std::cout << "create an asynchronous task with no return\n";
+  });
+  executor.wait_for_all();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template <typename F>
+  void silent_async(F&& func);
+
+  /**
+  @brief similar to tf::Executor::async but does not return a future object
+
+  @tparam F callable type
+
+  @param name assigned name to the task
+  @param func callable object
+
+  This member function is more efficient than tf::Executor::async
+  and is encouraged to use when you do not want a @std_future to
+  acquire the result or synchronize the execution.
+  Assigned task names will appear in the observers of the executor.
+
+  @code{.cpp}
+  executor.silent_async("name", [](){
+    std::cout << "create an asynchronous task with a name and no return\n";
+  });
+  executor.wait_for_all();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template <typename F>
+  void silent_async(const std::string& name, F&& func);
+
+  // --------------------------------------------------------------------------
+  // Silent Dependent Async Methods
+  // --------------------------------------------------------------------------
+  
+  /**
+  @brief runs the given function asynchronously 
+         when the given dependents finish
+
+  @tparam F callable type
+  @tparam Tasks task types convertible to tf::AsyncTask
+
+  @param func callable object
+  @param tasks asynchronous tasks on which this execution depends
+  
+  @return a tf::AsyncTask handle 
+  
+  This member function is more efficient than tf::Executor::dependent_async
+  and is encouraged to use when you do not want a @std_future to
+  acquire the result or synchronize the execution.
+  The example below creates three asynchronous tasks, @c A, @c B, and @c C,
+  in which task @c C runs after task @c A and task @c B.
+
+  @code{.cpp}
+  tf::AsyncTask A = executor.silent_dependent_async([](){ printf("A\n"); });
+  tf::AsyncTask B = executor.silent_dependent_async([](){ printf("B\n"); });
+  executor.silent_dependent_async([](){ printf("C runs after A and B\n"); }, A, B);
+  executor.wait_for_all();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template <typename F, typename... Tasks,
+    std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>* = nullptr
+  >
+  tf::AsyncTask silent_dependent_async(F&& func, Tasks&&... tasks);
+  
+  /**
+  @brief names and runs the given function asynchronously 
+         when the given dependents finish
+  
+  @tparam F callable type
+  @tparam Tasks task types convertible to tf::AsyncTask
+
+  @param name assigned name to the task
+  @param func callable object
+  @param tasks asynchronous tasks on which this execution depends
+  
+  @return a tf::AsyncTask handle 
+  
+  This member function is more efficient than tf::Executor::dependent_async
+  and is encouraged to use when you do not want a @std_future to
+  acquire the result or synchronize the execution.
+  The example below creates three asynchronous tasks, @c A, @c B, and @c C,
+  in which task @c C runs after task @c A and task @c B.
+  Assigned task names will appear in the observers of the executor.
+
+  @code{.cpp}
+  tf::AsyncTask A = executor.silent_dependent_async("A", [](){ printf("A\n"); });
+  tf::AsyncTask B = executor.silent_dependent_async("B", [](){ printf("B\n"); });
+  executor.silent_dependent_async(
+    "C", [](){ printf("C runs after A and B\n"); }, A, B
+  );
+  executor.wait_for_all();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template <typename F, typename... Tasks,
+    std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>* = nullptr
+  >
+  tf::AsyncTask silent_dependent_async(const std::string& name, F&& func, Tasks&&... tasks);
+  
+  /**
+  @brief runs the given function asynchronously 
+         when the given range of dependents finish
+  
+  @tparam F callable type
+  @tparam I iterator type 
+
+  @param func callable object
+  @param first iterator to the beginning (inclusive)
+  @param last iterator to the end (exclusive)
+  
+  @return a tf::AsyncTask handle 
+  
+  This member function is more efficient than tf::Executor::dependent_async
+  and is encouraged to use when you do not want a @std_future to
+  acquire the result or synchronize the execution.
+  The example below creates three asynchronous tasks, @c A, @c B, and @c C,
+  in which task @c C runs after task @c A and task @c B.
+
+  @code{.cpp}
+  std::array<tf::AsyncTask, 2> array {
+    executor.silent_dependent_async([](){ printf("A\n"); }),
+    executor.silent_dependent_async([](){ printf("B\n"); })
+  };
+  executor.silent_dependent_async(
+    [](){ printf("C runs after A and B\n"); }, array.begin(), array.end()
+  );
+  executor.wait_for_all();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template <typename F, typename I, 
+    std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>* = nullptr
+  >
+  tf::AsyncTask silent_dependent_async(F&& func, I first, I last);
+  
+  /**
+  @brief names and runs the given function asynchronously 
+         when the given range of dependents finish
+  
+  @tparam F callable type
+  @tparam I iterator type 
+
+  @param name assigned name to the task
+  @param func callable object
+  @param first iterator to the beginning (inclusive)
+  @param last iterator to the end (exclusive)
+
+  @return a tf::AsyncTask handle 
+  
+  This member function is more efficient than tf::Executor::dependent_async
+  and is encouraged to use when you do not want a @std_future to
+  acquire the result or synchronize the execution.
+  The example below creates three asynchronous tasks, @c A, @c B, and @c C,
+  in which task @c C runs after task @c A and task @c B.
+  Assigned task names will appear in the observers of the executor.
+
+  @code{.cpp}
+  std::array<tf::AsyncTask, 2> array {
+    executor.silent_dependent_async("A", [](){ printf("A\n"); }),
+    executor.silent_dependent_async("B", [](){ printf("B\n"); })
+  };
+  executor.silent_dependent_async(
+    "C", [](){ printf("C runs after A and B\n"); }, array.begin(), array.end()
+  );
+  executor.wait_for_all();
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template <typename F, typename I, 
+    std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>* = nullptr
+  >
+  tf::AsyncTask silent_dependent_async(const std::string& name, F&& func, I first, I last);
+  
+  // --------------------------------------------------------------------------
+  // Dependent Async Methods
+  // --------------------------------------------------------------------------
+  
+  /**
+  @brief runs the given function asynchronously 
+         when the given dependents finish
+  
+  @tparam F callable type
+  @tparam Tasks task types convertible to tf::AsyncTask
+
+  @param func callable object
+  @param tasks asynchronous tasks on which this execution depends
+  
+  @return a pair of a tf::AsyncTask handle and 
+                    a @std_future that holds the result of the execution
+  
+  The example below creates three asynchronous tasks, @c A, @c B, and @c C,
+  in which task @c C runs after task @c A and task @c B.
+  Task @c C returns a pair of its tf::AsyncTask handle and a std::future<int>
+  that eventually will hold the result of the execution.
+
+  @code{.cpp}
+  tf::AsyncTask A = executor.silent_dependent_async([](){ printf("A\n"); });
+  tf::AsyncTask B = executor.silent_dependent_async([](){ printf("B\n"); });
+  auto [C, fuC] = executor.dependent_async(
+    [](){ 
+      printf("C runs after A and B\n"); 
+      return 1;
+    }, 
+    A, B
+  );
+  fuC.get();  // C finishes, which in turns means both A and B finish
+  @endcode
+
+  You can mixed the use of tf::AsyncTask handles 
+  returned by Executor::dependent_async and Executor::silent_dependent_async
+  when specifying task dependencies.
+
+  This member function is thread-safe.
+  */
+  template <typename F, typename... Tasks,
+    std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>* = nullptr
+  >
+  auto dependent_async(F&& func, Tasks&&... tasks);
+  
+  /**
+  @brief names and runs the given function asynchronously
+         when the given dependents finish
+  
+  @tparam F callable type
+  @tparam Tasks task types convertible to tf::AsyncTask
+  
+  @param name assigned name to the task
+  @param func callable object
+  @param tasks asynchronous tasks on which this execution depends
+  
+  @return a pair of a tf::AsyncTask handle and 
+                    a @std_future that holds the result of the execution
+  
+  The example below creates three named asynchronous tasks, @c A, @c B, and @c C,
+  in which task @c C runs after task @c A and task @c B.
+  Task @c C returns a pair of its tf::AsyncTask handle and a std::future<int>
+  that eventually will hold the result of the execution.
+  Assigned task names will appear in the observers of the executor.
+
+  @code{.cpp}
+  tf::AsyncTask A = executor.silent_dependent_async("A", [](){ printf("A\n"); });
+  tf::AsyncTask B = executor.silent_dependent_async("B", [](){ printf("B\n"); });
+  auto [C, fuC] = executor.dependent_async(
+    "C",
+    [](){ 
+      printf("C runs after A and B\n"); 
+      return 1;
+    }, 
+    A, B
+  );
+  assert(fuC.get()==1);  // C finishes, which in turns means both A and B finish
+  @endcode
+
+  You can mixed the use of tf::AsyncTask handles 
+  returned by Executor::dependent_async and Executor::silent_dependent_async
+  when specifying task dependencies.
+
+  This member function is thread-safe.
+  */
+  template <typename F, typename... Tasks,
+    std::enable_if_t<all_same_v<AsyncTask, std::decay_t<Tasks>...>, void>* = nullptr
+  >
+  auto dependent_async(const std::string& name, F&& func, Tasks&&... tasks);
+  
+  /**
+  @brief runs the given function asynchronously 
+         when the given range of dependents finish
+  
+  @tparam F callable type
+  @tparam I iterator type 
+
+  @param func callable object
+  @param first iterator to the beginning (inclusive)
+  @param last iterator to the end (exclusive)
+  
+  @return a pair of a tf::AsyncTask handle and 
+                    a @std_future that holds the result of the execution
+  
+  The example below creates three asynchronous tasks, @c A, @c B, and @c C,
+  in which task @c C runs after task @c A and task @c B.
+  Task @c C returns a pair of its tf::AsyncTask handle and a std::future<int>
+  that eventually will hold the result of the execution.
+
+  @code{.cpp}
+  std::array<tf::AsyncTask, 2> array {
+    executor.silent_dependent_async([](){ printf("A\n"); }),
+    executor.silent_dependent_async([](){ printf("B\n"); })
+  };
+  auto [C, fuC] = executor.dependent_async(
+    [](){ 
+      printf("C runs after A and B\n"); 
+      return 1;
+    }, 
+    array.begin(), array.end()
+  );
+  assert(fuC.get()==1);  // C finishes, which in turns means both A and B finish
+  @endcode
+
+  You can mixed the use of tf::AsyncTask handles 
+  returned by Executor::dependent_async and Executor::silent_dependent_async
+  when specifying task dependencies.
+
+  This member function is thread-safe.
+  */
+  template <typename F, typename I,
+    std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>* = nullptr
+  >
+  auto dependent_async(F&& func, I first, I last);
+  
+  /**
+  @brief names and runs the given function asynchronously 
+         when the given range of dependents finish
+  
+  @tparam F callable type
+  @tparam I iterator type 
+  
+  @param name assigned name to the task
+  @param func callable object
+  @param first iterator to the beginning (inclusive)
+  @param last iterator to the end (exclusive)
+  
+  @return a pair of a tf::AsyncTask handle and 
+                    a @std_future that holds the result of the execution
+  
+  The example below creates three named asynchronous tasks, @c A, @c B, and @c C,
+  in which task @c C runs after task @c A and task @c B.
+  Task @c C returns a pair of its tf::AsyncTask handle and a std::future<int>
+  that eventually will hold the result of the execution.
+  Assigned task names will appear in the observers of the executor.
+
+  @code{.cpp}
+  std::array<tf::AsyncTask, 2> array {
+    executor.silent_dependent_async("A", [](){ printf("A\n"); }),
+    executor.silent_dependent_async("B", [](){ printf("B\n"); })
+  };
+  auto [C, fuC] = executor.dependent_async(
+    "C",
+    [](){ 
+      printf("C runs after A and B\n"); 
+      return 1;
+    }, 
+    array.begin(), array.end()
+  );
+  assert(fuC.get()==1);  // C finishes, which in turns means both A and B finish
+  @endcode
+
+  You can mixed the use of tf::AsyncTask handles 
+  returned by Executor::dependent_async and Executor::silent_dependent_async
+  when specifying task dependencies.
+
+  This member function is thread-safe.
+  */
+  template <typename F, typename I,
+    std::enable_if_t<!std::is_same_v<std::decay_t<I>, AsyncTask>, void>* = nullptr
+  >
+  auto dependent_async(const std::string& name, F&& func, I first, I last);
+
+  private:
     
-    //template <typename P>
-    //void _invoke_cudaflow_task_external(cudaFlow&, P&&, bool);
+  const size_t _MAX_STEALS;
+
+  std::condition_variable _topology_cv;
+  std::mutex _taskflows_mutex;
+  std::mutex _topology_mutex;
+  std::mutex _wsq_mutex;
+  std::mutex _asyncs_mutex;
+
+  size_t _num_topologies {0};
+  
+  std::unordered_map<std::thread::id, size_t> _wids;
+  std::vector<std::thread> _threads;
+  std::vector<Worker> _workers;
+  std::list<Taskflow> _taskflows;
+
+  std::unordered_set<std::shared_ptr<Node>> _asyncs;
+
+  Notifier _notifier;
+
+  TaskQueue<Node*> _wsq;
+
+  std::atomic<bool> _done {0};
+
+  std::shared_ptr<WorkerInterface> _worker_interface;
+  std::unordered_set<std::shared_ptr<ObserverInterface>> _observers;
+
+  Worker* _this_worker();
+
+  bool _wait_for_task(Worker&, Node*&);
+
+  void _observer_prologue(Worker&, Node*);
+  void _observer_epilogue(Worker&, Node*);
+  void _spawn(size_t);
+  void _exploit_task(Worker&, Node*&);
+  void _explore_task(Worker&, Node*&);
+  void _schedule(Worker&, Node*);
+  void _schedule(Node*);
+  void _schedule(Worker&, const SmallVector<Node*>&);
+  void _schedule(const SmallVector<Node*>&);
+  void _set_up_topology(Worker*, Topology*);
+  void _tear_down_topology(Worker&, Topology*);
+  void _tear_down_async(Node*);
+  void _tear_down_dependent_async(Worker&, Node*);
+  void _tear_down_invoke(Worker&, Node*);
+  void _increment_topology();
+  void _decrement_topology();
+  void _decrement_topology_and_notify();
+  void _invoke(Worker&, Node*);
+  void _invoke_static_task(Worker&, Node*);
+  void _invoke_dynamic_task(Worker&, Node*);
+  void _consume_graph(Worker&, Node*, Graph&);
+  void _detach_dynamic_task(Worker&, Node*, Graph&);
+  void _invoke_condition_task(Worker&, Node*, SmallVector<int>&);
+  void _invoke_multi_condition_task(Worker&, Node*, SmallVector<int>&);
+  void _invoke_module_task(Worker&, Node*);
+  void _invoke_async_task(Worker&, Node*);
+  void _invoke_dependent_async_task(Worker&, Node*);
+  void _process_async_dependent(Node*, tf::AsyncTask&, size_t&);
+  void _schedule_async_task(Node*);
+  
+  template <typename P>
+  void _corun_until(Worker&, P&&);
+  
+  template <typename R, typename F>
+  auto _make_promised_async(std::promise<R>&&, F&&);
 };
 
 // Constructor
-inline Executor::Executor(size_t N) : 
-  _VICTIM_BEG {0},
-  _VICTIM_END {N - 1},
-  _MAX_STEALS {(N + 1) << 1},
-  _MAX_YIELDS {100},
+inline Executor::Executor(size_t N, std::shared_ptr<WorkerInterface> wix) :
+  _MAX_STEALS {((N+1) << 1)},
+  _threads    {N},
   _workers    {N},
-  _notifier   {N} {
-  
+  _notifier   {N},
+  _worker_interface {std::move(wix)} {
+
   if(N == 0) {
     TF_THROW("no cpu workers to execute taskflows");
   }
-  
+
   _spawn(N);
 
   // instantite the default observer if requested
@@ -289,25 +1125,22 @@ inline Executor::Executor(size_t N) :
 
 // Destructor
 inline Executor::~Executor() {
-  
+
   // wait for all topologies to complete
   wait_for_all();
-  
+
   // shut down the scheduler
   _done = true;
 
   _notifier.notify(true);
-  
+
   for(auto& t : _threads){
     t.join();
-  } 
-  
-  // flush the default observer
-  //_flush_tfprof();
+  }
 }
 
 // Function: num_workers
-inline size_t Executor::num_workers() const {
+inline size_t Executor::num_workers() const noexcept {
   return _workers.size();
 }
 
@@ -315,140 +1148,169 @@ inline size_t Executor::num_workers() const {
 inline size_t Executor::num_topologies() const {
   return _num_topologies;
 }
-    
-// Function: async
-template <typename F, typename... ArgsT>
-auto Executor::async(F&& f, ArgsT&&... args) {
-
-  _increment_topology();
-
-  using T = std::invoke_result_t<F, ArgsT...>;
-  using R = std::conditional_t<std::is_same_v<T, void>, void, std::optional<T>>;
 
-  std::promise<R> p;
-
-  auto tpg = std::make_shared<AsyncTopology>();
-
-  Future<R> fu(p.get_future(), tpg);
-
-  auto node = node_pool.animate(
-    std::in_place_type_t<Node::Async>{},
-    [p=make_moc(std::move(p)), f=std::forward<F>(f), args...] 
-    (bool cancel) mutable {
-      if constexpr(std::is_same_v<R, void>) {
-        if(!cancel) {
-          f(args...);
-        }
-        p.object.set_value();
-      }
-      else {
-        p.object.set_value(cancel ? std::nullopt : std::make_optional(f(args...)));
-      }
-    },
-    std::move(tpg)
-  );
-
-  _schedule(node);
-
-  return fu;
+// Function: num_taskflows
+inline size_t Executor::num_taskflows() const {
+  return _taskflows.size();
 }
 
-// Function: silent_async
-template <typename F, typename... ArgsT>
-void Executor::silent_async(F&& f, ArgsT&&... args) {
-
-  _increment_topology();
-
-  Node* node = node_pool.animate(
-    std::in_place_type_t<Node::SilentAsync>{},
-    [f=std::forward<F>(f), args...] () mutable { 
-      f(args...); 
-    }
-  );
-
-  _schedule(node);
+// Function: _this_worker
+inline Worker* Executor::_this_worker() {
+  auto itr = _wids.find(std::this_thread::get_id());
+  return itr == _wids.end() ? nullptr : &_workers[itr->second];
 }
 
 // Function: this_worker_id
 inline int Executor::this_worker_id() const {
-  auto worker = per_thread.worker;
-  return worker ? static_cast<int>(worker->_id) : -1;
+  auto i = _wids.find(std::this_thread::get_id());
+  return i == _wids.end() ? -1 : static_cast<int>(_workers[i->second]._id);
 }
 
 // Procedure: _spawn
 inline void Executor::_spawn(size_t N) {
+
+  std::mutex mutex;
+  std::condition_variable cond;
+  size_t n=0;
+
   for(size_t id=0; id<N; ++id) {
 
     _workers[id]._id = id;
     _workers[id]._vtm = id;
     _workers[id]._executor = this;
     _workers[id]._waiter = &_notifier._waiters[id];
-    
-    _threads.emplace_back([this] (Worker& w) -> void {
 
-      per_thread.worker = &w;
+    _threads[id] = std::thread([this] (
+      Worker& w, std::mutex& mutex, std::condition_variable& cond, size_t& n
+    ) -> void {
+      
+      // assign the thread
+      w._thread = &_threads[w._id];
+
+      // enables the mapping
+      {
+        std::scoped_lock lock(mutex);
+        _wids[std::this_thread::get_id()] = w._id;
+        if(n++; n == num_workers()) {
+          cond.notify_one();
+        }
+      }
 
       Node* t = nullptr;
+      
+      // before entering the scheduler (work-stealing loop), 
+      // call the user-specified prologue function
+      if(_worker_interface) {
+        _worker_interface->scheduler_prologue(w);
+      }
+      
+      // must use 1 as condition instead of !done because
+      // the previous worker may stop while the following workers
+      // are still preparing for entering the scheduling loop
+      std::exception_ptr ptr{nullptr};
+      try {
+        while(1) {
+
+          // execute the tasks.
+          _exploit_task(w, t);
+
+          // wait for tasks
+          if(_wait_for_task(w, t) == false) {
+            break;
+          }
+        }
+      } 
+      catch(...) {
+        ptr = std::current_exception();
+      }
+      
+      // call the user-specified epilogue function
+      if(_worker_interface) {
+        _worker_interface->scheduler_epilogue(w, ptr);
+      }
 
-      // must use 1 as condition instead of !done
-      while(1) {
-        
-        // execute the tasks.
-        _exploit_task(w, t);
+    }, std::ref(_workers[id]), std::ref(mutex), std::ref(cond), std::ref(n));
+    
+    // POSIX-like system can use the following to affine threads to cores 
+    //cpu_set_t cpuset;
+    //CPU_ZERO(&cpuset);
+    //CPU_SET(id, &cpuset);
+    //pthread_setaffinity_np(
+    //  _threads[id].native_handle(), sizeof(cpu_set_t), &cpuset
+    //);
+  }
+
+  std::unique_lock<std::mutex> lock(mutex);
+  cond.wait(lock, [&](){ return n==N; });
+}
 
-        // wait for tasks
-        if(_wait_for_task(w, t) == false) {
-          break;
+// Function: _corun_until
+template <typename P>
+void Executor::_corun_until(Worker& w, P&& stop_predicate) {
+  
+  std::uniform_int_distribution<size_t> rdvtm(0, _workers.size()-1);
+
+  exploit:
+
+  while(!stop_predicate()) {
+
+    //exploit:
+
+    if(auto t = w._wsq.pop(); t) {
+      _invoke(w, t);
+    }
+    else {
+      size_t num_steals = 0;
+
+      explore:
+
+      t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal();
+
+      if(t) {
+        _invoke(w, t);
+        goto exploit;
+      }
+      else if(!stop_predicate()) {
+        if(num_steals++ > _MAX_STEALS) {
+          std::this_thread::yield();
         }
+        w._vtm = rdvtm(w._rdgen);
+        goto explore;
       }
-      
-    }, std::ref(_workers[id]));     
+      else {
+        break;
+      }
+    }
   }
 }
 
 // Function: _explore_task
 inline void Executor::_explore_task(Worker& w, Node*& t) {
-  
+
   //assert(_workers[w].wsq.empty());
-  assert(!t);
+  //assert(!t);
 
   size_t num_steals = 0;
   size_t num_yields = 0;
 
-  std::uniform_int_distribution<size_t> rdvtm(_VICTIM_BEG, _VICTIM_END);
-
-  //while(!_done) {
-  //
-  //  size_t vtm = rdvtm(w._rdgen);
-  //    
-  //  t = (vtm == w._id) ? _wsq[d].steal() : _workers[vtm].wsq[d].steal();
-
-  //  if(t) {
-  //    break;
-  //  }
-
-  //  if(num_steal++ > _MAX_STEALS) {
-  //    std::this_thread::yield();
-  //    if(num_yields++ > _MAX_YIELDS) {
-  //      break;
-  //    }
-  //  }
-  //}
-
+  std::uniform_int_distribution<size_t> rdvtm(0, _workers.size()-1);
+  
+  // Here, we write do-while to make the worker steal at once
+  // from the assigned victim.
   do {
     t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal();
 
     if(t) {
       break;
     }
-    
+
     if(num_steals++ > _MAX_STEALS) {
       std::this_thread::yield();
-      if(num_yields++ > _MAX_YIELDS) {
+      if(num_yields++ > 100) {
         break;
       }
     }
-    
+
     w._vtm = rdvtm(w._rdgen);
   } while(!_done);
 
@@ -456,92 +1318,58 @@ inline void Executor::_explore_task(Worker& w, Node*& t) {
 
 // Procedure: _exploit_task
 inline void Executor::_exploit_task(Worker& w, Node*& t) {
-  
-  if(t) {
-
-    if(_num_actives.fetch_add(1) == 0 && _num_thieves == 0) {
-      _notifier.notify(false);
-    }
-
-    while(t) {
-      _invoke(w, t);
-      t = w._wsq.pop();
-    }
-
-    --_num_actives;
+  while(t) {
+    _invoke(w, t);
+    t = w._wsq.pop();
   }
 }
 
 // Function: _wait_for_task
 inline bool Executor::_wait_for_task(Worker& worker, Node*& t) {
 
-  wait_for_task:
-
-  assert(!t);
-
-  ++_num_thieves;
-
   explore_task:
 
   _explore_task(worker, t);
-
+  
+  // The last thief who successfully stole a task will wake up
+  // another thief worker to avoid starvation.
   if(t) {
-    if(_num_thieves.fetch_sub(1) == 1) {
-      _notifier.notify(false);
-    }
+    _notifier.notify(false);
     return true;
   }
 
+  // ---- 2PC guard ----
   _notifier.prepare_wait(worker._waiter);
-  
-  //if(auto vtm = _find_vtm(me); vtm != _workers.size()) {
-  if(!_wsq.empty()) {
 
+  if(!_wsq.empty()) {
     _notifier.cancel_wait(worker._waiter);
-    //t = (vtm == me) ? _wsq.steal() : _workers[vtm].wsq.steal();
-    
-    t = _wsq.steal();  // must steal here
-    if(t) {
-      if(_num_thieves.fetch_sub(1) == 1) {
-        _notifier.notify(false);
-      }
-      return true;
-    }
-    else {
-      worker._vtm = worker._id;
-      goto explore_task;
-    }
+    worker._vtm = worker._id;
+    goto explore_task;
   }
-
+  
   if(_done) {
     _notifier.cancel_wait(worker._waiter);
     _notifier.notify(true);
-    --_num_thieves;
     return false;
   }
-
-  if(_num_thieves.fetch_sub(1) == 1) {
-    if(_num_actives) {
+  
+  // We need to use index-based scanning to avoid data race
+  // with _spawn which may initialize a worker at the same time.
+  for(size_t vtm=0; vtm<_workers.size(); vtm++) {
+    if(!_workers[vtm]._wsq.empty()) {
       _notifier.cancel_wait(worker._waiter);
-      goto wait_for_task;
-    }
-    // check all queues again
-    for(auto& w : _workers) {
-      if(!w._wsq.empty()) {
-        worker._vtm = w._id;
-        _notifier.cancel_wait(worker._waiter);
-        goto wait_for_task;
-      }
+      worker._vtm = vtm;
+      goto explore_task;
     }
   }
-    
+  
   // Now I really need to relinguish my self to others
   _notifier.commit_wait(worker._waiter);
 
-  return true;
+  goto explore_task;
 }
 
-// Function: make_observer    
+// Function: make_observer
 template<typename Observer, typename... ArgsT>
 std::shared_ptr<Observer> Executor::make_observer(ArgsT&&... args) {
 
@@ -549,10 +1377,10 @@ std::shared_ptr<Observer> Executor::make_observer(ArgsT&&... args) {
     std::is_base_of_v<ObserverInterface, Observer>,
     "Observer must be derived from ObserverInterface"
   );
-  
-  // use a local variable to mimic the constructor 
+
+  // use a local variable to mimic the constructor
   auto ptr = std::make_shared<Observer>(std::forward<ArgsT>(args)...);
-  
+
   ptr->set_up(_workers.size());
 
   _observers.emplace(std::static_pointer_cast<ObserverInterface>(ptr));
@@ -563,7 +1391,7 @@ std::shared_ptr<Observer> Executor::make_observer(ArgsT&&... args) {
 // Procedure: remove_observer
 template <typename Observer>
 void Executor::remove_observer(std::shared_ptr<Observer> ptr) {
-  
+
   static_assert(
     std::is_base_of_v<ObserverInterface, Observer>,
     "Observer must be derived from ObserverInterface"
@@ -573,114 +1401,170 @@ void Executor::remove_observer(std::shared_ptr<Observer> ptr) {
 }
 
 // Function: num_observers
-inline size_t Executor::num_observers() const {
+inline size_t Executor::num_observers() const noexcept {
   return _observers.size();
 }
 
 // Procedure: _schedule
-// The main procedure to schedule a give task node.
-// Each task node has two types of tasks - regular and subflow.
-inline void Executor::_schedule(Node* node) {
+inline void Executor::_schedule(Worker& worker, Node* node) {
   
-  //assert(_workers.size() != 0);
-
-  // caller is a worker to this pool
-  auto worker = per_thread.worker;
-
-  if(worker != nullptr && worker->_executor == this) {
-    worker->_wsq.push(node);
+  // We need to fetch p before the release such that the read 
+  // operation is synchronized properly with other thread to
+  // void data race.
+  auto p = node->_priority;
+
+  node->_state.fetch_or(Node::READY, std::memory_order_release);
+
+  // caller is a worker to this pool - starting at v3.5 we do not use
+  // any complicated notification mechanism as the experimental result
+  // has shown no significant advantage.
+  if(worker._executor == this) {
+    worker._wsq.push(node, p);
+    _notifier.notify(false);
     return;
   }
 
-  // other threads
   {
     std::lock_guard<std::mutex> lock(_wsq_mutex);
-    _wsq.push(node);
+    _wsq.push(node, p);
   }
 
   _notifier.notify(false);
 }
 
 // Procedure: _schedule
-// The main procedure to schedule a set of task nodes.
-// Each task node has two types of tasks - regular and subflow.
-inline void Executor::_schedule(const std::vector<Node*>& nodes) {
-
-  //assert(_workers.size() != 0);
+inline void Executor::_schedule(Node* node) {
   
+  // We need to fetch p before the release such that the read 
+  // operation is synchronized properly with other thread to
+  // void data race.
+  auto p = node->_priority;
+
+  node->_state.fetch_or(Node::READY, std::memory_order_release);
+
+  {
+    std::lock_guard<std::mutex> lock(_wsq_mutex);
+    _wsq.push(node, p);
+  }
+
+  _notifier.notify(false);
+}
+
+// Procedure: _schedule
+inline void Executor::_schedule(Worker& worker, const SmallVector<Node*>& nodes) {
+
   // We need to cacth the node count to avoid accessing the nodes
   // vector while the parent topology is removed!
   const auto num_nodes = nodes.size();
-  
+
   if(num_nodes == 0) {
     return;
   }
 
-  // worker thread
-  auto worker = per_thread.worker;
-
-  if(worker != nullptr && worker->_executor == this) {
+  // caller is a worker to this pool - starting at v3.5 we do not use
+  // any complicated notification mechanism as the experimental result
+  // has shown no significant advantage.
+  if(worker._executor == this) {
     for(size_t i=0; i<num_nodes; ++i) {
-      worker->_wsq.push(nodes[i]);
+      // We need to fetch p before the release such that the read 
+      // operation is synchronized properly with other thread to
+      // void data race.
+      auto p = nodes[i]->_priority;
+      nodes[i]->_state.fetch_or(Node::READY, std::memory_order_release);
+      worker._wsq.push(nodes[i], p);
+      _notifier.notify(false);
     }
     return;
   }
-  
-  // other threads
+
   {
     std::lock_guard<std::mutex> lock(_wsq_mutex);
     for(size_t k=0; k<num_nodes; ++k) {
-      _wsq.push(nodes[k]);
+      auto p = nodes[k]->_priority;
+      nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release);
+      _wsq.push(nodes[k], p);
     }
   }
-  
+
+  _notifier.notify_n(num_nodes);
+}
+
+// Procedure: _schedule
+inline void Executor::_schedule(const SmallVector<Node*>& nodes) {
+
+  // parent topology may be removed!
+  const auto num_nodes = nodes.size();
+
+  if(num_nodes == 0) {
+    return;
+  }
+
+  // We need to fetch p before the release such that the read 
+  // operation is synchronized properly with other thread to
+  // void data race.
+  {
+    std::lock_guard<std::mutex> lock(_wsq_mutex);
+    for(size_t k=0; k<num_nodes; ++k) {
+      auto p = nodes[k]->_priority;
+      nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release);
+      _wsq.push(nodes[k], p);
+    }
+  }
+
   _notifier.notify_n(num_nodes);
 }
 
 // Procedure: _invoke
 inline void Executor::_invoke(Worker& worker, Node* node) {
-  
+
+  // synchronize all outstanding memory operations caused by reordering
+  while(!(node->_state.load(std::memory_order_acquire) & Node::READY));
+
+  begin_invoke:
+
   // no need to do other things if the topology is cancelled
-  //if(node->_topology && node->_topology->_is_cancelled) {
   if(node->_is_cancelled()) {
-    _tear_down_invoke(node, true);
+    _tear_down_invoke(worker, node);
     return;
   }
 
   // if acquiring semaphore(s) exists, acquire them first
   if(node->_semaphores && !node->_semaphores->to_acquire.empty()) {
-    std::vector<Node*> nodes;
+    SmallVector<Node*> nodes;
     if(!node->_acquire_all(nodes)) {
-      _schedule(nodes);
+      _schedule(worker, nodes);
       return;
     }
-    node->_set_state(Node::ACQUIRED);
+    node->_state.fetch_or(Node::ACQUIRED, std::memory_order_release);
   }
 
-  // Here we need to fetch the num_successors first to avoid the invalid memory
-  // access caused by topology clear.
-  const auto num_successors = node->num_successors();
-  
   // condition task
-  int cond = -1;
-  
+  //int cond = -1;
+  SmallVector<int> conds;
+
   // switch is faster than nested if-else due to jump table
   switch(node->_handle.index()) {
     // static task
     case Node::STATIC:{
       _invoke_static_task(worker, node);
-    } 
+    }
     break;
-    
+
     // dynamic task
     case Node::DYNAMIC: {
       _invoke_dynamic_task(worker, node);
     }
     break;
-    
+
     // condition task
     case Node::CONDITION: {
-      _invoke_condition_task(worker, node, cond);
+      _invoke_condition_task(worker, node, conds);
+    }
+    break;
+
+    // multi-condition task
+    case Node::MULTI_CONDITION: {
+      _invoke_multi_condition_task(worker, node, conds);
     }
     break;
 
@@ -693,114 +1577,129 @@ inline void Executor::_invoke(Worker& worker, Node* node) {
     // async task
     case Node::ASYNC: {
       _invoke_async_task(worker, node);
-      _tear_down_invoke(node, false);
-      return ;
-    }
-    break;
-    
-    // silent async task
-    case Node::SILENT_ASYNC: {
-      _invoke_silent_async_task(worker, node);
-      _tear_down_invoke(node, false);
+      _tear_down_async(node);
       return ;
     }
     break;
 
-    // cudaflow task
-    case Node::CUDAFLOW: {
-      _invoke_cudaflow_task(worker, node);
+    // dependent async task
+    case Node::DEPENDENT_ASYNC: {
+      _invoke_dependent_async_task(worker, node);
+      _tear_down_dependent_async(worker, node);
+      if(worker._cache) {
+        node = worker._cache;
+        goto begin_invoke;
+      }
+      return;
     }
-    break; 
+    break;
 
-    // monostate
+    // monostate (placeholder)
     default:
     break;
   }
 
   // if releasing semaphores exist, release them
   if(node->_semaphores && !node->_semaphores->to_release.empty()) {
-    _schedule(node->_release_all());
+    _schedule(worker, node->_release_all());
   }
-
-  // We MUST recover the dependency since the graph may have cycles.
-  // This must be done before scheduling the successors, otherwise this might cause 
-  // race condition on the _dependents
-  if(node->_has_state(Node::BRANCHED)) {
-    node->_join_counter = node->num_strong_dependents();
-  }
-  else {
-    node->_join_counter = node->num_dependents();
-  }
-  
-  // acquire the parent flow counter
-  auto& j = (node->_parent) ? node->_parent->_join_counter : 
-                              node->_topology->_join_counter;
   
-  // At this point, the node storage might be destructed (to be verified)
-  // case 1: non-condition task
-  if(node->_handle.index() != Node::CONDITION) {
-    for(size_t i=0; i<num_successors; ++i) {
-      if(--(node->_successors[i]->_join_counter) == 0) {
-        j.fetch_add(1);
-        _schedule(node->_successors[i]);
-      }
-    }
+  // Reset the join counter to support the cyclic control flow.
+  // + We must do this before scheduling the successors to avoid race
+  //   condition on _dependents.
+  // + We must use fetch_add instead of direct assigning
+  //   because the user-space call on "invoke" may explicitly schedule 
+  //   this task again (e.g., pipeline) which can access the join_counter.
+  if((node->_state.load(std::memory_order_relaxed) & Node::CONDITIONED)) {
+    node->_join_counter.fetch_add(node->num_strong_dependents(), std::memory_order_relaxed);
   }
-  // case 2: condition task
   else {
-    if(cond >= 0 && static_cast<size_t>(cond) < num_successors) {
-      auto s = node->_successors[cond];
-      s->_join_counter.store(0);  // seems redundant but just for invariant
-      j.fetch_add(1);
-      _schedule(s);
-    }
+    node->_join_counter.fetch_add(node->num_dependents(), std::memory_order_relaxed);
   }
-  
-  // tear_down the invoke
-  _tear_down_invoke(node, false);
-}
 
-// Procedure: _tear_down_async
-inline void Executor::_tear_down_async(Node* node) {
-  if(node->_parent) {  
-    node->_parent->_join_counter.fetch_sub(1);
-  }
-  else {
-    _decrement_topology_and_notify();
-  }
-  node_pool.recycle(node);
-}
+  // acquire the parent flow counter
+  auto& j = (node->_parent) ? node->_parent->_join_counter :
+                              node->_topology->_join_counter;
 
-// Procedure: _tear_down_invoke
-inline void Executor::_tear_down_invoke(Node* node, bool cancel) {
+  // Here, we want to cache the latest successor with the highest priority
+  worker._cache = nullptr;
+  auto max_p = static_cast<unsigned>(TaskPriority::MAX);
 
+  // Invoke the task based on the corresponding type
   switch(node->_handle.index()) {
-    // async task needs to carry out the promise
-    case Node::ASYNC:
-      if(cancel) {
-        std::get<Node::Async>(node->_handle).work(true);
-      }
-      _tear_down_async(node);
-    break;
 
-    // silent async doesn't need to carry out the promise
-    case Node::SILENT_ASYNC:
-      _tear_down_async(node);
+    // condition and multi-condition tasks
+    case Node::CONDITION:
+    case Node::MULTI_CONDITION: {
+      for(auto cond : conds) {
+        if(cond >= 0 && static_cast<size_t>(cond) < node->_successors.size()) {
+          auto s = node->_successors[cond];
+          // zeroing the join counter for invariant
+          s->_join_counter.store(0, std::memory_order_relaxed);
+          j.fetch_add(1, std::memory_order_relaxed);
+          if(s->_priority <= max_p) {
+            if(worker._cache) {
+              _schedule(worker, worker._cache);
+            }
+            worker._cache = s;
+            max_p = s->_priority;
+          }
+          else {
+            _schedule(worker, s);
+          }
+        }
+      }
+    }
     break;
 
-    // tear down topology if the node is the last leaf
+    // non-condition task
     default: {
-      if(node->_parent == nullptr) {
-        if(node->_topology->_join_counter.fetch_sub(1) == 1) {
-          _tear_down_topology(node->_topology);
+      for(size_t i=0; i<node->_successors.size(); ++i) {
+        //if(auto s = node->_successors[i]; --(s->_join_counter) == 0) {
+        if(auto s = node->_successors[i]; 
+          s->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+          j.fetch_add(1, std::memory_order_relaxed);
+          if(s->_priority <= max_p) {
+            if(worker._cache) {
+              _schedule(worker, worker._cache);
+            }
+            worker._cache = s;
+            max_p = s->_priority;
+          }
+          else {
+            _schedule(worker, s);
+          }
         }
       }
-      else {  // joined subflow
-        node->_parent->_join_counter.fetch_sub(1);
-      }
     }
     break;
   }
+
+  // tear_down the invoke
+  _tear_down_invoke(worker, node);
+
+  // perform tail recursion elimination for the right-most child to reduce
+  // the number of expensive pop/push operations through the task queue
+  if(worker._cache) {
+    node = worker._cache;
+    //node->_state.fetch_or(Node::READY, std::memory_order_release);
+    goto begin_invoke;
+  }
+}
+
+// Proecdure: _tear_down_invoke
+inline void Executor::_tear_down_invoke(Worker& worker, Node* node) {
+  // we must check parent first before substracting the join counter,
+  // or it can introduce data race
+  if(node->_parent == nullptr) {
+    if(node->_topology->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+      _tear_down_topology(worker, node->_topology);
+    }
+  }
+  // joined subflow
+  else {  
+    node->_parent->_join_counter.fetch_sub(1, std::memory_order_release);
+  }
 }
 
 // Procedure: _observer_prologue
@@ -820,7 +1719,17 @@ inline void Executor::_observer_epilogue(Worker& worker, Node* node) {
 // Procedure: _invoke_static_task
 inline void Executor::_invoke_static_task(Worker& worker, Node* node) {
   _observer_prologue(worker, node);
-  std::get<Node::Static>(node->_handle).work();
+  auto& work = std::get_if<Node::Static>(&node->_handle)->work;
+  switch(work.index()) {
+    case 0:
+      std::get_if<0>(&work)->operator()();
+    break;
+
+    case 1:
+      Runtime rt(*this, worker, node);
+      std::get_if<1>(&work)->operator()(rt);
+    break;
+  }
   _observer_epilogue(worker, node);
 }
 
@@ -829,145 +1738,138 @@ inline void Executor::_invoke_dynamic_task(Worker& w, Node* node) {
 
   _observer_prologue(w, node);
 
-  auto& handle = std::get<Node::Dynamic>(node->_handle);
+  auto handle = std::get_if<Node::Dynamic>(&node->_handle);
 
-  handle.subgraph.clear();
+  handle->subgraph._clear();
 
-  Subflow sf(*this, node, handle.subgraph); 
+  Subflow sf(*this, w, node, handle->subgraph);
 
-  handle.work(sf);
+  handle->work(sf);
 
   if(sf._joinable) {
-    _invoke_dynamic_task_internal(w, node, handle.subgraph, false);
+    _consume_graph(w, node, handle->subgraph);
   }
-  
+
   _observer_epilogue(w, node);
 }
 
-// Procedure: _invoke_dynamic_task_external
-inline void Executor::_invoke_dynamic_task_external(Node*p, Graph& g, bool detach) {
+// Procedure: _detach_dynamic_task
+inline void Executor::_detach_dynamic_task(
+  Worker& w, Node* p, Graph& g
+) {
+
+  // graph is empty and has no async tasks
+  if(g.empty() && p->_join_counter.load(std::memory_order_acquire) == 0) {
+    return;
+  }
+
+  SmallVector<Node*> src;
+
+  for(auto n : g._nodes) {
 
-  auto worker = per_thread.worker;
+    n->_state.store(Node::DETACHED, std::memory_order_relaxed);
+    n->_set_up_join_counter();
+    n->_topology = p->_topology;
+    n->_parent = nullptr;
+
+    if(n->num_dependents() == 0) {
+      src.push_back(n);
+    }
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(p->_topology->_taskflow._mutex);
+    p->_topology->_taskflow._graph._merge(std::move(g));
+  }
 
-  assert(worker && worker->_executor == this);
-  
-  _invoke_dynamic_task_internal(*worker, p, g, detach);
+  p->_topology->_join_counter.fetch_add(src.size(), std::memory_order_relaxed);
+  _schedule(w, src);
 }
 
-// Procedure: _invoke_dynamic_task_internal
-inline void Executor::_invoke_dynamic_task_internal(
-  Worker& w, Node* p, Graph& g, bool detach
-) {
+// Procedure: _consume_graph
+inline void Executor::_consume_graph(Worker& w, Node* p, Graph& g) {
 
   // graph is empty and has no async tasks
-  if(g.empty() && p->_join_counter == 0) {
+  if(g.empty() && p->_join_counter.load(std::memory_order_acquire) == 0) {
     return;
   }
 
-  std::vector<Node*> src; 
+  SmallVector<Node*> src;
 
   for(auto n : g._nodes) {
-
-    n->_topology = p->_topology;
+    n->_state.store(0, std::memory_order_relaxed);
     n->_set_up_join_counter();
-
-    if(detach) {
-      n->_parent = nullptr;
-      n->_set_state(Node::DETACHED);
-    }
-    else {
-      n->_parent = p;
-    }
-    
+    n->_topology = p->_topology;
+    n->_parent = p;
     if(n->num_dependents() == 0) {
       src.push_back(n);
     }
   }
+  p->_join_counter.fetch_add(src.size(), std::memory_order_relaxed);
   
-  // detach here
-  if(detach) {    
-    
-    {
-      std::lock_guard<std::mutex> lock(p->_topology->_taskflow._mtx);
-      p->_topology->_taskflow._graph.merge(std::move(g));
-    }
-
-    p->_topology->_join_counter.fetch_add(src.size());
-    _schedule(src);
-  }
-  // join here
-  else {  
-    p->_join_counter.fetch_add(src.size());
-    _schedule(src);
-    Node* t = nullptr;
-  
-    std::uniform_int_distribution<size_t> rdvtm(_VICTIM_BEG, _VICTIM_END);
-
-    while(p->_join_counter != 0) {
-
-      t = w._wsq.pop();
-
-      exploit:
-
-      if(t) {
-        _invoke(w, t);
-      }
-      else {
-        explore:
-        t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal();
-        if(t) {
-          goto exploit;
-        }
-        else if(p->_join_counter != 0){
-          std::this_thread::yield();
-          w._vtm = rdvtm(w._rdgen);
-          goto explore;
-        }
-        else {
-          break;
-        }
-      }
-    }
-  }
+  _schedule(w, src);
+  _corun_until(w, [p] () -> bool { return p->_join_counter.load(std::memory_order_acquire) == 0; });
 }
 
 // Procedure: _invoke_condition_task
 inline void Executor::_invoke_condition_task(
-  Worker& worker, Node* node, int& cond
+  Worker& worker, Node* node, SmallVector<int>& conds
 ) {
   _observer_prologue(worker, node);
-  cond = std::get<Node::Condition>(node->_handle).work();
+  auto& work = std::get_if<Node::Condition>(&node->_handle)->work;
+  switch(work.index()) {
+    case 0:
+      conds = { std::get_if<0>(&work)->operator()() };
+    break;
+
+    case 1:
+      Runtime rt(*this, worker, node);
+      conds = { std::get_if<1>(&work)->operator()(rt) };
+    break;
+  }
   _observer_epilogue(worker, node);
 }
 
-// Procedure: _invoke_cudaflow_task
-inline void Executor::_invoke_cudaflow_task(Worker& worker, Node* node) {
-  _observer_prologue(worker, node);  
-  std::get<Node::cudaFlow>(node->_handle).work(*this, node);
+// Procedure: _invoke_multi_condition_task
+inline void Executor::_invoke_multi_condition_task(
+  Worker& worker, Node* node, SmallVector<int>& conds
+) {
+  _observer_prologue(worker, node);
+  auto& work = std::get_if<Node::MultiCondition>(&node->_handle)->work;
+  switch(work.index()) {
+    case 0:
+      conds = std::get_if<0>(&work)->operator()();
+    break;
+
+    case 1:
+      Runtime rt(*this, worker, node);
+      conds = std::get_if<1>(&work)->operator()(rt);
+    break;
+  }
   _observer_epilogue(worker, node);
 }
 
-
 // Procedure: _invoke_module_task
 inline void Executor::_invoke_module_task(Worker& w, Node* node) {
   _observer_prologue(w, node);
-  auto module = std::get<Node::Module>(node->_handle).module;
-  _invoke_dynamic_task_internal(w, node, module->_graph, false);
-  _observer_epilogue(w, node);  
+  _consume_graph(
+    w, node, std::get_if<Node::Module>(&node->_handle)->graph
+  );
+  _observer_epilogue(w, node);
 }
 
 // Procedure: _invoke_async_task
 inline void Executor::_invoke_async_task(Worker& w, Node* node) {
   _observer_prologue(w, node);
-  std::get<Node::Async>(node->_handle).work(false);
-  _observer_epilogue(w, node);  
+  std::get_if<Node::Async>(&node->_handle)->work();
+  _observer_epilogue(w, node);
 }
 
-// Procedure: _invoke_silent_async_task
-inline void Executor::_invoke_silent_async_task(Worker& w, Node* node) {
+// Procedure: _invoke_dependent_async_task
+inline void Executor::_invoke_dependent_async_task(Worker& w, Node* node) {
   _observer_prologue(w, node);
-  std::get<Node::SilentAsync>(node->_handle).work();
-  _observer_epilogue(w, node);  
+  std::get_if<Node::DependentAsync>(&node->_handle)->work();
+  _observer_epilogue(w, node);
 }
 
 // Function: run
@@ -975,17 +1877,33 @@ inline tf::Future<void> Executor::run(Taskflow& f) {
   return run_n(f, 1, [](){});
 }
 
+// Function: run
+inline tf::Future<void> Executor::run(Taskflow&& f) {
+  return run_n(std::move(f), 1, [](){});
+}
+
 // Function: run
 template <typename C>
 tf::Future<void> Executor::run(Taskflow& f, C&& c) {
   return run_n(f, 1, std::forward<C>(c));
 }
 
+// Function: run
+template <typename C>
+tf::Future<void> Executor::run(Taskflow&& f, C&& c) {
+  return run_n(std::move(f), 1, std::forward<C>(c));
+}
+
 // Function: run_n
 inline tf::Future<void> Executor::run_n(Taskflow& f, size_t repeat) {
   return run_n(f, repeat, [](){});
 }
 
+// Function: run_n
+inline tf::Future<void> Executor::run_n(Taskflow&& f, size_t repeat) {
+  return run_n(std::move(f), repeat, [](){});
+}
+
 // Function: run_n
 template <typename C>
 tf::Future<void> Executor::run_n(Taskflow& f, size_t repeat, C&& c) {
@@ -994,28 +1912,149 @@ tf::Future<void> Executor::run_n(Taskflow& f, size_t repeat, C&& c) {
   );
 }
 
-// Function: run_until    
+// Function: run_n
+template <typename C>
+tf::Future<void> Executor::run_n(Taskflow&& f, size_t repeat, C&& c) {
+  return run_until(
+    std::move(f), [repeat]() mutable { return repeat-- == 0; }, std::forward<C>(c)
+  );
+}
+
+// Function: run_until
 template<typename P>
 tf::Future<void> Executor::run_until(Taskflow& f, P&& pred) {
   return run_until(f, std::forward<P>(pred), [](){});
 }
 
-// Function: _set_up_topology
-inline void Executor::_set_up_topology(Topology* tpg) {
+// Function: run_until
+template<typename P>
+tf::Future<void> Executor::run_until(Taskflow&& f, P&& pred) {
+  return run_until(std::move(f), std::forward<P>(pred), [](){});
+}
 
-  if(tpg->_is_cancelled) {
-    _tear_down_topology(tpg);
-    return;
+// Function: run_until
+template <typename P, typename C>
+tf::Future<void> Executor::run_until(Taskflow& f, P&& p, C&& c) {
+
+  _increment_topology();
+
+  // Need to check the empty under the lock since dynamic task may
+  // define detached blocks that modify the taskflow at the same time
+  bool empty;
+  {
+    std::lock_guard<std::mutex> lock(f._mutex);
+    empty = f.empty();
   }
 
-  tpg->_sources.clear();
-  tpg->_taskflow._graph.clear_detached();
+  // No need to create a real topology but returns an dummy future
+  if(empty || p()) {
+    c();
+    std::promise<void> promise;
+    promise.set_value();
+    _decrement_topology_and_notify();
+    return tf::Future<void>(promise.get_future(), std::monostate{});
+  }
+
+  // create a topology for this run
+  auto t = std::make_shared<Topology>(f, std::forward<P>(p), std::forward<C>(c));
+
+  // need to create future before the topology got torn down quickly
+  tf::Future<void> future(t->_promise.get_future(), t);
+
+  // modifying topology needs to be protected under the lock
+  {
+    std::lock_guard<std::mutex> lock(f._mutex);
+    f._topologies.push(t);
+    if(f._topologies.size() == 1) {
+      _set_up_topology(_this_worker(), t.get());
+    }
+  }
+
+  return future;
+}
+
+// Function: run_until
+template <typename P, typename C>
+tf::Future<void> Executor::run_until(Taskflow&& f, P&& pred, C&& c) {
+
+  std::list<Taskflow>::iterator itr;
+
+  {
+    std::scoped_lock<std::mutex> lock(_taskflows_mutex);
+    itr = _taskflows.emplace(_taskflows.end(), std::move(f));
+    itr->_satellite = itr;
+  }
+
+  return run_until(*itr, std::forward<P>(pred), std::forward<C>(c));
+}
+
+// Function: corun
+template <typename T>
+void Executor::corun(T& target) {
+  
+  auto w = _this_worker();
+
+  if(w == nullptr) {
+    TF_THROW("corun must be called by a worker of the executor");
+  }
+
+  Node parent;  // dummy parent
+  _consume_graph(*w, &parent, target.graph());
+}
+
+// Function: corun_until
+template <typename P>
+void Executor::corun_until(P&& predicate) {
   
+  auto w = _this_worker();
+
+  if(w == nullptr) {
+    TF_THROW("corun_until must be called by a worker of the executor");
+  }
+
+  _corun_until(*w, std::forward<P>(predicate));
+}
+
+// Procedure: _increment_topology
+inline void Executor::_increment_topology() {
+  std::lock_guard<std::mutex> lock(_topology_mutex);
+  ++_num_topologies;
+}
+
+// Procedure: _decrement_topology_and_notify
+inline void Executor::_decrement_topology_and_notify() {
+  std::lock_guard<std::mutex> lock(_topology_mutex);
+  if(--_num_topologies == 0) {
+    _topology_cv.notify_all();
+  }
+}
+
+// Procedure: _decrement_topology
+inline void Executor::_decrement_topology() {
+  std::lock_guard<std::mutex> lock(_topology_mutex);
+  --_num_topologies;
+}
+
+// Procedure: wait_for_all
+inline void Executor::wait_for_all() {
+  std::unique_lock<std::mutex> lock(_topology_mutex);
+  _topology_cv.wait(lock, [&](){ return _num_topologies == 0; });
+}
+
+// Function: _set_up_topology
+inline void Executor::_set_up_topology(Worker* worker, Topology* tpg) {
+
+  // ---- under taskflow lock ----
+
+  tpg->_sources.clear();
+  tpg->_taskflow._graph._clear_detached();
+
   // scan each node in the graph and build up the links
   for(auto node : tpg->_taskflow._graph._nodes) {
-    
+
     node->_topology = tpg;
-    node->_clear_state();
+    node->_parent = nullptr;
+    node->_state.store(0, std::memory_order_relaxed);
 
     if(node->num_dependents() == 0) {
       tpg->_sources.push_back(node);
@@ -1024,12 +2063,18 @@ inline void Executor::_set_up_topology(Topology* tpg) {
     node->_set_up_join_counter();
   }
 
-  tpg->_join_counter = tpg->_sources.size();
-  _schedule(tpg->_sources);
+  tpg->_join_counter.store(tpg->_sources.size(), std::memory_order_relaxed);
+
+  if(worker) {
+    _schedule(*worker, tpg->_sources);
+  }
+  else {
+    _schedule(tpg->_sources);
+  }
 }
 
 // Function: _tear_down_topology
-inline void Executor::_tear_down_topology(Topology* tpg) {
+inline void Executor::_tear_down_topology(Worker& worker, Topology* tpg) {
 
   auto &f = tpg->_taskflow;
 
@@ -1037,130 +2082,71 @@ inline void Executor::_tear_down_topology(Topology* tpg) {
 
   // case 1: we still need to run the topology again
   if(!tpg->_is_cancelled && !tpg->_pred()) {
-    assert(tpg->_join_counter == 0);
-    tpg->_join_counter = tpg->_sources.size();
-    _schedule(tpg->_sources); 
+    //assert(tpg->_join_counter == 0);
+    std::lock_guard<std::mutex> lock(f._mutex);
+    tpg->_join_counter.store(tpg->_sources.size(), std::memory_order_relaxed);
+    _schedule(worker, tpg->_sources);
   }
   // case 2: the final run of this topology
   else {
 
-    // TODO: if the topology is cancelled, need to release all constraints
-    
+    // TODO: if the topology is cancelled, need to release all semaphores
     if(tpg->_call != nullptr) {
       tpg->_call();
     }
 
-    f._mtx.lock();
-
     // If there is another run (interleave between lock)
-    if(f._topologies.size() > 1) {
-
-      assert(tpg->_join_counter == 0);
+    if(std::unique_lock<std::mutex> lock(f._mutex); f._topologies.size()>1) {
+      //assert(tpg->_join_counter == 0);
 
       // Set the promise
       tpg->_promise.set_value();
       f._topologies.pop();
       tpg = f._topologies.front().get();
 
-      f._mtx.unlock();
-      
       // decrement the topology but since this is not the last we don't notify
       _decrement_topology();
 
-      _set_up_topology(tpg);
+      // set up topology needs to be under the lock or it can
+      // introduce memory order error with pop
+      _set_up_topology(&worker, tpg);
     }
     else {
-      assert(f._topologies.size() == 1);
+      //assert(f._topologies.size() == 1);
 
-      // Need to back up the promise first here becuz taskflow might be 
+      // Need to back up the promise first here becuz taskflow might be
       // destroy soon after calling get
       auto p {std::move(tpg->_promise)};
 
-      // Back up lambda capture in case it has the topology pointer, 
-      // to avoid it releasing on pop_front ahead of _mtx.unlock & 
+      // Back up lambda capture in case it has the topology pointer,
+      // to avoid it releasing on pop_front ahead of _mutex.unlock &
       // _promise.set_value. Released safely when leaving scope.
-      auto c { std::move( tpg->_call ) };
+      auto c {std::move(tpg->_call)};
 
+      // Get the satellite if any
+      auto s {f._satellite};
+
+      // Now we remove the topology from this taskflow
       f._topologies.pop();
 
-      f._mtx.unlock();
+      //f._mutex.unlock();
+      lock.unlock();
 
-      // We set the promise in the end in case taskflow leaves before taskflow
+      // We set the promise in the end in case taskflow leaves the scope.
+      // After set_value, the caller will return from wait
       p.set_value();
 
       _decrement_topology_and_notify();
-    }
-  }
-}
-
-// Function: run_until
-template <typename P, typename C>
-tf::Future<void> Executor::run_until(Taskflow& f, P&& pred, C&& c) {
-
-  _increment_topology();
-  
-  // Special case of predicate
-  if(f.empty() || pred()) {
-    std::promise<void> promise;
-    promise.set_value();
-    _decrement_topology_and_notify();
-    return tf::Future<void>(promise.get_future(), std::monostate{});
-  }
-  
-  // Multi-threaded execution.
-  bool run_now {false};
-  
-  // create a topology for this run
-  auto tpg = std::make_shared<Topology>(
-    f, std::forward<P>(pred), std::forward<C>(c)
-  );
-  
-  // need to create future before the topology got torn down quickly
-  tf::Future<void> future(tpg->_promise.get_future(), tpg);
 
-  {
-    std::lock_guard<std::mutex> lock(f._mtx);
-    
-    f._topologies.push(tpg);
-   
-    if(f._topologies.size() == 1) {
-      run_now = true;
+      // remove the taskflow if it is managed by the executor
+      // TODO: in the future, we may need to synchronize on wait
+      // (which means the following code should the moved before set_value)
+      if(s) {
+        std::scoped_lock<std::mutex> lock(_taskflows_mutex);
+        _taskflows.erase(*s);
+      }
     }
   }
-  
-  // Notice here calling schedule may cause the topology to be removed sonner 
-  // before the function leaves.
-  if(run_now) {
-    _set_up_topology(tpg.get());
-  }
-
-  return future;
-}
-
-// Procedure: _increment_topology
-inline void Executor::_increment_topology() {
-  std::lock_guard<std::mutex> lock(_topology_mutex);
-  ++_num_topologies;
-}
-
-// Procedure: _decrement_topology_and_notify
-inline void Executor::_decrement_topology_and_notify() {
-  std::lock_guard<std::mutex> lock(_topology_mutex);
-  if(--_num_topologies == 0) {
-    _topology_cv.notify_all();
-  }
-}
-
-// Procedure: _decrement_topology
-inline void Executor::_decrement_topology() {
-  std::lock_guard<std::mutex> lock(_topology_mutex);
-  --_num_topologies;
-}
-
-// Procedure: wait_for_all
-inline void Executor::wait_for_all() {
-  std::unique_lock<std::mutex> lock(_topology_mutex);
-  _topology_cv.wait(lock, [&](){ return _num_topologies == 0; });
 }
 
 // ############################################################################
@@ -1169,86 +2155,159 @@ inline void Executor::wait_for_all() {
 
 inline void Subflow::join() {
 
+  // assert(this_worker().worker == &_worker);
+
   if(!_joinable) {
     TF_THROW("subflow not joinable");
   }
 
-  _executor._invoke_dynamic_task_external(_parent, _graph, false);
+  // only the parent worker can join the subflow
+  _executor._consume_graph(_worker, _parent, _graph);
   _joinable = false;
 }
 
 inline void Subflow::detach() {
 
+  // assert(this_worker().worker == &_worker);
+
   if(!_joinable) {
     TF_THROW("subflow already joined or detached");
   }
 
-  _executor._invoke_dynamic_task_external(_parent, _graph, true);
+  // only the parent worker can detach the subflow
+  _executor._detach_dynamic_task(_worker, _parent, _graph);
   _joinable = false;
 }
 
-// Function: async
-template <typename F, typename... ArgsT>
-auto Subflow::async(F&& f, ArgsT&&... args) {
+// ############################################################################
+// Forward Declaration: Runtime
+// ############################################################################
+
+// Procedure: schedule
+inline void Runtime::schedule(Task task) {
+  
+  auto node = task._node;
+  // need to keep the invariant: when scheduling a task, the task must have
+  // zero dependency (join counter is 0)
+  // or we can encounter bug when inserting a nested flow (e.g., module task)
+  node->_join_counter.store(0, std::memory_order_relaxed);
+
+  auto& j = node->_parent ? node->_parent->_join_counter :
+                            node->_topology->_join_counter;
+  j.fetch_add(1, std::memory_order_relaxed);
+  _executor._schedule(_worker, node);
+}
 
-  _parent->_join_counter.fetch_add(1);
+// Procedure: corun
+template <typename T>
+void Runtime::corun(T&& target) {
+
+  // dynamic task (subflow)
+  if constexpr(is_dynamic_task_v<T>) {
+    Graph graph;
+    Subflow sf(_executor, _worker, _parent, graph);
+    target(sf);
+    if(sf._joinable) {
+      _executor._consume_graph(_worker, _parent, graph);
+    }
+  }
+  // a composable graph object with `tf::Graph& T::graph()` defined
+  else {
+    _executor._consume_graph(_worker, _parent, target.graph());
+  }
+}
 
-  //using T = typename function_traits<F>::return_type;
-  using T = std::invoke_result_t<F, ArgsT...>;
-  using R = std::conditional_t<std::is_same_v<T, void>, void, std::optional<T>>;
+// Procedure: corun_until
+template <typename P>
+void Runtime::corun_until(P&& predicate) {
+  _executor._corun_until(_worker, std::forward<P>(predicate));
+}
 
-  std::promise<R> p;
+// Function: _silent_async
+template <typename F>
+void Runtime::_silent_async(Worker& w, const std::string& name, F&& f) {
 
-  auto tpg = std::make_shared<AsyncTopology>();
+  _parent->_join_counter.fetch_add(1, std::memory_order_relaxed);
 
-  Future<R> fu(p.get_future(), tpg);
+  auto node = node_pool.animate(
+    name, 0, _parent->_topology, _parent, 0,
+    std::in_place_type_t<Node::Async>{}, std::forward<F>(f)
+  );
+
+  _executor._schedule(w, node);
+}
+
+// Function: silent_async
+template <typename F>
+void Runtime::silent_async(F&& f) {
+  _silent_async(*_executor._this_worker(), "", std::forward<F>(f));
+}
+
+// Function: silent_async
+template <typename F>
+void Runtime::silent_async(const std::string& name, F&& f) {
+  _silent_async(*_executor._this_worker(), name, std::forward<F>(f));
+}
+
+// Function: silent_async_unchecked
+template <typename F>
+void Runtime::silent_async_unchecked(const std::string& name, F&& f) {
+  _silent_async(_worker, name, std::forward<F>(f));
+}
+
+// Function: _async
+template <typename F>
+auto Runtime::_async(Worker& w, const std::string& name, F&& f) {
+
+  _parent->_join_counter.fetch_add(1, std::memory_order_relaxed);
+
+  using R = std::invoke_result_t<std::decay_t<F>>;
+
+  std::promise<R> p;
+  auto fu{p.get_future()};
 
   auto node = node_pool.animate(
+    name, 0, _parent->_topology, _parent, 0,
     std::in_place_type_t<Node::Async>{},
-    [p=make_moc(std::move(p)), f=std::forward<F>(f), args...] 
-    (bool cancel) mutable {
+    [p=make_moc(std::move(p)), f=std::forward<F>(f)] () mutable {
       if constexpr(std::is_same_v<R, void>) {
-        if(!cancel) {
-          f(args...);
-        }
+        f();
         p.object.set_value();
       }
       else {
-        p.object.set_value(cancel ? std::nullopt : std::make_optional(f(args...)));
+        p.object.set_value(f());
       }
-    },
-    std::move(tpg)
+    }
   );
 
-  node->_topology = _parent->_topology;
-  node->_parent = _parent;
-
-  _executor._schedule(node);
+  _executor._schedule(w, node);
 
   return fu;
 }
 
-// Function: silent_async
-template <typename F, typename... ArgsT>
-void Subflow::silent_async(F&& f, ArgsT&&... args) {
+// Function: async
+template <typename F>
+auto Runtime::async(F&& f) {
+  return _async(*_executor._this_worker(), "", std::forward<F>(f));
+}
 
-  _parent->_join_counter.fetch_add(1);
+// Function: async
+template <typename F>
+auto Runtime::async(const std::string& name, F&& f) {
+  return _async(*_executor._this_worker(), name, std::forward<F>(f));
+}
 
-  auto node = node_pool.animate(
-    std::in_place_type_t<Node::SilentAsync>{},
-    [f=std::forward<F>(f), args...] () mutable { 
-      f(args...); 
-    }
-  );
+// Function: join
+inline void Runtime::join() {
+  corun_until([this] () -> bool { 
+    return _parent->_join_counter.load(std::memory_order_acquire) == 0; 
+  });
+}
 
-  node->_topology = _parent->_topology;
-  node->_parent = _parent;
+}  // end of namespace tf -----------------------------------------------------
 
-  _executor._schedule(node);
-}
 
 
-}  // end of namespace tf -----------------------------------------------------
 
 
 
diff --git a/lib/taskflow/core/flow_builder.hpp b/lib/taskflow/core/flow_builder.hpp
index a022138..3e90d8e 100644
--- a/lib/taskflow/core/flow_builder.hpp
+++ b/lib/taskflow/core/flow_builder.hpp
@@ -1,18 +1,22 @@
 #pragma once
 
 #include "task.hpp"
+#include "../algorithm/partitioner.hpp"
 
-/** 
+/**
 @file flow_builder.hpp
 @brief flow builder include file
 */
 
 namespace tf {
 
-/** 
+/**
 @class FlowBuilder
 
-@brief building methods of a task dependency graph
+@brief class to build a task dependency graph
+
+The class provides essential methods to construct a task dependency graph
+from which tf::Taskflow and tf::Subflow are derived.
 
 */
 class FlowBuilder {
@@ -20,773 +24,1069 @@ class FlowBuilder {
   friend class Executor;
 
   public:
-    
-    /**
-    @brief creates a static task
-    
-    @tparam C callable type constructible from std::function<void()>
 
-    @param callable callable to construct a static task
+  /**
+  @brief constructs a flow builder with a graph
+  */
+  FlowBuilder(Graph& graph);
 
-    @return a tf::Task handle
+  /**
+  @brief creates a static task
 
-    The following example creates a static task.
+  @tparam C callable type constructible from std::function<void()>
 
-    @code{.cpp}
-    tf::Task static_task = taskflow.emplace([](){});
-    @endcode
-    
-    Please refer to @ref StaticTasking for details.
-    */
-    template <typename C, 
-      std::enable_if_t<is_static_task_v<C>, void>* = nullptr
-    >
-    Task emplace(C&& callable);
-    
-    /**
-    @brief creates a dynamic task
-    
-    @tparam C callable type constructible from std::function<void(tf::Subflow&)>
+  @param callable callable to construct a static task
 
-    @param callable callable to construct a dynamic task
+  @return a tf::Task handle
 
-    @return a tf::Task handle
-    
-    The following example creates a dynamic task (tf::Subflow) 
-    that spawns two static tasks.
+  The following example creates a static task.
 
-    @code{.cpp}
-    tf::Task dynamic_task = taskflow.emplace([](tf::Subflow& sf){
-      tf::Task static_task1 = sf.emplace([](){});
-      tf::Task static_task2 = sf.emplace([](){});
-    });
-    @endcode
-    
-    Please refer to @ref DynamicTasking for details.
-    */
-    template <typename C, 
-      std::enable_if_t<is_dynamic_task_v<C>, void>* = nullptr
-    >
-    Task emplace(C&& callable);
-    
-    /**
-    @brief creates a condition task
-    
-    @tparam C callable type constructible from std::function<int()>
+  @code{.cpp}
+  tf::Task static_task = taskflow.emplace([](){});
+  @endcode
 
-    @param callable callable to construct a condition task
+  Please refer to @ref StaticTasking for details.
+  */
+  template <typename C,
+    std::enable_if_t<is_static_task_v<C>, void>* = nullptr
+  >
+  Task emplace(C&& callable);
 
-    @return a tf::Task handle
-    
-    The following example creates an if-else block using one condition task
-    and three static tasks.
-    
-    @code{.cpp}
-    tf::Taskflow taskflow;
-    
-    auto [init, cond, yes, no] = taskflow.emplace(
-     [] () { },
-     [] () { return 0; },
-     [] () { std::cout << "yes\n"; },
-     [] () { std::cout << "no\n"; }
-    );
-    
-    // executes yes if cond returns 0, or no if cond returns 1
-    cond.precede(yes, no);
-    cond.succeed(init);
-    @endcode
+  /**
+  @brief creates a dynamic task
 
-    Please refer to @ref ConditionalTasking for details.
-    */
-    template <typename C, 
-      std::enable_if_t<is_condition_task_v<C>, void>* = nullptr
-    >
-    Task emplace(C&& callable);
+  @tparam C callable type constructible from std::function<void(tf::Subflow&)>
 
-    /**
-    @brief creates multiple tasks from a list of callable objects
-    
-    @tparam C callable types
+  @param callable callable to construct a dynamic task
 
-    @param callables one or multiple callable objects constructible from each task category
+  @return a tf::Task handle
 
-    @return a tf::Task handle
+  The following example creates a dynamic task (tf::Subflow)
+  that spawns two static tasks.
 
-    The method returns a tuple of tasks each corresponding to the given 
-    callable target. You can use structured binding to get the return tasks
-    one by one.
-    The following example creates four static tasks and assign them to
-    @c A, @c B, @c C, and @c D using structured binding.
+  @code{.cpp}
+  tf::Task dynamic_task = taskflow.emplace([](tf::Subflow& sf){
+    tf::Task static_task1 = sf.emplace([](){});
+    tf::Task static_task2 = sf.emplace([](){});
+  });
+  @endcode
 
-    @code{.cpp}
-    auto [A, B, C, D] = taskflow.emplace(
-      [] () { std::cout << "A"; },
-      [] () { std::cout << "B"; },
-      [] () { std::cout << "C"; },
-      [] () { std::cout << "D"; }
-    );
-    @endcode
-    */
-    template <typename... C, std::enable_if_t<(sizeof...(C)>1), void>* = nullptr>
-    auto emplace(C&&... callables);
+  Please refer to @ref DynamicTasking for details.
+  */
+  template <typename C,
+    std::enable_if_t<is_dynamic_task_v<C>, void>* = nullptr
+  >
+  Task emplace(C&& callable);
 
-    /**
-    @brief creates a module task from a taskflow
+  /**
+  @brief creates a condition task
 
-    @param taskflow a taskflow object for the module
+  @tparam C callable type constructible from std::function<int()>
 
-    @return a tf::Task handle
+  @param callable callable to construct a condition task
 
-    Please refer to @ref ComposableTasking for details.
-    */
-    Task composed_of(Taskflow& taskflow);
+  @return a tf::Task handle
 
-    /**
-    @brief creates a placeholder task
+  The following example creates an if-else block using one condition task
+  and three static tasks.
 
-    @return a tf::Task handle
+  @code{.cpp}
+  tf::Taskflow taskflow;
 
-    A placeholder task maps to a node in the taskflow graph, but 
-    it does not have any callable work assigned yet. 
-    A placeholder task is different from an empty task handle that
-    does not point to any node in a graph.
+  auto [init, cond, yes, no] = taskflow.emplace(
+    [] () { },
+    [] () { return 0; },
+    [] () { std::cout << "yes\n"; },
+    [] () { std::cout << "no\n"; }
+  );
 
-    @code{.cpp}
-    // create a placeholder task with no callable target assigned
-    tf::Task placeholder = taskflow.placeholder(); 
-    assert(placeholder.empty() == false && placeholder.has_work() == false);
-    
-    // create an empty task handle
-    tf::Task task;
-    assert(task.empty() == true);
-    
-    // assign the task handle to the placeholder task
-    task = placeholder;
-    assert(task.empty() == false && task.has_work() == false);
-    @endcode
-    */
-    Task placeholder();
+  // executes yes if cond returns 0, or no if cond returns 1
+  cond.precede(yes, no);
+  cond.succeed(init);
+  @endcode
 
-    /**
-    @brief creates a %cudaFlow task on the caller's GPU device context
+  Please refer to @ref ConditionalTasking for details.
+  */
+  template <typename C,
+    std::enable_if_t<is_condition_task_v<C>, void>* = nullptr
+  >
+  Task emplace(C&& callable);
 
-    @tparam C callable type constructible from @c std::function<void(tf::cudaFlow&)>
+  /**
+  @brief creates a multi-condition task
 
-    @return a tf::Task handle
+  @tparam C callable type constructible from
+          std::function<tf::SmallVector<int>()>
 
-    This method is equivalent to calling tf::FlowBuilder::emplace_on(callable, d)
-    where @c d is the caller's device context.
-    The following example creates a %cudaFlow of two kernel tasks, @c task1 and 
-    @c task2, where @c task1 runs before @c task2.
-    
-    @code{.cpp}
-    taskflow.emplace([&](tf::cudaFlow& cf){
-      // create two kernel tasks
-      tf::cudaTask task1 = cf.kernel(grid1, block1, shm1, kernel1, args1);
-      tf::cudaTask task2 = cf.kernel(grid2, block2, shm2, kernel2, args2);
+  @param callable callable to construct a multi-condition task
 
-      // kernel1 runs before kernel2
-      task1.precede(task2);
-    });
-    @endcode
+  @return a tf::Task handle
 
-    Please refer to @ref GPUTaskingcudaFlow and @ref GPUTaskingcudaFlowCapturer 
-    for details.
-    */
-    template <typename C, 
-      std::enable_if_t<is_cudaflow_task_v<C>, void>* = nullptr
-    >
-    Task emplace(C&& callable);
-    
-    /**
-    @brief creates a %cudaFlow task on the given device
+  The following example creates a multi-condition task that selectively
+  jumps to two successor tasks.
 
-    @tparam C callable type constructible from std::function<void(tf::cudaFlow&)>
-    @tparam D device type, either @c int or @c std::ref<int> (stateful)
+  @code{.cpp}
+  tf::Taskflow taskflow;
 
-    @return a tf::Task handle
-    
-    The following example creates a %cudaFlow of two kernel tasks, @c task1 and 
-    @c task2 on GPU @c 2, where @c task1 runs before @c task2
-    
-    @code{.cpp}
-    taskflow.emplace_on([&](tf::cudaFlow& cf){
-      // create two kernel tasks
-      tf::cudaTask task1 = cf.kernel(grid1, block1, shm1, kernel1, args1);
-      tf::cudaTask task2 = cf.kernel(grid2, block2, shm2, kernel2, args2);
-
-      // kernel1 runs before kernel2
-      task1.precede(task2);
-    }, 2);
-    @endcode
-    */
-    template <typename C, typename D, 
-      std::enable_if_t<is_cudaflow_task_v<C>, void>* = nullptr
-    >
-    Task emplace_on(C&& callable, D&& device);
+  auto [init, cond, branch1, branch2, branch3] = taskflow.emplace(
+    [] () { },
+    [] () { return tf::SmallVector{0, 2}; },
+    [] () { std::cout << "branch1\n"; },
+    [] () { std::cout << "branch2\n"; },
+    [] () { std::cout << "branch3\n"; }
+  );
 
-    /**
-    @brief adds adjacent dependency links to a linear list of tasks
+  // executes branch1 and branch3 when cond returns 0 and 2
+  cond.precede(branch1, branch2, branch3);
+  cond.succeed(init);
+  @endcode
 
-    @param tasks a vector of tasks
-    */
-    void linearize(std::vector<Task>& tasks);
+  Please refer to @ref ConditionalTasking for details.
+  */
+  template <typename C,
+    std::enable_if_t<is_multi_condition_task_v<C>, void>* = nullptr
+  >
+  Task emplace(C&& callable);
 
-    /**
-    @brief adds adjacent dependency links to a linear list of tasks
+  /**
+  @brief creates multiple tasks from a list of callable objects
 
-    @param tasks an initializer list of tasks
-    */
-    void linearize(std::initializer_list<Task> tasks);
+  @tparam C callable types
 
-    // ------------------------------------------------------------------------
-    // parallel iterations
-    // ------------------------------------------------------------------------
-    
-    /**
-    @brief constructs a STL-styled parallel-for task
-
-    @tparam B beginning iterator type
-    @tparam E ending iterator type
-    @tparam C callable type
+  @param callables one or multiple callable objects constructible from each task category
 
-    @param first iterator to the beginning (inclusive)
-    @param last iterator to the end (exclusive)
-    @param callable a callable object to apply to the dereferenced iterator 
+  @return a tf::Task handle
 
-    @return a tf::Task handle
+  The method returns a tuple of tasks each corresponding to the given
+  callable target. You can use structured binding to get the return tasks
+  one by one.
+  The following example creates four static tasks and assign them to
+  @c A, @c B, @c C, and @c D using structured binding.
 
-    The task spawns a subflow that applies the callable object to each object obtained by dereferencing every iterator in the range <tt>[first, last)</tt>. By default, we employ the guided partition algorithm with chunk size equal to one.
-    This method is equivalent to the parallel execution of the following loop:
-    
-    @code{.cpp}
-    for(auto itr=first; itr!=last; itr++) {
-      callable(*itr);
+  @code{.cpp}
+  auto [A, B, C, D] = taskflow.emplace(
+    [] () { std::cout << "A"; },
+    [] () { std::cout << "B"; },
+    [] () { std::cout << "C"; },
+    [] () { std::cout << "D"; }
+  );
+  @endcode
+  */
+  template <typename... C, std::enable_if_t<(sizeof...(C)>1), void>* = nullptr>
+  auto emplace(C&&... callables);
+
+  /**
+  @brief removes a task from a taskflow
+
+  @param task task to remove
+
+  Removes a task and its input and output dependencies from the graph
+  associated with the flow builder.
+  If the task does not belong to the graph, nothing will happen.
+
+  @code{.cpp}
+  tf::Task A = taskflow.emplace([](){ std::cout << "A"; });
+  tf::Task B = taskflow.emplace([](){ std::cout << "B"; });
+  tf::Task C = taskflow.emplace([](){ std::cout << "C"; });
+  tf::Task D = taskflow.emplace([](){ std::cout << "D"; });
+  A.precede(B, C, D);
+
+  // erase A from the taskflow and its dependencies to B, C, and D
+  taskflow.erase(A);
+  @endcode
+  */
+  void erase(Task task);
+
+  /**
+  @brief creates a module task for the target object
+
+  @tparam T target object type
+  @param object a custom object that defines the method @c T::graph()
+
+  @return a tf::Task handle
+
+  The example below demonstrates a taskflow composition using
+  the @c composed_of method.
+
+  @code{.cpp}
+  tf::Taskflow t1, t2;
+  t1.emplace([](){ std::cout << "t1"; });
+
+  // t2 is partially composed of t1
+  tf::Task comp = t2.composed_of(t1);
+  tf::Task init = t2.emplace([](){ std::cout << "t2"; });
+  init.precede(comp);
+  @endcode
+
+  The taskflow object @c t2 is composed of another taskflow object @c t1,
+  preceded by another static task @c init.
+  When taskflow @c t2 is submitted to an executor,
+  @c init will run first and then @c comp which spwans its definition
+  in taskflow @c t1.
+
+  The target @c object being composed must define the method
+  <tt>T::graph()</tt> that returns a reference to a graph object of
+  type tf::Graph such that it can interact with the executor.
+  For example:
+
+  @code{.cpp}
+  // custom struct
+  struct MyObj {
+    tf::Graph graph;
+    MyObj() {
+      tf::FlowBuilder builder(graph);
+      tf::Task task = builder.emplace([](){
+        std::cout << "a task\n";  // static task
+      });
     }
-    @endcode
-    
-    Arguments templated to enable stateful passing using std::reference_wrapper. 
-    The callable needs to take a single argument of 
-    the dereferenced iterator type.
+    Graph& graph() { return graph; }
+  };
+
+  MyObj obj;
+  tf::Task comp = taskflow.composed_of(obj);
+  @endcode
+
+  Please refer to @ref ComposableTasking for details.
+  */
+  template <typename T>
+  Task composed_of(T& object);
+
+  /**
+  @brief creates a placeholder task
+
+  @return a tf::Task handle
+
+  A placeholder task maps to a node in the taskflow graph, but
+  it does not have any callable work assigned yet.
+  A placeholder task is different from an empty task handle that
+  does not point to any node in a graph.
+
+  @code{.cpp}
+  // create a placeholder task with no callable target assigned
+  tf::Task placeholder = taskflow.placeholder();
+  assert(placeholder.empty() == false && placeholder.has_work() == false);
+
+  // create an empty task handle
+  tf::Task task;
+  assert(task.empty() == true);
+
+  // assign the task handle to the placeholder task
+  task = placeholder;
+  assert(task.empty() == false && task.has_work() == false);
+  @endcode
+  */
+  Task placeholder();
+
+  /**
+  @brief adds adjacent dependency links to a linear list of tasks
+
+  @param tasks a vector of tasks
+
+  This member function creates linear dependencies over a vector of tasks.
+
+  @code{.cpp}
+  tf::Task A = taskflow.emplace([](){ std::cout << "A"; });
+  tf::Task B = taskflow.emplace([](){ std::cout << "B"; });
+  tf::Task C = taskflow.emplace([](){ std::cout << "C"; });
+  tf::Task D = taskflow.emplace([](){ std::cout << "D"; });
+  std::vector<tf::Task> tasks {A, B, C, D}
+  taskflow.linearize(tasks);  // A->B->C->D
+  @endcode
+
+  */
+  void linearize(std::vector<Task>& tasks);
+
+  /**
+  @brief adds adjacent dependency links to a linear list of tasks
+
+  @param tasks an initializer list of tasks
+
+  This member function creates linear dependencies over a list of tasks.
+
+  @code{.cpp}
+  tf::Task A = taskflow.emplace([](){ std::cout << "A"; });
+  tf::Task B = taskflow.emplace([](){ std::cout << "B"; });
+  tf::Task C = taskflow.emplace([](){ std::cout << "C"; });
+  tf::Task D = taskflow.emplace([](){ std::cout << "D"; });
+  taskflow.linearize({A, B, C, D});  // A->B->C->D
+  @endcode
+  */
+  void linearize(std::initializer_list<Task> tasks);
+
+  // ------------------------------------------------------------------------
+  // parallel iterations
+  // ------------------------------------------------------------------------
+
+  /**
+  @brief constructs an STL-styled parallel-for task
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam C callable type
+  @tparam P partitioner type (default tf::GuidedPartitioner)
+
+  @param first iterator to the beginning (inclusive)
+  @param last iterator to the end (exclusive)
+  @param callable callable object to apply to the dereferenced iterator
+  @param part partitioning algorithm to schedule parallel iterations
+
+  @return a tf::Task handle
+
+  The task spawns asynchronous tasks that applies the callable object to each object
+  obtained by dereferencing every iterator in the range <tt>[first, last)</tt>.
+  This method is equivalent to the parallel execution of the following loop:
+
+  @code{.cpp}
+  for(auto itr=first; itr!=last; itr++) {
+    callable(*itr);
+  }
+  @endcode
 
-    Please refer to @ref ParallelIterations for details.
-    */
-    template <typename B, typename E, typename C>
-    Task for_each(B&& first, E&& last, C&& callable);
-    
-    /**
-    @brief constructs a STL-styled parallel-for task using the guided partition algorithm
-
-    @tparam B beginning iterator type
-    @tparam E ending iterator type
-    @tparam C callable type
-    @tparam H chunk size type
-
-    @param beg iterator to the beginning (inclusive)
-    @param end iterator to the end (exclusive)
-    @param callable a callable object to apply to the dereferenced iterator 
-    @param chunk_size chunk size
-
-    @return a tf::Task handle
-
-    The task spawns a subflow that applies the callable object to each object obtained by dereferencing every iterator in the range <tt>[beg, end)</tt>. The runtime partitions the range into chunks of the given chunk size, where each chunk is processed by a worker.
-    
-    Arguments are templated to enable stateful passing using std::reference_wrapper. 
-    The callable needs to take a single argument of the dereferenced iterator type.
-    
-    Please refer to @ref ParallelIterations for details.
-    */
-    template <typename B, typename E, typename C, typename H = size_t>
-    Task for_each_guided(B&& beg, E&& end, C&& callable, H&& chunk_size = 1);
-    
-    /**
-    @brief constructs a STL-styled parallel-for task using the dynamic partition algorithm
-
-    @tparam B beginning iterator type
-    @tparam E ending iterator type
-    @tparam C callable type
-    @tparam H chunk size type
-
-    @param beg iterator to the beginning (inclusive)
-    @param end iterator to the end (exclusive)
-    @param callable a callable object to apply to the dereferenced iterator 
-    @param chunk_size chunk size
-
-    @return a tf::Task handle
-    
-    The task spawns a subflow that applies the callable object to each object obtained by dereferencing every iterator in the range <tt>[beg, end)</tt>. The runtime partitions the range into chunks of the given chunk size, where each chunk is processed by a worker.
-    
-    Arguments are templated to enable stateful passing using std::reference_wrapper. 
-    The callable needs to take a single argument of the dereferenced iterator type.
-    
-    Please refer to @ref ParallelIterations for details.
-    */
-    template <typename B, typename E, typename C, typename H = size_t>
-    Task for_each_dynamic(B&& beg, E&& end, C&& callable, H&& chunk_size = 1);
-    
-    /**
-    @brief constructs a STL-styled parallel-for task using the dynamic partition algorithm
-
-    @tparam B beginning iterator type
-    @tparam E ending iterator type
-    @tparam C callable type
-    @tparam H chunk size type
-
-    @param beg iterator to the beginning (inclusive)
-    @param end iterator to the end (exclusive)
-    @param callable a callable object to apply to the dereferenced iterator 
-    @param chunk_size chunk size
-
-    @return a tf::Task handle
-    
-    The task spawns a subflow that applies the callable object to each object obtained by dereferencing every iterator in the range <tt>[beg, end)</tt>. The runtime partitions the range into chunks of the given chunk size, where each chunk is processed by a worker. When the given chunk size is zero, the runtime distributes the work evenly across workers.
-    
-    Arguments are templated to enable stateful passing using std::reference_wrapper. 
-    The callable needs to take a single argument of the dereferenced iterator type.
-    
-    Please refer to @ref ParallelIterations for details.
-    */
-    template <typename B, typename E, typename C, typename H = size_t>
-    Task for_each_static(
-      B&& beg, E&& end, C&& callable, H&& chunk_size = 0
-    );
-    
-    /**
-    @brief constructs an index-based parallel-for task 
-
-    @tparam B beginning index type (must be integral)
-    @tparam E ending index type (must be integral)
-    @tparam S step type (must be integral)
-    @tparam C callable type
-
-    @param first index of the beginning (inclusive)
-    @param last index of the end (exclusive)
-    @param step step size 
-    @param callable a callable object to apply to each valid index
-
-    @return a tf::Task handle
-    
-    The task spawns a subflow that applies the callable object to each index in the range <tt>[first, last)</tt> with the step size. By default, we employ the guided partition algorithm with chunk size equal to one.
-    
-    This method is equivalent to the parallel execution of the following loop:
-    
-    @code{.cpp}
-    // case 1: step size is positive
-    for(auto i=first; i<last; i+=step) {
-      callable(i);
-    }
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  The callable needs to take a single argument of
+  the dereferenced iterator type.
 
-    // case 2: step size is negative
-    for(auto i=first, i>last; i+=step) {
-      callable(i);
-    }
-    @endcode
+  Please refer to @ref ParallelIterations for details.
+  */
+  template <typename B, typename E, typename C, typename P = GuidedPartitioner>
+  Task for_each(B first, E last, C callable, P&& part = P());
+  
+  /**
+  @brief constructs an STL-styled index-based parallel-for task 
+
+  @tparam B beginning index type (must be integral)
+  @tparam E ending index type (must be integral)
+  @tparam S step type (must be integral)
+  @tparam C callable type
+  @tparam P partitioner type (default tf::GuidedPartitioner)
+
+  @param first index of the beginning (inclusive)
+  @param last index of the end (exclusive)
+  @param step step size
+  @param callable callable object to apply to each valid index
+  @param part partitioning algorithm to schedule parallel iterations
+
+  @return a tf::Task handle
+
+  The task spawns asynchronous tasks that applies the callable object to each index
+  in the range <tt>[first, last)</tt> with the step size.
+  This method is equivalent to the parallel execution of the following loop:
+
+  @code{.cpp}
+  // case 1: step size is positive
+  for(auto i=first; i<last; i+=step) {
+    callable(i);
+  }
 
-    Arguments are templated to enable stateful passing using std::reference_wrapper.
-    The callable needs to take a single argument of the integral index type.
-    
-    Please refer to @ref ParallelIterations for details.
-    */
-    template <typename B, typename E, typename S, typename C>
-    Task for_each_index(B&& first, E&& last, S&& step, C&& callable);
-    
-    /**
-    @brief constructs an index-based parallel-for task using the guided partition algorithm.
-    
-    @tparam B beginning index type (must be integral)
-    @tparam E ending index type (must be integral)
-    @tparam S step type (must be integral)
-    @tparam C callable type
-    @tparam H chunk size type
-
-    @param beg index of the beginning (inclusive)
-    @param end index of the end (exclusive)
-    @param step step size 
-    @param callable a callable object to apply to each valid index
-    @param chunk_size chunk size (default 1)
-
-    @return a tf::Task handle
-    
-    The task spawns a subflow that applies the callable object to each index in the range <tt>[beg, end)</tt> with the step size. The runtime partitions the range into chunks of the given size, where each chunk is processed by a worker.
-
-    Arguments are templated to enable stateful passing using std::reference_wrapper.
-    The callable needs to take a single argument of the integral index type.
-    
-    Please refer to @ref ParallelIterations for details.
-    */
-    template <typename B, typename E, typename S, typename C, typename H = size_t>
-    Task for_each_index_guided(
-      B&& beg, E&& end, S&& step, C&& callable, H&& chunk_size = 1
-    );
-    
-    /**
-    @brief constructs an index-based parallel-for task using the dynamic partition algorithm.
-
-    @tparam B beginning index type (must be integral)
-    @tparam E ending index type (must be integral)
-    @tparam S step type (must be integral)
-    @tparam C callable type
-    @tparam H chunk size type
-
-    @param beg index of the beginning (inclusive)
-    @param end index of the end (exclusive)
-    @param step step size 
-    @param callable a callable object to apply to each valid index
-    @param chunk_size chunk size (default 1)
-
-    @return a tf::Task handle
-    
-    The task spawns a subflow that applies the callable object to each index in the range <tt>[beg, end)</tt> with the step size. The runtime partitions the range into chunks of the given size, where each chunk is processed by a worker.
-
-    Arguments are templated to enable stateful passing using std::reference_wrapper.
-    The callable needs to take a single argument of the integral index type.
-    
-    Please refer to @ref ParallelIterations for details.
-    */
-    template <typename B, typename E, typename S, typename C, typename H = size_t>
-    Task for_each_index_dynamic(
-      B&& beg, E&& end, S&& step, C&& callable, H&& chunk_size = 1
-    );
-    
-    /**
-    @brief constructs an index-based parallel-for task using the static partition algorithm.
-    
-    @tparam B beginning index type (must be integral)
-    @tparam E ending index type (must be integral)
-    @tparam S step type (must be integral)
-    @tparam C callable type
-    @tparam H chunk size type
-
-    @param beg index of the beginning (inclusive)
-    @param end index of the end (exclusive)
-    @param step step size 
-    @param callable a callable object to apply to each valid index
-    @param chunk_size chunk size (default 0)
-
-    @return a tf::Task handle
-    
-    The task spawns a subflow that applies the callable object to each index in the range <tt>[beg, end)</tt> with the step size. The runtime partitions the range into chunks of the given size, where each chunk is processed by a worker. When the given chunk size is zero, the runtime distributes the work evenly across workers.
-
-    Arguments are templated to enable stateful passing using std::reference_wrapper.
-    The callable needs to take a single argument of the integral index type.
-    
-    Please refer to @ref ParallelIterations for details.
-    */
-    template <typename B, typename E, typename S, typename C, typename H = size_t>
-    Task for_each_index_static(
-      B&& beg, E&& end, S&& step, C&& callable, H&& chunk_size = 0
-    );
+  // case 2: step size is negative
+  for(auto i=first, i>last; i+=step) {
+    callable(i);
+  }
+  @endcode
 
-    // ------------------------------------------------------------------------
-    // reduction
-    // ------------------------------------------------------------------------
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  The callable needs to take a single argument of the integral index type.
 
-    /**
-    @brief constructs a STL-styled parallel-reduce task
-  
-    @tparam B beginning iterator type
-    @tparam E ending iterator type
-    @tparam T result type 
-    @tparam O binary reducer type
-
-    @param first iterator to the beginning (inclusive)
-    @param last iterator to the end (exclusive)
-    @param init initial value of the reduction and the storage for the reduced result
-    @param bop binary operator that will be applied 
-
-    @return a tf::Task handle
-    
-    The task spawns a subflow to perform parallel reduction over @c init and the elements in the range <tt>[first, last)</tt>. The reduced result is store in @c init. The runtime partitions the range into chunks of the given chunk size, where each chunk is processed by a worker. By default, we employ the guided partition algorithm.
-    
-    This method is equivalent to the parallel execution of the following loop:
-    
-    @code{.cpp}
-    for(auto itr=first; itr!=last; itr++) {
-      init = bop(init, *itr);
-    }
-    @endcode
-    
-    Arguments are templated to enable stateful passing using std::reference_wrapper. 
+  Please refer to @ref ParallelIterations for details.
+  */
+  template <typename B, typename E, typename S, typename C, typename P = GuidedPartitioner>
+  Task for_each_index(
+    B first, E last, S step, C callable, P&& part = P()
+  );
 
-    Please refer to @ref ParallelReduction for details.
-    */
-    template <typename B, typename E, typename T, typename O>
-    Task reduce(B&& first, E&& last, T& init, O&& bop);
+  // ------------------------------------------------------------------------
+  // transform
+  // ------------------------------------------------------------------------
 
-    /**
-    @brief constructs a STL-styled parallel-reduce task using the guided partition algorithm
+  /**
+  @brief constructs a parallel-transform task
 
-    @tparam B beginning iterator type
-    @tparam E ending iterator type
-    @tparam T result type 
-    @tparam O binary reducer type
-    @tparam H chunk size type
+  @tparam B beginning input iterator type
+  @tparam E ending input iterator type
+  @tparam O output iterator type
+  @tparam C callable type
+  @tparam P partitioner type (default tf::GuidedPartitioner)
 
-    @param first iterator to the beginning (inclusive)
-    @param last iterator to the end (exclusive)
-    @param init initial value of the reduction and the storage for the reduced result
-    @param bop binary operator that will be applied 
-    @param chunk_size chunk size
-    
-    @return a tf::Task handle
+  @param first1 iterator to the beginning of the first range
+  @param last1 iterator to the end of the first range
+  @param d_first iterator to the beginning of the output range
+  @param c an unary callable to apply to dereferenced input elements
+  @param part partitioning algorithm to schedule parallel iterations
 
-    The task spawns a subflow to perform parallel reduction over @c init and the elements in the range <tt>[first, last)</tt>. The reduced result is store in @c init. The runtime partitions the range into chunks of size @c chunk_size, where each chunk is processed by a worker. 
+  @return a tf::Task handle
 
-    Arguments are templated to enable stateful passing using std::reference_wrapper. 
+  The task spawns asynchronous tasks that applies the callable object to an
+  input range and stores the result in another output range.
+  This method is equivalent to the parallel execution of the following loop:
 
-    Please refer to @ref ParallelReduction for details.
-    */
-    template <typename B, typename E, typename T, typename O, typename H = size_t>
-    Task reduce_guided(
-      B&& first, E&& last, T& init, O&& bop, H&& chunk_size = 1
-    );
-    
-    /**
-    @brief constructs a STL-styled parallel-reduce task using the dynamic partition algorithm
+  @code{.cpp}
+  while (first1 != last1) {
+    *d_first++ = c(*first1++);
+  }
+  @endcode
 
-    @tparam B beginning iterator type
-    @tparam E ending iterator type
-    @tparam T result type 
-    @tparam O binary reducer type
-    @tparam H chunk size type
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  The callable needs to take a single argument of the dereferenced
+  iterator type.
+  
+  Please refer to @ref ParallelTransforms for details.
+  */
+  template <
+    typename B, typename E, typename O, typename C, typename P = GuidedPartitioner
+  >
+  Task transform(B first1, E last1, O d_first, C c, P&& part = P());
+  
+  /**
+  @brief constructs a parallel-transform task
+
+  @tparam B1 beginning input iterator type for the first input range
+  @tparam E1 ending input iterator type for the first input range
+  @tparam B2 beginning input iterator type for the first second range
+  @tparam O output iterator type
+  @tparam C callable type
+  @tparam P partitioner type (default tf::GuidedPartitioner)
+
+  @param first1 iterator to the beginning of the first input range
+  @param last1 iterator to the end of the first input range
+  @param first2 iterator to the beginning of the second input range
+  @param d_first iterator to the beginning of the output range
+  @param c a binary operator to apply to dereferenced input elements
+  @param part partitioning algorithm to schedule parallel iterations
+
+  @return a tf::Task handle
+
+  The task spawns asynchronous tasks that applies the callable object to two
+  input ranges and stores the result in another output range.
+  This method is equivalent to the parallel execution of the following loop:
+
+  @code{.cpp}
+  while (first1 != last1) {
+    *d_first++ = c(*first1++, *first2++);
+  }
+  @endcode
 
-    @param first iterator to the beginning (inclusive)
-    @param last iterator to the end (exclusive)
-    @param init initial value of the reduction and the storage for the reduced result
-    @param bop binary operator that will be applied 
-    @param chunk_size chunk size
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  The callable needs to take two arguments of dereferenced elements
+  from the two input ranges.
+  
+  Please refer to @ref ParallelTransforms for details.
+  */
+  template <
+    typename B1, typename E1, typename B2, typename O, typename C, typename P=GuidedPartitioner,
+    std::enable_if_t<!is_partitioner_v<std::decay_t<C>>, void>* = nullptr
+  >
+  Task transform(B1 first1, E1 last1, B2 first2, O d_first, C c, P&& part = P());
+  
+  // ------------------------------------------------------------------------
+  // reduction
+  // ------------------------------------------------------------------------
+
+  /**
+  @brief constructs an STL-styled parallel-reduce task
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam T result type
+  @tparam O binary reducer type
+  @tparam P partitioner type (default tf::GuidedPartitioner)
+
+  @param first iterator to the beginning (inclusive)
+  @param last iterator to the end (exclusive)
+  @param init initial value of the reduction and the storage for the reduced result
+  @param bop binary operator that will be applied
+  @param part partitioning algorithm to schedule parallel iterations
+
+  @return a tf::Task handle
+
+  The task spawns asynchronous tasks to perform parallel reduction over @c init
+  and the elements in the range <tt>[first, last)</tt>.
+  The reduced result is store in @c init.
+  This method is equivalent to the parallel execution of the following loop:
+
+  @code{.cpp}
+  for(auto itr=first; itr!=last; itr++) {
+    init = bop(init, *itr);
+  }
+  @endcode
 
-    @return a tf::Task handle
+  Iterators are templated to enable stateful range using std::reference_wrapper.
 
-    The task spawns a subflow to perform parallel reduction over @c init and the elements in the range <tt>[first, last)</tt>. The reduced result is store in @c init. The runtime partitions the range into chunks of size @c chunk_size, where each chunk is processed by a worker. 
-    
-    Arguments are templated to enable stateful passing using std::reference_wrapper. 
+  Please refer to @ref ParallelReduction for details.
+  */
+  template <typename B, typename E, typename T, typename O, typename P = GuidedPartitioner>
+  Task reduce(B first, E last, T& init, O bop, P&& part = P());
+  
+  // ------------------------------------------------------------------------
+  // transfrom and reduction
+  // ------------------------------------------------------------------------
+
+  /**
+  @brief constructs an STL-styled parallel transform-reduce task
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam T result type
+  @tparam BOP binary reducer type
+  @tparam UOP unary transformion type
+  @tparam P partitioner type (default tf::GuidedPartitioner)
+
+  @param first iterator to the beginning (inclusive)
+  @param last iterator to the end (exclusive)
+  @param init initial value of the reduction and the storage for the reduced result
+  @param bop binary operator that will be applied in unspecified order to the results of @c uop
+  @param uop unary operator that will be applied to transform each element in the range to the result type
+  @param part partitioning algorithm to schedule parallel iterations
+
+  @return a tf::Task handle
+
+  The task spawns asynchronous tasks to perform parallel reduction over @c init and
+  the transformed elements in the range <tt>[first, last)</tt>.
+  The reduced result is store in @c init.
+  This method is equivalent to the parallel execution of the following loop:
+
+  @code{.cpp}
+  for(auto itr=first; itr!=last; itr++) {
+    init = bop(init, uop(*itr));
+  }
+  @endcode
 
-    Please refer to @ref ParallelReduction for details.
-    */
-    template <typename B, typename E, typename T, typename O, typename H = size_t>
-    Task reduce_dynamic(
-      B&& first, E&& last, T& init, O&& bop, H&& chunk_size = 1
-    );
-    
-    /**
-    @brief constructs a STL-styled parallel-reduce task using the static partition algorithm
+  Iterators are templated to enable stateful range using std::reference_wrapper.
 
-    @tparam B beginning iterator type
-    @tparam E ending iterator type
-    @tparam T result type 
-    @tparam O binary reducer type
-    @tparam H chunk size type
+  Please refer to @ref ParallelReduction for details.
+  */
+  template <
+   typename B, typename E, typename T, typename BOP, typename UOP, typename P = GuidedPartitioner
+  >
+  Task transform_reduce(B first, E last, T& init, BOP bop, UOP uop, P&& part = P());
+  
+  // ------------------------------------------------------------------------
+  // scan
+  // ------------------------------------------------------------------------
+  
+  /**
+  @brief creates an STL-styled parallel inclusive-scan task
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam D destination iterator type
+  @tparam BOP summation operator type
+
+  @param first start of input range
+  @param last end of input range
+  @param d_first start of output range (may be the same as input range)
+  @param bop function to perform summation
+
+  Performs the cumulative sum (aka prefix sum, aka scan) of the input range
+  and writes the result to the output range. 
+  Each element of the output range contains the
+  running total of all earlier elements using the given binary operator
+  for summation.
+  
+  This function generates an @em inclusive scan, meaning that the N-th element
+  of the output range is the sum of the first N input elements,
+  so the N-th input element is included.
+
+  @code{.cpp}
+  std::vector<int> input = {1, 2, 3, 4, 5};
+  taskflow.inclusive_scan(
+    input.begin(), input.end(), input.begin(), std::plus<int>{}
+  );
+  executor.run(taskflow).wait();
+  
+  // input is {1, 3, 6, 10, 15}
+  @endcode
+  
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  
+  Please refer to @ref ParallelScan for details.
+  */
+  template <typename B, typename E, typename D, typename BOP>
+  Task inclusive_scan(B first, E last, D d_first, BOP bop);
+  
+  /**
+  @brief creates an STL-styled parallel inclusive-scan task with an initial value
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam D destination iterator type
+  @tparam BOP summation operator type
+  @tparam T initial value type
+
+  @param first start of input range
+  @param last end of input range
+  @param d_first start of output range (may be the same as input range)
+  @param bop function to perform summation
+  @param init initial value
+
+  Performs the cumulative sum (aka prefix sum, aka scan) of the input range
+  and writes the result to the output range. 
+  Each element of the output range contains the
+  running total of all earlier elements (and the initial value)
+  using the given binary operator for summation.
+  
+  This function generates an @em inclusive scan, meaning the N-th element
+  of the output range is the sum of the first N input elements,
+  so the N-th input element is included.
+
+  @code{.cpp}
+  std::vector<int> input = {1, 2, 3, 4, 5};
+  taskflow.inclusive_scan(
+    input.begin(), input.end(), input.begin(), std::plus<int>{}, -1
+  );
+  executor.run(taskflow).wait();
+  
+  // input is {0, 2, 5, 9, 14}
+  @endcode
+  
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+ 
+  Please refer to @ref ParallelScan for details.
 
-    @param first iterator to the beginning (inclusive)
-    @param last iterator to the end (exclusive)
-    @param init initial value of the reduction and the storage for the reduced result
-    @param bop binary operator that will be applied 
-    @param chunk_size chunk size
+  */
+  template <typename B, typename E, typename D, typename BOP, typename T>
+  Task inclusive_scan(B first, E last, D d_first, BOP bop, T init);
+  
+  /**
+  @brief creates an STL-styled parallel exclusive-scan task
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam D destination iterator type
+  @tparam T initial value type
+  @tparam BOP summation operator type
+
+  @param first start of input range
+  @param last end of input range
+  @param d_first start of output range (may be the same as input range)
+  @param init initial value
+  @param bop function to perform summation
+
+  Performs the cumulative sum (aka prefix sum, aka scan) of the input range
+  and writes the result to the output range. 
+  Each element of the output range contains the
+  running total of all earlier elements (and the initial value)
+  using the given binary operator for summation.
+  
+  This function generates an @em exclusive scan, meaning the N-th element
+  of the output range is the sum of the first N-1 input elements,
+  so the N-th input element is not included.
+
+  @code{.cpp}
+  std::vector<int> input = {1, 2, 3, 4, 5};
+  taskflow.exclusive_scan(
+    input.begin(), input.end(), input.begin(), -1, std::plus<int>{}
+  );
+  executor.run(taskflow).wait();
+  
+  // input is {-1, 0, 2, 5, 9}
+  @endcode
+  
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  
+  Please refer to @ref ParallelScan for details.
+  */
+  template <typename B, typename E, typename D, typename T, typename BOP>
+  Task exclusive_scan(B first, E last, D d_first, T init, BOP bop);
+  
+  // ------------------------------------------------------------------------
+  // transform scan
+  // ------------------------------------------------------------------------
+  
+  /**
+  @brief creates an STL-styled parallel transform-inclusive scan task
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam D destination iterator type
+  @tparam BOP summation operator type
+  @tparam UOP transform operator type
+
+  @param first start of input range
+  @param last end of input range
+  @param d_first start of output range (may be the same as input range)
+  @param bop function to perform summation
+  @param uop function to transform elements of the input range
+
+  Write the cumulative sum (aka prefix sum, aka scan) of the input range
+  to the output range. Each element of the output range contains the
+  running total of all earlier elements
+  using @c uop to transform the input elements
+  and using @c bop for summation.
+  
+  This function generates an @em inclusive scan, meaning the Nth element
+  of the output range is the sum of the first N input elements,
+  so the Nth input element is included.
+
+  @code{.cpp}
+  std::vector<int> input = {1, 2, 3, 4, 5};
+  taskflow.transform_inclusive_scan(
+    input.begin(), input.end(), input.begin(), std::plus<int>{}, 
+    [] (int item) { return -item; }
+  );
+  executor.run(taskflow).wait();
+  
+  // input is {-1, -3, -6, -10, -15}
+  @endcode
+  
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  
+  Please refer to @ref ParallelScan for details.
+  */
+  template <typename B, typename E, typename D, typename BOP, typename UOP>
+  Task transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop);
+  
+  /**
+  @brief creates an STL-styled parallel transform-inclusive scan task
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam D destination iterator type
+  @tparam BOP summation operator type
+  @tparam UOP transform operator type
+  @tparam T initial value type
+
+  @param first start of input range
+  @param last end of input range
+  @param d_first start of output range (may be the same as input range)
+  @param bop function to perform summation
+  @param uop function to transform elements of the input range
+  @param init initial value
+
+  Write the cumulative sum (aka prefix sum, aka scan) of the input range
+  to the output range. Each element of the output range contains the
+  running total of all earlier elements (including an initial value)
+  using @c uop to transform the input elements
+  and using @c bop for summation.
+  
+  This function generates an @em inclusive scan, meaning the Nth element
+  of the output range is the sum of the first N input elements,
+  so the Nth input element is included.
+
+  @code{.cpp}
+  std::vector<int> input = {1, 2, 3, 4, 5};
+  taskflow.transform_inclusive_scan(
+    input.begin(), input.end(), input.begin(), std::plus<int>{}, 
+    [] (int item) { return -item; },
+    -1
+  );
+  executor.run(taskflow).wait();
+  
+  // input is {-2, -4, -7, -11, -16}
+  @endcode
+  
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  
+  Please refer to @ref ParallelScan for details.
+  */
+  template <typename B, typename E, typename D, typename BOP, typename UOP, typename T>
+  Task transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop, T init);
+  
+  /**
+  @brief creates an STL-styled parallel transform-exclusive scan task
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam D destination iterator type
+  @tparam BOP summation operator type
+  @tparam UOP transform operator type
+  @tparam T initial value type
+
+  @param first start of input range
+  @param last end of input range
+  @param d_first start of output range (may be the same as input range)
+  @param bop function to perform summation
+  @param uop function to transform elements of the input range
+  @param init initial value
+
+  Write the cumulative sum (aka prefix sum, aka scan) of the input range
+  to the output range. Each element of the output range contains the
+  running total of all earlier elements (including an initial value)
+  using @c uop to transform the input elements
+  and using @c bop for summation.
+  
+  This function generates an @em exclusive scan, meaning the Nth element
+  of the output range is the sum of the first N-1 input elements,
+  so the Nth input element is not included.
+
+  @code{.cpp}
+  std::vector<int> input = {1, 2, 3, 4, 5};
+  taskflow.transform_exclusive_scan(
+    input.begin(), input.end(), input.begin(), -1, std::plus<int>{},
+    [](int item) { return -item; }
+  );
+  executor.run(taskflow).wait();
+  
+  // input is {-1, -2, -4, -7, -11}
+  @endcode
+  
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  
+  Please refer to @ref ParallelScan for details.
+  */
+  template <typename B, typename E, typename D, typename T, typename BOP, typename UOP>
+  Task transform_exclusive_scan(B first, E last, D d_first, T init, BOP bop, UOP uop);
+
+  // ------------------------------------------------------------------------
+  // find
+  // ------------------------------------------------------------------------
+ 
+  /**
+  @brief constructs a task to perform STL-styled find-if algorithm
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam T resulting iterator type
+  @tparam UOP unary predicate type
+  @tparam P partitioner type
+  
+  @param first start of the input range
+  @param last end of the input range
+  @param result resulting iterator to the found element in the input range
+  @param predicate unary predicate which returns @c true for the required element
+  @param part partitioning algorithm (default tf::GuidedPartitioner)
+
+  Returns an iterator to the first element in the range <tt>[first, last)</tt> 
+  that satisfies the given criteria (or last if there is no such iterator).
+  This method is equivalent to the parallel execution of the following loop:
+
+  @code{.cpp}
+  auto find_if(InputIt first, InputIt last, UnaryPredicate p) {
+    for (; first != last; ++first) {
+      if (predicate(*first)){
+        return first;
+      }
+    }
+    return last;
+  }
+  @endcode
 
-    @return a tf::Task handle
+  For example, the code below find the element that satisfies the given 
+  criteria (value plus one is equal to 23) from an input range of 10 elements:
 
-    The task spawns a subflow to perform parallel reduction over @c init and the elements in the range <tt>[first, last)</tt>. The reduced result is store in @c init. The runtime partitions the range into chunks of size @c chunk_size, where each chunk is processed by a worker. 
-    
-    Arguments are templated to enable stateful passing using std::reference_wrapper. 
+  @code{.cpp}
+  std::vector<int> input = {1, 6, 9, 10, 22, 5, 7, 8, 9, 11};
+  std::vector<int>::iterator result;
+  taskflow.find_if(
+    input.begin(), input.end(), [](int i){ return i+1 = 23; }, result
+  );
+  executor.run(taskflow).wait();
+  assert(*result == 22);
+  @endcode
+  
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  */
+  template <typename B, typename E, typename T, typename UOP, typename P = GuidedPartitioner>
+  Task find_if(B first, E last, T& result, UOP predicate, P&& part = P());
+  
+  /**
+  @brief constructs a task to perform STL-styled find-if-not algorithm
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam T resulting iterator type
+  @tparam UOP unary predicate type
+  @tparam P partitioner type
+  
+  @param first start of the input range
+  @param last end of the input range
+  @param result resulting iterator to the found element in the input range
+  @param predicate unary predicate which returns @c false for the required element
+  @param part partitioning algorithm (default tf::GuidedPartitioner)
+
+  Returns an iterator to the first element in the range <tt>[first, last)</tt> 
+  that satisfies the given criteria (or last if there is no such iterator).
+  This method is equivalent to the parallel execution of the following loop:
+
+  @code{.cpp}
+  auto find_if(InputIt first, InputIt last, UnaryPredicate p) {
+    for (; first != last; ++first) {
+      if (!predicate(*first)){
+        return first;
+      }
+    }
+    return last;
+  }
+  @endcode
 
-    Please refer to @ref ParallelReduction for details.
-    */
-    template <typename B, typename E, typename T, typename O, typename H = size_t>
-    Task reduce_static(
-      B&& first, E&& last, T& init, O&& bop, H&& chunk_size = 0
-    );
-    
-    // ------------------------------------------------------------------------
-    // transfrom and reduction
-    // ------------------------------------------------------------------------
-    
-    /**
-    @brief constructs a STL-styled parallel transform-reduce task
-  
-    @tparam B beginning iterator type
-    @tparam E ending iterator type
-    @tparam T result type 
-    @tparam BOP binary reducer type
-    @tparam UOP unary transformion type
-
-    @param first iterator to the beginning (inclusive)
-    @param last iterator to the end (exclusive)
-    @param init initial value of the reduction and the storage for the reduced result
-    @param bop binary operator that will be applied in unspecified order to the results of @c uop
-    @param uop unary operator that will be applied to transform each element in the range to the result type
-
-    @return a tf::Task handle
-    
-    The task spawns a subflow to perform parallel reduction over @c init and the transformed elements in the range <tt>[first, last)</tt>. The reduced result is store in @c init. The runtime partitions the range into chunks of the given chunk size, where each chunk is processed by a worker. By default, we employ the guided partition algorithm.
-    
-    This method is equivalent to the parallel execution of the following loop:
-    
-    @code{.cpp}
-    for(auto itr=first; itr!=last; itr++) {
-      init = bop(init, uop(*itr));
+  For example, the code below find the element that satisfies the given 
+  criteria (value is not equal to 1) from an input range of 10 elements:
+
+  @code{.cpp}
+  std::vector<int> input = {1, 1, 1, 1, 22, 1, 1, 1, 1, 1};
+  std::vector<int>::iterator result;
+  taskflow.find_if_not(
+    input.begin(), input.end(), [](int i){ return i == 1; }, result
+  );
+  executor.run(taskflow).wait();
+  assert(*result == 22);
+  @endcode
+  
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  */
+  template <typename B, typename E, typename T, typename UOP,typename P = GuidedPartitioner>
+  Task find_if_not(B first, E last, T& result, UOP predicate, P&& part = P());
+
+  /**
+  @brief constructs a task to perform STL-styled min-element algorithm
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam T resulting iterator type
+  @tparam C comparator type
+  @tparam P partitioner type
+  
+  @param first start of the input range
+  @param last end of the input range
+  @param result resulting iterator to the found element in the input range
+  @param comp comparison function object
+  @param part partitioning algorithm (default tf::GuidedPartitioner)
+
+  Finds the smallest element in the <tt>[first, last)</tt> 
+  using the given comparison function object.
+  The iterator to that smallest element is stored in @c result.
+  This method is equivalent to the parallel execution of the following loop:
+
+  @code{.cpp}
+  if (first == last) {
+    return last;
+  }
+  auto smallest = first;
+  ++first;
+  for (; first != last; ++first) {
+    if (comp(*first, *smallest)) {
+      smallest = first;
     }
-    @endcode
-    
-    Arguments are templated to enable stateful passing using std::reference_wrapper. 
-   
-    Please refer to @ref ParallelReduction for details. 
-    */
-    template <typename B, typename E, typename T, typename BOP, typename UOP>
-    Task transform_reduce(B&& first, E&& last, T& init, BOP&& bop, UOP&& uop);
-    
-    /**
-    @brief constructs a STL-styled parallel transform-reduce task using the guided partition algorithm
-  
-    @tparam B beginning iterator type
-    @tparam E ending iterator type
-    @tparam T result type 
-    @tparam BOP binary reducer type
-    @tparam UOP unary transformion type
-    @tparam H chunk size type
-
-    @param first iterator to the beginning (inclusive)
-    @param last iterator to the end (exclusive)
-    @param init initial value of the reduction and the storage for the reduced result
-    @param bop binary operator that will be applied in unspecified order to the results of @c uop
-    @param uop unary operator that will be applied to transform each element in the range to the result type
-    @param chunk_size chunk size
-
-    @return a tf::Task handle
-    
-    The task spawns a subflow to perform parallel reduction over @c init and the transformed elements in the range <tt>[first, last)</tt>. The reduced result is store in @c init. The runtime partitions the range into chunks of size @c chunk_size, where each chunk is processed by a worker. 
-    
-    Arguments are templated to enable stateful passing using std::reference_wrapper. 
-    
-    Please refer to @ref ParallelReduction for details. 
-    */
-    template <typename B, typename E, typename T, typename BOP, typename UOP, typename H = size_t>
-    Task transform_reduce_guided(
-      B&& first, E&& last, T& init, BOP&& bop, UOP&& uop, H&& chunk_size = 1
-    );
+  }
+  return smallest;
+  @endcode
 
-    /**
-    @brief constructs a STL-styled parallel transform-reduce task using the static partition algorithm
-  
-    @tparam B beginning iterator type
-    @tparam E ending iterator type
-    @tparam T result type 
-    @tparam BOP binary reducer type
-    @tparam UOP unary transformion type
-    @tparam H chunk size type
-
-    @param first iterator to the beginning (inclusive)
-    @param last iterator to the end (exclusive)
-    @param init initial value of the reduction and the storage for the reduced result
-    @param bop binary operator that will be applied in unspecified order to the results of @c uop
-    @param uop unary operator that will be applied to transform each element in the range to the result type
-    @param chunk_size chunk size
-
-    @return a tf::Task handle
-    
-    The task spawns a subflow to perform parallel reduction over @c init and the transformed elements in the range <tt>[first, last)</tt>. The reduced result is store in @c init. The runtime partitions the range into chunks of size @c chunk_size, where each chunk is processed by a worker. 
-    
-    Arguments are templated to enable stateful passing using std::reference_wrapper. 
-    
-    Please refer to @ref ParallelReduction for details. 
-    */
-    template <typename B, typename E, typename T, typename BOP, typename UOP, typename H = size_t>
-    Task transform_reduce_static(
-      B&& first, E&& last, T& init, BOP&& bop, UOP&& uop, H&& chunk_size = 0
-    );
+  For example, the code below find the smallest element from an input
+  range of 10 elements.
+
+  @code{.cpp}
+  std::vector<int> input = {1, 1, 1, 1, 1, -1, 1, 1, 1, 1};
+  std::vector<int>::iterator result;
+  taskflow.min_element(
+    input.begin(), input.end(), std::less<int>(), result
+  );
+  executor.run(taskflow).wait();
+  assert(*result == -1);
+  @endcode
+  
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  */
+  template <typename B, typename E, typename T, typename C, typename P>
+  Task min_element(B first, E last, T& result, C comp, P&& part);
+  
+  /**
+  @brief constructs a task to perform STL-styled max-element algorithm
+
+  @tparam B beginning iterator type
+  @tparam E ending iterator type
+  @tparam T resulting iterator type
+  @tparam C comparator type
+  @tparam P partitioner type
+  
+  @param first start of the input range
+  @param last end of the input range
+  @param result resulting iterator to the found element in the input range
+  @param comp comparison function object
+  @param part partitioning algorithm (default tf::GuidedPartitioner)
+
+  Finds the largest element in the <tt>[first, last)</tt> 
+  using the given comparison function object.
+  The iterator to that largest element is stored in @c result.
+  This method is equivalent to the parallel execution of the following loop:
+
+  @code{.cpp}
+  if (first == last){
+    return last;
+  }
+  auto largest = first;
+  ++first;
+  for (; first != last; ++first) {
+    if (comp(*largest, *first)) {
+      largest = first;
+    }
+  }
+  return largest;
+  @endcode
+
+  For example, the code below find the largest element from an input
+  range of 10 elements.
+
+  @code{.cpp}
+  std::vector<int> input = {1, 1, 1, 1, 1, 2, 1, 1, 1, 1};
+  std::vector<int>::iterator result;
+  taskflow.max_element(
+    input.begin(), input.end(), std::less<int>(), result
+  );
+  executor.run(taskflow).wait();
+  assert(*result == 2);
+  @endcode
+  
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+  */
+  template <typename B, typename E, typename T, typename C, typename P>
+  Task max_element(B first, E last, T& result, C comp, P&& part);
+
+  // ------------------------------------------------------------------------
+  // sort
+  // ------------------------------------------------------------------------
+
+  /**
+  @brief constructs a dynamic task to perform STL-styled parallel sort
+
+  @tparam B beginning iterator type (random-accessible)
+  @tparam E ending iterator type (random-accessible)
+  @tparam C comparator type
+
+  @param first iterator to the beginning (inclusive)
+  @param last iterator to the end (exclusive)
+  @param cmp comparison operator
+
+  The task spawns asynchronous tasks to sort elements in the range
+  <tt>[first, last)</tt> in parallel.
+
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+
+  Please refer to @ref ParallelSort for details.
+  */
+  template <typename B, typename E, typename C>
+  Task sort(B first, E last, C cmp);
+
+  /**
+  @brief constructs a dynamic task to perform STL-styled parallel sort using
+         the @c std::less<T> comparator, where @c T is the element type
+
+  @tparam B beginning iterator type (random-accessible)
+  @tparam E ending iterator type (random-accessible)
+
+  @param first iterator to the beginning (inclusive)
+  @param last iterator to the end (exclusive)
+
+  The task spawns asynchronous tasks to parallelly sort elements in the range
+  <tt>[first, last)</tt> using the @c std::less<T> comparator,
+  where @c T is the dereferenced iterator type.
+
+  Iterators are templated to enable stateful range using std::reference_wrapper.
+
+  Please refer to @ref ParallelSort for details.
+   */
+  template <typename B, typename E>
+  Task sort(B first, E last);
 
-    /**
-    @brief constructs a STL-styled parallel transform-reduce task using the dynamic partition algorithm
-  
-    @tparam B beginning iterator type
-    @tparam E ending iterator type
-    @tparam T result type 
-    @tparam BOP binary reducer type
-    @tparam UOP unary transformion type
-    @tparam H chunk size type
-
-    @param first iterator to the beginning (inclusive)
-    @param last iterator to the end (exclusive)
-    @param init initial value of the reduction and the storage for the reduced result
-    @param bop binary operator that will be applied in unspecified order to the results of @c uop
-    @param uop unary operator that will be applied to transform each element in the range to the result type
-    @param chunk_size chunk size
-
-    @return a tf::Task handle
-    
-    The task spawns a subflow to perform parallel reduction over @c init and the transformed elements in the range <tt>[first, last)</tt>. The reduced result is store in @c init. The runtime partitions the range into chunks of size @c chunk_size, where each chunk is processed by a worker. 
-    
-    Arguments are templated to enable stateful passing using std::reference_wrapper. 
-    
-    Please refer to @ref ParallelReduction for details. 
-    */
-    template <typename B, typename E, typename T, typename BOP, typename UOP, typename H = size_t>
-    Task transform_reduce_dynamic(
-      B&& first, E&& last, T& init, BOP&& bop, UOP&& uop, H&& chunk_size = 1
-    );
-    
-    // ------------------------------------------------------------------------
-    // sort
-    // ------------------------------------------------------------------------
-    
-    /**
-    @brief constructs a dynamic task to perform STL-styled parallel sort
-  
-    @tparam B beginning iterator type (random-accessible)
-    @tparam E ending iterator type (random-accessible)
-    @tparam C comparator type
-
-    @param first iterator to the beginning (inclusive)
-    @param last iterator to the end (exclusive)
-    @param cmp comparison function object
-    
-    The task spawns a subflow to parallelly sort elements in the range 
-    <tt>[first, last)</tt>. 
-    
-    Arguments are templated to enable stateful passing using std::reference_wrapper. 
-
-    Please refer to @ref ParallelSort for details.
-    */
-    template <typename B, typename E, typename C>
-    Task sort(B&& first, E&& last, C&& cmp);
-    
-    /**
-    @brief constructs a dynamic task to perform STL-styled parallel sort using
-           the @c std::less<T> comparator, where @c T is the element type
-    
-    @tparam B beginning iterator type (random-accessible)
-    @tparam E ending iterator type (random-accessible)
-
-    @param first iterator to the beginning (inclusive)
-    @param last iterator to the end (exclusive)
-    
-    The task spawns a subflow to parallelly sort elements in the range 
-    <tt>[first, last)</tt> using the @c std::less<T> comparator, 
-    where @c T is the dereferenced iterator type.
-
-    Arguments are templated to enable stateful passing using std::reference_wrapper. 
-    
-    Please refer to @ref ParallelSort for details.
-     */
-    template <typename B, typename E>
-    Task sort(B&& first, E&& last);
-    
   protected:
-    
-    /**
-    @brief constructs a flow builder with a graph
-    */
-    FlowBuilder(Graph& graph);
-    
-    /**
-    @brief associated graph object
-    */
-    Graph& _graph;
-    
+
+  /**
+  @brief associated graph object
+  */
+  Graph& _graph;
+
   private:
 
-    template <typename L>
-    void _linearize(L&);
+  template <typename L>
+  void _linearize(L&);
 };
 
 // Constructor
@@ -797,7 +1097,7 @@ inline FlowBuilder::FlowBuilder(Graph& graph) :
 // Function: emplace
 template <typename C, std::enable_if_t<is_static_task_v<C>, void>*>
 Task FlowBuilder::emplace(C&& c) {
-  return Task(_graph.emplace_back(
+  return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0,
     std::in_place_type_t<Node::Static>{}, std::forward<C>(c)
   ));
 }
@@ -805,7 +1105,7 @@ Task FlowBuilder::emplace(C&& c) {
 // Function: emplace
 template <typename C, std::enable_if_t<is_dynamic_task_v<C>, void>*>
 Task FlowBuilder::emplace(C&& c) {
-  return Task(_graph.emplace_back(
+  return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0,
     std::in_place_type_t<Node::Dynamic>{}, std::forward<C>(c)
   ));
 }
@@ -813,28 +1113,63 @@ Task FlowBuilder::emplace(C&& c) {
 // Function: emplace
 template <typename C, std::enable_if_t<is_condition_task_v<C>, void>*>
 Task FlowBuilder::emplace(C&& c) {
-  return Task(_graph.emplace_back(
+  return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0,
     std::in_place_type_t<Node::Condition>{}, std::forward<C>(c)
   ));
 }
 
+// Function: emplace
+template <typename C, std::enable_if_t<is_multi_condition_task_v<C>, void>*>
+Task FlowBuilder::emplace(C&& c) {
+  return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0,
+    std::in_place_type_t<Node::MultiCondition>{}, std::forward<C>(c)
+  ));
+}
+
 // Function: emplace
 template <typename... C, std::enable_if_t<(sizeof...(C)>1), void>*>
 auto FlowBuilder::emplace(C&&... cs) {
   return std::make_tuple(emplace(std::forward<C>(cs))...);
 }
 
-// Function: composed_of    
-inline Task FlowBuilder::composed_of(Taskflow& taskflow) {
-  auto node = _graph.emplace_back(
-    std::in_place_type_t<Node::Module>{}, &taskflow
+// Function: erase
+inline void FlowBuilder::erase(Task task) {
+
+  if (!task._node) {
+    return;
+  }
+
+  task.for_each_dependent([&] (Task dependent) {
+    auto& S = dependent._node->_successors;
+    if(auto I = std::find(S.begin(), S.end(), task._node); I != S.end()) {
+      S.erase(I);
+    }
+  });
+
+  task.for_each_successor([&] (Task dependent) {
+    auto& D = dependent._node->_dependents;
+    if(auto I = std::find(D.begin(), D.end(), task._node); I != D.end()) {
+      D.erase(I);
+    }
+  });
+
+  _graph._erase(task._node);
+}
+
+// Function: composed_of
+template <typename T>
+Task FlowBuilder::composed_of(T& object) {
+  auto node = _graph._emplace_back("", 0, nullptr, nullptr, 0,
+    std::in_place_type_t<Node::Module>{}, object
   );
   return Task(node);
 }
 
 // Function: placeholder
 inline Task FlowBuilder::placeholder() {
-  auto node = _graph.emplace_back();
+  auto node = _graph._emplace_back("", 0, nullptr, nullptr, 0,
+    std::in_place_type_t<Node::Placeholder>{}
+  );
   return Task(node);
 }
 
@@ -858,7 +1193,7 @@ void FlowBuilder::_linearize(L& keys) {
 
 // Procedure: linearize
 inline void FlowBuilder::linearize(std::vector<Task>& keys) {
-  _linearize(keys); 
+  _linearize(keys);
 }
 
 // Procedure: linearize
@@ -868,20 +1203,22 @@ inline void FlowBuilder::linearize(std::initializer_list<Task> keys) {
 
 // ----------------------------------------------------------------------------
 
-/** 
+/**
 @class Subflow
 
 @brief class to construct a subflow graph from the execution of a dynamic task
 
-By default, a subflow automatically @em joins its parent node. 
-You may explicitly join or detach a subflow by calling tf::Subflow::join 
+tf::Subflow is a derived class from tf::Runtime with a specialized mechanism
+to manage the execution of a child graph.
+By default, a subflow automatically @em joins its parent node.
+You may explicitly join or detach a subflow by calling tf::Subflow::join
 or tf::Subflow::detach, respectively.
 The following example creates a taskflow graph that spawns a subflow from
 the execution of task @c B, and the subflow contains three tasks, @c B1,
 @c B2, and @c B3, where @c B3 runs after @c B1 and @c B2.
 
 @code{.cpp}
-// create three regular tasks
+// create three static tasks
 tf::Task A = taskflow.emplace([](){}).name("A");
 tf::Task C = taskflow.emplace([](){}).name("C");
 tf::Task D = taskflow.emplace([](){}).name("D");
@@ -894,26 +1231,37 @@ tf::Task B = taskflow.emplace([] (tf::Subflow& subflow) {
   B1.precede(B3);
   B2.precede(B3);
 }).name("B");
-            
-A.precede(B);  // B runs after A 
-A.precede(C);  // C runs after A 
-B.precede(D);  // D runs after B 
-C.precede(D);  // D runs after C 
+
+A.precede(B);  // B runs after A
+A.precede(C);  // C runs after A
+B.precede(D);  // D runs after B
+C.precede(D);  // D runs after C
 @endcode
 
-*/ 
-class Subflow : public FlowBuilder {
+*/
+class Subflow : public FlowBuilder,
+                public Runtime {
 
   friend class Executor;
   friend class FlowBuilder;
+  friend class Runtime;
 
   public:
-    
+
     /**
     @brief enables the subflow to join its parent task
 
     Performs an immediate action to join the subflow. Once the subflow is joined,
     it is considered finished and you may not modify the subflow anymore.
+
+    @code{.cpp}
+    taskflow.emplace([](tf::Subflow& sf){
+      sf.emplace([](){});
+      sf.join();  // join the subflow of one task
+    });
+    @endcode
+
+    Only the worker that spawns this subflow can join it.
     */
     void join();
 
@@ -922,80 +1270,83 @@ class Subflow : public FlowBuilder {
 
     Performs an immediate action to detach the subflow. Once the subflow is detached,
     it is considered finished and you may not modify the subflow anymore.
+
+    @code{.cpp}
+    taskflow.emplace([](tf::Subflow& sf){
+      sf.emplace([](){});
+      sf.detach();
+    });
+    @endcode
+
+    Only the worker that spawns this subflow can detach it.
     */
     void detach();
-    
-    /**
-    @brief queries if the subflow is joinable
 
-    When a subflow is joined or detached, it becomes not joinable.
-    */
-    bool joinable() const;
+    /**
+    @brief resets the subflow to a joinable state
 
-    /** 
-    @brief runs a given function asynchronously
+    @param clear_graph specifies whether to clear the associated graph (default @c true)
 
-    @tparam F callable type
-    @tparam ArgsT parameter types
+    Clears the underlying task graph depending on the 
+    given variable @c clear_graph (default @c true) and then
+    updates the subflow to a joinable state.
+    */
+    void reset(bool clear_graph = true);
 
-    @param f callable object to call
-    @param args parameters to pass to the callable
-    
-    @return a tf::Future that will holds the result of the execution
+    /**
+    @brief queries if the subflow is joinable
 
-    This method is thread-safe and can be called by multiple tasks in the 
-    subflow at the same time.
-    The difference to tf::Executor::async is that the created asynchronous task
-    pertains to the subflow. 
-    When the subflow joins, all asynchronous tasks created from the subflow
-    are guaranteed to finish before the join.
-    For example:
+    This member function queries if the subflow is joinable.
+    When a subflow is joined or detached, it becomes not joinable.
 
     @code{.cpp}
-    std::atomic<int> counter(0);
-    taskflow.empalce([&](tf::Subflow& sf){
-      for(int i=0; i<100; i++) {
-        sf.async([&](){ counter++; });
-      }
+    taskflow.emplace([](tf::Subflow& sf){
+      sf.emplace([](){});
+      std::cout << sf.joinable() << '\n';  // true
       sf.join();
-      assert(counter == 100);
+      std::cout << sf.joinable() << '\n';  // false
     });
     @endcode
-
-    You cannot create asynchronous tasks from a detached subflow.
-    Doing this results in undefined behavior.
     */
-    template <typename F, typename... ArgsT>
-    auto async(F&& f, ArgsT&&... args);
-    
-    /**
-    @brief similar to tf::Subflow::async but did not return a future object
-     */
-    template <typename F, typename... ArgsT>
-    void silent_async(F&& f, ArgsT&&... args);
+    bool joinable() const noexcept;
 
   private:
-    
-    Subflow(Executor&, Node*, Graph&);
-
-    Executor& _executor;
-    Node* _parent;
 
     bool _joinable {true};
+
+    Subflow(Executor&, Worker&, Node*, Graph&);
 };
 
 // Constructor
-inline Subflow::Subflow(Executor& executor, Node* parent, Graph& graph) :
+inline Subflow::Subflow(
+  Executor& executor, Worker& worker, Node* parent, Graph& graph
+) :
   FlowBuilder {graph},
-  _executor   {executor},
-  _parent     {parent} {
+  Runtime {executor, worker, parent} {
+  // assert(_parent != nullptr);
 }
 
 // Function: joined
-inline bool Subflow::joinable() const {
+inline bool Subflow::joinable() const noexcept {
   return _joinable;
 }
 
+// Procedure: reset
+inline void Subflow::reset(bool clear_graph) {
+  if(clear_graph) {
+    _graph._clear();
+  }
+  _joinable = true;
+}
+
 }  // end of namespace tf. ---------------------------------------------------
 
 
+
+
+
+
+
+
+
+
diff --git a/lib/taskflow/core/graph.hpp b/lib/taskflow/core/graph.hpp
index 06c6dd3..475422d 100644
--- a/lib/taskflow/core/graph.hpp
+++ b/lib/taskflow/core/graph.hpp
@@ -1,74 +1,495 @@
 #pragma once
 
+#include "../utility/traits.hpp"
 #include "../utility/iterator.hpp"
 #include "../utility/object_pool.hpp"
-#include "../utility/traits.hpp"
-#include "../utility/singleton.hpp"
 #include "../utility/os.hpp"
 #include "../utility/math.hpp"
+#include "../utility/small_vector.hpp"
 #include "../utility/serializer.hpp"
 #include "error.hpp"
 #include "declarations.hpp"
 #include "semaphore.hpp"
 #include "environment.hpp"
 #include "topology.hpp"
+#include "tsq.hpp"
+
+/**
+@file graph.hpp
+@brief graph include file
+*/
 
 namespace tf {
 
 // ----------------------------------------------------------------------------
-// Class: CustomGraphBase
+// Class: Graph
 // ----------------------------------------------------------------------------
-class CustomGraphBase {
 
-  public:
-  
-  virtual void dump(std::ostream&, const void*, const std::string&) const = 0;
-  virtual ~CustomGraphBase() = default;  
-};
+/**
+@class Graph
 
-// ----------------------------------------------------------------------------
-// Class: Graph
-// ----------------------------------------------------------------------------
+@brief class to create a graph object
+
+A graph is the ultimate storage for a task dependency graph and is the main
+gateway to interact with an executor.
+A graph manages a set of nodes in a global object pool that animates and
+recycles node objects efficiently without going through repetitive and
+expensive memory allocations and deallocations.
+This class is mainly used for creating an opaque graph object in a custom
+class to interact with the executor through taskflow composition.
+
+A graph object is move-only.
+*/
 class Graph {
 
   friend class Node;
+  friend class FlowBuilder;
+  friend class Subflow;
   friend class Taskflow;
   friend class Executor;
 
   public:
 
+    /**
+    @brief constructs a graph object
+    */
     Graph() = default;
+
+    /**
+    @brief disabled copy constructor
+    */
     Graph(const Graph&) = delete;
+
+    /**
+    @brief constructs a graph using move semantics
+    */
     Graph(Graph&&);
 
+    /**
+    @brief destructs the graph object
+    */
     ~Graph();
 
+    /**
+    @brief disabled copy assignment operator
+    */
     Graph& operator = (const Graph&) = delete;
+
+    /**
+    @brief assigns a graph using move semantics
+    */
     Graph& operator = (Graph&&);
-    
-    void clear();
-    void clear_detached();
-    void merge(Graph&&);
 
+    /**
+    @brief queries if the graph is empty
+    */
     bool empty() const;
 
+    /**
+    @brief queries the number of nodes in the graph
+    */
     size_t size() const;
-    
-    template <typename ...Args>
-    Node* emplace_back(Args&& ...); 
 
-    Node* emplace_back();
+    /**
+    @brief clears the graph
+    */
+    void clear();
 
   private:
 
     std::vector<Node*> _nodes;
+
+    void _clear();
+    void _clear_detached();
+    void _merge(Graph&&);
+    void _erase(Node*);
+    
+    /**
+    @private
+    */
+    template <typename ...ArgsT>
+    Node* _emplace_back(ArgsT&&...);
 };
 
 // ----------------------------------------------------------------------------
 
-// Class: Node
-class Node {
+/**
+@class Runtime
+
+@brief class to include a runtime object in a task
+
+A runtime object allows users to interact with the
+scheduling runtime inside a task, such as scheduling an active task,
+spawning a subflow, and so on.
+
+@code{.cpp}
+tf::Task A, B, C, D;
+std::tie(A, B, C, D) = taskflow.emplace(
+  [] () { return 0; },
+  [&C] (tf::Runtime& rt) {  // C must be captured by reference
+    std::cout << "B\n";
+    rt.schedule(C);
+  },
+  [] () { std::cout << "C\n"; },
+  [] () { std::cout << "D\n"; }
+);
+A.precede(B, C, D);
+executor.run(taskflow).wait();
+@endcode
+
+A runtime object is associated with the worker and the executor
+that runs the task.
+
+*/
+class Runtime {
+
+  friend class Executor;
+  friend class FlowBuilder;
+
+  public:
+
+  /**
+  @brief obtains the running executor
+
+  The running executor of a runtime task is the executor that runs
+  the parent taskflow of that runtime task.
+
+  @code{.cpp}
+  tf::Executor executor;
+  tf::Taskflow taskflow;
+  taskflow.emplace([&](tf::Runtime& rt){
+    assert(&(rt.executor()) == &executor);
+  });
+  executor.run(taskflow).wait();
+  @endcode
+  */
+  Executor& executor();
+
+  /**
+  @brief schedules an active task immediately to the worker's queue
+
+  @param task the given active task to schedule immediately
+
+  This member function immediately schedules an active task to the
+  task queue of the associated worker in the runtime task.
+  An active task is a task in a running taskflow.
+  The task may or may not be running, and scheduling that task
+  will immediately put the task into the task queue of the worker
+  that is running the runtime task.
+  Consider the following example:
+
+  @code{.cpp}
+  tf::Task A, B, C, D;
+  std::tie(A, B, C, D) = taskflow.emplace(
+    [] () { return 0; },
+    [&C] (tf::Runtime& rt) {  // C must be captured by reference
+      std::cout << "B\n";
+      rt.schedule(C);
+    },
+    [] () { std::cout << "C\n"; },
+    [] () { std::cout << "D\n"; }
+  );
+  A.precede(B, C, D);
+  executor.run(taskflow).wait();
+  @endcode
+
+  The executor will first run the condition task @c A which returns @c 0
+  to inform the scheduler to go to the runtime task @c B.
+  During the execution of @c B, it directly schedules task @c C without
+  going through the normal taskflow graph scheduling process.
+  At this moment, task @c C is active because its parent taskflow is running.
+  When the taskflow finishes, we will see both @c B and @c C in the output.
+  */
+  void schedule(Task task);
+  
+  /**
+  @brief runs the given callable asynchronously
+
+  @tparam F callable type
+  @param f callable object
+    
+  The method creates an asynchronous task to launch the given
+  function on the given arguments.
+  The difference to tf::Executor::async is that the created asynchronous task
+  pertains to the runtime.
+  When the runtime joins, all asynchronous tasks created from the runtime
+  are guaranteed to finish after the join returns.
+  For example:
+
+  @code{.cpp}
+  std::atomic<int> counter(0);
+  taskflow.emplace([&](tf::Runtime& rt){
+    auto fu1 = rt.async([&](){ counter++; });
+    auto fu2 = rt.async([&](){ counter++; });
+    fu1.get();
+    fu2.get();
+    assert(counter == 2);
+    
+    // spawn 100 asynchronous tasks from the worker of the runtime
+    for(int i=0; i<100; i++) {
+      rt.async([&](){ counter++; });
+    }
+    
+    // explicit join 100 asynchronous tasks
+    rt.join();
+    assert(counter == 102);
+  });
+  @endcode
+
+  This method is thread-safe and can be called by multiple workers
+  that hold the reference to the runtime.
+  For example, the code below spawns 100 tasks from the worker of
+  a runtime, and each of the 100 tasks spawns another task
+  that will be run by another worker.
   
+  @code{.cpp}
+  std::atomic<int> counter(0);
+  taskflow.emplace([&](tf::Runtime& rt){
+    // worker of the runtime spawns 100 tasks each spawning another task
+    // that will be run by another worker
+    for(int i=0; i<100; i++) {
+      rt.async([&](){ 
+        counter++; 
+        rt.async([](){ counter++; });
+      });
+    }
+    
+    // explicit join 100 asynchronous tasks
+    rt.join();
+    assert(counter == 200);
+  });
+  @endcode
+  */
+  template <typename F>
+  auto async(F&& f);
+  
+  /**
+  @brief similar to tf::Runtime::async but assigns the task a name
+
+  @tparam F callable type
+
+  @param name assigned name to the task
+  @param f callable
+
+  @code{.cpp}
+  taskflow.emplace([&](tf::Runtime& rt){
+    auto future = rt.async("my task", [](){});
+    future.get();
+  });
+  @endcode
+
+  */
+  template <typename F>
+  auto async(const std::string& name, F&& f);
+
+  /**
+  @brief runs the given function asynchronously without returning any future object
+
+  @tparam F callable type
+  @param f callable
+
+  This member function is more efficient than tf::Runtime::async
+  and is encouraged to use when there is no data returned.
+
+  @code{.cpp}
+  std::atomic<int> counter(0);
+  taskflow.emplace([&](tf::Runtime& rt){
+    for(int i=0; i<100; i++) {
+      rt.silent_async([&](){ counter++; });
+    }
+    rt.join();
+    assert(counter == 100);
+  });
+  @endcode
+
+  This member function is thread-safe.
+  */
+  template <typename F>
+  void silent_async(F&& f);
+  
+  /**
+  @brief similar to tf::Runtime::silent_async but assigns the task a name
+
+  @tparam F callable type
+  @param name assigned name to the task
+  @param f callable
+  
+  @code{.cpp}
+  taskflow.emplace([&](tf::Runtime& rt){
+    rt.silent_async("my task", [](){});
+    rt.join();
+  });
+  @endcode
+  */
+  template <typename F>
+  void silent_async(const std::string& name, F&& f);
+  
+  /**
+  @brief similar to tf::Runtime::silent_async but the caller must be the worker of the runtime
+
+  @tparam F callable type
+
+  @param name assigned name to the task
+  @param f callable
+
+  The method bypass the check of the caller worker from the executor 
+  and thus can only called by the worker of this runtime.
+
+  @code{.cpp}
+  taskflow.emplace([&](tf::Runtime& rt){
+    // running by the worker of this runtime
+    rt.silent_async_unchecked("my task", [](){});
+    rt.join();
+  });
+  @endcode
+  */
+  template <typename F>
+  void silent_async_unchecked(const std::string& name, F&& f);
+
+  /**
+  @brief co-runs the given target and waits until it completes
+  
+  A target can be one of the following forms:
+    + a dynamic task to spawn a subflow or
+    + a composable graph object with `tf::Graph& T::graph()` defined
+
+  @code{.cpp}
+  // co-run a subflow and wait until all tasks complete
+  taskflow.emplace([](tf::Runtime& rt){
+    rt.corun([](tf::Subflow& sf){
+      tf::Task A = sf.emplace([](){});
+      tf::Task B = sf.emplace([](){});
+    }); 
+  });
+  
+  // co-run a taskflow and wait until all tasks complete
+  tf::Taskflow taskflow1, taskflow2;
+  taskflow1.emplace([](){ std::cout << "running taskflow1\n"; });
+  taskflow2.emplace([&](tf::Runtime& rt){
+    std::cout << "running taskflow2\n";
+    rt.corun(taskflow1);
+  });
+  executor.run(taskflow2).wait();
+  @endcode
+
+  Although tf::Runtime::corun blocks until the operation completes, 
+  the caller thread (worker) is not blocked (e.g., sleeping or holding any lock). 
+  Instead, the caller thread joins the work-stealing loop of the executor 
+  and returns when all tasks in the target completes.
+  */
+  template <typename T>
+  void corun(T&& target);
+
+  /**
+  @brief keeps running the work-stealing loop until the predicate becomes true
+  
+  @tparam P predicate type
+  @param predicate a boolean predicate to indicate when to stop the loop
+
+  The method keeps the caller worker running in the work-stealing loop
+  until the stop predicate becomes true.
+  */
+  template <typename P>
+  void corun_until(P&& predicate);
+  
+  /**
+  @brief joins all asynchronous tasks spawned by this runtime
+
+  Immediately joins all asynchronous tasks (tf::Runtime::async,
+  tf::Runtime::silent_async).
+  Unlike tf::Subflow::join, you can join multiples times from
+  a tf::Runtime object.
+    
+  @code{.cpp}
+  std::atomic<size_t> counter{0};
+  taskflow.emplace([&](tf::Runtime& rt){
+    // spawn 100 async tasks and join
+    for(int i=0; i<100; i++) {
+      rt.silent_async([&](){ counter++; });
+    }
+    rt.join();
+    assert(counter == 100);
+    
+    // spawn another 100 async tasks and join
+    for(int i=0; i<100; i++) {
+      rt.silent_async([&](){ counter++; });
+    }
+    rt.join();
+    assert(counter == 200);
+  });
+  @endcode
+
+  @attention
+  Only the worker of this tf::Runtime can issue join.
+  */
+  inline void join();
+
+  /**
+  @brief acquire a reference to the underlying worker
+  */
+  inline Worker& worker();
+
+  protected:
+  
+  /**
+  @private
+  */
+  explicit Runtime(Executor&, Worker&, Node*);
+  
+  /**
+  @private
+  */
+  Executor& _executor;
+  
+  /**
+  @private
+  */
+  Worker& _worker;
+  
+  /**
+  @private
+  */
+  Node* _parent;
+
+  /**
+  @private
+  */
+  template <typename F>
+  auto _async(Worker& w, const std::string& name, F&& f);
+  
+  /**
+  @private
+  */
+  template <typename F>
+  void _silent_async(Worker& w, const std::string& name, F&& f);
+};
+
+// constructor
+inline Runtime::Runtime(Executor& e, Worker& w, Node* p) :
+  _executor{e},
+  _worker  {w},
+  _parent  {p}{
+}
+
+// Function: executor
+inline Executor& Runtime::executor() {
+  return _executor;
+}
+
+// Function: worker
+inline Worker& Runtime::worker() {
+  return _worker;
+}
+
+// ----------------------------------------------------------------------------
+// Node
+// ----------------------------------------------------------------------------
+
+/**
+@private
+*/
+class Node {
+
   friend class Graph;
   friend class Task;
   friend class TaskView;
@@ -76,225 +497,239 @@ class Node {
   friend class Executor;
   friend class FlowBuilder;
   friend class Subflow;
+  friend class Runtime;
+
+  enum class AsyncState : int {
+    UNFINISHED = 0,
+    LOCKED = 1,
+    FINISHED = 2
+  };
 
   TF_ENABLE_POOLABLE_ON_THIS;
 
   // state bit flag
-  constexpr static int BRANCHED = 0x1;
-  constexpr static int DETACHED = 0x2;
-  constexpr static int ACQUIRED = 0x4;
-  
+  constexpr static int CONDITIONED = 1;
+  constexpr static int DETACHED    = 2;
+  constexpr static int ACQUIRED    = 4;
+  constexpr static int READY       = 8;
+
+  using Placeholder = std::monostate;
+
   // static work handle
   struct Static {
 
-    template <typename C> 
+    template <typename C>
     Static(C&&);
 
-    std::function<void()> work;
+    std::variant<
+      std::function<void()>, std::function<void(Runtime&)>
+    > work;
   };
 
   // dynamic work handle
   struct Dynamic {
 
-    template <typename C> 
+    template <typename C>
     Dynamic(C&&);
 
     std::function<void(Subflow&)> work;
     Graph subgraph;
   };
-  
+
   // condition work handle
   struct Condition {
 
-    template <typename C> 
+    template <typename C>
     Condition(C&&);
+    
+    std::variant<
+      std::function<int()>, std::function<int(Runtime&)>
+    > work;
+  };
 
-    std::function<int()> work;
+  // multi-condition work handle
+  struct MultiCondition {
+
+    template <typename C>
+    MultiCondition(C&&);
+
+    std::variant<
+      std::function<SmallVector<int>()>, std::function<SmallVector<int>(Runtime&)>
+    > work;
   };
 
   // module work handle
   struct Module {
 
     template <typename T>
-    Module(T&&);
+    Module(T&);
 
-    Taskflow* module {nullptr};
+    Graph& graph;
   };
 
   // Async work
   struct Async {
 
     template <typename T>
-    Async(T&&, std::shared_ptr<AsyncTopology>);
-
-    std::function<void(bool)> work;
+    Async(T&&);
 
-    std::shared_ptr<AsyncTopology> topology;
+    std::function<void()> work;
   };
   
-  // Silent async work
-  struct SilentAsync {
+  // silent dependent async
+  struct DependentAsync {
     
     template <typename C>
-    SilentAsync(C&&);
-
+    DependentAsync(C&&);
+    
     std::function<void()> work;
-  };
-  
-  // cudaFlow work handle
-  struct cudaFlow {
     
-    template <typename C, typename G> 
-    cudaFlow(C&& c, G&& g);
-
-    std::function<void(Executor&, Node*)> work;
-
-    std::unique_ptr<CustomGraphBase> graph;
+    std::atomic<AsyncState> state {AsyncState::UNFINISHED};
   };
-    
+
   using handle_t = std::variant<
-    std::monostate,  // placeholder
-    Static,          // static tasking
-    Dynamic,         // dynamic tasking
-    Condition,       // conditional tasking
-    Module,          // composable tasking
-    Async,           // async tasking
-    SilentAsync,     // async tasking (no future)
-    cudaFlow         // cudaFlow
+    Placeholder,      // placeholder
+    Static,           // static tasking
+    Dynamic,          // dynamic tasking
+    Condition,        // conditional tasking
+    MultiCondition,   // multi-conditional tasking
+    Module,           // composable tasking
+    Async,            // async tasking
+    DependentAsync    // dependent async tasking (no future)
   >;
-    
-  struct Semaphores {  
-    std::vector<Semaphore*> to_acquire;
-    std::vector<Semaphore*> to_release;
+
+  struct Semaphores {
+    SmallVector<Semaphore*> to_acquire;
+    SmallVector<Semaphore*> to_release;
   };
 
   public:
-  
+
   // variant index
-  constexpr static auto PLACEHOLDER  = get_index_v<std::monostate, handle_t>;
-  constexpr static auto STATIC       = get_index_v<Static, handle_t>;
-  constexpr static auto DYNAMIC      = get_index_v<Dynamic, handle_t>;
-  constexpr static auto CONDITION    = get_index_v<Condition, handle_t>; 
-  constexpr static auto MODULE       = get_index_v<Module, handle_t>; 
-  constexpr static auto ASYNC        = get_index_v<Async, handle_t>; 
-  constexpr static auto SILENT_ASYNC = get_index_v<SilentAsync, handle_t>; 
-  constexpr static auto CUDAFLOW     = get_index_v<cudaFlow, handle_t>; 
+  constexpr static auto PLACEHOLDER     = get_index_v<Placeholder, handle_t>;
+  constexpr static auto STATIC          = get_index_v<Static, handle_t>;
+  constexpr static auto DYNAMIC         = get_index_v<Dynamic, handle_t>;
+  constexpr static auto CONDITION       = get_index_v<Condition, handle_t>;
+  constexpr static auto MULTI_CONDITION = get_index_v<MultiCondition, handle_t>;
+  constexpr static auto MODULE          = get_index_v<Module, handle_t>;
+  constexpr static auto ASYNC           = get_index_v<Async, handle_t>;
+  constexpr static auto DEPENDENT_ASYNC = get_index_v<DependentAsync, handle_t>;
 
-    template <typename... Args>
-    Node(Args&&... args);
+  Node() = default;
 
-    ~Node();
+  template <typename... Args>
+  Node(const std::string&, unsigned, Topology*, Node*, size_t, Args&&... args);
 
-    size_t num_successors() const;
-    size_t num_dependents() const;
-    size_t num_strong_dependents() const;
-    size_t num_weak_dependents() const;
+  ~Node();
 
-    const std::string& name() const;
+  size_t num_successors() const;
+  size_t num_dependents() const;
+  size_t num_strong_dependents() const;
+  size_t num_weak_dependents() const;
 
-  private:
+  const std::string& name() const;
 
-    std::string _name;
+  private:
 
-    handle_t _handle;
+  std::string _name;
+  
+  unsigned _priority {0};
+  
+  Topology* _topology {nullptr};
+  Node* _parent {nullptr};
 
-    std::vector<Node*> _successors;
-    std::vector<Node*> _dependents;
+  void* _data {nullptr};
 
-    //std::optional<Semaphores> _semaphores;
-    std::unique_ptr<Semaphores> _semaphores;
+  SmallVector<Node*> _successors;
+  SmallVector<Node*> _dependents;
 
-    Topology* _topology {nullptr};
-    
-    Node* _parent {nullptr};
+  std::atomic<int> _state {0};
+  std::atomic<size_t> _join_counter {0};
 
-    int _state {0};
+  std::unique_ptr<Semaphores> _semaphores;
+  
+  handle_t _handle;
 
-    std::atomic<size_t> _join_counter {0};
-    
-    void _precede(Node*);
-    void _set_state(int);
-    void _unset_state(int);
-    void _clear_state();
-    void _set_up_join_counter();
+  void _precede(Node*);
+  void _set_up_join_counter();
 
-    bool _has_state(int) const;
-    bool _is_cancelled() const;
-    bool _acquire_all(std::vector<Node*>&);
+  bool _is_cancelled() const;
+  bool _is_conditioner() const;
+  bool _acquire_all(SmallVector<Node*>&);
 
-    std::vector<Node*> _release_all();
+  SmallVector<Node*> _release_all();
 };
 
 // ----------------------------------------------------------------------------
 // Node Object Pool
 // ----------------------------------------------------------------------------
+
+/**
+@private
+*/
 inline ObjectPool<Node> node_pool;
 
 // ----------------------------------------------------------------------------
 // Definition for Node::Static
 // ----------------------------------------------------------------------------
-    
+
 // Constructor
-template <typename C> 
+template <typename C>
 Node::Static::Static(C&& c) : work {std::forward<C>(c)} {
 }
 
 // ----------------------------------------------------------------------------
 // Definition for Node::Dynamic
 // ----------------------------------------------------------------------------
-    
+
 // Constructor
-template <typename C> 
+template <typename C>
 Node::Dynamic::Dynamic(C&& c) : work {std::forward<C>(c)} {
 }
 
 // ----------------------------------------------------------------------------
 // Definition for Node::Condition
 // ----------------------------------------------------------------------------
-    
+
 // Constructor
-template <typename C> 
+template <typename C>
 Node::Condition::Condition(C&& c) : work {std::forward<C>(c)} {
-}
+}                                        
 
 // ----------------------------------------------------------------------------
-// Definition for Node::cudaFlow
+// Definition for Node::MultiCondition
 // ----------------------------------------------------------------------------
 
-template <typename C, typename G>
-Node::cudaFlow::cudaFlow(C&& c, G&& g) :
-  work  {std::forward<C>(c)},
-  graph {std::forward<G>(g)} {
+// Constructor
+template <typename C>
+Node::MultiCondition::MultiCondition(C&& c) : work {std::forward<C>(c)} {
 }
-    
+
 // ----------------------------------------------------------------------------
 // Definition for Node::Module
 // ----------------------------------------------------------------------------
-    
+
 // Constructor
 template <typename T>
-Node::Module::Module(T&& tf) : module {tf} {
+inline Node::Module::Module(T& obj) : graph{ obj.graph() } {
 }
 
 // ----------------------------------------------------------------------------
 // Definition for Node::Async
 // ----------------------------------------------------------------------------
-    
+
 // Constructor
 template <typename C>
-Node::Async::Async(C&& c, std::shared_ptr<AsyncTopology>tpg) : 
-  work     {std::forward<C>(c)},
-  topology {std::move(tpg)} {
+Node::Async::Async(C&& c) : work {std::forward<C>(c)} {
 }
 
 // ----------------------------------------------------------------------------
-// Definition for Node::SilentAsync
+// Definition for Node::DependentAsync
 // ----------------------------------------------------------------------------
 
 // Constructor
 template <typename C>
-Node::SilentAsync::SilentAsync(C&& c) :
-  work {std::forward<C>(c)} {
+Node::DependentAsync::DependentAsync(C&& c) : work {std::forward<C>(c)} {
 }
 
 // ----------------------------------------------------------------------------
@@ -303,18 +738,37 @@ Node::SilentAsync::SilentAsync(C&& c) :
 
 // Constructor
 template <typename... Args>
-Node::Node(Args&&... args): _handle{std::forward<Args>(args)...} {
-} 
+Node::Node(
+  const std::string& name, 
+  unsigned priority,
+  Topology* topology, 
+  Node* parent, 
+  size_t join_counter,
+  Args&&... args
+) :
+  _name         {name},
+  _priority     {priority},
+  _topology     {topology},
+  _parent       {parent},
+  _join_counter {join_counter},
+  _handle       {std::forward<Args>(args)...} {
+}
+
+//Node::Node(Args&&... args): _handle{std::forward<Args>(args)...} {
+//}
 
 // Destructor
 inline Node::~Node() {
   // this is to avoid stack overflow
 
   if(_handle.index() == DYNAMIC) {
-
-    auto& subgraph = std::get<Dynamic>(_handle).subgraph;
-
+    // using std::get_if instead of std::get makes this compatible
+    // with older macOS versions
+    // the result of std::get_if is guaranteed to be non-null
+    // due to the index check above
+    auto& subgraph = std::get_if<Dynamic>(&_handle)->subgraph;
     std::vector<Node*> nodes;
+    nodes.reserve(subgraph.size());
 
     std::move(
       subgraph._nodes.begin(), subgraph._nodes.end(), std::back_inserter(nodes)
@@ -326,8 +780,7 @@ inline Node::~Node() {
     while(i < nodes.size()) {
 
       if(nodes[i]->_handle.index() == DYNAMIC) {
-
-        auto& sbg = std::get<Dynamic>(nodes[i]->_handle).subgraph;
+        auto& sbg = std::get_if<Dynamic>(&(nodes[i]->_handle))->subgraph;
         std::move(
           sbg._nodes.begin(), sbg._nodes.end(), std::back_inserter(nodes)
         );
@@ -336,7 +789,7 @@ inline Node::~Node() {
 
       ++i;
     }
-      
+
     //auto& np = Graph::_node_pool();
     for(i=0; i<nodes.size(); ++i) {
       node_pool.recycle(nodes[i]);
@@ -364,7 +817,8 @@ inline size_t Node::num_dependents() const {
 inline size_t Node::num_weak_dependents() const {
   size_t n = 0;
   for(size_t i=0; i<_dependents.size(); i++) {
-    if(_dependents[i]->_handle.index() == Node::CONDITION) {
+    //if(_dependents[i]->_handle.index() == Node::CONDITION) {
+    if(_dependents[i]->_is_conditioner()) {
       n++;
     }
   }
@@ -375,7 +829,8 @@ inline size_t Node::num_weak_dependents() const {
 inline size_t Node::num_strong_dependents() const {
   size_t n = 0;
   for(size_t i=0; i<_dependents.size(); i++) {
-    if(_dependents[i]->_handle.index() != Node::CONDITION) {
+    //if(_dependents[i]->_handle.index() != Node::CONDITION) {
+    if(!_dependents[i]->_is_conditioner()) {
       n++;
     }
   }
@@ -387,58 +842,35 @@ inline const std::string& Node::name() const {
   return _name;
 }
 
-// Procedure: _set_state
-inline void Node::_set_state(int flag) { 
-  _state |= flag; 
-}
-
-// Procedure: _unset_state
-inline void Node::_unset_state(int flag) { 
-  _state &= ~flag; 
-}
-
-// Procedure: _clear_state
-inline void Node::_clear_state() { 
-  _state = 0; 
-}
-
-// Function: _has_state
-inline bool Node::_has_state(int flag) const {
-  return _state & flag;
+// Function: _is_conditioner
+inline bool Node::_is_conditioner() const {
+  return _handle.index() == Node::CONDITION ||
+         _handle.index() == Node::MULTI_CONDITION;
 }
 
 // Function: _is_cancelled
 inline bool Node::_is_cancelled() const {
-  if(_handle.index() == Node::ASYNC) {
-    auto& h = std::get<Node::Async>(_handle);
-    if(h.topology && h.topology->_is_cancelled) {
-      return true;
-    }
-  }
-  // async tasks spawned from subflow does not have topology
-  return _topology && _topology->_is_cancelled;
+  return _topology && _topology->_is_cancelled.load(std::memory_order_relaxed);
 }
 
 // Procedure: _set_up_join_counter
 inline void Node::_set_up_join_counter() {
-
   size_t c = 0;
-
   for(auto p : _dependents) {
-    if(p->_handle.index() == Node::CONDITION) {
-      _set_state(Node::BRANCHED);
+    //if(p->_handle.index() == Node::CONDITION) {
+    if(p->_is_conditioner()) {
+      _state.fetch_or(Node::CONDITIONED, std::memory_order_relaxed);
     }
     else {
       c++;
     }
   }
-
-  _join_counter.store(c, std::memory_order_relaxed);
+  _join_counter.store(c, std::memory_order_release);
 }
 
 
 // Function: _acquire_all
-inline bool Node::_acquire_all(std::vector<Node*>& nodes) {
+inline bool Node::_acquire_all(SmallVector<Node*>& nodes) {
 
   auto& to_acquire = _semaphores->to_acquire;
 
@@ -446,7 +878,7 @@ inline bool Node::_acquire_all(std::vector<Node*>& nodes) {
     if(!to_acquire[i]->_try_acquire_or_wait(this)) {
       for(size_t j = 1; j <= i; ++j) {
         auto r = to_acquire[i-j]->_release();
-        nodes.insert(end(nodes), begin(r), end(r));
+        nodes.insert(std::end(nodes), std::begin(r), std::end(r));
       }
       return false;
     }
@@ -455,67 +887,73 @@ inline bool Node::_acquire_all(std::vector<Node*>& nodes) {
 }
 
 // Function: _release_all
-inline std::vector<Node*> Node::_release_all() {
+inline SmallVector<Node*> Node::_release_all() {
 
   auto& to_release = _semaphores->to_release;
 
-  std::vector<Node*> nodes;
+  SmallVector<Node*> nodes;
   for(const auto& sem : to_release) {
     auto r = sem->_release();
-    nodes.insert(end(nodes), begin(r), end(r));
+    nodes.insert(std::end(nodes), std::begin(r), std::end(r));
   }
+
   return nodes;
 }
 
+// ----------------------------------------------------------------------------
+// Node Deleter
+// ----------------------------------------------------------------------------
+
+/**
+@private
+*/
+struct NodeDeleter {
+  void operator ()(Node* ptr) {
+    node_pool.recycle(ptr);
+  }
+};
+
 // ----------------------------------------------------------------------------
 // Graph definition
 // ----------------------------------------------------------------------------
-    
-//// Function: _node_pool
-//inline ObjectPool<Node>& Graph::_node_pool() {
-//  static ObjectPool<Node> pool;
-//  return pool;
-//}
 
 // Destructor
 inline Graph::~Graph() {
-  //auto& np = _node_pool();
-  for(auto node : _nodes) {
-    //np.recycle(node);
-    node_pool.recycle(node);
-  }
+  _clear();
 }
 
 // Move constructor
-inline Graph::Graph(Graph&& other) : 
+inline Graph::Graph(Graph&& other) :
   _nodes {std::move(other._nodes)} {
 }
 
 // Move assignment
 inline Graph& Graph::operator = (Graph&& other) {
+  _clear();
   _nodes = std::move(other._nodes);
   return *this;
 }
 
 // Procedure: clear
 inline void Graph::clear() {
-  //auto& np = _node_pool();
+  _clear();
+}
+
+// Procedure: clear
+inline void Graph::_clear() {
   for(auto node : _nodes) {
-    //node->~Node();
-    //np.deallocate(node);
     node_pool.recycle(node);
   }
   _nodes.clear();
 }
 
 // Procedure: clear_detached
-inline void Graph::clear_detached() {
+inline void Graph::_clear_detached() {
 
   auto mid = std::partition(_nodes.begin(), _nodes.end(), [] (Node* node) {
-    return !(node->_has_state(Node::DETACHED));
+    return !(node->_state.load(std::memory_order_relaxed) & Node::DETACHED);
   });
-  
-  //auto& np = _node_pool();
+
   for(auto itr = mid; itr != _nodes.end(); ++itr) {
     node_pool.recycle(*itr);
   }
@@ -523,50 +961,38 @@ inline void Graph::clear_detached() {
 }
 
 // Procedure: merge
-inline void Graph::merge(Graph&& g) {
+inline void Graph::_merge(Graph&& g) {
   for(auto n : g._nodes) {
     _nodes.push_back(n);
   }
   g._nodes.clear();
 }
 
+// Function: erase
+inline void Graph::_erase(Node* node) {
+  if(auto I = std::find(_nodes.begin(), _nodes.end(), node); I != _nodes.end()) {
+    _nodes.erase(I);
+    node_pool.recycle(node);
+  }
+}
+
 // Function: size
-// query the size
 inline size_t Graph::size() const {
   return _nodes.size();
 }
 
 // Function: empty
-// query the emptiness
 inline bool Graph::empty() const {
   return _nodes.empty();
 }
-    
-// Function: emplace_back
-// create a node from a give argument; constructor is called if necessary
+
+/**
+@private
+*/
 template <typename ...ArgsT>
-Node* Graph::emplace_back(ArgsT&&... args) {
-  //auto node = _node_pool().allocate();
-  //new (node) Node(std::forward<ArgsT>(args)...);
-  //_nodes.push_back(node);
+Node* Graph::_emplace_back(ArgsT&&... args) {
   _nodes.push_back(node_pool.animate(std::forward<ArgsT>(args)...));
   return _nodes.back();
 }
 
-// Function: emplace_back
-// create a node from a give argument; constructor is called if necessary
-inline Node* Graph::emplace_back() {
-  //auto node = _node_pool().allocate();
-  //new (node) Node();
-  //_nodes.push_back(node);
-  _nodes.push_back(node_pool.animate());
-  return _nodes.back();
-}
-
-
 }  // end of namespace tf. ---------------------------------------------------
-
-
-
-
-
diff --git a/lib/taskflow/core/notifier.hpp b/lib/taskflow/core/notifier.hpp
index a82f8a5..39bcf64 100644
--- a/lib/taskflow/core/notifier.hpp
+++ b/lib/taskflow/core/notifier.hpp
@@ -67,7 +67,7 @@ class Notifier {
   friend class Executor;
 
   public:
-  
+
   struct Waiter {
     std::atomic<Waiter*> next;
     std::mutex mu;
@@ -199,7 +199,7 @@ class Notifier {
       }
     }
   }
-  
+
   // notify n workers
   void notify_n(size_t n) {
     if(n >= _waiters.size()) {
diff --git a/lib/taskflow/core/observer.hpp b/lib/taskflow/core/observer.hpp
index 4ca0166..3c1873e 100644
--- a/lib/taskflow/core/observer.hpp
+++ b/lib/taskflow/core/observer.hpp
@@ -114,9 +114,9 @@ struct ProfileData {
 /**
 @class: ObserverInterface
 
-@brief The interface class for creating an executor observer.
+@brief class to derive an executor observer 
 
-The tf::ObserverInterface class let users define custom methods to monitor 
+The tf::ObserverInterface class allows users to define custom methods to monitor 
 the behaviors of an executor. This is particularly useful when you want to 
 inspect the performance of an executor and visualize when each thread 
 participates in the execution of a task.
@@ -168,8 +168,6 @@ executor.run(taskflow).wait();
 */
 class ObserverInterface {
 
-  friend class Executor;
-  
   public:
 
   /**
@@ -185,17 +183,17 @@ class ObserverInterface {
   
   /**
   @brief method to call before a worker thread executes a closure 
-  @param w an immutable view of this worker thread 
+  @param wv an immutable view of this worker thread 
   @param task_view a constant wrapper object to the task 
   */
-  virtual void on_entry(WorkerView w, TaskView task_view) = 0;
+  virtual void on_entry(WorkerView wv, TaskView task_view) = 0;
   
   /**
   @brief method to call after a worker thread executed a closure
-  @param w an immutable view of this worker thread
+  @param wv an immutable view of this worker thread
   @param task_view a constant wrapper object to the task
   */
-  virtual void on_exit(WorkerView w, TaskView task_view) = 0;
+  virtual void on_exit(WorkerView wv, TaskView task_view) = 0;
 };
 
 // ----------------------------------------------------------------------------
@@ -205,7 +203,7 @@ class ObserverInterface {
 /**
 @class: ChromeObserver
 
-@brief observer interface based on Chrome tracing format
+@brief class to create an observer based on Chrome tracing format
 
 A tf::ChromeObserver inherits tf::ObserverInterface and defines methods to dump
 the observed thread activities into a format that can be visualized through
@@ -338,6 +336,8 @@ inline void ChromeObserver::clear() {
 // Procedure: dump
 inline void ChromeObserver::dump(std::ostream& os) const {
 
+  using namespace std::chrono;
+
   size_t first;
 
   for(first = 0; first<_timeline.segments.size(); ++first) {
@@ -356,8 +356,7 @@ inline void ChromeObserver::dump(std::ostream& os) const {
 
     for(size_t i=0; i<_timeline.segments[w].size(); i++) {
 
-      os << '{'
-         << "\"cat\":\"ChromeObserver\",";
+      os << '{'<< "\"cat\":\"ChromeObserver\",";
 
       // name field
       os << "\"name\":\"";
@@ -373,10 +372,10 @@ inline void ChromeObserver::dump(std::ostream& os) const {
       os << "\"ph\":\"X\","
          << "\"pid\":1,"
          << "\"tid\":" << w << ','
-         << "\"ts\":" << std::chrono::duration_cast<std::chrono::microseconds>(
+         << "\"ts\":" << duration_cast<microseconds>(
                            _timeline.segments[w][i].beg - _timeline.origin
                          ).count() << ','
-         << "\"dur\":" << std::chrono::duration_cast<std::chrono::microseconds>(
+         << "\"dur\":" << duration_cast<microseconds>(
                            _timeline.segments[w][i].end - _timeline.segments[w][i].beg
                          ).count();
 
@@ -415,7 +414,7 @@ inline size_t ChromeObserver::num_tasks() const {
 /**
 @class TFProfObserver
 
-@brief observer interface based on the built-in taskflow profiler format
+@brief class to create an observer based on the built-in taskflow profiler format
 
 A tf::TFProfObserver inherits tf::ObserverInterface and defines methods to dump
 the observed thread activities into a format that can be visualized through
@@ -438,17 +437,48 @@ executor.run(taskflow).wait();
 observer->dump(std::cout);
 @endcode
 
-We recommend using our @TFProf python script to observe thread activities 
-instead of the raw function call.
-The script will turn on environment variables needed for observing all executors 
-in a taskflow program and dump the result to a valid, clean JSON file
-compatible with the format of @TFProf.
 */
 class TFProfObserver : public ObserverInterface {
 
   friend class Executor;
   friend class TFProfManager;
+
+  /** @private overall task summary */
+  struct TaskSummary {
+    size_t count {0};
+    size_t total_span {0};
+    size_t min_span;
+    size_t max_span;
+    
+    float avg_span() const { return total_span * 1.0f / count; }
+  };
+
+  /** @private worker summary at a level */
+  struct WorkerSummary {
+
+    size_t id;
+    size_t level;
+    size_t count {0};
+    size_t total_span {0};
+    size_t min_span{0};
+    size_t max_span{0};
+
+    std::array<TaskSummary, TASK_TYPES.size()> tsum;
+
+    float avg_span() const { return total_span * 1.0f / count; }
+    //return count < 2 ? 0.0f : total_delay * 1.0f / (count-1); 
+  };
   
+  /** @private */
+  struct Summary {
+    std::array<TaskSummary, TASK_TYPES.size()> tsum;
+    std::vector<WorkerSummary> wsum;
+    
+    void dump_tsum(std::ostream&) const;
+    void dump_wsum(std::ostream&) const;
+    void dump(std::ostream&) const;
+  };
+
   public:
 
     /**
@@ -462,6 +492,16 @@ class TFProfObserver : public ObserverInterface {
     */
     std::string dump() const;
 
+    /**
+    @brief shows the summary report through an output stream
+    */
+    void summary(std::ostream& ostream) const;
+
+    /**
+    @brief returns the summary report in a string
+    */
+    std::string summary() const;
+
     /**
     @brief clears the timeline data
     */
@@ -471,6 +511,11 @@ class TFProfObserver : public ObserverInterface {
     @brief queries the number of tasks observed
     */
     size_t num_tasks() const;
+    
+    /**
+    @brief queries the number of observed workers
+    */
+    size_t num_workers() const;
 
   private:
     
@@ -483,6 +528,155 @@ class TFProfObserver : public ObserverInterface {
     inline void on_exit(WorkerView, TaskView) override final;
 };  
 
+
+// dump the task summary
+inline void TFProfObserver::Summary::dump_tsum(std::ostream& os) const {
+
+  // task summary
+  size_t type_w{10}, count_w{5}, time_w{9}, avg_w{8}, min_w{8}, max_w{8};
+
+  std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    count_w = std::max(count_w, std::to_string(i.count).size());
+  });
+  
+  std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    time_w = std::max(time_w, std::to_string(i.total_span).size());
+  });
+  
+  std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    avg_w = std::max(time_w, std::to_string(i.avg_span()).size());
+  });
+  
+  std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    min_w = std::max(min_w, std::to_string(i.min_span).size());
+  });
+  
+  std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    max_w = std::max(max_w, std::to_string(i.max_span).size());
+  });
+
+  os << std::setw(type_w) << "-Task-" 
+     << std::setw(count_w+2) << "Count"
+     << std::setw(time_w+2) << "Time (us)"
+     << std::setw(avg_w+2) << "Avg (us)"
+     << std::setw(min_w+2) << "Min (us)"
+     << std::setw(max_w+2) << "Max (us)"
+     << '\n';
+
+  for(size_t i=0; i<TASK_TYPES.size(); i++) {
+    if(tsum[i].count == 0) {
+      continue;
+    }
+    os << std::setw(type_w) << to_string(TASK_TYPES[i])
+       << std::setw(count_w+2) << tsum[i].count
+       << std::setw(time_w+2) << tsum[i].total_span
+       << std::setw(avg_w+2) << std::to_string(tsum[i].avg_span())
+       << std::setw(min_w+2) << tsum[i].min_span
+       << std::setw(max_w+2) << tsum[i].max_span
+       << '\n';
+  }
+}
+
+// dump the worker summary
+inline void TFProfObserver::Summary::dump_wsum(std::ostream& os) const {
+  
+  // task summary
+  size_t w_w{10}, t_w{10}, l_w{5}, c_w{5}, d_w{9}, avg_w{8}, min_w{8}, max_w{8};
+
+  std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    l_w = std::max(l_w, std::to_string(i.level).size());
+  });
+  
+  std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    c_w = std::max(c_w, std::to_string(i.count).size());
+  });
+  
+  std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    d_w = std::max(d_w, std::to_string(i.total_span).size());
+  });
+  
+  std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    avg_w = std::max(avg_w, std::to_string(i.avg_span()).size());
+  });
+  
+  std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    min_w = std::max(min_w, std::to_string(i.min_span).size());
+  });
+  
+  std::for_each(wsum.begin(), wsum.end(), [&](const auto& i){
+    if(i.count == 0) return;
+    max_w = std::max(max_w, std::to_string(i.max_span).size());
+  });
+  
+  os << std::setw(w_w) << "-Worker-" 
+     << std::setw(l_w+2) << "Level"
+     << std::setw(t_w) << "Task"
+     << std::setw(c_w+2) << "Count"
+     << std::setw(d_w+2) << "Time (us)"
+     << std::setw(avg_w+2) << "Avg (us)"
+     << std::setw(min_w+2) << "Min (us)"
+     << std::setw(max_w+2) << "Max (us)"
+     << '\n';
+
+  for(const auto& ws : wsum) {
+
+    if(ws.count == 0) {
+      continue;
+    }
+
+    os << std::setw(w_w) << ws.id
+       << std::setw(l_w+2) << ws.level;
+    
+    bool first = true;
+    for(size_t i=0; i<TASK_TYPES.size(); i++) {
+
+      if(ws.tsum[i].count == 0) {
+        continue;
+      }
+
+      os << (first ? std::setw(t_w) : std::setw(w_w + l_w + 2 + t_w));
+      first = false;
+
+      os << to_string(TASK_TYPES[i])
+         << std::setw(c_w+2) << ws.tsum[i].count
+         << std::setw(d_w+2) << ws.tsum[i].total_span
+         << std::setw(avg_w+2) << std::to_string(ws.tsum[i].avg_span())
+         << std::setw(min_w+2) << ws.tsum[i].min_span
+         << std::setw(max_w+2) << ws.tsum[i].max_span
+         << '\n';
+    }
+
+    // per-worker summary
+    os << std::setw(w_w + l_w + t_w + c_w + 4) << ws.count
+       << std::setw(d_w+2) << ws.total_span
+       << std::setw(avg_w+2) << std::to_string(ws.avg_span())
+       << std::setw(min_w+2) << ws.min_span
+       << std::setw(max_w+2) << ws.max_span
+       << '\n';
+    
+    //for(size_t j=0; j<w_w+l_w+t_w+4; j++) os << ' ';
+    //for(size_t j=0; j<c_w+d_w+avg_w+min_w+max_w+8; j++) os << '-';
+    //os <<'\n';
+  }
+}
+
+// dump the summary report through an ostream
+inline void TFProfObserver::Summary::dump(std::ostream& os) const {
+  dump_tsum(os);
+  os << '\n';
+  dump_wsum(os);
+}
+
 // Procedure: set_up
 inline void TFProfObserver::set_up(size_t num_workers) {
   _timeline.uid = unique_id<size_t>();
@@ -530,6 +724,8 @@ inline void TFProfObserver::clear() {
 // Procedure: dump
 inline void TFProfObserver::dump(std::ostream& os) const {
 
+  using namespace std::chrono;
+
   size_t first;
 
   for(first = 0; first<_timeline.segments.size(); ++first) {
@@ -571,12 +767,10 @@ inline void TFProfObserver::dump(std::ostream& os) const {
         
         // span 
         os << "{\"span\":[" 
-           << std::chrono::duration_cast<std::chrono::microseconds>(
-                s.beg - _timeline.origin
-              ).count() << ","
-           << std::chrono::duration_cast<std::chrono::microseconds>(
-                s.end - _timeline.origin
-              ).count() << "],";
+           << duration_cast<microseconds>(s.beg - _timeline.origin).count() 
+           << ","
+           << duration_cast<microseconds>(s.end - _timeline.origin).count() 
+           << "],";
         
         // name
         os << "\"name\":\""; 
@@ -588,7 +782,7 @@ inline void TFProfObserver::dump(std::ostream& os) const {
         }
         os << "\",";
     
-        // category "type": "Condition Task",
+        // e.g., category "type": "Condition Task"
         os << "\"type\":\"" << to_string(s.type) << "\"";
 
         os << "}";
@@ -607,15 +801,124 @@ inline std::string TFProfObserver::dump() const {
   return oss.str();
 }
 
+// Procedure: summary
+inline void TFProfObserver::summary(std::ostream& os) const {
+
+  using namespace std::chrono;
+  
+  Summary summary;
+  std::optional<observer_stamp_t> view_beg, view_end;
+
+  // find the first non-empty worker
+  size_t first;
+  for(first = 0; first<_timeline.segments.size(); ++first) {
+    if(_timeline.segments[first].size() > 0) { 
+      break; 
+    }
+  }
+  
+  // not timeline data to dump
+  if(first == _timeline.segments.size()) {
+    goto end_of_summary;
+  }
+
+  for(size_t w=first; w<_timeline.segments.size(); w++) {
+    for(size_t l=0; l<_timeline.segments[w].size(); l++) {
+
+      if(_timeline.segments[w][l].empty()) {
+        continue;
+      }
+
+      // worker w at level l
+      WorkerSummary ws;
+      ws.id = w;
+      ws.level = l;
+      ws.count = _timeline.segments[w][l].size();
+      
+      // scan all tasks at level l
+      for(size_t i=0; i<_timeline.segments[w][l].size(); ++i) {
+        
+        // update the entire span
+        auto& s = _timeline.segments[w][l][i];
+        view_beg = view_beg ? std::min(*view_beg, s.beg) : s.beg;
+        view_end = view_end ? std::max(*view_end, s.end) : s.end;
+        
+        // update the task summary
+        size_t t = duration_cast<microseconds>(s.end - s.beg).count();
+
+        auto& x = summary.tsum[static_cast<int>(s.type)];
+        x.count += 1;
+        x.total_span += t;
+        x.min_span = (x.count == 1) ? t : std::min(t, x.min_span);
+        x.max_span = (x.count == 1) ? t : std::max(t, x.max_span);
+
+        // update the worker summary
+        ws.total_span += t;
+        ws.min_span = (i == 0) ? t : std::min(t, ws.min_span);
+        ws.max_span = (i == 0) ? t : std::max(t, ws.max_span);
+
+        auto&y = ws.tsum[static_cast<int>(s.type)];
+        y.count += 1;
+        y.total_span += t;
+        y.min_span = (y.count == 1) ? t : std::min(t, y.min_span);
+        y.max_span = (y.count == 1) ? t : std::max(t, y.max_span);
+        
+        // update the delay
+        //if(i) {
+        //  size_t d = duration_cast<nanoseconds>(
+        //    s.beg - _timeline.segments[w][l][i-1].end
+        //  ).count();
+        //  ws.total_delay += d;
+        //  ws.min_delay = (i == 1) ? d : std::min(ws.min_delay, d);
+        //  ws.max_delay = (i == 1) ? d : std::max(ws.max_delay, d);
+        //}
+      }
+      summary.wsum.push_back(ws);
+    }
+  }
+
+  end_of_summary:
+
+  size_t view = 0;
+  if(view_beg && view_end) {
+    view = duration_cast<microseconds>(*view_end - *view_beg).count();
+  }
+
+  os << "==Observer " << _timeline.uid << ": "
+     << num_workers() << " workers completed "
+     << num_tasks() << " tasks in "
+     << view << " us\n";
+
+  summary.dump(os);
+}
+
+// Procedure: summary
+inline std::string TFProfObserver::summary() const {
+  std::ostringstream oss;
+  summary(oss);
+  return oss.str();
+}
+
 // Function: num_tasks
 inline size_t TFProfObserver::num_tasks() const {
-  return std::accumulate(
-    _timeline.segments.begin(), _timeline.segments.end(), size_t{0}, 
-    [](size_t sum, const auto& exe){ 
-      return sum + exe.size(); 
+  size_t s = 0;
+  for(size_t w=0; w<_timeline.segments.size(); ++w) {
+    for(size_t l=0; l<_timeline.segments[w].size(); ++l) {
+      s += _timeline.segments[w][l].size();
     }
-  );
+  }
+  return s;
 }
+  
+// Function: num_workers
+inline size_t TFProfObserver::num_workers() const {
+  size_t w = 0;
+  for(size_t i=0; i<_timeline.segments.size(); ++i) {
+    w += (!_timeline.segments[i].empty());
+  }
+  return w;
+}
+
 
 // ----------------------------------------------------------------------------
 // TFProfManager
@@ -682,11 +985,11 @@ inline TFProfManager::~TFProfManager() {
       for(size_t i=0; i<_observers.size(); ++i) {
         data.timelines.push_back(std::move(_observers[i]->_timeline));
       }
-      Serializer serializer(ofs); 
+      Serializer<std::ofstream> serializer(ofs); 
       serializer(data);
     }
     // .json
-    else {
+    else { // if(_fpath.rfind(".json") != std::string::npos) {
       ofs << "[\n";
       for(size_t i=0; i<_observers.size(); ++i) {
         if(i) ofs << ',';
@@ -695,6 +998,14 @@ inline TFProfManager::~TFProfManager() {
       ofs << "]\n";
     }
   }
+  // do a summary report in stderr for each observer
+  else {
+    std::ostringstream oss;
+    for(size_t i=0; i<_observers.size(); ++i) {
+      _observers[i]->summary(oss);
+    }
+    fprintf(stderr, "%s", oss.str().c_str());
+  }
 }
     
 // Function: get
diff --git a/lib/taskflow/core/semaphore.hpp b/lib/taskflow/core/semaphore.hpp
index 75d49be..12d6069 100644
--- a/lib/taskflow/core/semaphore.hpp
+++ b/lib/taskflow/core/semaphore.hpp
@@ -5,7 +5,7 @@
 
 #include "declarations.hpp"
 
-/** 
+/**
 @file semaphore.hpp
 @brief semaphore include file
 */
@@ -23,16 +23,16 @@ namespace tf {
 
 A semaphore creates a constraint that limits the maximum concurrency,
 i.e., the number of workers, in a set of tasks.
-You can let a task acquire/release one or multiple semaphores before/after 
+You can let a task acquire/release one or multiple semaphores before/after
 executing its work.
-A task can acquire and release a semaphore, 
-or just acquire or just release it. 
+A task can acquire and release a semaphore,
+or just acquire or just release it.
 A tf::Semaphore object starts with an initial count.
 As long as that count is above 0, tasks can acquire the semaphore and do
 their work.
 If the count is 0 or less, a task trying to acquire the semaphore will not run
 but goes to a waiting list of that semaphore.
-When the semaphore is released by another task, 
+When the semaphore is released by another task,
 it reschedules all tasks on that waiting list.
 
 @code{.cpp}
@@ -62,7 +62,7 @@ Under normal circumstances, the five tasks would be executed concurrently.
 However, this example has a semaphore with initial count 1,
 and all tasks need to acquire that semaphore before running and release that
 semaphore after they are done.
-This organization limits the number of concurrently running tasks to only one.
+This arrangement limits the number of concurrently running tasks to only one.
 
 */
 class Semaphore {
@@ -70,34 +70,41 @@ class Semaphore {
   friend class Node;
 
   public:
-    
+
     /**
     @brief constructs a semaphore with the given counter
+
+    A semaphore creates a constraint that limits the maximum concurrency,
+    i.e., the number of workers, in a set of tasks.
+
+    @code{.cpp}
+    tf::Semaphore semaphore(4);  // concurrency constraint of 4 workers
+    @endcode
     */
-    explicit Semaphore(int max_workers);
-    
+    explicit Semaphore(size_t max_workers);
+
     /**
     @brief queries the counter value (not thread-safe during the run)
     */
-    int count() const;
-    
+    size_t count() const;
+
   private:
 
     std::mutex _mtx;
 
-    int _counter;
+    size_t _counter;
 
     std::vector<Node*> _waiters;
-    
+
     bool _try_acquire_or_wait(Node*);
 
     std::vector<Node*> _release();
 };
 
-inline Semaphore::Semaphore(int max_workers) : 
+inline Semaphore::Semaphore(size_t max_workers) :
   _counter(max_workers) {
 }
-    
+
 inline bool Semaphore::_try_acquire_or_wait(Node* me) {
   std::lock_guard<std::mutex> lock(_mtx);
   if(_counter > 0) {
@@ -117,7 +124,7 @@ inline std::vector<Node*> Semaphore::_release() {
   return r;
 }
 
-inline int Semaphore::count() const {
+inline size_t Semaphore::count() const {
   return _counter;
 }
 
diff --git a/lib/taskflow/core/task.hpp b/lib/taskflow/core/task.hpp
index 2cc4621..cd10b73 100644
--- a/lib/taskflow/core/task.hpp
+++ b/lib/taskflow/core/task.hpp
@@ -2,7 +2,7 @@
 
 #include "graph.hpp"
 
-/** 
+/**
 @file task.hpp
 @brief task include file
 */
@@ -19,45 +19,61 @@ namespace tf {
 @brief enumeration of all task types
 */
 enum class TaskType : int {
+  /** @brief placeholder task type */
   PLACEHOLDER = 0,
-  CUDAFLOW,
+  /** @brief static task type */
   STATIC,
+  /** @brief dynamic (subflow) task type */
   DYNAMIC,
+  /** @brief condition task type */
   CONDITION,
+  /** @brief module task type */
   MODULE,
+  /** @brief asynchronous task type */
   ASYNC,
-  UNDEFINED 
+  /** @brief undefined task type (for internal use only) */
+  UNDEFINED
 };
 
 /**
+@private
 @brief array of all task types (used for iterating task types)
 */
-inline constexpr std::array<TaskType, 7> TASK_TYPES = {
+inline constexpr std::array<TaskType, 6> TASK_TYPES = {
   TaskType::PLACEHOLDER,
-  TaskType::CUDAFLOW,
   TaskType::STATIC,
   TaskType::DYNAMIC,
   TaskType::CONDITION,
   TaskType::MODULE,
-  TaskType::ASYNC
+  TaskType::ASYNC,
 };
 
 /**
 @brief convert a task type to a human-readable string
+
+The name of each task type is the litte-case string of its characters.
+
+@code{.cpp}
+TaskType::PLACEHOLDER     ->  "placeholder"
+TaskType::STATIC          ->  "static"
+TaskType::DYNAMIC         ->  "subflow"
+TaskType::CONDITION       ->  "condition"
+TaskType::MODULE          ->  "module"
+TaskType::ASYNC           ->  "async"
+@endcode
 */
 inline const char* to_string(TaskType type) {
 
   const char* val;
 
   switch(type) {
-    case TaskType::PLACEHOLDER: val = "placeholder"; break;
-    case TaskType::CUDAFLOW:    val = "cudaflow";    break;
-    case TaskType::STATIC:      val = "static";      break;
-    case TaskType::DYNAMIC:     val = "subflow";     break;
-    case TaskType::CONDITION:   val = "condition";   break;
-    case TaskType::MODULE:      val = "module";      break;
-    case TaskType::ASYNC:       val = "async";       break;
-    default:                    val = "undefined";   break;
+    case TaskType::PLACEHOLDER:      val = "placeholder";     break;
+    case TaskType::STATIC:           val = "static";          break;
+    case TaskType::DYNAMIC:          val = "subflow";         break;
+    case TaskType::CONDITION:        val = "condition";       break;
+    case TaskType::MODULE:           val = "module";          break;
+    case TaskType::ASYNC:            val = "async";           break;
+    default:                         val = "undefined";       break;
   }
 
   return val;
@@ -68,39 +84,51 @@ inline const char* to_string(TaskType type) {
 // ----------------------------------------------------------------------------
 
 /**
-@brief determines if a callable is a static task
+@brief determines if a callable is a dynamic task
 
-A static task is a callable object constructible from std::function<void()>.
+A dynamic task is a callable object constructible from std::function<void(Subflow&)>.
 */
 template <typename C>
-constexpr bool is_static_task_v = std::is_invocable_r_v<void, C> &&
-                                 !std::is_invocable_r_v<int, C>;
+constexpr bool is_dynamic_task_v = 
+  std::is_invocable_r_v<void, C, Subflow&> &&
+  !std::is_invocable_r_v<void, C, Runtime&>;
 
 /**
-@brief determines if a callable is a dynamic task
+@brief determines if a callable is a condition task
 
-A dynamic task is a callable object constructible from std::function<void(Subflow&)>.
+A condition task is a callable object constructible from std::function<int()>
+or std::function<int(tf::Runtime&)>.
 */
 template <typename C>
-constexpr bool is_dynamic_task_v = std::is_invocable_r_v<void, C, Subflow&>;
+constexpr bool is_condition_task_v = 
+  (std::is_invocable_r_v<int, C> || std::is_invocable_r_v<int, C, Runtime&>) &&
+  !is_dynamic_task_v<C>;
 
 /**
-@brief determines if a callable is a condition task
+@brief determines if a callable is a multi-condition task
 
-A condition task is a callable object constructible from std::function<int()>.
+A multi-condition task is a callable object constructible from
+std::function<tf::SmallVector<int>()> or
+std::function<tf::SmallVector<int>(tf::Runtime&)>.
 */
 template <typename C>
-constexpr bool is_condition_task_v = std::is_invocable_r_v<int, C>;
+constexpr bool is_multi_condition_task_v =
+  (std::is_invocable_r_v<SmallVector<int>, C> ||
+  std::is_invocable_r_v<SmallVector<int>, C, Runtime&>) &&
+  !is_dynamic_task_v<C>;
 
 /**
-@brief determines if a callable is a cudaflow task
+@brief determines if a callable is a static task
 
-A cudaFlow task is a callable object constructible from 
-std::function<void(tf::cudaFlow&)> or std::function<void(tf::cudaFlowCapturer&)>.
+A static task is a callable object constructible from std::function<void()>
+or std::function<void(tf::Runtime&)>.
 */
 template <typename C>
-constexpr bool is_cudaflow_task_v = std::is_invocable_r_v<void, C, cudaFlow&> ||
-                                    std::is_invocable_r_v<void, C, cudaFlowCapturer&>;
+constexpr bool is_static_task_v =
+  (std::is_invocable_r_v<void, C> || std::is_invocable_r_v<void, C, Runtime&>) &&
+  !is_condition_task_v<C> &&
+  !is_multi_condition_task_v<C> &&
+  !is_dynamic_task_v<C>;
 
 // ----------------------------------------------------------------------------
 // Task
@@ -109,19 +137,23 @@ constexpr bool is_cudaflow_task_v = std::is_invocable_r_v<void, C, cudaFlow&> ||
 /**
 @class Task
 
-@brief handle to a node in a task dependency graph
-
-A Task is handle to manipulate a node in a taskflow graph. 
-It provides a set of methods for users to access and modify the attributes of 
-the associated graph node without directly touching internal node data.
+@brief class to create a task handle over a node in a taskflow graph
 
+A task is a wrapper over a node in a taskflow graph.
+It provides a set of methods for users to access and modify the attributes of
+the associated node in the taskflow graph.
+A task is very lightweight object (i.e., only storing a node pointer) that
+can be trivially copied around,
+and it does not own the lifetime of the associated node.
 */
 class Task {
 
   friend class FlowBuilder;
+  friend class Runtime;
   friend class Taskflow;
   friend class TaskView;
-  
+  friend class Executor;
+
   public:
 
     /**
@@ -133,12 +165,12 @@ class Task {
     @brief constructs the task with the copy of the other task
     */
     Task(const Task& other);
-    
+
     /**
     @brief replaces the contents with a copy of the other task
     */
     Task& operator = (const Task&);
-    
+
     /**
     @brief replaces the contents with a null pointer
     */
@@ -153,12 +185,12 @@ class Task {
     @brief compares if two tasks are not associated with the same graph node
     */
     bool operator != (const Task& rhs) const;
-    
+
     /**
     @brief queries the name of the task
     */
     const std::string& name() const;
-    
+
     /**
     @brief queries the number of successors of the task
     */
@@ -168,7 +200,7 @@ class Task {
     @brief queries the number of predecessors of the task
     */
     size_t num_dependents() const;
-    
+
     /**
     @brief queries the number of strong dependents of the task
     */
@@ -178,7 +210,7 @@ class Task {
     @brief queries the number of weak dependents of the task
     */
     size_t num_weak_dependents() const;
-    
+
     /**
     @brief assigns a name to the task
 
@@ -193,22 +225,24 @@ class Task {
 
     @tparam C callable type
 
-    @param callable callable to construct one of the static, dynamic, condition, and cudaFlow tasks
+    @param callable callable to construct a task
 
     @return @c *this
     */
     template <typename C>
     Task& work(C&& callable);
-    
+
     /**
     @brief creates a module task from a taskflow
 
-    @param taskflow a taskflow object for the module
+    @tparam T object type
+    @param object a custom object that defines @c T::graph() method
 
     @return @c *this
     */
-    Task& composed_of(Taskflow& taskflow);
-    
+    template <typename T>
+    Task& composed_of(T& object);
+
     /**
     @brief adds precedence links from this to other tasks
 
@@ -220,11 +254,11 @@ class Task {
     */
     template <typename... Ts>
     Task& precede(Ts&&... tasks);
-    
+
     /**
     @brief adds precedence links from other tasks to this
 
-    @tparam Ts parameter pack 
+    @tparam Ts parameter pack
 
     @param tasks one or multiple tasks
 
@@ -242,7 +276,54 @@ class Task {
     @brief makes the task acquire this semaphore
     */
     Task& acquire(Semaphore& semaphore);
+
+    /**
+    @brief assigns pointer to user data
+
+    @param data pointer to user data
+
+    The following example shows how to attach user data to a task and
+    run the task iteratively while changing the data value:
+
+    @code{.cpp}
+    tf::Executor executor;
+    tf::Taskflow taskflow("attach data to a task");
+
+    int data;
+
+    // create a task and attach it the data
+    auto A = taskflow.placeholder();
+    A.data(&data).work([A](){
+      auto d = *static_cast<int*>(A.data());
+      std::cout << "data is " << d << std::endl;
+    });
+
+    // run the taskflow iteratively with changing data
+    for(data = 0; data<10; data++){
+      executor.run(taskflow).wait();
+    }
+    @endcode
+
+    @return @c *this
+    */
+    Task& data(void* data);
+      
+    /**
+    @brief assigns a priority value to the task
+
+    A priority value can be one of the following three levels, 
+    tf::TaskPriority::HIGH (numerically equivalent to 0),
+    tf::TaskPriority::NORMAL (numerically equivalent to 1), and
+    tf::TaskPriority::LOW (numerically equivalent to 2).
+    The smaller the priority value, the higher the priority.
+    */
+    Task& priority(TaskPriority p);
     
+    /**
+    @brief queries the priority value of the task
+    */
+    TaskPriority priority() const;
+
     /**
     @brief resets the task handle to null
     */
@@ -262,13 +343,13 @@ class Task {
     @brief queries if the task has a work assigned
     */
     bool has_work() const;
-    
+
     /**
     @brief applies an visitor callable to each successor of the task
     */
     template <typename V>
     void for_each_successor(V&& visitor) const;
-    
+
     /**
     @brief applies an visitor callable to each dependents of the task
     */
@@ -279,7 +360,7 @@ class Task {
     @brief obtains a hash value of the underlying node
     */
     size_t hash_value() const;
-    
+
     /**
     @brief returns the task type
     */
@@ -290,8 +371,14 @@ class Task {
     */
     void dump(std::ostream& ostream) const;
 
+    /**
+    @brief queries pointer to user data
+    */
+    void* data() const;
+
+
   private:
-    
+
     Task(Node*);
 
     Node* _node {nullptr};
@@ -322,8 +409,9 @@ Task& Task::succeed(Ts&&... tasks) {
 }
 
 // Function: composed_of
-inline Task& Task::composed_of(Taskflow& tf) {
-  _node->_handle.emplace<Node::Module>(&tf);
+template <typename T>
+Task& Task::composed_of(T& object) {
+  _node->_handle.emplace<Node::Module>(object);
   return *this;
 }
 
@@ -358,7 +446,6 @@ inline Task& Task::name(const std::string& name) {
 // Function: acquire
 inline Task& Task::acquire(Semaphore& s) {
   if(!_node->_semaphores) {
-    //_node->_semaphores.emplace();
     _node->_semaphores = std::make_unique<Node::Semaphores>();
   }
   _node->_semaphores->to_acquire.push_back(&s);
@@ -423,15 +510,15 @@ inline bool Task::has_work() const {
 // Function: task_type
 inline TaskType Task::type() const {
   switch(_node->_handle.index()) {
-    case Node::PLACEHOLDER:  return TaskType::PLACEHOLDER;
-    case Node::STATIC:       return TaskType::STATIC;
-    case Node::DYNAMIC:      return TaskType::DYNAMIC;
-    case Node::CONDITION:    return TaskType::CONDITION;
-    case Node::MODULE:       return TaskType::MODULE;
-    case Node::ASYNC:        return TaskType::ASYNC;
-    case Node::SILENT_ASYNC: return TaskType::ASYNC;
-    case Node::CUDAFLOW:     return TaskType::CUDAFLOW;
-    default:                 return TaskType::UNDEFINED;
+    case Node::PLACEHOLDER:     return TaskType::PLACEHOLDER;
+    case Node::STATIC:          return TaskType::STATIC;
+    case Node::DYNAMIC:         return TaskType::DYNAMIC;
+    case Node::CONDITION:       return TaskType::CONDITION;
+    case Node::MULTI_CONDITION: return TaskType::CONDITION;
+    case Node::MODULE:          return TaskType::MODULE;
+    case Node::ASYNC:           return TaskType::ASYNC;
+    case Node::DEPENDENT_ASYNC: return TaskType::ASYNC;
+    default:                    return TaskType::UNDEFINED;
   }
 }
 
@@ -467,6 +554,7 @@ inline void Task::dump(std::ostream& os) const {
 // Function: work
 template <typename C>
 Task& Task::work(C&& c) {
+
   if constexpr(is_static_task_v<C>) {
     _node->_handle.emplace<Node::Static>(std::forward<C>(c));
   }
@@ -476,8 +564,8 @@ Task& Task::work(C&& c) {
   else if constexpr(is_condition_task_v<C>) {
     _node->_handle.emplace<Node::Condition>(std::forward<C>(c));
   }
-  else if constexpr(is_cudaflow_task_v<C>) {
-    _node->_handle.emplace<Node::cudaFlow>(std::forward<C>(c));
+  else if constexpr(is_multi_condition_task_v<C>) {
+    _node->_handle.emplace<Node::MultiCondition>(std::forward<C>(c));
   }
   else {
     static_assert(dependent_false_v<C>, "invalid task callable");
@@ -485,18 +573,42 @@ Task& Task::work(C&& c) {
   return *this;
 }
 
+// Function: data
+inline void* Task::data() const {
+  return _node->_data;
+}
+
+// Function: data
+inline Task& Task::data(void* data) {
+  _node->_data = data;
+  return *this;
+}
+
+// Function: priority
+inline Task& Task::priority(TaskPriority p) {
+  _node->_priority = static_cast<unsigned>(p);
+  return *this;
+}
+
+// Function: priority
+inline TaskPriority Task::priority() const {
+  return static_cast<TaskPriority>(_node->_priority);
+}
+
 // ----------------------------------------------------------------------------
 // global ostream
 // ----------------------------------------------------------------------------
 
 /**
-@brief overload of ostream inserter operator for cudaTask
+@brief overload of ostream inserter operator for Task
 */
 inline std::ostream& operator << (std::ostream& os, const Task& task) {
   task.dump(os);
   return os;
 }
 
+// ----------------------------------------------------------------------------
+// Task View
 // ----------------------------------------------------------------------------
 
 /**
@@ -505,7 +617,7 @@ inline std::ostream& operator << (std::ostream& os, const Task& task) {
 @brief class to access task information from the observer interface
 */
 class TaskView {
-  
+
   friend class Executor;
 
   public:
@@ -514,7 +626,7 @@ class TaskView {
     @brief queries the name of the task
     */
     const std::string& name() const;
-    
+
     /**
     @brief queries the number of successors of the task
     */
@@ -524,7 +636,7 @@ class TaskView {
     @brief queries the number of predecessors of the task
     */
     size_t num_dependents() const;
-    
+
     /**
     @brief queries the number of strong dependents of the task
     */
@@ -540,7 +652,7 @@ class TaskView {
     */
     template <typename V>
     void for_each_successor(V&& visitor) const;
-    
+
     /**
     @brief applies an visitor callable to each dependents of the task
     */
@@ -551,14 +663,14 @@ class TaskView {
     @brief queries the task type
     */
     TaskType type() const;
-  
+
     /**
     @brief obtains a hash value of the underlying node
     */
     size_t hash_value() const;
-    
+
   private:
-    
+
     TaskView(const Node&);
     TaskView(const TaskView&) = default;
 
@@ -597,18 +709,18 @@ inline size_t TaskView::num_successors() const {
 // Function: type
 inline TaskType TaskView::type() const {
   switch(_node._handle.index()) {
-    case Node::PLACEHOLDER:  return TaskType::PLACEHOLDER;
-    case Node::STATIC:       return TaskType::STATIC;
-    case Node::DYNAMIC:      return TaskType::DYNAMIC;
-    case Node::CONDITION:    return TaskType::CONDITION;
-    case Node::MODULE:       return TaskType::MODULE;
-    case Node::ASYNC:        return TaskType::ASYNC;
-    case Node::SILENT_ASYNC: return TaskType::ASYNC;
-    case Node::CUDAFLOW:     return TaskType::CUDAFLOW;
-    default:                 return TaskType::UNDEFINED;
+    case Node::PLACEHOLDER:     return TaskType::PLACEHOLDER;
+    case Node::STATIC:          return TaskType::STATIC;
+    case Node::DYNAMIC:         return TaskType::DYNAMIC;
+    case Node::CONDITION:       return TaskType::CONDITION;
+    case Node::MULTI_CONDITION: return TaskType::CONDITION;
+    case Node::MODULE:          return TaskType::MODULE;
+    case Node::ASYNC:           return TaskType::ASYNC;
+    case Node::DEPENDENT_ASYNC: return TaskType::ASYNC;
+    default:                    return TaskType::UNDEFINED;
   }
 }
-  
+
 // Function: hash_value
 inline size_t TaskView::hash_value() const {
   return std::hash<const Node*>{}(&_node);
@@ -618,7 +730,7 @@ inline size_t TaskView::hash_value() const {
 template <typename V>
 void TaskView::for_each_successor(V&& visitor) const {
   for(size_t i=0; i<_node._successors.size(); ++i) {
-    visitor(TaskView(_node._successors[i]));
+    visitor(TaskView(*_node._successors[i]));
   }
 }
 
@@ -626,7 +738,7 @@ void TaskView::for_each_successor(V&& visitor) const {
 template <typename V>
 void TaskView::for_each_dependent(V&& visitor) const {
   for(size_t i=0; i<_node._dependents.size(); ++i) {
-    visitor(TaskView(_node._dependents[i]));
+    visitor(TaskView(*_node._dependents[i]));
   }
 }
 
diff --git a/lib/taskflow/core/taskflow.hpp b/lib/taskflow/core/taskflow.hpp
index 00b26f3..ff836f5 100644
--- a/lib/taskflow/core/taskflow.hpp
+++ b/lib/taskflow/core/taskflow.hpp
@@ -2,8 +2,8 @@
 
 #include "flow_builder.hpp"
 
-/** 
-@file core/taskflow.hpp
+/**
+@file taskflow/core/taskflow.hpp
 @brief taskflow include file
 */
 
@@ -12,47 +12,55 @@ namespace tf {
 // ----------------------------------------------------------------------------
 
 /**
-@class Taskflow 
+@class Taskflow
 
-@brief main entry to create a task dependency graph
+@brief class to create a taskflow object
 
-A %taskflow manages a task dependency graph where each task represents a 
-callable object (e.g., @std_lambda, @std_function) and an edge represents a 
+A %taskflow manages a task dependency graph where each task represents a
+callable object (e.g., @std_lambda, @std_function) and an edge represents a
 dependency between two tasks. A task is one of the following types:
-  
-  1. static task: the callable constructible from 
-                  @c std::function<void()>
-  2. dynamic task: the callable constructible from 
-                   @c std::function<void(tf::Subflow&)>
-  3. condition task: the callable constructible from 
-                     @c std::function<int()>
-  4. module task: the task constructed from tf::Taskflow::composed_of
-  5. %cudaFlow task: the callable constructible from 
-                     @c std::function<void(tf::cudaFlow&)> or
-                     @c std::function<void(tf::cudaFlowCapturer&)>
+
+  1. static task         : the callable constructible from
+                           @c std::function<void()>
+  2. dynamic task        : the callable constructible from
+                           @c std::function<void(tf::Subflow&)>
+  3. condition task      : the callable constructible from
+                           @c std::function<int()>
+  4. multi-condition task: the callable constructible from
+                           @c %std::function<tf::SmallVector<int>()>
+  5. module task         : the task constructed from tf::Taskflow::composed_of
+                           @c std::function<void(tf::Runtime&)>
 
 Each task is a basic computation unit and is run by one worker thread
 from an executor.
-The following example creates a simple taskflow graph of four static tasks, 
+The following example creates a simple taskflow graph of four static tasks,
 @c A, @c B, @c C, and @c D, where
-@c A runs before @c B and @c C and 
+@c A runs before @c B and @c C and
 @c D runs after  @c B and @c C.
 
 @code{.cpp}
 tf::Executor executor;
 tf::Taskflow taskflow("simple");
 
-tf::Task A = taskflow.emplace([](){ std::cout << "TaskA\n"; }); 
+tf::Task A = taskflow.emplace([](){ std::cout << "TaskA\n"; });
 tf::Task B = taskflow.emplace([](){ std::cout << "TaskB\n"; });
 tf::Task C = taskflow.emplace([](){ std::cout << "TaskC\n"; });
 tf::Task D = taskflow.emplace([](){ std::cout << "TaskD\n"; });
 
 A.precede(B, C);  // A runs before B and C
 D.succeed(B, C);  // D runs after  B and C
-                                   
-executor.run(taskflow).wait();     
+
+executor.run(taskflow).wait();
 @endcode
 
+The taskflow object itself is NOT thread-safe. You should not
+modifying the graph while it is running,
+such as adding new tasks, adding new dependencies, and moving
+the taskflow to another.
+To minimize the overhead of task creation,
+our runtime leverages a global object pool to recycle
+tasks in a thread-safe manner.
+
 Please refer to @ref Cookbook to learn more about each task type
 and how to submit a taskflow to an executor.
 */
@@ -63,14 +71,20 @@ class Taskflow : public FlowBuilder {
   friend class FlowBuilder;
 
   struct Dumper {
-    std::stack<const Taskflow*> stack;
-    std::unordered_set<const Taskflow*> visited;
+    size_t id;
+    std::stack<std::pair<const Node*, const Graph*>> stack;
+    std::unordered_map<const Graph*, size_t> visited;
   };
 
   public:
 
     /**
     @brief constructs a taskflow with the given name
+
+    @code{.cpp}
+    tf::Taskflow taskflow("My Taskflow");
+    std::cout << taskflow.name();         // "My Taskflow"
+    @endcode
     */
     Taskflow(const std::string& name);
 
@@ -79,52 +93,140 @@ class Taskflow : public FlowBuilder {
     */
     Taskflow();
 
+    /**
+    @brief constructs a taskflow from a moved taskflow
+
+    Constructing a taskflow @c taskflow1 from a moved taskflow @c taskflow2 will
+    migrate the graph of @c taskflow2 to @c taskflow1.
+    After the move, @c taskflow2 will become empty.
+
+    @code{.cpp}
+    tf::Taskflow taskflow1(std::move(taskflow2));
+    assert(taskflow2.empty());
+    @endcode
+
+    Notice that @c taskflow2 should not be running in an executor
+    during the move operation, or the behavior is undefined.
+    */
+    Taskflow(Taskflow&& rhs);
+
+    /**
+    @brief move assignment operator
+
+    Moving a taskflow @c taskflow2 to another taskflow @c taskflow1 will destroy
+    the existing graph of @c taskflow1 and assign it the graph of @c taskflow2.
+    After the move, @c taskflow2 will become empty.
+
+    @code{.cpp}
+    taskflow1 = std::move(taskflow2);
+    assert(taskflow2.empty());
+    @endcode
+
+    Notice that both @c taskflow1 and @c taskflow2 should not be running
+    in an executor during the move operation, or the behavior is undefined.
+    */
+    Taskflow& operator = (Taskflow&& rhs);
+
     /**
     @brief default destructor
 
     When the destructor is called, all tasks and their associated data
     (e.g., captured data) will be destroyed.
-    It is your responsibility to ensure all submitted execution of this 
+    It is your responsibility to ensure all submitted execution of this
     taskflow have completed before destroying it.
+    For instance, the following code results in undefined behavior
+    since the executor may still be running the taskflow while
+    it is destroyed after the block.
+
+    @code{.cpp}
+    {
+      tf::Taskflow taskflow;
+      executor.run(taskflow);
+    }
+    @endcode
+
+    To fix the problem, we must wait for the execution to complete
+    before destroying the taskflow.
+
+    @code{.cpp}
+    {
+      tf::Taskflow taskflow;
+      executor.run(taskflow).wait();
+    }
+    @endcode
     */
     ~Taskflow() = default;
 
     /**
     @brief dumps the taskflow to a DOT format through a std::ostream target
+
+    @code{.cpp}
+    taskflow.dump(std::cout);  // dump the graph to the standard output
+
+    std::ofstream ofs("output.dot");
+    taskflow.dump(ofs);        // dump the graph to the file output.dot
+    @endcode
+
+    For dynamically spawned tasks, such as module tasks, subflow tasks,
+    and GPU tasks, you need to run the taskflow first before you can
+    dump the entire graph.
+
+    @code{.cpp}
+    tf::Task parent = taskflow.emplace([](tf::Subflow sf){
+      sf.emplace([](){ std::cout << "child\n"; });
+    });
+    taskflow.dump(std::cout);      // this dumps only the parent tasks
+    executor.run(taskflow).wait();
+    taskflow.dump(std::cout);      // this dumps both parent and child tasks
+    @endcode
     */
     void dump(std::ostream& ostream) const;
-    
+
     /**
     @brief dumps the taskflow to a std::string of DOT format
+
+    This method is similar to tf::Taskflow::dump(std::ostream& ostream),
+    but returning a string of the graph in DOT format.
     */
     std::string dump() const;
-    
+
     /**
     @brief queries the number of tasks
     */
     size_t num_tasks() const;
-    
+
     /**
     @brief queries the emptiness of the taskflow
+
+    An empty taskflow has no tasks. That is the return of
+    tf::Taskflow::num_tasks is zero.
     */
     bool empty() const;
 
     /**
     @brief assigns a name to the taskflow
+
+    @code{.cpp}
+    taskflow.name("assign another name");
+    @endcode
     */
-    void name(const std::string&); 
+    void name(const std::string&);
 
     /**
     @brief queries the name of the taskflow
+
+    @code{.cpp}
+    std::cout << "my name is: " << taskflow.name();
+    @endcode
     */
-    const std::string& name() const ;
-    
+    const std::string& name() const;
+
     /**
     @brief clears the associated task dependency graph
-    
+
     When you clear a taskflow, all tasks and their associated data
-    (e.g., captured data) will be destroyed.
-    You should never clean a taskflow while it is being run by an executor.
+    (e.g., captured data in task callables) will be destroyed.
+    The behavior of clearing a running taskflow is undefined.
     */
     void clear();
 
@@ -144,23 +246,34 @@ class Taskflow : public FlowBuilder {
     template <typename V>
     void for_each_task(V&& visitor) const;
 
+    /**
+    @brief returns a reference to the underlying graph object
+
+    A graph object (of type tf::Graph) is the ultimate storage for the
+    task dependency graph and should only be used as an opaque
+    data structure to interact with the executor (e.g., composition).
+    */
+    Graph& graph();
+
   private:
- 
+
+    mutable std::mutex _mutex;
+
     std::string _name;
-   
-    Graph _graph;
 
-    std::mutex _mtx;
+    Graph _graph;
 
     std::queue<std::shared_ptr<Topology>> _topologies;
-    
-    void _dump(std::ostream&, const Taskflow*) const;
+
+    std::optional<std::list<Taskflow>::iterator> _satellite;
+
+    void _dump(std::ostream&, const Graph*) const;
     void _dump(std::ostream&, const Node*, Dumper&) const;
-    void _dump(std::ostream&, const Graph&, Dumper&) const;
+    void _dump(std::ostream&, const Graph*, Dumper&) const;
 };
 
 // Constructor
-inline Taskflow::Taskflow(const std::string& name) : 
+inline Taskflow::Taskflow(const std::string& name) :
   FlowBuilder {_graph},
   _name       {name} {
 }
@@ -169,9 +282,35 @@ inline Taskflow::Taskflow(const std::string& name) :
 inline Taskflow::Taskflow() : FlowBuilder{_graph} {
 }
 
+// Move constructor
+inline Taskflow::Taskflow(Taskflow&& rhs) : FlowBuilder{_graph} {
+
+  std::scoped_lock<std::mutex> lock(rhs._mutex);
+
+  _name = std::move(rhs._name);
+  _graph = std::move(rhs._graph);
+  _topologies = std::move(rhs._topologies);
+  _satellite = rhs._satellite;
+
+  rhs._satellite.reset();
+}
+
+// Move assignment
+inline Taskflow& Taskflow::operator = (Taskflow&& rhs) {
+  if(this != &rhs) {
+    std::scoped_lock<std::mutex, std::mutex> lock(_mutex, rhs._mutex);
+    _name = std::move(rhs._name);
+    _graph = std::move(rhs._graph);
+    _topologies = std::move(rhs._topologies);
+    _satellite = rhs._satellite;
+    rhs._satellite.reset();
+  }
+  return *this;
+}
+
 // Procedure:
 inline void Taskflow::clear() {
-  _graph.clear();
+  _graph._clear();
 }
 
 // Function: num_tasks
@@ -194,6 +333,11 @@ inline const std::string& Taskflow::name() const {
   return _name;
 }
 
+// Function: graph
+inline Graph& Taskflow::graph() {
+  return _graph;
+}
+
 // Function: for_each_task
 template <typename V>
 void Taskflow::for_each_task(V&& visitor) const {
@@ -212,28 +356,40 @@ inline std::string Taskflow::dump() const {
 // Function: dump
 inline void Taskflow::dump(std::ostream& os) const {
   os << "digraph Taskflow {\n";
-  _dump(os, this);
+  _dump(os, &_graph);
   os << "}\n";
 }
 
 // Procedure: _dump
-inline void Taskflow::_dump(std::ostream& os, const Taskflow* top) const {
-  
+inline void Taskflow::_dump(std::ostream& os, const Graph* top) const {
+
   Dumper dumper;
-  
-  dumper.stack.push(top);
-  dumper.visited.insert(top);
+
+  dumper.id = 0;
+  dumper.stack.push({nullptr, top});
+  dumper.visited[top] = dumper.id++;
 
   while(!dumper.stack.empty()) {
-    
-    auto f = dumper.stack.top();
+
+    auto [p, f] = dumper.stack.top();
     dumper.stack.pop();
-    
-    os << "subgraph cluster_p" << f << " {\nlabel=\"Taskflow: ";
-    if(f->_name.empty()) os << 'p' << f;
-    else os << f->_name;
+
+    os << "subgraph cluster_p" << f << " {\nlabel=\"";
+
+    // n-level module
+    if(p) {
+      os << 'm' << dumper.visited[f];
+    }
+    // top-level taskflow graph
+    else {
+      os << "Taskflow: ";
+      if(_name.empty()) os << 'p' << this;
+      else os << _name;
+    }
+
     os << "\";\n";
-    _dump(os, f->_graph, dumper);
+
+    _dump(os, f, dumper);
     os << "}\n";
   }
 }
@@ -252,60 +408,49 @@ inline void Taskflow::_dump(
   switch(node->_handle.index()) {
 
     case Node::CONDITION:
+    case Node::MULTI_CONDITION:
       os << "shape=diamond color=black fillcolor=aquamarine style=filled";
     break;
 
-    case Node::CUDAFLOW:
-      os << " style=\"filled\""
-         << " color=\"black\" fillcolor=\"purple\""
-         << " fontcolor=\"white\""
-         << " shape=\"folder\"";
-    break;
-
     default:
     break;
   }
 
   os << "];\n";
-  
+
   for(size_t s=0; s<node->_successors.size(); ++s) {
-    if(node->_handle.index() == Node::CONDITION) {
+    if(node->_is_conditioner()) {
       // case edge is dashed
-      os << 'p' << node << " -> p" << node->_successors[s] 
+      os << 'p' << node << " -> p" << node->_successors[s]
          << " [style=dashed label=\"" << s << "\"];\n";
-    }
-    else {
+    } else {
       os << 'p' << node << " -> p" << node->_successors[s] << ";\n";
     }
   }
-  
+
   // subflow join node
-  if(node->_parent && node->_successors.size() == 0) {
+  if(node->_parent && node->_parent->_handle.index() == Node::DYNAMIC &&
+     node->_successors.size() == 0
+    ) {
     os << 'p' << node << " -> p" << node->_parent << ";\n";
   }
 
+  // node info
   switch(node->_handle.index()) {
 
     case Node::DYNAMIC: {
-      auto& sbg = std::get<Node::Dynamic>(node->_handle).subgraph;
+      auto& sbg = std::get_if<Node::Dynamic>(&node->_handle)->subgraph;
       if(!sbg.empty()) {
         os << "subgraph cluster_p" << node << " {\nlabel=\"Subflow: ";
         if(node->_name.empty()) os << 'p' << node;
         else os << node->_name;
 
         os << "\";\n" << "color=blue\n";
-        _dump(os, sbg, dumper);
+        _dump(os, &sbg, dumper);
         os << "}\n";
       }
     }
     break;
-    
-    case Node::CUDAFLOW: {
-      std::get<Node::cudaFlow>(node->_handle).graph->dump(
-        os, node, node->_name
-      );
-    }
-    break;
 
     default:
     break;
@@ -314,10 +459,10 @@ inline void Taskflow::_dump(
 
 // Procedure: _dump
 inline void Taskflow::_dump(
-  std::ostream& os, const Graph& graph, Dumper& dumper
+  std::ostream& os, const Graph* graph, Dumper& dumper
 ) const {
-    
-  for(const auto& n : graph._nodes) {
+
+  for(const auto& n : graph->_nodes) {
 
     // regular task
     if(n->_handle.index() != Node::MODULE) {
@@ -325,22 +470,20 @@ inline void Taskflow::_dump(
     }
     // module task
     else {
-
-      auto module = std::get<Node::Module>(n->_handle).module;
+      //auto module = &(std::get_if<Node::Module>(&n->_handle)->module);
+      auto module = &(std::get_if<Node::Module>(&n->_handle)->graph);
 
       os << 'p' << n << "[shape=box3d, color=blue, label=\"";
-      if(n->_name.empty()) os << n;
+      if(n->_name.empty()) os << 'p' << n;
       else os << n->_name;
-      os << " [Taskflow: ";
-      if(module->_name.empty()) os << 'p' << module;
-      else os << module->_name;
-      os << "]\"];\n";
 
       if(dumper.visited.find(module) == dumper.visited.end()) {
-        dumper.visited.insert(module);
-        dumper.stack.push(module);
+        dumper.visited[module] = dumper.id++;
+        dumper.stack.push({n, module});
       }
 
+      os << " [m" << dumper.visited[module] << "]\"];\n";
+
       for(const auto s : n->_successors) {
         os << 'p' << n << "->" << 'p' << s << ";\n";
       }
@@ -355,12 +498,12 @@ inline void Taskflow::_dump(
 /**
 @class Future
 
-@brief class to access the result of task execution
+@brief class to access the result of an execution
 
 tf::Future is a derived class from std::future that will eventually hold the
-execution result of a submitted taskflow (e.g., tf::Executor::run)
-or an asynchronous task (e.g., tf::Executor::async).
-In addition to base methods of std::future,
+execution result of a submitted taskflow (tf::Executor::run)
+or an asynchronous task (tf::Executor::async, tf::Executor::silent_async).
+In addition to the base methods inherited from std::future,
 you can call tf::Future::cancel to cancel the execution of the running taskflow
 associated with this future object.
 The following example cancels a submission of a taskflow that contains
@@ -371,7 +514,7 @@ tf::Executor executor;
 tf::Taskflow taskflow;
 
 for(int i=0; i<1000; i++) {
-  taskflow.emplace([](){ 
+  taskflow.emplace([](){
     std::this_thread::sleep_for(std::chrono::seconds(1));
   });
 }
@@ -391,17 +534,14 @@ class Future : public std::future<T>  {
 
   friend class Executor;
   friend class Subflow;
-  
+  friend class Runtime;
+
   using handle_t = std::variant<
-    std::monostate, std::weak_ptr<Topology>, std::weak_ptr<AsyncTopology>
+    std::monostate, std::weak_ptr<Topology>
   >;
 
-  // variant index
-  constexpr static auto ASYNC = get_index_v<std::weak_ptr<AsyncTopology>, handle_t>;
-  constexpr static auto TASKFLOW = get_index_v<std::weak_ptr<Topology>, handle_t>; 
-
   public:
-    
+
     /**
     @brief default constructor
     */
@@ -411,12 +551,12 @@ class Future : public std::future<T>  {
     @brief disabled copy constructor
     */
     Future(const Future&) = delete;
-    
+
     /**
     @brief default move constructor
     */
     Future(Future&&) = default;
-    
+
     /**
     @brief disabled copy assignment
     */
@@ -428,16 +568,21 @@ class Future : public std::future<T>  {
     Future& operator = (Future&&) = default;
 
     /**
-    @brief cancels the execution of the running taskflow associated with 
+    @brief cancels the execution of the running taskflow associated with
            this future object
 
     @return @c true if the execution can be cancelled or
             @c false if the execution has already completed
+
+    When you request a cancellation, the executor will stop scheduling
+    any tasks onwards. Tasks that are already running will continue to finish
+    (non-preemptive).
+    You can call tf::Future::wait to wait for the cancellation to complete.
     */
     bool cancel();
 
   private:
-    
+
     handle_t _handle;
 
     template <typename P>
@@ -462,7 +607,7 @@ bool Future<T>::cancel() {
     else {
       auto ptr = arg.lock();
       if(ptr) {
-        ptr->_is_cancelled = true;
+        ptr->_is_cancelled.store(true, std::memory_order_relaxed);
         return true;
       }
       return false;
@@ -472,7 +617,3 @@ bool Future<T>::cancel() {
 
 
 }  // end of namespace tf. ---------------------------------------------------
-
-
-
-
diff --git a/lib/taskflow/core/topology.hpp b/lib/taskflow/core/topology.hpp
index a9b8e51..b4d9eab 100644
--- a/lib/taskflow/core/topology.hpp
+++ b/lib/taskflow/core/topology.hpp
@@ -6,30 +6,25 @@ namespace tf {
 
 // class: TopologyBase
 class TopologyBase {
-  
+
   friend class Executor;
   friend class Node;
-  
+
   template <typename T>
   friend class Future;
 
   protected:
 
-  bool _is_cancelled { false };
+  std::atomic<bool> _is_cancelled { false };
 };
 
 // ----------------------------------------------------------------------------
 
-// class: AsyncTopology
-class AsyncTopology : public TopologyBase {
-};
-
-// ----------------------------------------------------------------------------
-  
 // class: Topology
 class Topology : public TopologyBase {
-  
+
   friend class Executor;
+  friend class Runtime;
 
   public:
 
@@ -42,7 +37,7 @@ class Topology : public TopologyBase {
 
     std::promise<void> _promise;
 
-    std::vector<Node*> _sources;
+    SmallVector<Node*> _sources;
 
     std::function<bool()> _pred;
     std::function<void()> _call;
@@ -52,7 +47,7 @@ class Topology : public TopologyBase {
 
 // Constructor
 template <typename P, typename C>
-Topology::Topology(Taskflow& tf, P&& p, C&& c): 
+Topology::Topology(Taskflow& tf, P&& p, C&& c):
   _taskflow(tf),
   _pred {std::forward<P>(p)},
   _call {std::forward<C>(c)} {
diff --git a/lib/taskflow/core/tsq.hpp b/lib/taskflow/core/tsq.hpp
index 0a13630..e4ea76c 100644
--- a/lib/taskflow/core/tsq.hpp
+++ b/lib/taskflow/core/tsq.hpp
@@ -1,31 +1,115 @@
 #pragma once
 
-#include <atomic>
-#include <vector>
-#include <cassert>
-#include <cstdint>
-#include <cstddef>
-#include <cstdlib>
+#include "../utility/macros.hpp"
+#include "../utility/traits.hpp"
+
+/**
+@file tsq.hpp
+@brief task queue include file
+*/
 
 namespace tf {
 
+
+// ----------------------------------------------------------------------------
+// Task Types
+// ----------------------------------------------------------------------------
+
+/**
+@enum TaskPriority
+
+@brief enumeration of all task priority values
+
+A priority is an enumerated value of type @c unsigned.
+Currently, %Taskflow defines three priority levels, 
+@c HIGH, @c NORMAL, and @c LOW, starting from 0, 1, to 2.
+That is, the lower the value, the higher the priority.
+
+*/
+enum class TaskPriority : unsigned {
+  /** @brief value of the highest priority (i.e., 0)  */
+  HIGH = 0,
+  /** @brief value of the normal priority (i.e., 1)  */
+  NORMAL = 1,
+  /** @brief value of the lowest priority (i.e., 2) */
+  LOW = 2,
+  /** @brief conventional value for iterating priority values */
+  MAX = 3
+};
+
+
+
+// ----------------------------------------------------------------------------
+// Task Queue
+// ----------------------------------------------------------------------------
+
+
 /**
 @class: TaskQueue
 
-@tparam T data type (must be a pointer)
+@tparam T data type (must be a pointer type)
+@tparam TF_MAX_PRIORITY maximum level of the priority 
 
-@brief Lock-free unbounded single-producer multiple-consumer queue.
+@brief class to create a lock-free unbounded single-producer multiple-consumer queue
 
-This class implements the work stealing queue described in the paper, 
-"Correct and Efficient Work-Stealing for Weak Memory Models,"
-available at https://www.di.ens.fr/~zappa/readings/ppopp13.pdf.
+This class implements the work-stealing queue described in the paper,
+<a href="https://www.di.ens.fr/~zappa/readings/ppopp13.pdf">Correct and Efficient Work-Stealing for Weak Memory Models</a>,
+and extends it to include priority.
 
 Only the queue owner can perform pop and push operations,
-while others can steal data from the queue.
+while others can steal data from the queue simultaneously.
+Priority starts from zero (highest priority) to the template value 
+`TF_MAX_PRIORITY-1` (lowest priority).
+All operations are associated with priority values to indicate
+the corresponding queues to which an operation is applied.
+
+The default template value, `TF_MAX_PRIORITY`, is `TaskPriority::MAX` 
+which applies only three priority levels to the task queue.
+
+@code{.cpp}
+auto [A, B, C, D, E] = taskflow.emplace(
+  [] () { },
+  [&] () { 
+    std::cout << "Task B: " << counter++ << '\n';  // 0
+  },
+  [&] () { 
+    std::cout << "Task C: " << counter++ << '\n';  // 2
+  },
+  [&] () { 
+    std::cout << "Task D: " << counter++ << '\n';  // 1
+  },
+  [] () { }
+);
+
+A.precede(B, C, D); 
+E.succeed(B, C, D);
+  
+B.priority(tf::TaskPriority::HIGH);
+C.priority(tf::TaskPriority::LOW);
+D.priority(tf::TaskPriority::NORMAL);
+  
+executor.run(taskflow).wait();
+@endcode
+
+In the above example, we have a task graph of five tasks,
+@c A, @c B, @c C, @c D, and @c E, in which @c B, @c C, and @c D
+can run in simultaneously when @c A finishes.
+Since we only uses one worker thread in the executor, 
+we can deterministically run @c B first, then @c D, and @c C
+in order of their priority values.
+The output is as follows:
+
+@code{.shell-session}
+Task B: 0
+Task D: 1
+Task C: 2
+@endcode
+
 */
-template <typename T>
+template <typename T, unsigned TF_MAX_PRIORITY = static_cast<unsigned>(TaskPriority::MAX)>
 class TaskQueue {
-
+  
+  static_assert(TF_MAX_PRIORITY > 0, "TF_MAX_PRIORITY must be at least one");
   static_assert(std::is_pointer_v<T>, "T must be a pointer type");
 
   struct Array {
@@ -34,7 +118,7 @@ class TaskQueue {
     int64_t M;
     std::atomic<T>* S;
 
-    explicit Array(int64_t c) : 
+    explicit Array(int64_t c) :
       C {c},
       M {c-1},
       S {new std::atomic<T>[static_cast<size_t>(C)]} {
@@ -47,10 +131,9 @@ class TaskQueue {
     int64_t capacity() const noexcept {
       return C;
     }
-    
-    template <typename O>
-    void push(int64_t i, O&& o) noexcept {
-      S[i & M].store(std::forward<O>(o), std::memory_order_relaxed);
+
+    void push(int64_t i, T o) noexcept {
+      S[i & M].store(o, std::memory_order_relaxed);
     }
 
     T pop(int64_t i) noexcept {
@@ -67,133 +150,208 @@ class TaskQueue {
 
   };
 
-  std::atomic<int64_t> _top;
-  std::atomic<int64_t> _bottom;
-  std::atomic<Array*> _array;
-  std::vector<Array*> _garbage;
+  // Doubling the alignment by 2 seems to generate the most
+  // decent performance.
+  CachelineAligned<std::atomic<int64_t>> _top[TF_MAX_PRIORITY];
+  CachelineAligned<std::atomic<int64_t>> _bottom[TF_MAX_PRIORITY];
+  std::atomic<Array*> _array[TF_MAX_PRIORITY];
+  std::vector<Array*> _garbage[TF_MAX_PRIORITY];
+
+  //std::atomic<T> _cache {nullptr};
 
   public:
-    
+
     /**
     @brief constructs the queue with a given capacity
 
     @param capacity the capacity of the queue (must be power of 2)
     */
-    explicit TaskQueue(int64_t capacity = 1024);
+    explicit TaskQueue(int64_t capacity = 512);
 
     /**
     @brief destructs the queue
     */
     ~TaskQueue();
-    
+
     /**
     @brief queries if the queue is empty at the time of this call
     */
     bool empty() const noexcept;
-    
+
+    /**
+    @brief queries if the queue is empty at a specific priority value
+    */
+    bool empty(unsigned priority) const noexcept;
+
     /**
     @brief queries the number of items at the time of this call
     */
     size_t size() const noexcept;
 
+    /**
+    @brief queries the number of items with the given priority
+           at the time of this call
+    */
+    size_t size(unsigned priority) const noexcept;
+
     /**
     @brief queries the capacity of the queue
     */
     int64_t capacity() const noexcept;
     
+    /**
+    @brief queries the capacity of the queue at a specific priority value
+    */
+    int64_t capacity(unsigned priority) const noexcept;
+
     /**
     @brief inserts an item to the queue
 
-    Only the owner thread can insert an item to the queue. 
-    The operation can trigger the queue to resize its capacity 
+    @param item the item to push to the queue
+    @param priority priority value of the item to push (default = 0)
+    
+    Only the owner thread can insert an item to the queue.
+    The operation can trigger the queue to resize its capacity
     if more space is required.
-
-    @tparam O data type 
-
-    @param item the item to perfect-forward to the queue
     */
-    void push(T item);
-    
+    TF_FORCE_INLINE void push(T item, unsigned priority);
+
     /**
     @brief pops out an item from the queue
 
-    Only the owner thread can pop out an item from the queue. 
-    The return can be a nullptr if this operation failed (empty queue).
+    Only the owner thread can pop out an item from the queue.
+    The return can be a @c nullptr if this operation failed (empty queue).
     */
     T pop();
-    
+
+    /**
+    @brief pops out an item with a specific priority value from the queue
+
+    @param priority priority of the item to pop
+
+    Only the owner thread can pop out an item from the queue.
+    The return can be a @c nullptr if this operation failed (empty queue).
+    */
+    TF_FORCE_INLINE T pop(unsigned priority);
+
     /**
     @brief steals an item from the queue
 
     Any threads can try to steal an item from the queue.
-    The return can be a nullptr if this operation failed (not necessary empty).
+    The return can be a @c nullptr if this operation failed (not necessary empty).
     */
     T steal();
+
+    /**
+    @brief steals an item with a specific priority value from the queue
+
+    @param priority priority of the item to steal
+
+    Any threads can try to steal an item from the queue.
+    The return can be a @c nullptr if this operation failed (not necessary empty).
+    */
+    T steal(unsigned priority);
+
+  private:
+    TF_NO_INLINE Array* resize_array(Array* a, unsigned p, std::int64_t b, std::int64_t t);
 };
 
 // Constructor
-template <typename T>
-TaskQueue<T>::TaskQueue(int64_t c) {
+template <typename T, unsigned TF_MAX_PRIORITY>
+TaskQueue<T, TF_MAX_PRIORITY>::TaskQueue(int64_t c) {
   assert(c && (!(c & (c-1))));
-  _top.store(0, std::memory_order_relaxed);
-  _bottom.store(0, std::memory_order_relaxed);
-  _array.store(new Array{c}, std::memory_order_relaxed);
-  _garbage.reserve(32);
+  unroll<0, TF_MAX_PRIORITY, 1>([&](auto p){
+    _top[p].data.store(0, std::memory_order_relaxed);
+    _bottom[p].data.store(0, std::memory_order_relaxed);
+    _array[p].store(new Array{c}, std::memory_order_relaxed);
+    _garbage[p].reserve(32);
+  });
 }
 
 // Destructor
-template <typename T>
-TaskQueue<T>::~TaskQueue() {
-  for(auto a : _garbage) {
-    delete a;
+template <typename T, unsigned TF_MAX_PRIORITY>
+TaskQueue<T, TF_MAX_PRIORITY>::~TaskQueue() {
+  unroll<0, TF_MAX_PRIORITY, 1>([&](auto p){
+    for(auto a : _garbage[p]) {
+      delete a;
+    }
+    delete _array[p].load();
+  });
+}
+
+// Function: empty
+template <typename T, unsigned TF_MAX_PRIORITY>
+bool TaskQueue<T, TF_MAX_PRIORITY>::empty() const noexcept {
+  for(unsigned i=0; i<TF_MAX_PRIORITY; i++) {
+    if(!empty(i)) {
+      return false;
+    }
   }
-  delete _array.load();
+  return true;
 }
-  
+
 // Function: empty
-template <typename T>
-bool TaskQueue<T>::empty() const noexcept {
-  int64_t b = _bottom.load(std::memory_order_relaxed);
-  int64_t t = _top.load(std::memory_order_relaxed);
-  return b <= t;
+template <typename T, unsigned TF_MAX_PRIORITY>
+bool TaskQueue<T, TF_MAX_PRIORITY>::empty(unsigned p) const noexcept {
+  int64_t b = _bottom[p].data.load(std::memory_order_relaxed);
+  int64_t t = _top[p].data.load(std::memory_order_relaxed);
+  return (b <= t);
+}
+
+// Function: size
+template <typename T, unsigned TF_MAX_PRIORITY>
+size_t TaskQueue<T, TF_MAX_PRIORITY>::size() const noexcept {
+  size_t s;
+  unroll<0, TF_MAX_PRIORITY, 1>([&](auto i) { s = i ? size(i) + s : size(i); });
+  return s;
 }
 
 // Function: size
-template <typename T>
-size_t TaskQueue<T>::size() const noexcept {
-  int64_t b = _bottom.load(std::memory_order_relaxed);
-  int64_t t = _top.load(std::memory_order_relaxed);
+template <typename T, unsigned TF_MAX_PRIORITY>
+size_t TaskQueue<T, TF_MAX_PRIORITY>::size(unsigned p) const noexcept {
+  int64_t b = _bottom[p].data.load(std::memory_order_relaxed);
+  int64_t t = _top[p].data.load(std::memory_order_relaxed);
   return static_cast<size_t>(b >= t ? b - t : 0);
 }
 
 // Function: push
-template <typename T>
-void TaskQueue<T>::push(T o) {
-  int64_t b = _bottom.load(std::memory_order_relaxed);
-  int64_t t = _top.load(std::memory_order_acquire);
-  Array* a = _array.load(std::memory_order_relaxed);
+template <typename T, unsigned TF_MAX_PRIORITY>
+TF_FORCE_INLINE void TaskQueue<T, TF_MAX_PRIORITY>::push(T o, unsigned p) {
+
+  int64_t b = _bottom[p].data.load(std::memory_order_relaxed);
+  int64_t t = _top[p].data.load(std::memory_order_acquire);
+  Array* a = _array[p].load(std::memory_order_relaxed);
 
   // queue is full
   if(a->capacity() - 1 < (b - t)) {
-    Array* tmp = a->resize(b, t);
-    _garbage.push_back(a);
-    std::swap(a, tmp);
-    _array.store(a, std::memory_order_relaxed);
+    a = resize_array(a, p, b, t);
   }
 
   a->push(b, o);
   std::atomic_thread_fence(std::memory_order_release);
-  _bottom.store(b + 1, std::memory_order_relaxed);
+  _bottom[p].data.store(b + 1, std::memory_order_relaxed);
 }
 
 // Function: pop
-template <typename T>
-T TaskQueue<T>::pop() {
-  int64_t b = _bottom.load(std::memory_order_relaxed) - 1;
-  Array* a = _array.load(std::memory_order_relaxed);
-  _bottom.store(b, std::memory_order_relaxed);
+template <typename T, unsigned TF_MAX_PRIORITY>
+T TaskQueue<T, TF_MAX_PRIORITY>::pop() {
+  for(unsigned i=0; i<TF_MAX_PRIORITY; i++) {
+    if(auto t = pop(i); t) {
+      return t;
+    }
+  }
+  return nullptr;
+}
+
+// Function: pop
+template <typename T, unsigned TF_MAX_PRIORITY>
+TF_FORCE_INLINE T TaskQueue<T, TF_MAX_PRIORITY>::pop(unsigned p) {
+
+  int64_t b = _bottom[p].data.load(std::memory_order_relaxed) - 1;
+  Array* a = _array[p].load(std::memory_order_relaxed);
+  _bottom[p].data.store(b, std::memory_order_relaxed);
   std::atomic_thread_fence(std::memory_order_seq_cst);
-  int64_t t = _top.load(std::memory_order_relaxed);
+  int64_t t = _top[p].data.load(std::memory_order_relaxed);
 
   T item {nullptr};
 
@@ -201,36 +359,48 @@ T TaskQueue<T>::pop() {
     item = a->pop(b);
     if(t == b) {
       // the last item just got stolen
-      if(!_top.compare_exchange_strong(t, t+1, 
-                                       std::memory_order_seq_cst, 
-                                       std::memory_order_relaxed)) {
+      if(!_top[p].data.compare_exchange_strong(t, t+1,
+                                               std::memory_order_seq_cst,
+                                               std::memory_order_relaxed)) {
         item = nullptr;
       }
-      _bottom.store(b + 1, std::memory_order_relaxed);
+      _bottom[p].data.store(b + 1, std::memory_order_relaxed);
     }
   }
   else {
-    _bottom.store(b + 1, std::memory_order_relaxed);
+    _bottom[p].data.store(b + 1, std::memory_order_relaxed);
   }
 
   return item;
 }
 
 // Function: steal
-template <typename T>
-T TaskQueue<T>::steal() {
-  int64_t t = _top.load(std::memory_order_acquire);
-  std::atomic_thread_fence(std::memory_order_seq_cst);
-  int64_t b = _bottom.load(std::memory_order_acquire);
+template <typename T, unsigned TF_MAX_PRIORITY>
+T TaskQueue<T, TF_MAX_PRIORITY>::steal() {
+  for(unsigned i=0; i<TF_MAX_PRIORITY; i++) {
+    if(auto t = steal(i); t) {
+      return t;
+    }
+  }
+  return nullptr;
+}
+
+// Function: steal
+template <typename T, unsigned TF_MAX_PRIORITY>
+T TaskQueue<T, TF_MAX_PRIORITY>::steal(unsigned p) {
   
+  int64_t t = _top[p].data.load(std::memory_order_acquire);
+  std::atomic_thread_fence(std::memory_order_seq_cst);
+  int64_t b = _bottom[p].data.load(std::memory_order_acquire);
+
   T item {nullptr};
 
   if(t < b) {
-    Array* a = _array.load(std::memory_order_consume);
+    Array* a = _array[p].load(std::memory_order_consume);
     item = a->pop(t);
-    if(!_top.compare_exchange_strong(t, t+1,
-                                     std::memory_order_seq_cst,
-                                     std::memory_order_relaxed)) {
+    if(!_top[p].data.compare_exchange_strong(t, t+1,
+                                             std::memory_order_seq_cst,
+                                             std::memory_order_relaxed)) {
       return nullptr;
     }
   }
@@ -239,9 +409,33 @@ T TaskQueue<T>::steal() {
 }
 
 // Function: capacity
-template <typename T>
-int64_t TaskQueue<T>::capacity() const noexcept {
-  return _array.load(std::memory_order_relaxed)->capacity();
+template <typename T, unsigned TF_MAX_PRIORITY>
+int64_t TaskQueue<T, TF_MAX_PRIORITY>::capacity() const noexcept {
+  size_t s;
+  unroll<0, TF_MAX_PRIORITY, 1>([&](auto i) { 
+    s = i ? capacity(i) + s : capacity(i); 
+  });
+  return s;
 }
 
+// Function: capacity
+template <typename T, unsigned TF_MAX_PRIORITY>
+int64_t TaskQueue<T, TF_MAX_PRIORITY>::capacity(unsigned p) const noexcept {
+  return _array[p].load(std::memory_order_relaxed)->capacity();
+}
+
+template <typename T, unsigned TF_MAX_PRIORITY>
+TF_NO_INLINE typename TaskQueue<T, TF_MAX_PRIORITY>::Array*
+  TaskQueue<T, TF_MAX_PRIORITY>::resize_array(Array* a, unsigned p, std::int64_t b, std::int64_t t) {
+
+  Array* tmp = a->resize(b, t);
+  _garbage[p].push_back(a);
+  std::swap(a, tmp);
+  _array[p].store(a, std::memory_order_release);
+  // Note: the original paper using relaxed causes t-san to complain
+  //_array.store(a, std::memory_order_relaxed);
+  return a;
+}
+
+
 }  // end of namespace tf -----------------------------------------------------
diff --git a/lib/taskflow/core/worker.hpp b/lib/taskflow/core/worker.hpp
index 61b7bc8..47fcf81 100644
--- a/lib/taskflow/core/worker.hpp
+++ b/lib/taskflow/core/worker.hpp
@@ -4,31 +4,100 @@
 #include "tsq.hpp"
 #include "notifier.hpp"
 
-/** 
+/**
 @file worker.hpp
 @brief worker include file
 */
 
 namespace tf {
 
+// ----------------------------------------------------------------------------
+// Class Definition: Worker
+// ----------------------------------------------------------------------------
+
 /**
-@private
+@class Worker
+
+@brief class to create a worker in an executor
+
+The class is primarily used by the executor to perform work-stealing algorithm.
+Users can access a worker object and alter its property
+(e.g., changing the thread affinity in a POSIX-like system)
+using tf::WorkerInterface.
 */
-struct Worker {
+class Worker {
 
   friend class Executor;
   friend class WorkerView;
 
+  public:
+
+    /**
+    @brief queries the worker id associated with its parent executor
+
+    A worker id is a unsigned integer in the range <tt>[0, N)</tt>,
+    where @c N is the number of workers spawned at the construction
+    time of the executor.
+    */
+    inline size_t id() const { return _id; }
+
+    /**
+    @brief acquires a pointer access to the underlying thread
+    */
+    inline std::thread* thread() const { return _thread; }
+
+    /**
+    @brief queries the size of the queue (i.e., number of enqueued tasks to
+           run) associated with the worker
+    */
+    inline size_t queue_size() const { return _wsq.size(); }
+    
+    /**
+    @brief queries the current capacity of the queue
+    */
+    inline size_t queue_capacity() const { return static_cast<size_t>(_wsq.capacity()); }
+
   private:
 
     size_t _id;
     size_t _vtm;
     Executor* _executor;
+    std::thread* _thread;
     Notifier::Waiter* _waiter;
     std::default_random_engine _rdgen { std::random_device{}() };
     TaskQueue<Node*> _wsq;
+    Node* _cache;
 };
 
+// ----------------------------------------------------------------------------
+// Class Definition: PerThreadWorker
+// ----------------------------------------------------------------------------
+
+/**
+@private
+*/
+//struct PerThreadWorker {
+//
+//  Worker* worker;
+//
+//  PerThreadWorker() : worker {nullptr} {}
+//
+//  PerThreadWorker(const PerThreadWorker&) = delete;
+//  PerThreadWorker(PerThreadWorker&&) = delete;
+//
+//  PerThreadWorker& operator = (const PerThreadWorker&) = delete;
+//  PerThreadWorker& operator = (PerThreadWorker&&) = delete;
+//};
+
+/**
+@private
+*/
+//inline PerThreadWorker& this_worker() {
+//  thread_local PerThreadWorker worker;
+//  return worker;
+//}
+
+
 // ----------------------------------------------------------------------------
 // Class Definition: WorkerView
 // ----------------------------------------------------------------------------
@@ -44,22 +113,22 @@ when a worker runs a task, and the view object is only accessible
 from an observer derived from tf::ObserverInterface.
 */
 class WorkerView {
-  
+
   friend class Executor;
-  
+
   public:
-    
+
     /**
-    @brief queries the worker id associated with the executor
+    @brief queries the worker id associated with its parent executor
 
     A worker id is a unsigned integer in the range <tt>[0, N)</tt>,
     where @c N is the number of workers spawned at the construction
     time of the executor.
     */
     size_t id() const;
-    
+
     /**
-    @brief queries the size of the queue (i.e., number of pending tasks to 
+    @brief queries the size of the queue (i.e., number of pending tasks to
            run) associated with the worker
     */
     size_t queue_size() const;
@@ -98,6 +167,94 @@ inline size_t WorkerView::queue_capacity() const {
 }
 
 
+// ----------------------------------------------------------------------------
+// Class Definition: WorkerInterface
+// ----------------------------------------------------------------------------
+
+/**
+@class WorkerInterface
+
+@brief class to configure worker behavior in an executor
+
+The tf::WorkerInterface class lets users interact with the executor
+to customize the worker behavior,
+such as calling custom methods before and after a worker enters and leaves
+the loop.
+When you create an executor, it spawns a set of workers to run tasks.
+The interaction between the executor and its spawned workers looks like
+the following:
+
+for(size_t n=0; n<num_workers; n++) {
+  create_thread([](Worker& worker)
+  
+    // pre-processing executor-specific worker information
+    // ...
+  
+    // enter the scheduling loop
+    // Here, WorkerInterface::scheduler_prologue is invoked, if any
+    
+    while(1) {
+      perform_work_stealing_algorithm();
+      if(stop) {
+        break; 
+      }
+    }
+  
+    // leaves the scheduling loop and joins this worker thread
+    // Here, WorkerInterface::scheduler_epilogue is invoked, if any
+  );
+}
+
+@note
+Methods defined in tf::WorkerInterface are not thread-safe and may be
+be invoked by multiple workers concurrently.
+
+*/
+class WorkerInterface {
+
+  public:
+  
+  /**
+  @brief default destructor
+  */
+  virtual ~WorkerInterface() = default;
+  
+  /**
+  @brief method to call before a worker enters the scheduling loop
+  @param worker a reference to the worker
+
+  The method is called by the constructor of an executor.
+  */
+  virtual void scheduler_prologue(Worker& worker) = 0;
+  
+  /**
+  @brief method to call after a worker leaves the scheduling loop
+  @param worker a reference to the worker
+  @param ptr an pointer to the exception thrown by the scheduling loop
+
+  The method is called by the constructor of an executor.
+  */
+  virtual void scheduler_epilogue(Worker& worker, std::exception_ptr ptr) = 0;
+
+};
+
+/**
+@brief helper function to create an instance derived from tf::WorkerInterface
+
+@tparam T type derived from tf::WorkerInterface
+@tparam ArgsT argument types to construct @c T
+
+@param args arguments to forward to the constructor of @c T
+*/
+template <typename T, typename... ArgsT>
+std::shared_ptr<T> make_worker_interface(ArgsT&&... args) {
+  static_assert(
+    std::is_base_of_v<WorkerInterface, T>, 
+    "T must be derived from WorkerInterface"
+  );
+  return std::make_shared<T>(std::forward<ArgsT>(args)...);
+}
+
 }  // end of namespact tf -----------------------------------------------------
 
 
diff --git a/lib/taskflow/taskflow.hpp b/lib/taskflow/taskflow.hpp
index c815c23..38ac741 100644
--- a/lib/taskflow/taskflow.hpp
+++ b/lib/taskflow/taskflow.hpp
@@ -1,26 +1,27 @@
 #pragma once
 
 #include "core/executor.hpp"
-#include "core/algorithm/critical.hpp"
-#include "core/algorithm/for_each.hpp"
-#include "core/algorithm/reduce.hpp"
-#include "core/algorithm/sort.hpp"
+#include "core/async.hpp"
+#include "algorithm/critical.hpp"
 
-
-/** @dir taskflow
+/**
+@dir taskflow
 @brief root taskflow include dir
 */
 
-/** @dir taskflow/core
+/**
+@dir taskflow/core
 @brief taskflow core include dir
 */
 
-/** @dir taskflow/cuda
-@brief taskflow CUDA include dir
+/**
+@dir taskflow/algorithm
+@brief taskflow algorithms include dir
 */
 
-/** @dir taskflow/cuda/cublas
-@brief taskflow cuBLAS include dir
+/**
+@dir taskflow/cuda
+@brief taskflow CUDA include dir
 */
 
 /**
@@ -32,8 +33,8 @@
 // TF_VERSION / 100 % 1000 is the minor version
 // TF_VERSION / 100000 is the major version
 
-// current version: 3.1.0
-#define TF_VERSION 300100
+// current version: 3.6.0
+#define TF_VERSION 300600
 
 #define TF_MAJOR_VERSION TF_VERSION/100000
 #define TF_MINOR_VERSION TF_VERSION/100%1000
@@ -44,11 +45,19 @@
 */
 namespace tf {
 
+/**
+@private
+*/
+namespace detail { }
+
+
 /**
 @brief queries the version information in a string format @c major.minor.patch
+
+Release notes are available here: https://taskflow.github.io/taskflow/Releases.html
 */
 constexpr const char* version() {
-  return "3.1.0";
+  return "3.6.0";
 }
 
 
diff --git a/lib/taskflow/utility/iterator.hpp b/lib/taskflow/utility/iterator.hpp
index e2aa5b5..6441391 100644
--- a/lib/taskflow/utility/iterator.hpp
+++ b/lib/taskflow/utility/iterator.hpp
@@ -5,18 +5,18 @@
 
 namespace tf {
 
-template <typename T>
-constexpr std::enable_if_t<std::is_integral<std::decay_t<T>>::value, size_t>
-distance(T beg, T end, T step) {
-  return (end - beg + step + (step > 0 ? -1 : 1)) / step;
-}
-
 template <typename T>
 constexpr std::enable_if_t<std::is_integral<std::decay_t<T>>::value, bool>
 is_range_invalid(T beg, T end, T step) {
-  return ((step == 0 && beg != end) || 
-          (beg < end && step <=  0) || 
+  return ((step == 0 && beg != end) ||
+          (beg < end && step <=  0) ||
           (beg > end && step >=  0));
 }
 
+template <typename T>
+constexpr std::enable_if_t<std::is_integral<std::decay_t<T>>::value, size_t>
+distance(T beg, T end, T step) {
+  return (end - beg + step + (step > 0 ? -1 : 1)) / step;
+}
+
 }  // end of namespace tf -----------------------------------------------------
diff --git a/lib/taskflow/utility/macros.hpp b/lib/taskflow/utility/macros.hpp
new file mode 100644
index 0000000..e7598cf
--- /dev/null
+++ b/lib/taskflow/utility/macros.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#if defined(_MSC_VER)
+  #define TF_FORCE_INLINE __forceinline
+#elif defined(__GNUC__) && __GNUC__ > 3
+  #define TF_FORCE_INLINE __attribute__((__always_inline__)) inline
+#else
+  #define TF_FORCE_INLINE inline
+#endif
+
+#if defined(_MSC_VER)
+  #define TF_NO_INLINE __declspec(noinline)
+#elif defined(__GNUC__) && __GNUC__ > 3
+  #define TF_NO_INLINE __attribute__((__noinline__))
+#else
+  #define TF_NO_INLINE
+#endif
diff --git a/lib/taskflow/utility/math.hpp b/lib/taskflow/utility/math.hpp
index b195b23..f80053e 100644
--- a/lib/taskflow/utility/math.hpp
+++ b/lib/taskflow/utility/math.hpp
@@ -42,7 +42,7 @@ template <typename T, std::enable_if_t<
   std::is_integral_v<std::decay_t<T>>, void>* = nullptr
 >
 constexpr bool is_pow2(const T& x) {
-  return x && (!(x&(x-1))); 
+  return x && (!(x&(x-1)));
 }
 
 //// finds the ceil of x divided by b
@@ -77,7 +77,7 @@ RandItr median_of_three(RandItr l, RandItr m, RandItr r, C cmp) {
 }
 
 /**
-@brief finds the pseudo median of a range of items using spreaded 
+@brief finds the pseudo median of a range of items using spreaded
        nine numbers
  */
 template <typename RandItr, typename C>
@@ -121,6 +121,30 @@ T unique_id() {
   return counter.fetch_add(1, std::memory_order_relaxed);
 }
 
+/**
+@brief updates an atomic variable with a maximum value
+*/
+template <typename T>
+inline void atomic_max(std::atomic<T>& v, const T& max_v) noexcept {
+  T prev = v.load(std::memory_order_relaxed);
+  while(prev < max_v && 
+        !v.compare_exchange_weak(prev, max_v, std::memory_order_relaxed,
+                                              std::memory_order_relaxed)) {
+  }
+}
+
+/**
+@brief updates an atomic variable with a minimum value
+*/
+template <typename T>
+inline void atomic_min(std::atomic<T>& v, const T& min_v) noexcept {
+  T prev = v.load(std::memory_order_relaxed);
+  while(prev > min_v && 
+        !v.compare_exchange_weak(prev, min_v, std::memory_order_relaxed,
+                                              std::memory_order_relaxed)) {
+  }
+}
+
 }  // end of namespace tf -----------------------------------------------------
 
 
diff --git a/lib/taskflow/utility/object_pool.hpp b/lib/taskflow/utility/object_pool.hpp
index a90478b..34d60fb 100644
--- a/lib/taskflow/utility/object_pool.hpp
+++ b/lib/taskflow/utility/object_pool.hpp
@@ -3,7 +3,7 @@
 //
 // 2020/02/02 - modified by Tsung-Wei Huang
 //  - new implementation motivated by Hoard
-// 
+//
 // 2019/07/10 - modified by Tsung-Wei Huang
 //  - replace raw pointer with smart pointer
 //
@@ -28,7 +28,7 @@ namespace tf {
 // Class: ObjectPool
 //
 // The class implements an efficient thread-safe object pool motivated
-// by the Hoard memory allocator algorithm. 
+// by the Hoard memory allocator algorithm.
 // Different from the normal memory allocator, object pool allocates
 // only one object at a time.
 //
@@ -44,13 +44,13 @@ namespace tf {
 // M = 30
 // F = 4
 // W = (30+4-1)/4 = 8
-// 
+//
 // b0: 0, 1, 2, 3, 4, 5, 6, 7
 // b1: 8, 9, 10, 11, 12, 13, 14, 15
 // b2: 16, 17, 18, 19, 20, 21, 22, 23
 // b3: 24, 25, 26, 27, 28, 29
 // b4: 30 (anything equal to M)
-// 
+//
 // Example scenario 2:
 // M = 32
 // F = 4
@@ -62,14 +62,14 @@ namespace tf {
 // b4: 32 (anything equal to M)
 //
 template <typename T, size_t S = 65536>
-class ObjectPool { 
-  
-  // the data column must be sufficient to hold the pointer in freelist  
+class ObjectPool {
+
+  // the data column must be sufficient to hold the pointer in freelist
   constexpr static size_t X = (std::max)(sizeof(T*), sizeof(T));
   //constexpr static size_t X = sizeof(long double) + std::max(sizeof(T*), sizeof(T));
   //constexpr static size_t M = (S - offsetof(Block, data)) / X;
   constexpr static size_t M = S / X;
-  constexpr static size_t F = 4;   
+  constexpr static size_t F = 4;
   constexpr static size_t B = F + 1;
   constexpr static size_t W = (M + F - 1) / F;
   constexpr static size_t K = 4;
@@ -81,7 +81,7 @@ class ObjectPool {
   static_assert(
     M >= 128, "block size S must be larger enough to pool at least 128 objects"
   );
-  
+
   struct Blocklist {
     Blocklist* prev;
     Blocklist* next;
@@ -100,7 +100,7 @@ class ObjectPool {
   };
 
   struct Block {
-    LocalHeap* heap;
+    std::atomic<LocalHeap*> heap;
     Blocklist list_node;
     size_t i;
     size_t u;
@@ -110,7 +110,7 @@ class ObjectPool {
   };
 
   public:
-    
+
     /**
     @brief constructs an object pool from a number of anticipated threads
     */
@@ -120,18 +120,18 @@ class ObjectPool {
     @brief destructs the object pool
     */
     ~ObjectPool();
-    
+
     /**
     @brief acquires a pointer to a object constructed from a given argument list
     */
     template <typename... ArgsT>
     T* animate(ArgsT&&... args);
-    
+
     /**
     @brief recycles a object pointed by @c ptr and destroys it
     */
     void recycle(T* ptr);
-    
+
     size_t num_bins_per_local_heap() const;
     size_t num_objects_per_bin() const;
     size_t num_objects_per_block() const;
@@ -141,7 +141,7 @@ class ObjectPool {
     size_t num_local_heaps() const;
     size_t num_global_heaps() const;
     size_t num_heaps() const;
-    
+
     float emptiness_threshold() const;
 
   private:
@@ -158,7 +158,7 @@ class ObjectPool {
 
     template <class P, class Q>
     constexpr size_t _offset_in_class(const Q P::*member) const;
-    
+
     template <class P, class Q>
     constexpr P* _parent_class_of(Q*, const Q P::*member);
 
@@ -194,7 +194,7 @@ class ObjectPool {
     void _for_each_block(Blocklist*, C&&);
 
 };
-    
+
 // ----------------------------------------------------------------------------
 // ObjectPool definition
 // ----------------------------------------------------------------------------
@@ -224,18 +224,20 @@ ObjectPool<T, S>::~ObjectPool() {
   // clear local heaps
   for(auto& h : _lheaps) {
     for(size_t i=0; i<B; ++i) {
-      _for_each_block_safe(&h.lists[i], [] (Block* b) { 
-        std::free(b); 
+      _for_each_block_safe(&h.lists[i], [] (Block* b) {
+        //std::free(b);
+        delete b;
       });
     }
   }
-  
+
   // clear global heap
-  _for_each_block_safe(&_gheap.list, [] (Block* b) { 
-    std::free(b);
+  _for_each_block_safe(&_gheap.list, [] (Block* b) {
+    //std::free(b);
+    delete b;
   });
 }
-    
+
 // Function: num_bins_per_local_heap
 template <typename T, size_t S>
 size_t ObjectPool<T, S>::num_bins_per_local_heap() const {
@@ -281,11 +283,11 @@ size_t ObjectPool<T, S>::num_heaps() const {
 // Function: capacity
 template <typename T, size_t S>
 size_t ObjectPool<T, S>::capacity() const {
-  
+
   size_t n = 0;
-  
+
   // global heap
-  for(auto p=_gheap.list.next; p!=&_gheap.list; p=p->next) {  
+  for(auto p=_gheap.list.next; p!=&_gheap.list; p=p->next) {
     n += M;
   };
 
@@ -302,9 +304,9 @@ template <typename T, size_t S>
 size_t ObjectPool<T, S>::num_available_objects() const {
 
   size_t n = 0;
-  
+
   // global heap
-  for(auto p=_gheap.list.next; p!=&_gheap.list; p=p->next) {  
+  for(auto p=_gheap.list.next; p!=&_gheap.list; p=p->next) {
     n += (M - _block_of(p)->u);
   };
 
@@ -318,11 +320,11 @@ size_t ObjectPool<T, S>::num_available_objects() const {
 // Function: num_allocated_objects
 template <typename T, size_t S>
 size_t ObjectPool<T, S>::num_allocated_objects() const {
-  
+
   size_t n = 0;
-  
+
   // global heap
-  for(auto p=_gheap.list.next; p!=&_gheap.list; p=p->next) {  
+  for(auto p=_gheap.list.next; p!=&_gheap.list; p=p->next) {
     n += _block_of(p)->u;
   };
 
@@ -368,14 +370,14 @@ constexpr P* ObjectPool<T, S>::_parent_class_of(
 
 // Function: _block_of
 template <typename T, size_t S>
-constexpr typename ObjectPool<T, S>::Block* 
+constexpr typename ObjectPool<T, S>::Block*
 ObjectPool<T, S>::_block_of(Blocklist* list) {
   return _parent_class_of(list, &Block::list_node);
 }
 
 // Function: _block_of
 template <typename T, size_t S>
-constexpr typename ObjectPool<T, S>::Block* 
+constexpr typename ObjectPool<T, S>::Block*
 ObjectPool<T, S>::_block_of(const Blocklist* list) const {
   return _parent_class_of(list, &Block::list_node);
 }
@@ -389,7 +391,7 @@ void ObjectPool<T, S>::_blocklist_init_head(Blocklist *list) {
 
 // Procedure: _blocklist_add_impl
 // Insert a new entry between two known consecutive entries.
-// 
+//
 // This is only for internal list manipulation where we know
 // the prev/next entries already!
 template <typename T, size_t S>
@@ -405,10 +407,10 @@ void ObjectPool<T, S>::_blocklist_add_impl(
 // list_push_front - add a new entry
 // @curr: curr entry to be added
 // @head: list head to add it after
-// 
+//
 // Insert a new entry after the specified head.
 // This is good for implementing stacks.
-// 
+//
 template <typename T, size_t S>
 void ObjectPool<T, S>::_blocklist_push_front(
   Blocklist *curr, Blocklist *head
@@ -419,10 +421,10 @@ void ObjectPool<T, S>::_blocklist_push_front(
 // list_add_tail - add a new entry
 // @curr: curr entry to be added
 // @head: list head to add it before
-// 
+//
 // Insert a new entry before the specified head.
 // This is useful for implementing queues.
-// 
+//
 template <typename T, size_t S>
 void ObjectPool<T, S>::_blocklist_push_back(
   Blocklist *curr, Blocklist *head
@@ -432,10 +434,10 @@ void ObjectPool<T, S>::_blocklist_push_back(
 
 // Delete a list entry by making the prev/next entries
 // point to each other.
-// 
+//
 // This is only for internal list manipulation where we know
 // the prev/next entries already!
-// 
+//
 template <typename T, size_t S>
 void ObjectPool<T, S>::_blocklist_del_impl(
   Blocklist * prev, Blocklist * next
@@ -458,7 +460,7 @@ void ObjectPool<T, S>::_blocklist_del(Blocklist *entry) {
 // list_replace - replace old entry by new one
 // @old : the element to be replaced
 // @curr : the new element to insert
-// 
+//
 // If @old was empty, it will be overwritten.
 template <typename T, size_t S>
 void ObjectPool<T, S>::_blocklist_replace(
@@ -537,7 +539,7 @@ void ObjectPool<T, S>::_for_each_block(Blocklist* head, C&& c) {
     c(_block_of(p));
   }
 }
-      
+
 // Procedure: _for_each_block_safe
 // Iterate each item of a list - safe to free
 template <typename T, size_t S>
@@ -577,15 +579,15 @@ template <typename... ArgsT>
 T* ObjectPool<T, S>::animate(ArgsT&&... args) {
 
   //std::cout << "construct a new item\n";
-    
+
   // my logically mapped heap
-  LocalHeap& h = _this_heap(); 
-  
+  LocalHeap& h = _this_heap();
+
   Block* s {nullptr};
 
   h.mutex.lock();
-  
-  // scan the list of superblocks from most full to least
+
+  // scan the list of superblocks from the most full to the least full
   int f = static_cast<int>(F-1);
   for(; f>=0; f--) {
     if(!_blocklist_is_empty(&h.lists[f])) {
@@ -593,16 +595,16 @@ T* ObjectPool<T, S>::animate(ArgsT&&... args) {
       break;
     }
   }
-  
+
   // no superblock found
   if(f == -1) {
 
     // check heap 0 for a superblock
     _gheap.mutex.lock();
     if(!_blocklist_is_empty(&_gheap.list)) {
-      
+
       s = _block_of(_gheap.list.next);
-      
+
       //printf("get a superblock from global heap %lu\n", s->u);
       assert(s->u < M && s->heap == nullptr);
       f = static_cast<int>(_bin(s->u + 1));
@@ -620,7 +622,8 @@ T* ObjectPool<T, S>::animate(ArgsT&&... args) {
       //printf("create a new superblock\n");
       _gheap.mutex.unlock();
       f = 0;
-      s = static_cast<Block*>(std::malloc(sizeof(Block)));
+      //s = static_cast<Block*>(std::malloc(sizeof(Block)));
+      s = new Block();
 
       if(s == nullptr) {
         throw std::bad_alloc();
@@ -636,7 +639,7 @@ T* ObjectPool<T, S>::animate(ArgsT&&... args) {
       h.a = h.a + M;
     }
   }
-  
+
   // the superblock must have at least one space
   //assert(s->u < M);
   //printf("%lu %lu %lu\n", h.u, h.a, s->u);
@@ -647,9 +650,9 @@ T* ObjectPool<T, S>::animate(ArgsT&&... args) {
 
   // take one item from the superblock
   T* mem = _allocate(s);
-  
+
   int b = static_cast<int>(_bin(s->u));
-  
+
   if(b != f) {
     //printf("move superblock from list[%d] to list[%d]\n", f, b);
     _blocklist_move_front(&s->list_node, &h.lists[b]);
@@ -670,7 +673,7 @@ T* ObjectPool<T, S>::animate(ArgsT&&... args) {
 
   return mem;
 }
-  
+
 // Function: destruct
 template <typename T, size_t S>
 void ObjectPool<T, S>::recycle(T* mem) {
@@ -684,7 +687,7 @@ void ObjectPool<T, S>::recycle(T* mem) {
   Block* s = static_cast<Block*>(mem->_object_pool_block);
 
   mem->~T();
-  
+
   //printf("deallocate %p (s=%p) M=%lu W=%lu X=%lu\n", mem, s, M, W, X);
 
   // here we need a loop because when we lock the heap,
@@ -692,8 +695,8 @@ void ObjectPool<T, S>::recycle(T* mem) {
   bool sync = false;
 
   do {
-    auto h = s->heap;    
-    
+    LocalHeap* h = s->heap.load(std::memory_order_relaxed);
+
     // the block is in global heap
     if(h == nullptr) {
       std::lock_guard<std::mutex> glock(_gheap.mutex);
@@ -739,14 +742,14 @@ void ObjectPool<T, S>::recycle(T* mem) {
       }
     }
   } while(!sync);
-  
+
   //std::cout << "s.i " << s->i << '\n'
   //          << "s.u " << s->u << '\n';
 }
-    
+
 // Function: _this_heap
 template <typename T, size_t S>
-typename ObjectPool<T, S>::LocalHeap& 
+typename ObjectPool<T, S>::LocalHeap&
 ObjectPool<T, S>::_this_heap() {
   // here we don't use thread local since object pool might be
   // created and destroyed multiple times
@@ -760,16 +763,16 @@ ObjectPool<T, S>::_this_heap() {
 
 // Function: _next_pow2
 template <typename T, size_t S>
-constexpr unsigned ObjectPool<T, S>::_next_pow2(unsigned n) const { 
+constexpr unsigned ObjectPool<T, S>::_next_pow2(unsigned n) const {
   if(n == 0) return 1;
-  n--; 
-  n |= n >> 1; 
-  n |= n >> 2; 
-  n |= n >> 4; 
-  n |= n >> 8; 
-  n |= n >> 16; 
-  n++; 
-  return n; 
-}  
+  n--;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+  n++;
+  return n;
+}
 
 }  // end namespace tf --------------------------------------------------------
diff --git a/lib/taskflow/utility/os.hpp b/lib/taskflow/utility/os.hpp
index 433f6d8..23ac301 100644
--- a/lib/taskflow/utility/os.hpp
+++ b/lib/taskflow/utility/os.hpp
@@ -14,7 +14,7 @@
 #define TF_OS_CNK 0
 #define TF_OS_HURD 0
 #define TF_OS_SOLARIS 0
-#define TF_OS_UNIX 0 /* disjunction of TF_OS_LINUX, TF_OS_DARWIN etc. */
+#define TF_OS_UNIX 0
 
 #ifdef _WIN32
 #undef TF_OS_WINDOWS
@@ -80,7 +80,7 @@
      TF_OS_LINUX + TF_OS_DRAGONFLY + TF_OS_FREEBSD + TF_OS_NETBSD +        \
      TF_OS_OPENBSD + TF_OS_DARWIN + TF_OS_WINDOWS + TF_OS_HURD +           \
      TF_OS_SOLARIS)
-#error Unknown OS
+#define TF_OS_UNKNOWN 1
 #endif
 
 #if TF_OS_LINUX || TF_OS_DRAGONFLY || TF_OS_FREEBSD || TF_OS_NETBSD ||     \
@@ -89,14 +89,60 @@
 #define TF_OS_UNIX 1
 #endif
 
+
+//-----------------------------------------------------------------------------
+// Cache line alignment
+//-----------------------------------------------------------------------------
+#if defined(__i386__) || defined(__x86_64__)
+  #define TF_CACHELINE_SIZE 64
+#elif defined(__powerpc64__)
+  // TODO
+  // This is the L1 D-cache line size of our Power7 machines.
+  // Need to check if this is appropriate for other PowerPC64 systems.
+  #define TF_CACHELINE_SIZE 128
+#elif defined(__arm__)
+  // Cache line sizes for ARM: These values are not strictly correct since
+  // cache line sizes depend on implementations, not architectures.
+  // There are even implementations with cache line sizes configurable
+  // at boot time.
+  #if defined(__ARM_ARCH_5T__)
+    #define TF_CACHELINE_SIZE 32
+  #elif defined(__ARM_ARCH_7A__)
+    #define TF_CACHELINE_SIZE 64
+  #endif
+#endif
+
+#ifndef TF_CACHELINE_SIZE
+// A reasonable default guess.  Note that overestimates tend to waste more
+// space, while underestimates tend to waste more time.
+  #define TF_CACHELINE_SIZE 64
+#endif
+
+
+
+//-----------------------------------------------------------------------------
+// pause
+//-----------------------------------------------------------------------------
+//#if __has_include (<immintrin.h>)
+//  #define TF_HAS_MM_PAUSE 1
+//  #include <immintrin.h>
+//#endif
+
 namespace tf {
 
+// Struct: CachelineAligned
+// Due to prefetch, we typically do 2x cacheline for the alignment.
+template <typename T>
+struct CachelineAligned {
+  alignas (2*TF_CACHELINE_SIZE) T data;
+};
+
 // Function: get_env
 inline std::string get_env(const std::string& str) {
 #ifdef _MSC_VER
   char *ptr = nullptr;
   size_t len = 0;
-  
+
   if(_dupenv_s(&ptr, &len, str.c_str()) == 0 && ptr != nullptr) {
     std::string res(ptr, len);
     std::free(ptr);
@@ -115,7 +161,7 @@ inline bool has_env(const std::string& str) {
 #ifdef _MSC_VER
   char *ptr = nullptr;
   size_t len = 0;
-  
+
   if(_dupenv_s(&ptr, &len, str.c_str()) == 0 && ptr != nullptr) {
     std::string res(ptr, len);
     std::free(ptr);
@@ -129,8 +175,12 @@ inline bool has_env(const std::string& str) {
 #endif
 }
 
-// ----------------------------------------------------------------------------
-
+// Procedure: relax_cpu
+//inline void relax_cpu() {
+//#ifdef TF_HAS_MM_PAUSE
+//  _mm_pause();
+//#endif
+//}
 
 
 
diff --git a/lib/taskflow/utility/serializer.hpp b/lib/taskflow/utility/serializer.hpp
index 387ef43..aab00f2 100644
--- a/lib/taskflow/utility/serializer.hpp
+++ b/lib/taskflow/utility/serializer.hpp
@@ -1,6 +1,30 @@
 #pragma once
 
-#include "traits.hpp"
+#include <type_traits>
+#include <iterator>
+#include <iostream>
+#include <fstream>
+#include <stack>
+#include <queue>
+#include <vector>
+#include <algorithm>
+#include <memory>
+#include <functional>
+#include <map>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+#include <sstream>
+#include <list>
+#include <forward_list>
+#include <numeric>
+#include <iomanip>
+#include <cassert>
+#include <cmath>
+#include <array>
+#include <string>
+#include <variant>
+#include <optional>
 
 namespace tf {
 
@@ -9,172 +33,172 @@ namespace tf {
 // ----------------------------------------------------------------------------
 
 // std::basic_string
-template <typename T> 
+template <typename T>
 struct is_std_basic_string : std::false_type {};
 
-template <typename... ArgsT> 
+template <typename... ArgsT>
 struct is_std_basic_string <std::basic_string<ArgsT...>> : std::true_type {};
 
-template <typename T> 
+template <typename T>
 constexpr bool is_std_basic_string_v = is_std_basic_string<T>::value;
 
 // std::array
-template <typename T> 
+template <typename T>
 struct is_std_array : std::false_type {};
 
-template <typename T, size_t N> 
+template <typename T, size_t N>
 struct is_std_array <std::array<T, N>> : std::true_type {};
 
-template <typename T> 
+template <typename T>
 constexpr bool is_std_array_v = is_std_array<T>::value;
 
 // std::vector
-template <typename T> 
+template <typename T>
 struct is_std_vector : std::false_type {};
 
-template <typename... ArgsT> 
+template <typename... ArgsT>
 struct is_std_vector <std::vector<ArgsT...>> : std::true_type {};
 
-template <typename T> 
+template <typename T>
 constexpr bool is_std_vector_v = is_std_vector<T>::value;
 
 // std::deque
-template <typename T> 
+template <typename T>
 struct is_std_deque : std::false_type {};
 
-template <typename... ArgsT> 
+template <typename... ArgsT>
 struct is_std_deque <std::deque<ArgsT...>> : std::true_type {};
 
-template <typename T> 
+template <typename T>
 constexpr bool is_std_deque_v = is_std_deque<T>::value;
 
 // std::list
-template <typename T> 
+template <typename T>
 struct is_std_list : std::false_type {};
 
-template <typename... ArgsT> 
+template <typename... ArgsT>
 struct is_std_list <std::list<ArgsT...>> : std::true_type {};
 
-template <typename T> 
+template <typename T>
 constexpr bool is_std_list_v = is_std_list<T>::value;
 
 // std::forward_list
-template <typename T> 
+template <typename T>
 struct is_std_forward_list : std::false_type {};
 
-template <typename... ArgsT> 
+template <typename... ArgsT>
 struct is_std_forward_list <std::forward_list<ArgsT...>> : std::true_type {};
 
-template <typename T> 
+template <typename T>
 constexpr bool is_std_forward_list_v = is_std_forward_list<T>::value;
 
 // std::map
-template <typename T> 
+template <typename T>
 struct is_std_map : std::false_type {};
 
-template <typename... ArgsT> 
+template <typename... ArgsT>
 struct is_std_map <std::map<ArgsT...>> : std::true_type {};
 
-template <typename T> 
+template <typename T>
 constexpr bool is_std_map_v = is_std_map<T>::value;
 
 // std::unordered_map
-template <typename T> 
+template <typename T>
 struct is_std_unordered_map : std::false_type {};
 
-template <typename... ArgsT> 
+template <typename... ArgsT>
 struct is_std_unordered_map <std::unordered_map<ArgsT...>> : std::true_type {};
 
-template <typename T> 
+template <typename T>
 constexpr bool is_std_unordered_map_v = is_std_unordered_map<T>::value;
 
 // std::set
-template <typename T> 
+template <typename T>
 struct is_std_set : std::false_type {};
 
-template <typename... ArgsT> 
+template <typename... ArgsT>
 struct is_std_set <std::set<ArgsT...>> : std::true_type {};
 
-template <typename T> 
+template <typename T>
 constexpr bool is_std_set_v = is_std_set<T>::value;
 
 // std::unordered_set
-template <typename T> 
+template <typename T>
 struct is_std_unordered_set : std::false_type {};
 
-template <typename... ArgsT> 
+template <typename... ArgsT>
 struct is_std_unordered_set <std::unordered_set<ArgsT...>> : std::true_type {};
 
-template <typename T> 
+template <typename T>
 constexpr bool is_std_unordered_set_v = is_std_unordered_set<T>::value;
 
 // std::variant
-template <typename T> 
+template <typename T>
 struct is_std_variant : std::false_type {};
 
-template <typename... ArgsT> 
+template <typename... ArgsT>
 struct is_std_variant <std::variant<ArgsT...>> : std::true_type {};
 
-template <typename T> 
+template <typename T>
 constexpr bool is_std_variant_v = is_std_variant<T>::value;
 
 // std::optional
-template <typename T> 
+template <typename T>
 struct is_std_optional : std::false_type {};
 
-template <typename... ArgsT> 
+template <typename... ArgsT>
 struct is_std_optional <std::optional<ArgsT...>> : std::true_type {};
 
-template <typename T> 
+template <typename T>
 constexpr bool is_std_optional_v = is_std_optional<T>::value;
 
 // std::unique_ptr
-template <typename T> 
+template <typename T>
 struct is_std_unique_ptr : std::false_type {};
 
-template <typename... ArgsT> 
+template <typename... ArgsT>
 struct is_std_unique_ptr <std::unique_ptr<ArgsT...>> : std::true_type {};
 
-template <typename T> 
+template <typename T>
 constexpr bool is_std_unique_ptr_v = is_std_unique_ptr<T>::value;
 
 // std::shared_ptr
-template <typename T> 
+template <typename T>
 struct is_std_shared_ptr : std::false_type {};
 
-template <typename... ArgsT> 
+template <typename... ArgsT>
 struct is_std_shared_ptr <std::shared_ptr<ArgsT...>> : std::true_type {};
 
-template <typename T> 
+template <typename T>
 constexpr bool is_std_shared_ptr_v = is_std_shared_ptr<T>::value;
 
 // std::duration
 template <typename T> struct is_std_duration : std::false_type {};
 
-template <typename... ArgsT> 
+template <typename... ArgsT>
 struct is_std_duration<std::chrono::duration<ArgsT...>> : std::true_type {};
 
-template <typename T> 
+template <typename T>
 constexpr bool is_std_duration_v = is_std_duration<T>::value;
 
 // std::time_point
-template <typename T> 
+template <typename T>
 struct is_std_time_point : std::false_type {};
 
-template <typename... ArgsT> 
+template <typename... ArgsT>
 struct is_std_time_point<std::chrono::time_point<ArgsT...>> : std::true_type {};
 
-template <typename T> 
+template <typename T>
 constexpr bool is_std_time_point_v = is_std_time_point<T>::value;
 
 // std::tuple
-template <typename T> 
+template <typename T>
 struct is_std_tuple : std::false_type {};
 
-template <typename... ArgsT> 
+template <typename... ArgsT>
 struct is_std_tuple<std::tuple<ArgsT...>> : std::true_type {};
 
-template <typename T> 
+template <typename T>
 constexpr bool is_std_tuple_v = is_std_tuple<T>::value;
 
 //-----------------------------------------------------------------------------
@@ -182,7 +206,7 @@ constexpr bool is_std_tuple_v = is_std_tuple<T>::value;
 //-----------------------------------------------------------------------------
 
 // ExtractType: forward declaration
-template <size_t, typename> 
+template <size_t, typename>
 struct ExtractType;
 
 // ExtractType_t: alias interface
@@ -211,23 +235,23 @@ struct ExtractType <idx, C<T, RestT...>> : ExtractType<idx-1, C<RestT...>> {
 // ----------------------------------------------------------------------------
 
 // Struct: SizeTag
-// Class that wraps a given size item which can be customized. 
+// Class that wraps a given size item which can be customized.
 template <typename T>
 class SizeTag {
 
-  public: 
-  
+  public:
+
     using type = std::conditional_t<std::is_lvalue_reference_v<T>, T, std::decay_t<T>>;
-    
+
     SizeTag(T&& item) : _item(std::forward<T>(item)) {}
-    
+
     SizeTag& operator = (const SizeTag&) = delete;
 
     inline const T& get() const {return _item;}
 
     template <typename ArchiverT>
     auto save(ArchiverT & ar) const { return ar(_item); }
-    
+
     template <typename ArchiverT>
     auto load(ArchiverT & ar) { return ar(_item); }
 
@@ -249,9 +273,9 @@ SizeTag<T> make_size_tag(T&& t) {
 // Class: MapItem
 template <typename KeyT, typename ValueT>
 class MapItem {
-  
+
   public:
-  
+
     using KeyType = std::conditional_t <std::is_lvalue_reference_v<KeyT>, KeyT, std::decay_t<KeyT>>;
     using ValueType = std::conditional_t <std::is_lvalue_reference_v<ValueT>, ValueT, std::decay_t<ValueT>>;
 
@@ -263,7 +287,7 @@ class MapItem {
 
     template <typename ArchiverT>
     auto save(ArchiverT & ar) const { return ar(_key, _value); }
-    
+
     template <typename ArchiverT>
     auto load(ArchiverT & ar) { return ar(_key, _value); }
 
@@ -284,7 +308,7 @@ MapItem<KeyT, ValueT> make_kv_pair(KeyT&& k, ValueT&& v) {
 // ----------------------------------------------------------------------------
 
 template <typename T>
-constexpr auto is_default_serializable_v = 
+constexpr auto is_default_serializable_v = (
   std::is_arithmetic_v<T>    ||
   std::is_enum_v<T>          ||
   is_std_basic_string_v<T>   ||
@@ -301,166 +325,169 @@ constexpr auto is_default_serializable_v =
   is_std_variant_v<T>        ||
   is_std_optional_v<T>       ||
   is_std_tuple_v<T>          ||
-  is_std_array_v<T>;
+  is_std_array_v<T>
+);
 
 
 // Class: Serializer
-template <typename Device = std::ostream, typename SizeType = std::streamsize>
+template <typename Stream, typename SizeType = std::streamsize>
 class Serializer {
 
   public:
-    
-    Serializer(Device& device);
-    
+
+    Serializer(Stream& stream);
+
     template <typename... T>
     SizeType operator()(T&&... items);
-  
+
   private:
 
-    Device& _device;
-    
-    template <typename T, 
+    Stream& _stream;
+
+    template <typename T,
+      std::enable_if_t<!is_default_serializable_v<std::decay_t<T>>, void>* = nullptr
+    >
+    SizeType _save(T&&);
+
+    template <typename T,
       std::enable_if_t<std::is_arithmetic_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _save(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<is_std_basic_string_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _save(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<is_std_vector_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _save(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<
         is_std_deque_v<std::decay_t<T>> ||
-        is_std_list_v<std::decay_t<T>>, 
+        is_std_list_v<std::decay_t<T>>,
         void
       >* = nullptr
     >
     SizeType _save(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<
-        is_std_forward_list_v<std::decay_t<T>>, 
+        is_std_forward_list_v<std::decay_t<T>>,
         void
       >* = nullptr
     >
     SizeType _save(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<
         is_std_map_v<std::decay_t<T>> ||
-        is_std_unordered_map_v<std::decay_t<T>>, 
+        is_std_unordered_map_v<std::decay_t<T>>,
         void
       >* = nullptr
     >
     SizeType _save(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<
         is_std_set_v<std::decay_t<T>> ||
-        is_std_unordered_set_v<std::decay_t<T>>, 
+        is_std_unordered_set_v<std::decay_t<T>>,
         void
       >* = nullptr
     >
     SizeType _save(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<std::is_enum_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _save(T&&);
 
-    template <typename T, 
+    template <typename T,
       std::enable_if_t<is_std_duration_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _save(T&&);
 
-    template <typename T, 
+    template <typename T,
       std::enable_if_t<is_std_time_point_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _save(T&&);
 
-    template <typename T, 
+    template <typename T,
       std::enable_if_t<is_std_optional_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _save(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<is_std_variant_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _save(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<is_std_tuple_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _save(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<is_std_array_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _save(T&&);
-    
-    template <typename T, 
-      std::enable_if_t<!is_default_serializable_v<std::decay_t<T>>, void>* = nullptr
-    >
-    SizeType _save(T&&);
+
+
 };
 
 // Constructor
-template <typename Device, typename SizeType>
-Serializer<Device, SizeType>::Serializer(Device& device) : _device(device) {
+template <typename Stream, typename SizeType>
+Serializer<Stream, SizeType>::Serializer(Stream& stream) : _stream(stream) {
 }
 
 // Operator ()
-template <typename Device, typename SizeType>
+template <typename Stream, typename SizeType>
 template <typename... T>
-SizeType Serializer<Device, SizeType>::operator() (T&&... items) {
+SizeType Serializer<Stream, SizeType>::operator() (T&&... items) {
   return (_save(std::forward<T>(items)) + ...);
 }
 
 // arithmetic data type
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<std::is_arithmetic_v<std::decay_t<T>>, void>*
 >
-SizeType Serializer<Device, SizeType>::_save(T&& t) {
-  _device.write(reinterpret_cast<const char*>(std::addressof(t)), sizeof(t));
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
+  _stream.write(reinterpret_cast<const char*>(std::addressof(t)), sizeof(t));
   return sizeof(t);
 }
 
 // std::basic_string
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_basic_string_v<std::decay_t<T>>, void>*
 >
-SizeType Serializer<Device, SizeType>::_save(T&& t) {
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
   using U = std::decay_t<T>;
   auto sz = _save(make_size_tag(t.size()));
-  _device.write(
-    reinterpret_cast<const char*>(t.data()), 
+  _stream.write(
+    reinterpret_cast<const char*>(t.data()),
     t.size()*sizeof(typename U::value_type)
   );
   return sz + t.size()*sizeof(typename U::value_type);
 }
 
 // std::vector
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_vector_v<std::decay_t<T>>, void>*
 >
-SizeType Serializer<Device, SizeType>::_save(T&& t) {
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
 
   using U = std::decay_t<T>;
-    
+
   auto sz = _save(make_size_tag(t.size()));
 
   if constexpr (std::is_arithmetic_v<typename U::value_type>) {
-    _device.write(
-      reinterpret_cast<const char*>(t.data()), 
+    _stream.write(
+      reinterpret_cast<const char*>(t.data()),
       t.size() * sizeof(typename U::value_type)
     );
     sz += t.size() * sizeof(typename U::value_type);
@@ -474,12 +501,12 @@ SizeType Serializer<Device, SizeType>::_save(T&& t) {
 }
 
 // std::list and std::deque
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_deque_v<std::decay_t<T>> ||
                    is_std_list_v<std::decay_t<T>>, void>*
 >
-SizeType Serializer<Device, SizeType>::_save(T&& t) {
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
   auto sz = _save(make_size_tag(t.size()));
   for(auto&& item : t) {
     sz += _save(item);
@@ -488,11 +515,11 @@ SizeType Serializer<Device, SizeType>::_save(T&& t) {
 }
 
 // std::forward_list
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_forward_list_v<std::decay_t<T>>, void>*
 >
-SizeType Serializer<Device, SizeType>::_save(T&& t) {
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
   auto sz = _save(make_size_tag(std::distance(t.begin(), t.end())));
   for(auto&& item : t) {
     sz += _save(item);
@@ -501,13 +528,13 @@ SizeType Serializer<Device, SizeType>::_save(T&& t) {
 }
 
 // std::map and std::unordered_map
-template <typename Device, typename SizeType>  
+template <typename Stream, typename SizeType>
 template <typename T, std::enable_if_t<
   is_std_map_v<std::decay_t<T>> ||
-  is_std_unordered_map_v<std::decay_t<T>>, 
+  is_std_unordered_map_v<std::decay_t<T>>,
   void
 >*>
-SizeType Serializer<Device, SizeType>::_save(T&& t) {
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
   auto sz = _save(make_size_tag(t.size()));
   for(auto&& [k, v] : t) {
     sz += _save(make_kv_pair(k, v));
@@ -516,13 +543,13 @@ SizeType Serializer<Device, SizeType>::_save(T&& t) {
 }
 
 // std::set and std::unordered_set
-template <typename Device, typename SizeType>  
+template <typename Stream, typename SizeType>
 template <typename T, std::enable_if_t<
   is_std_set_v<std::decay_t<T>> ||
-  is_std_unordered_set_v<std::decay_t<T>>, 
+  is_std_unordered_set_v<std::decay_t<T>>,
   void
 >*>
-SizeType Serializer<Device, SizeType>::_save(T&& t) {
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
   auto sz = _save(make_size_tag(t.size()));
   for(auto&& item : t) {
     sz += _save(item);
@@ -531,39 +558,39 @@ SizeType Serializer<Device, SizeType>::_save(T&& t) {
 }
 
 // enum data type
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<std::is_enum_v<std::decay_t<T>>, void>*
 >
-SizeType Serializer<Device, SizeType>::_save(T&& t) {
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
   using U = std::decay_t<T>;
   return _save(static_cast<std::underlying_type_t<U>>(t));
 }
 
 // duration data type
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_duration_v<std::decay_t<T>>, void>*
 >
-SizeType Serializer<Device, SizeType>::_save(T&& t) {
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
   return _save(t.count());
 }
 
 // time point data type
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_time_point_v<std::decay_t<T>>, void>*
 >
-SizeType Serializer<Device, SizeType>::_save(T&& t) {
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
   return _save(t.time_since_epoch());
 }
 
 // optional data type
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_optional_v<std::decay_t<T>>, void>*
 >
-SizeType Serializer<Device, SizeType>::_save(T&& t) {
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
   if(bool flag = t.has_value(); flag) {
     return _save(flag) + _save(*t);
   }
@@ -573,35 +600,35 @@ SizeType Serializer<Device, SizeType>::_save(T&& t) {
 }
 
 // variant type
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_variant_v<std::decay_t<T>>, void>*
 >
-SizeType Serializer<Device, SizeType>::_save(T&& t) {
-  return _save(t.index()) + 
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
+  return _save(t.index()) +
          std::visit([&] (auto&& arg){ return _save(arg);}, t);
 }
 
 // tuple type
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_tuple_v<std::decay_t<T>>, void>*
 >
-SizeType Serializer<Device, SizeType>::_save(T&& t) {
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
   return std::apply(
     [&] (auto&&... args) {
-      return (_save(std::forward<decltype(args)>(args)) + ... + 0); 
+      return (_save(std::forward<decltype(args)>(args)) + ... + 0);
     },
     std::forward<T>(t)
   );
 }
 
 // array
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_array_v<std::decay_t<T>>, void>*
 >
-SizeType Serializer<Device, SizeType>::_save(T&& t) {
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
 
   using U = std::decay_t<T>;
 
@@ -610,9 +637,9 @@ SizeType Serializer<Device, SizeType>::_save(T&& t) {
   SizeType sz;
 
   if constexpr(std::is_arithmetic_v<typename U::value_type>) {
-    _device.write(reinterpret_cast<const char*>(t.data()), sizeof(t));
+    _stream.write(reinterpret_cast<const char*>(t.data()), sizeof(t));
     sz = sizeof(t);
-  } 
+  }
   else {
     sz = 0;
     for(auto&& item : t) {
@@ -623,12 +650,12 @@ SizeType Serializer<Device, SizeType>::_save(T&& t) {
   return sz;
 }
 
-// custom save method    
-template <typename Device, typename SizeType>  
-template <typename T, 
+// custom save method
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<!is_default_serializable_v<std::decay_t<T>>, void>*
 >
-SizeType Serializer<Device, SizeType>::_save(T&& t) {
+SizeType Serializer<Stream, SizeType>::_save(T&& t) {
   return t.save(*this);
 }
 
@@ -637,7 +664,7 @@ SizeType Serializer<Device, SizeType>::_save(T&& t) {
 // ----------------------------------------------------------------------------
 
 template <typename T>
-constexpr auto is_default_deserializable_v = 
+constexpr auto is_default_deserializable_v =
   std::is_arithmetic_v<T>    ||
   std::is_enum_v<T>          ||
   is_std_basic_string_v<T>   ||
@@ -657,199 +684,199 @@ constexpr auto is_default_deserializable_v =
   is_std_array_v<T>;
 
 // Class: Deserializer
-template <typename Device = std::istream, typename SizeType = std::streamsize>
+template <typename Stream, typename SizeType = std::streamsize>
 class Deserializer {
 
   public:
-    
-    Deserializer(Device& device);
-    
+
+    Deserializer(Stream& stream);
+
     template <typename... T>
     SizeType operator()(T&&... items);
-  
+
   private:
 
-    Device& _device;
-    
+    Stream& _stream;
+
     // Function: _variant_helper
     template <
-      size_t I = 0, typename... ArgsT, 
+      size_t I = 0, typename... ArgsT,
       std::enable_if_t<I==sizeof...(ArgsT)>* = nullptr
     >
     SizeType _variant_helper(size_t, std::variant<ArgsT...>&);
-    
+
     // Function: _variant_helper
     template <
-      size_t I = 0, typename... ArgsT, 
+      size_t I = 0, typename... ArgsT,
       std::enable_if_t<I<sizeof...(ArgsT)>* = nullptr
     >
     SizeType _variant_helper(size_t, std::variant<ArgsT...>&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<std::is_arithmetic_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _load(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<is_std_basic_string_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _load(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<is_std_vector_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _load(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<
         is_std_deque_v<std::decay_t<T>> ||
         is_std_list_v<std::decay_t<T>>  ||
-        is_std_forward_list_v<std::decay_t<T>>, 
+        is_std_forward_list_v<std::decay_t<T>>,
         void
       >* = nullptr
     >
     SizeType _load(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<is_std_map_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _load(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<is_std_unordered_map_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _load(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<is_std_set_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _load(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<is_std_unordered_set_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _load(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<std::is_enum_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _load(T&&);
 
-    template <typename T, 
+    template <typename T,
       std::enable_if_t<is_std_duration_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _load(T&&);
 
-    template <typename T, 
+    template <typename T,
       std::enable_if_t<is_std_time_point_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _load(T&&);
 
-    template <typename T, 
+    template <typename T,
       std::enable_if_t<is_std_optional_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _load(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<is_std_variant_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _load(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<is_std_tuple_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _load(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<is_std_array_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _load(T&&);
-    
-    template <typename T, 
+
+    template <typename T,
       std::enable_if_t<!is_default_deserializable_v<std::decay_t<T>>, void>* = nullptr
     >
     SizeType _load(T&&);
 };
 
 // Constructor
-template <typename Device, typename SizeType>
-Deserializer<Device, SizeType>::Deserializer(Device& device) : _device(device) {
+template <typename Stream, typename SizeType>
+Deserializer<Stream, SizeType>::Deserializer(Stream& stream) : _stream(stream) {
 }
 
 // Operator ()
-template <typename Device, typename SizeType>
+template <typename Stream, typename SizeType>
 template <typename... T>
-SizeType Deserializer<Device, SizeType>::operator() (T&&... items) {
+SizeType Deserializer<Stream, SizeType>::operator() (T&&... items) {
   return (_load(std::forward<T>(items)) + ...);
 }
 
 // Function: _variant_helper
-template <typename Device, typename SizeType>
+template <typename Stream, typename SizeType>
 template <size_t I, typename... ArgsT, std::enable_if_t<I==sizeof...(ArgsT)>*>
-SizeType Deserializer<Device, SizeType>::_variant_helper(size_t, std::variant<ArgsT...>&) {
+SizeType Deserializer<Stream, SizeType>::_variant_helper(size_t, std::variant<ArgsT...>&) {
   return 0;
 }
 
 // Function: _variant_helper
-template <typename Device, typename SizeType>
+template <typename Stream, typename SizeType>
 template <size_t I, typename... ArgsT, std::enable_if_t<I<sizeof...(ArgsT)>*>
-SizeType Deserializer<Device, SizeType>::_variant_helper(size_t i, std::variant<ArgsT...>& v) {
+SizeType Deserializer<Stream, SizeType>::_variant_helper(size_t i, std::variant<ArgsT...>& v) {
   if(i == 0) {
     using type = ExtractType_t<I, std::variant<ArgsT...>>;
     if(v.index() != I) {
       static_assert(
-        std::is_default_constructible<type>::value, 
+        std::is_default_constructible<type>::value,
         "Failed to archive variant (type should be default constructible T())"
       );
       v = type();
     }
-    return _load(std::get<type>(v));
+    return _load(*std::get_if<type>(&v));
   }
   return _variant_helper<I+1, ArgsT...>(i-1, v);
 }
 
 // arithmetic data type
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<std::is_arithmetic_v<std::decay_t<T>>, void>*
 >
-SizeType Deserializer<Device, SizeType>::_load(T&& t) {
-  _device.read(reinterpret_cast<char*>(std::addressof(t)), sizeof(t));
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
+  _stream.read(reinterpret_cast<char*>(std::addressof(t)), sizeof(t));
   return sizeof(t);
 }
 
 // std::basic_string
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_basic_string_v<std::decay_t<T>>, void>*
 >
-SizeType Deserializer<Device, SizeType>::_load(T&& t) {
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
   using U = std::decay_t<T>;
   typename U::size_type num_chars;
   auto sz = _load(make_size_tag(num_chars));
   t.resize(num_chars);
-  _device.read(reinterpret_cast<char*>(t.data()), num_chars*sizeof(typename U::value_type));
+  _stream.read(reinterpret_cast<char*>(t.data()), num_chars*sizeof(typename U::value_type));
   return sz + num_chars*sizeof(typename U::value_type);
 }
 
 // std::vector
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_vector_v<std::decay_t<T>>, void>*
 >
-SizeType Deserializer<Device, SizeType>::_load(T&& t) {
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
 
   using U = std::decay_t<T>;
-  
+
   typename U::size_type num_data;
-    
+
   auto sz = _load(make_size_tag(num_data));
 
   if constexpr(std::is_arithmetic_v<typename U::value_type>) {
     t.resize(num_data);
-    _device.read(reinterpret_cast<char*>(t.data()), num_data * sizeof(typename U::value_type));
+    _stream.read(reinterpret_cast<char*>(t.data()), num_data * sizeof(typename U::value_type));
     sz += num_data * sizeof(typename U::value_type);
-  } 
+  }
   else {
     t.resize(num_data);
     for(auto && v : t) {
@@ -860,15 +887,15 @@ SizeType Deserializer<Device, SizeType>::_load(T&& t) {
 }
 
 // std::list and std::deque
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_deque_v<std::decay_t<T>> ||
                    is_std_list_v<std::decay_t<T>>  ||
                    is_std_forward_list_v<std::decay_t<T>>, void>*
 >
-SizeType Deserializer<Device, SizeType>::_load(T&& t) {
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
   using U = std::decay_t<T>;
-    
+
   typename U::size_type num_data;
   auto sz = _load(make_size_tag(num_data));
 
@@ -879,21 +906,21 @@ SizeType Deserializer<Device, SizeType>::_load(T&& t) {
   return sz;
 }
 
-// std::map 
-template <typename Device, typename SizeType>  
-template <typename T, 
+// std::map
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_map_v<std::decay_t<T>>, void>*
 >
-SizeType Deserializer<Device, SizeType>::_load(T&& t) {
-  
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
+
   using U = std::decay_t<T>;
 
   typename U::size_type num_data;
   auto sz = _load(make_size_tag(num_data));
-  
+
   t.clear();
   auto hint = t.begin();
-    
+
   typename U::key_type k;
   typename U::mapped_type v;
 
@@ -905,11 +932,11 @@ SizeType Deserializer<Device, SizeType>::_load(T&& t) {
 }
 
 // std::unordered_map
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_unordered_map_v<std::decay_t<T>>, void>*
 >
-SizeType Deserializer<Device, SizeType>::_load(T&& t) {
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
   using U = std::decay_t<T>;
   typename U::size_type num_data;
   auto sz = _load(make_size_tag(num_data));
@@ -924,17 +951,17 @@ SizeType Deserializer<Device, SizeType>::_load(T&& t) {
     sz += _load(make_kv_pair(k, v));
     t.emplace(std::move(k), std::move(v));
   }
-  
+
   return sz;
 }
 
-// std::set 
-template <typename Device, typename SizeType>  
-template <typename T, 
+// std::set
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_set_v<std::decay_t<T>>, void>*
 >
-SizeType Deserializer<Device, SizeType>::_load(T&& t) {
-  
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
+
   using U = std::decay_t<T>;
 
   typename U::size_type num_data;
@@ -942,46 +969,46 @@ SizeType Deserializer<Device, SizeType>::_load(T&& t) {
 
   t.clear();
   auto hint = t.begin();
-    
+
   typename U::key_type k;
 
-  for(size_t i=0; i<num_data; ++i) {   
+  for(size_t i=0; i<num_data; ++i) {
     sz += _load(k);
     hint = t.emplace_hint(hint, std::move(k));
-  }   
+  }
   return sz;
 }
 
 // std::unordered_set
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_unordered_set_v<std::decay_t<T>>, void>*
 >
-SizeType Deserializer<Device, SizeType>::_load(T&& t) {
-   
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
+
   using U = std::decay_t<T>;
-   
+
   typename U::size_type num_data;
   auto sz = _load(make_size_tag(num_data));
 
   t.clear();
   t.reserve(num_data);
-    
+
   typename U::key_type k;
 
-  for(size_t i=0; i<num_data; ++i) {   
+  for(size_t i=0; i<num_data; ++i) {
     sz += _load(k);
     t.emplace(std::move(k));
-  }   
+  }
   return sz;
 }
 
 // enum data type
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<std::is_enum_v<std::decay_t<T>>, void>*
 >
-SizeType Deserializer<Device, SizeType>::_load(T&& t) {
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
   using U = std::decay_t<T>;
   std::underlying_type_t<U> k;
   auto sz = _load(k);
@@ -990,11 +1017,11 @@ SizeType Deserializer<Device, SizeType>::_load(T&& t) {
 }
 
 // duration data type
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_duration_v<std::decay_t<T>>, void>*
 >
-SizeType Deserializer<Device, SizeType>::_load(T&& t) {
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
   using U = std::decay_t<T>;
   typename U::rep count;
   auto s = _load(count);
@@ -1003,11 +1030,11 @@ SizeType Deserializer<Device, SizeType>::_load(T&& t) {
 }
 
 // time point data type
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_time_point_v<std::decay_t<T>>, void>*
 >
-SizeType Deserializer<Device, SizeType>::_load(T&& t) {
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
   using U = std::decay_t<T>;
   typename U::duration elapsed;
   auto s = _load(elapsed);
@@ -1016,12 +1043,12 @@ SizeType Deserializer<Device, SizeType>::_load(T&& t) {
 }
 
 // optional data type
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_optional_v<std::decay_t<T>>, void>*
 >
-SizeType Deserializer<Device, SizeType>::_load(T&& t) {
-  
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
+
   using U = std::decay_t<T>;
 
   bool has_value;
@@ -1033,53 +1060,53 @@ SizeType Deserializer<Device, SizeType>::_load(T&& t) {
     s += _load(*t);
   }
   else {
-    t.reset(); 
+    t.reset();
   }
   return s;
 }
 
 // variant type
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_variant_v<std::decay_t<T>>, void>*
 >
-SizeType Deserializer<Device, SizeType>::_load(T&& t) {
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
   std::decay_t<decltype(t.index())> idx;
   auto s = _load(idx);
   return s + _variant_helper(idx, t);
 }
 
 // tuple type
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_tuple_v<std::decay_t<T>>, void>*
 >
-SizeType Deserializer<Device, SizeType>::_load(T&& t) {
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
   return std::apply(
     [&] (auto&&... args) {
-      return (_load(std::forward<decltype(args)>(args)) + ... + 0); 
+      return (_load(std::forward<decltype(args)>(args)) + ... + 0);
     },
     std::forward<T>(t)
   );
 }
 
 // array
-template <typename Device, typename SizeType>  
-template <typename T, 
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<is_std_array_v<std::decay_t<T>>, void>*
 >
-SizeType Deserializer<Device, SizeType>::_load(T&& t) {
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
 
   using U = std::decay_t<T>;
 
   static_assert(std::tuple_size<U>::value > 0, "Array size can't be zero");
 
   SizeType sz;
-    
+
   if constexpr(std::is_arithmetic_v<typename U::value_type>) {
-    _device.read(reinterpret_cast<char*>(t.data()), sizeof(t));
+    _stream.read(reinterpret_cast<char*>(t.data()), sizeof(t));
     sz = sizeof(t);
-  } 
+  }
   else {
     sz = 0;
     for(auto && v : t) {
@@ -1090,12 +1117,12 @@ SizeType Deserializer<Device, SizeType>::_load(T&& t) {
   return sz;
 }
 
-// custom save method    
-template <typename Device, typename SizeType>  
-template <typename T, 
+// custom save method
+template <typename Stream, typename SizeType>
+template <typename T,
   std::enable_if_t<!is_default_deserializable_v<std::decay_t<T>>, void>*
 >
-SizeType Deserializer<Device, SizeType>::_load(T&& t) {
+SizeType Deserializer<Stream, SizeType>::_load(T&& t) {
   return t.load(*this);
 }
 
diff --git a/lib/taskflow/utility/singleton.hpp b/lib/taskflow/utility/singleton.hpp
index 01d521c..aab50bc 100644
--- a/lib/taskflow/utility/singleton.hpp
+++ b/lib/taskflow/utility/singleton.hpp
@@ -11,7 +11,7 @@ template <typename T>
 class Singleton {
 
   public:
-  
+
   /**
   @brief get a reference to the singleton object
   */
diff --git a/lib/taskflow/utility/small_vector.hpp b/lib/taskflow/utility/small_vector.hpp
new file mode 100644
index 0000000..a42c264
--- /dev/null
+++ b/lib/taskflow/utility/small_vector.hpp
@@ -0,0 +1,1048 @@
+// small vector modified from llvm
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <initializer_list>
+#include <iterator>
+#include <memory>
+
+#if defined(__GNUC__)
+  #define TF_LIKELY(x) (__builtin_expect((x), 1))
+  #define TF_UNLIKELY(x) (__builtin_expect((x), 0))
+#else
+  #define TF_LIKELY(x) (x)
+  #define TF_UNLIKELY(x) (x)
+#endif
+
+/**
+@file small_vector.hpp
+@brief small vector include file
+*/
+
+namespace tf { namespace detail {
+
+/**
+@private
+@brief NextCapacity - Returns the next power of two (in 64-bits)
+       that is strictly greater than A.  Returns zero on overflow.
+       this function assumes A to be positive
+*/
+inline uint64_t NextCapacity(uint64_t A) {
+  A |= (A >> 1);
+  A |= (A >> 2);
+  A |= (A >> 4);
+  A |= (A >> 8);
+  A |= (A >> 16);
+  A |= (A >> 32);
+  return A + 1;
+}
+
+}}  // end of namespace tf::detail --------------------------------------------
+
+
+namespace tf {
+
+/**
+@private
+*/
+template <typename T>
+struct IsPod : std::integral_constant<bool, std::is_standard_layout<T>::value &&
+                                            std::is_trivial<T>::value> {};
+
+/**
+@private
+*/
+class SmallVectorBase {
+protected:
+  void *BeginX, *EndX, *CapacityX;
+
+protected:
+  SmallVectorBase(void *FirstEl, size_t Size)
+    : BeginX(FirstEl), EndX(FirstEl), CapacityX((char*)FirstEl+Size) {}
+
+  /// This is an implementation of the grow() method which only works
+  /// on POD-like data types and is out of line to reduce code duplication.
+  void grow_pod(void *FirstEl, size_t MinSizeInBytes, size_t TSize){
+    size_t CurSizeBytes = size_in_bytes();
+    size_t NewCapacityInBytes = 2 * capacity_in_bytes() + TSize; // Always grow.
+    if (NewCapacityInBytes < MinSizeInBytes) {
+      NewCapacityInBytes = MinSizeInBytes;
+    }
+
+    void *NewElts;
+    if (BeginX == FirstEl) {
+      NewElts = std::malloc(NewCapacityInBytes);
+
+      // Copy the elements over.  No need to run dtors on PODs.
+      memcpy(NewElts, this->BeginX, CurSizeBytes);
+    } else {
+      // If this wasn't grown from the inline copy, grow the allocated space.
+      NewElts = realloc(this->BeginX, NewCapacityInBytes);
+    }
+    //assert(NewElts && "Out of memory");
+
+    this->EndX = (char*)NewElts+CurSizeBytes;
+    this->BeginX = NewElts;
+    this->CapacityX = (char*)this->BeginX + NewCapacityInBytes;
+  }
+
+public:
+  /// This returns size()*sizeof(T).
+  size_t size_in_bytes() const {
+    return size_t((char*)EndX - (char*)BeginX);
+  }
+
+  /// capacity_in_bytes - This returns capacity()*sizeof(T).
+  size_t capacity_in_bytes() const {
+    return size_t((char*)CapacityX - (char*)BeginX);
+  }
+
+  bool empty() const { return BeginX == EndX; }
+};
+
+/**
+@private
+*/
+template <typename T, unsigned N> struct SmallVectorStorage;
+
+/**
+@private
+*/
+template <typename T, typename = void>
+class SmallVectorTemplateCommon : public SmallVectorBase {
+
+  private:
+  template <typename, unsigned> friend struct SmallVectorStorage;
+
+  template <typename X>
+  struct AlignedUnionType {
+    alignas(X) std::byte buff[std::max(sizeof(std::byte), sizeof(X))];
+  };
+
+  // Allocate raw space for N elements of type T.  If T has a ctor or dtor, we
+  // don't want it to be automatically run, so we need to represent the space as
+  // something else.  Use an array of char of sufficient alignment.
+  
+  // deprecated in c++23
+  //typedef typename std::aligned_union<1, T>::type U;
+  typedef AlignedUnionType<T> U;
+
+  U FirstEl;
+  // Space after 'FirstEl' is clobbered, do not add any instance vars after it.
+
+  protected:
+  SmallVectorTemplateCommon(size_t Size) : SmallVectorBase(&FirstEl, Size) {}
+
+  void grow_pod(size_t MinSizeInBytes, size_t TSize) {
+    SmallVectorBase::grow_pod(&FirstEl, MinSizeInBytes, TSize);
+  }
+
+  /// Return true if this is a smallvector which has not had dynamic
+  /// memory allocated for it.
+  bool isSmall() const {
+    return BeginX == static_cast<const void*>(&FirstEl);
+  }
+
+  /// Put this vector in a state of being small.
+  void resetToSmall() {
+    BeginX = EndX = CapacityX = &FirstEl;
+  }
+
+  void setEnd(T *P) { this->EndX = P; }
+
+  public:
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+  typedef T value_type;
+  typedef T *iterator;
+  typedef const T *const_iterator;
+
+  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+  typedef std::reverse_iterator<iterator> reverse_iterator;
+
+  typedef T &reference;
+  typedef const T &const_reference;
+  typedef T *pointer;
+  typedef const T *const_pointer;
+
+  // forward iterator creation methods.
+  inline iterator begin() { return (iterator)this->BeginX; }
+  inline const_iterator begin() const { return (const_iterator)this->BeginX; }
+  inline iterator end() { return (iterator)this->EndX; }
+  inline const_iterator end() const { return (const_iterator)this->EndX; }
+
+  protected:
+
+  iterator capacity_ptr() { return (iterator)this->CapacityX; }
+  const_iterator capacity_ptr() const { return (const_iterator)this->CapacityX;}
+
+  public:
+
+  // reverse iterator creation methods.
+  reverse_iterator rbegin()            { return reverse_iterator(end()); }
+  const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); }
+  reverse_iterator rend()              { return reverse_iterator(begin()); }
+  const_reverse_iterator rend() const { return const_reverse_iterator(begin());}
+
+  inline size_type size() const { return end()-begin(); }
+  inline size_type max_size() const { return size_type(-1) / sizeof(T); }
+
+  /// Return the total number of elements in the currently allocated buffer.
+  size_t capacity() const { return capacity_ptr() - begin(); }
+
+  /// Return a pointer to the vector's buffer, even if empty().
+  pointer data() { return pointer(begin()); }
+  /// Return a pointer to the vector's buffer, even if empty().
+  const_pointer data() const { return const_pointer(begin()); }
+
+  inline reference operator[](size_type idx) {
+    //assert(idx < size());
+    return begin()[idx];
+  }
+
+  inline const_reference operator[](size_type idx) const {
+    //assert(idx < size());
+    return begin()[idx];
+  }
+
+  reference front() {
+    //assert(!empty());
+    return begin()[0];
+  }
+
+  const_reference front() const {
+    //assert(!empty());
+    return begin()[0];
+  }
+
+  reference back() {
+    //assert(!empty());
+    return end()[-1];
+  }
+
+  const_reference back() const {
+    //assert(!empty());
+    return end()[-1];
+  }
+};
+
+/**
+@private
+*/
+template <typename T, bool isPodLike>
+class SmallVectorTemplateBase : public SmallVectorTemplateCommon<T> {
+
+protected:
+  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
+
+  static void destroy_range(T *S, T *E) {
+    while (S != E) {
+      --E;
+      E->~T();
+    }
+  }
+
+  /// Move the range [I, E) into the uninitialized memory starting with "Dest",
+  /// constructing elements as needed.
+  template<typename It1, typename It2>
+  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
+    std::uninitialized_copy(std::make_move_iterator(I),
+                            std::make_move_iterator(E), Dest);
+  }
+
+  /// Copy the range [I, E) onto the uninitialized memory starting with "Dest",
+  /// constructing elements as needed.
+  template<typename It1, typename It2>
+  static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
+    std::uninitialized_copy(I, E, Dest);
+  }
+
+  /// Grow the allocated memory (without initializing new elements), doubling
+  /// the size of the allocated memory. Guarantees space for at least one more
+  /// element, or MinSize more elements if specified.
+  void grow(size_t MinSize = 0);
+
+public:
+  void push_back(const T &Elt) {
+    if (TF_UNLIKELY(this->EndX >= this->CapacityX))
+      this->grow();
+    ::new ((void*) this->end()) T(Elt);
+    this->setEnd(this->end()+1);
+  }
+
+  void push_back(T &&Elt) {
+    if (TF_UNLIKELY(this->EndX >= this->CapacityX))
+      this->grow();
+    ::new ((void*) this->end()) T(::std::move(Elt));
+    this->setEnd(this->end()+1);
+  }
+
+  void pop_back() {
+    this->setEnd(this->end()-1);
+    this->end()->~T();
+  }
+};
+
+/**
+@private
+*/
+template <typename T, bool isPodLike>
+void SmallVectorTemplateBase<T, isPodLike>::grow(size_t MinSize) {
+  size_t CurCapacity = this->capacity();
+  size_t CurSize = this->size();
+  // Always grow, even from zero.
+  size_t NewCapacity = size_t(tf::detail::NextCapacity(CurCapacity+2));
+  if (NewCapacity < MinSize)
+    NewCapacity = MinSize;
+  T *NewElts = static_cast<T*>(std::malloc(NewCapacity*sizeof(T)));
+
+  // Move the elements over.
+  this->uninitialized_move(this->begin(), this->end(), NewElts);
+
+  // Destroy the original elements.
+  destroy_range(this->begin(), this->end());
+
+  // If this wasn't grown from the inline copy, deallocate the old space.
+  if (!this->isSmall())
+    std::free(this->begin());
+
+  this->setEnd(NewElts+CurSize);
+  this->BeginX = NewElts;
+  this->CapacityX = this->begin()+NewCapacity;
+}
+
+/**
+@private
+*/
+template <typename T>
+class SmallVectorTemplateBase<T, true> : public SmallVectorTemplateCommon<T> {
+protected:
+  SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon<T>(Size) {}
+
+  // No need to do a destroy loop for POD's.
+  static void destroy_range(T *, T *) {}
+
+  /// Move the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template<typename It1, typename It2>
+  static void uninitialized_move(It1 I, It1 E, It2 Dest) {
+    // Just do a copy.
+    uninitialized_copy(I, E, Dest);
+  }
+
+  /// Copy the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template<typename It1, typename It2>
+  static void uninitialized_copy(It1 I, It1 E, It2 Dest) {
+    // Arbitrary iterator types; just use the basic implementation.
+    std::uninitialized_copy(I, E, Dest);
+  }
+
+  /// Copy the range [I, E) onto the uninitialized memory
+  /// starting with "Dest", constructing elements into it as needed.
+  template <typename T1, typename T2>
+  static void uninitialized_copy(
+      T1 *I, T1 *E, T2 *Dest,
+      typename std::enable_if<std::is_same<typename std::remove_const<T1>::type,
+                                           T2>::value>::type * = nullptr) {
+    // Use memcpy for PODs iterated by pointers (which includes SmallVector
+    // iterators): std::uninitialized_copy optimizes to memmove, but we can
+    // use memcpy here. Note that I and E are iterators and thus might be
+    // invalid for memcpy if they are equal.
+    if (I != E)
+      memcpy(Dest, I, (E - I) * sizeof(T));
+  }
+
+  /// Double the size of the allocated memory, guaranteeing space for at
+  /// least one more element or MinSize if specified.
+  void grow(size_t MinSize = 0) {
+    this->grow_pod(MinSize*sizeof(T), sizeof(T));
+  }
+public:
+  void push_back(const T &Elt) {
+    if (TF_UNLIKELY(this->EndX >= this->CapacityX))
+      this->grow();
+    memcpy(this->end(), &Elt, sizeof(T));
+    this->setEnd(this->end()+1);
+  }
+
+  void pop_back() {
+    this->setEnd(this->end()-1);
+  }
+};
+
+/**
+@private
+*/
+template <typename T>
+class SmallVectorImpl : public SmallVectorTemplateBase<T, IsPod<T>::value> {
+  typedef SmallVectorTemplateBase<T, IsPod<T>::value> SuperClass;
+
+  SmallVectorImpl(const SmallVectorImpl&) = delete;
+
+public:
+  typedef typename SuperClass::iterator iterator;
+  typedef typename SuperClass::const_iterator const_iterator;
+  typedef typename SuperClass::size_type size_type;
+
+protected:
+  // Default ctor - Initialize to empty.
+  explicit SmallVectorImpl(unsigned N)
+    : SmallVectorTemplateBase<T, IsPod<T>::value>(N*sizeof(T)) {
+  }
+
+public:
+  ~SmallVectorImpl() {
+    // Destroy the constructed elements in the vector.
+    this->destroy_range(this->begin(), this->end());
+
+    // If this wasn't grown from the inline copy, deallocate the old space.
+    if (!this->isSmall())
+      std::free(this->begin());
+  }
+
+
+  void clear() {
+    this->destroy_range(this->begin(), this->end());
+    this->EndX = this->BeginX;
+  }
+
+  void resize(size_type N) {
+    if (N < this->size()) {
+      this->destroy_range(this->begin()+N, this->end());
+      this->setEnd(this->begin()+N);
+    } else if (N > this->size()) {
+      if (this->capacity() < N)
+        this->grow(N);
+      for (auto I = this->end(), E = this->begin() + N; I != E; ++I)
+        new (&*I) T();
+      this->setEnd(this->begin()+N);
+    }
+  }
+
+  void resize(size_type N, const T &NV) {
+    if (N < this->size()) {
+      this->destroy_range(this->begin()+N, this->end());
+      this->setEnd(this->begin()+N);
+    } else if (N > this->size()) {
+      if (this->capacity() < N)
+        this->grow(N);
+      std::uninitialized_fill(this->end(), this->begin()+N, NV);
+      this->setEnd(this->begin()+N);
+    }
+  }
+
+  void reserve(size_type N) {
+    if (this->capacity() < N)
+      this->grow(N);
+  }
+
+  T pop_back_val() {
+    T Result = ::std::move(this->back());
+    this->pop_back();
+    return Result;
+  }
+
+  void swap(SmallVectorImpl &RHS);
+
+  /// Add the specified range to the end of the SmallVector.
+  template<typename in_iter>
+  void append(in_iter in_start, in_iter in_end) {
+    size_type NumInputs = std::distance(in_start, in_end);
+    // Grow allocated space if needed.
+    if (NumInputs > size_type(this->capacity_ptr()-this->end()))
+      this->grow(this->size()+NumInputs);
+
+    // Copy the new elements over.
+    this->uninitialized_copy(in_start, in_end, this->end());
+    this->setEnd(this->end() + NumInputs);
+  }
+
+  /// Add the specified range to the end of the SmallVector.
+  void append(size_type NumInputs, const T &Elt) {
+    // Grow allocated space if needed.
+    if (NumInputs > size_type(this->capacity_ptr()-this->end()))
+      this->grow(this->size()+NumInputs);
+
+    // Copy the new elements over.
+    std::uninitialized_fill_n(this->end(), NumInputs, Elt);
+    this->setEnd(this->end() + NumInputs);
+  }
+
+  void append(std::initializer_list<T> IL) {
+    append(IL.begin(), IL.end());
+  }
+
+  void assign(size_type NumElts, const T &Elt) {
+    clear();
+    if (this->capacity() < NumElts)
+      this->grow(NumElts);
+    this->setEnd(this->begin()+NumElts);
+    std::uninitialized_fill(this->begin(), this->end(), Elt);
+  }
+
+  void assign(std::initializer_list<T> IL) {
+    clear();
+    append(IL);
+  }
+
+  iterator erase(const_iterator CI) {
+    // Just cast away constness because this is a non-const member function.
+    iterator I = const_cast<iterator>(CI);
+
+    //assert(I >= this->begin() && "Iterator to erase is out of bounds.");
+    //assert(I < this->end() && "Erasing at past-the-end iterator.");
+
+    iterator N = I;
+    // Shift all elts down one.
+    std::move(I+1, this->end(), I);
+    // Drop the last elt.
+    this->pop_back();
+    return(N);
+  }
+
+  iterator erase(const_iterator CS, const_iterator CE) {
+    // Just cast away constness because this is a non-const member function.
+    iterator S = const_cast<iterator>(CS);
+    iterator E = const_cast<iterator>(CE);
+
+    //assert(S >= this->begin() && "Range to erase is out of bounds.");
+    //assert(S <= E && "Trying to erase invalid range.");
+    //assert(E <= this->end() && "Trying to erase past the end.");
+
+    iterator N = S;
+    // Shift all elts down.
+    iterator I = std::move(E, this->end(), S);
+    // Drop the last elts.
+    this->destroy_range(I, this->end());
+    this->setEnd(I);
+    return(N);
+  }
+
+  iterator insert(iterator I, T &&Elt) {
+    if (I == this->end()) {  // Important special case for empty vector.
+      this->push_back(::std::move(Elt));
+      return this->end()-1;
+    }
+
+    //assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    //assert(I <= this->end() && "Inserting past the end of the vector.");
+
+    if (this->EndX >= this->CapacityX) {
+      size_t EltNo = I-this->begin();
+      this->grow();
+      I = this->begin()+EltNo;
+    }
+
+    ::new ((void*) this->end()) T(::std::move(this->back()));
+    // Push everything else over.
+    std::move_backward(I, this->end()-1, this->end());
+    this->setEnd(this->end()+1);
+
+    // If we just moved the element we're inserting, be sure to update
+    // the reference.
+    T *EltPtr = &Elt;
+    if (I <= EltPtr && EltPtr < this->EndX)
+      ++EltPtr;
+
+    *I = ::std::move(*EltPtr);
+    return I;
+  }
+
+  iterator insert(iterator I, const T &Elt) {
+    if (I == this->end()) {  // Important special case for empty vector.
+      this->push_back(Elt);
+      return this->end()-1;
+    }
+
+    //assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    //assert(I <= this->end() && "Inserting past the end of the vector.");
+
+    if (this->EndX >= this->CapacityX) {
+      size_t EltNo = I-this->begin();
+      this->grow();
+      I = this->begin()+EltNo;
+    }
+    ::new ((void*) this->end()) T(std::move(this->back()));
+    // Push everything else over.
+    std::move_backward(I, this->end()-1, this->end());
+    this->setEnd(this->end()+1);
+
+    // If we just moved the element we're inserting, be sure to update
+    // the reference.
+    const T *EltPtr = &Elt;
+    if (I <= EltPtr && EltPtr < this->EndX)
+      ++EltPtr;
+
+    *I = *EltPtr;
+    return I;
+  }
+
+  iterator insert(iterator I, size_type NumToInsert, const T &Elt) {
+    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
+    size_t InsertElt = I - this->begin();
+
+    if (I == this->end()) {  // Important special case for empty vector.
+      append(NumToInsert, Elt);
+      return this->begin()+InsertElt;
+    }
+
+    //assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    //assert(I <= this->end() && "Inserting past the end of the vector.");
+
+    // Ensure there is enough space.
+    reserve(this->size() + NumToInsert);
+
+    // Uninvalidate the iterator.
+    I = this->begin()+InsertElt;
+
+    // If there are more elements between the insertion point and the end of the
+    // range than there are being inserted, we can use a simple approach to
+    // insertion.  Since we already reserved space, we know that this won't
+    // reallocate the vector.
+    if (size_t(this->end()-I) >= NumToInsert) {
+      T *OldEnd = this->end();
+      append(std::move_iterator<iterator>(this->end() - NumToInsert),
+             std::move_iterator<iterator>(this->end()));
+
+      // Copy the existing elements that get replaced.
+      std::move_backward(I, OldEnd-NumToInsert, OldEnd);
+
+      std::fill_n(I, NumToInsert, Elt);
+      return I;
+    }
+
+    // Otherwise, we're inserting more elements than exist already, and we're
+    // not inserting at the end.
+
+    // Move over the elements that we're about to overwrite.
+    T *OldEnd = this->end();
+    this->setEnd(this->end() + NumToInsert);
+    size_t NumOverwritten = OldEnd-I;
+    this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
+
+    // Replace the overwritten part.
+    std::fill_n(I, NumOverwritten, Elt);
+
+    // Insert the non-overwritten middle part.
+    std::uninitialized_fill_n(OldEnd, NumToInsert-NumOverwritten, Elt);
+    return I;
+  }
+
+  template<typename ItTy>
+  iterator insert(iterator I, ItTy From, ItTy To) {
+    // Convert iterator to elt# to avoid invalidating iterator when we reserve()
+    size_t InsertElt = I - this->begin();
+
+    if (I == this->end()) {  // Important special case for empty vector.
+      append(From, To);
+      return this->begin()+InsertElt;
+    }
+
+    //assert(I >= this->begin() && "Insertion iterator is out of bounds.");
+    //assert(I <= this->end() && "Inserting past the end of the vector.");
+
+    size_t NumToInsert = std::distance(From, To);
+
+    // Ensure there is enough space.
+    reserve(this->size() + NumToInsert);
+
+    // Uninvalidate the iterator.
+    I = this->begin()+InsertElt;
+
+    // If there are more elements between the insertion point and the end of the
+    // range than there are being inserted, we can use a simple approach to
+    // insertion.  Since we already reserved space, we know that this won't
+    // reallocate the vector.
+    if (size_t(this->end()-I) >= NumToInsert) {
+      T *OldEnd = this->end();
+      append(std::move_iterator<iterator>(this->end() - NumToInsert),
+             std::move_iterator<iterator>(this->end()));
+
+      // Copy the existing elements that get replaced.
+      std::move_backward(I, OldEnd-NumToInsert, OldEnd);
+
+      std::copy(From, To, I);
+      return I;
+    }
+
+    // Otherwise, we're inserting more elements than exist already, and we're
+    // not inserting at the end.
+
+    // Move over the elements that we're about to overwrite.
+    T *OldEnd = this->end();
+    this->setEnd(this->end() + NumToInsert);
+    size_t NumOverwritten = OldEnd-I;
+    this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten);
+
+    // Replace the overwritten part.
+    for (T *J = I; NumOverwritten > 0; --NumOverwritten) {
+      *J = *From;
+      ++J; ++From;
+    }
+
+    // Insert the non-overwritten middle part.
+    this->uninitialized_copy(From, To, OldEnd);
+    return I;
+  }
+
+  void insert(iterator I, std::initializer_list<T> IL) {
+    insert(I, IL.begin(), IL.end());
+  }
+
+  template <typename... ArgTypes> void emplace_back(ArgTypes &&... Args) {
+    if (TF_UNLIKELY(this->EndX >= this->CapacityX))
+      this->grow();
+    ::new ((void *)this->end()) T(std::forward<ArgTypes>(Args)...);
+    this->setEnd(this->end() + 1);
+  }
+
+  SmallVectorImpl &operator=(const SmallVectorImpl &RHS);
+
+  SmallVectorImpl &operator=(SmallVectorImpl &&RHS);
+
+  bool operator==(const SmallVectorImpl &RHS) const {
+    if (this->size() != RHS.size()) return false;
+    return std::equal(this->begin(), this->end(), RHS.begin());
+  }
+  bool operator!=(const SmallVectorImpl &RHS) const {
+    return !(*this == RHS);
+  }
+
+  bool operator<(const SmallVectorImpl &RHS) const {
+    return std::lexicographical_compare(this->begin(), this->end(),
+                                        RHS.begin(), RHS.end());
+  }
+
+  /// Set the array size to \p N, which the current array must have enough
+  /// capacity for.
+  ///
+  /// This does not construct or destroy any elements in the vector.
+  ///
+  /// Clients can use this in conjunction with capacity() to write past the end
+  /// of the buffer when they know that more elements are available, and only
+  /// update the size later. This avoids the cost of value initializing elements
+  /// which will only be overwritten.
+  void set_size(size_type N) {
+    //assert(N <= this->capacity());
+    this->setEnd(this->begin() + N);
+  }
+};
+
+
+template <typename T>
+void SmallVectorImpl<T>::swap(SmallVectorImpl<T> &RHS) {
+  if (this == &RHS) return;
+
+  // We can only avoid copying elements if neither vector is small.
+  if (!this->isSmall() && !RHS.isSmall()) {
+    std::swap(this->BeginX, RHS.BeginX);
+    std::swap(this->EndX, RHS.EndX);
+    std::swap(this->CapacityX, RHS.CapacityX);
+    return;
+  }
+  if (RHS.size() > this->capacity())
+    this->grow(RHS.size());
+  if (this->size() > RHS.capacity())
+    RHS.grow(this->size());
+
+  // Swap the shared elements.
+  size_t NumShared = this->size();
+  if (NumShared > RHS.size()) NumShared = RHS.size();
+  for (size_type i = 0; i != NumShared; ++i)
+    std::swap((*this)[i], RHS[i]);
+
+  // Copy over the extra elts.
+  if (this->size() > RHS.size()) {
+    size_t EltDiff = this->size() - RHS.size();
+    this->uninitialized_copy(this->begin()+NumShared, this->end(), RHS.end());
+    RHS.setEnd(RHS.end()+EltDiff);
+    this->destroy_range(this->begin()+NumShared, this->end());
+    this->setEnd(this->begin()+NumShared);
+  } else if (RHS.size() > this->size()) {
+    size_t EltDiff = RHS.size() - this->size();
+    this->uninitialized_copy(RHS.begin()+NumShared, RHS.end(), this->end());
+    this->setEnd(this->end() + EltDiff);
+    this->destroy_range(RHS.begin()+NumShared, RHS.end());
+    RHS.setEnd(RHS.begin()+NumShared);
+  }
+}
+
+template <typename T>
+SmallVectorImpl<T> &SmallVectorImpl<T>::
+  operator=(const SmallVectorImpl<T> &RHS) {
+  // Avoid self-assignment.
+  if (this == &RHS) return *this;
+
+  // If we already have sufficient space, assign the common elements, then
+  // destroy any excess.
+  size_t RHSSize = RHS.size();
+  size_t CurSize = this->size();
+  if (CurSize >= RHSSize) {
+    // Assign common elements.
+    iterator NewEnd;
+    if (RHSSize)
+      NewEnd = std::copy(RHS.begin(), RHS.begin()+RHSSize, this->begin());
+    else
+      NewEnd = this->begin();
+
+    // Destroy excess elements.
+    this->destroy_range(NewEnd, this->end());
+
+    // Trim.
+    this->setEnd(NewEnd);
+    return *this;
+  }
+
+  // If we have to grow to have enough elements, destroy the current elements.
+  // This allows us to avoid copying them during the grow.
+  // FIXME: don't do this if they're efficiently moveable.
+  if (this->capacity() < RHSSize) {
+    // Destroy current elements.
+    this->destroy_range(this->begin(), this->end());
+    this->setEnd(this->begin());
+    CurSize = 0;
+    this->grow(RHSSize);
+  } else if (CurSize) {
+    // Otherwise, use assignment for the already-constructed elements.
+    std::copy(RHS.begin(), RHS.begin()+CurSize, this->begin());
+  }
+
+  // Copy construct the new elements in place.
+  this->uninitialized_copy(RHS.begin()+CurSize, RHS.end(),
+                           this->begin()+CurSize);
+
+  // Set end.
+  this->setEnd(this->begin()+RHSSize);
+  return *this;
+}
+
+template <typename T>
+SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
+  // Avoid self-assignment.
+  if (this == &RHS) return *this;
+
+  // If the RHS isn't small, clear this vector and then steal its buffer.
+  if (!RHS.isSmall()) {
+    this->destroy_range(this->begin(), this->end());
+    if (!this->isSmall()) std::free(this->begin());
+    this->BeginX = RHS.BeginX;
+    this->EndX = RHS.EndX;
+    this->CapacityX = RHS.CapacityX;
+    RHS.resetToSmall();
+    return *this;
+  }
+
+  // If we already have sufficient space, assign the common elements, then
+  // destroy any excess.
+  size_t RHSSize = RHS.size();
+  size_t CurSize = this->size();
+  if (CurSize >= RHSSize) {
+    // Assign common elements.
+    iterator NewEnd = this->begin();
+    if (RHSSize)
+      NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd);
+
+    // Destroy excess elements and trim the bounds.
+    this->destroy_range(NewEnd, this->end());
+    this->setEnd(NewEnd);
+
+    // Clear the RHS.
+    RHS.clear();
+
+    return *this;
+  }
+
+  // If we have to grow to have enough elements, destroy the current elements.
+  // This allows us to avoid copying them during the grow.
+  // FIXME: this may not actually make any sense if we can efficiently move
+  // elements.
+  if (this->capacity() < RHSSize) {
+    // Destroy current elements.
+    this->destroy_range(this->begin(), this->end());
+    this->setEnd(this->begin());
+    CurSize = 0;
+    this->grow(RHSSize);
+  } else if (CurSize) {
+    // Otherwise, use assignment for the already-constructed elements.
+    std::move(RHS.begin(), RHS.begin()+CurSize, this->begin());
+  }
+
+  // Move-construct the new elements in place.
+  this->uninitialized_move(RHS.begin()+CurSize, RHS.end(),
+                           this->begin()+CurSize);
+
+  // Set end.
+  this->setEnd(this->begin()+RHSSize);
+
+  RHS.clear();
+  return *this;
+}
+
+/**
+@private
+*/
+template <typename T, unsigned N>
+struct SmallVectorStorage {
+  /**
+  @private
+  */
+  typename SmallVectorTemplateCommon<T>::U InlineElts[N - 1];
+};
+
+/**
+@private
+*/
+template <typename T> struct SmallVectorStorage<T, 1> {};
+
+/**
+@private
+*/
+template <typename T> struct SmallVectorStorage<T, 0> {};
+
+/**
+@brief class to define a vector optimized for small array
+
+@tparam T data type
+@tparam N threshold of the number of elements in the initial storage
+
+The class defines a C++ STL-styled vector (a variable-sized array)
+optimized for the case when the array is small.
+It contains some number of elements in-place,
+which allows it to avoid heap allocation when the actual number of
+elements is below that threshold. This allows normal @em small cases to be
+fast without losing generality for large inputs.
+All the methods in [std::vector](https://en.cppreference.com/w/cpp/container/vector)
+can apply to this class.
+
+The class is stripped from the LLVM codebase.
+*/
+template <typename T, unsigned N = 2>
+class SmallVector : public SmallVectorImpl<T> {
+  /// Inline space for elements which aren't stored in the base class.
+  SmallVectorStorage<T, N> Storage;
+
+public:
+
+  /**
+  @brief constructs an empty vector
+  */
+  SmallVector() : SmallVectorImpl<T>(N) {
+  }
+
+  /**
+  @brief constructs a vector with @c Size copies of elements with value @c value
+  */
+  explicit SmallVector(size_t Size, const T &Value = T())
+    : SmallVectorImpl<T>(N) {
+    this->assign(Size, Value);
+  }
+
+  /**
+  @brief constructs a vector with the contents of the range
+         <tt>[S, E)</tt>
+   */
+  template<typename ItTy>
+  SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(N) {
+    this->append(S, E);
+  }
+
+  //template <typename RangeTy>
+  //explicit SmallVector(const tf::iterator_range<RangeTy> &R)
+  //    : SmallVectorImpl<T>(N) {
+  //  this->append(R.begin(), R.end());
+  //}
+
+  /**
+  @brief constructs a vector with the contents of the initializer list @c IL
+  */
+  SmallVector(std::initializer_list<T> IL) : SmallVectorImpl<T>(N) {
+    this->assign(IL);
+  }
+
+  /**
+  @brief constructs the vector with the copy of the contents of @c RHS
+  */
+  SmallVector(const SmallVector &RHS) : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(RHS);
+  }
+
+  /**
+  @brief constructs the vector with the contents of @c RHS using move semantics
+  */
+  SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(::std::move(RHS));
+  }
+
+  /**
+  @brief replaces the contents with a copy of the contents of @c RHS
+  */
+  const SmallVector &operator=(const SmallVector &RHS) {
+    SmallVectorImpl<T>::operator=(RHS);
+    return *this;
+  }
+
+  /**
+  @brief replaces the contents with the contents of @c RHS using move semantics
+  */
+  const SmallVector &operator=(SmallVector &&RHS) {
+    SmallVectorImpl<T>::operator=(::std::move(RHS));
+    return *this;
+  }
+
+  /**
+  @brief constructs a vector with the contents of @c RHS using move semantics
+  */
+  SmallVector(SmallVectorImpl<T> &&RHS) : SmallVectorImpl<T>(N) {
+    if (!RHS.empty())
+      SmallVectorImpl<T>::operator=(::std::move(RHS));
+  }
+
+  /**
+  @brief replaces the contents with the contents of @c RHS using move semantics
+   */
+  const SmallVector &operator=(SmallVectorImpl<T> &&RHS) {
+    SmallVectorImpl<T>::operator=(::std::move(RHS));
+    return *this;
+  }
+
+  /**
+  @brief replaces the contents with the copy of the contents of an initializer list @c IL
+   */
+  const SmallVector &operator=(std::initializer_list<T> IL) {
+    this->assign(IL);
+    return *this;
+  }
+};
+
+template<typename T, unsigned N>
+static inline size_t capacity_in_bytes(const SmallVector<T, N> &X) {
+  return X.capacity_in_bytes();
+}
+
+} // end tf namespace ---------------------------------------------------------
+
+namespace std {
+  /// Implement std::swap in terms of SmallVector swap.
+  template<typename T>
+  inline void
+  swap(tf::SmallVectorImpl<T> &LHS, tf::SmallVectorImpl<T> &RHS) {
+    LHS.swap(RHS);
+  }
+
+  /// Implement std::swap in terms of SmallVector swap.
+  template<typename T, unsigned N>
+  inline void
+  swap(tf::SmallVector<T, N> &LHS, tf::SmallVector<T, N> &RHS) {
+    LHS.swap(RHS);
+  }
+}  // end of namespace std ----------------------------------------------------
+
+
diff --git a/lib/taskflow/utility/stream.hpp b/lib/taskflow/utility/stream.hpp
index 6063f8c..320aa6c 100644
--- a/lib/taskflow/utility/stream.hpp
+++ b/lib/taskflow/utility/stream.hpp
@@ -8,7 +8,7 @@ namespace tf {
 // Procedure: ostreamize
 template <typename T>
 void ostreamize(std::ostream& os, T&& token) {
-  os << std::forward<T>(token);  
+  os << std::forward<T>(token);
 }
 
 // Procedure: ostreamize
diff --git a/lib/taskflow/utility/traits.hpp b/lib/taskflow/utility/traits.hpp
index 58b3839..196b147 100644
--- a/lib/taskflow/utility/traits.hpp
+++ b/lib/taskflow/utility/traits.hpp
@@ -14,23 +14,20 @@
 #include <thread>
 #include <future>
 #include <functional>
-#include <map>
-#include <set>
 #include <unordered_map>
 #include <unordered_set>
 #include <sstream>
 #include <list>
-#include <forward_list>
 #include <numeric>
 #include <random>
 #include <iomanip>
 #include <cassert>
 #include <cmath>
 #include <array>
-#include <cstring>
+#include <string>
 #include <variant>
 #include <optional>
-#include <any>
+#include "os.hpp"
 
 namespace tf {
 
@@ -38,14 +35,49 @@ namespace tf {
 // Traits
 //-----------------------------------------------------------------------------
 
-// Struct: dependent_false
-template <typename... T>
-struct dependent_false { 
-  static constexpr bool value = false; 
+//// Struct: dependent_false
+//template <typename... T>
+//struct dependent_false {
+//  static constexpr bool value = false;
+//};
+//
+//template <typename... T>
+//constexpr auto dependent_false_v = dependent_false<T...>::value;
+
+template<typename> inline constexpr bool dependent_false_v = false;
+
+// ----------------------------------------------------------------------------
+// is_pod
+//-----------------------------------------------------------------------------
+template <typename T>
+struct is_pod {
+  static const bool value = std::is_trivial_v<T> && 
+                            std::is_standard_layout_v<T>;
 };
 
-template <typename... T>
-constexpr auto dependent_false_v = dependent_false<T...>::value;
+template <typename T>
+constexpr bool is_pod_v = is_pod<T>::value;
+
+//-----------------------------------------------------------------------------
+// NoInit
+//-----------------------------------------------------------------------------
+
+template <typename T>
+struct NoInit {
+
+  //static_assert(is_pod_v<T>, "NoInit only supports POD type");
+
+  // constructor without initialization
+  NoInit () noexcept {}
+
+  // implicit conversion T -> NoInit<T>
+  constexpr  NoInit (T value) noexcept : v{value} {}
+
+  // implicit conversion NoInit<T> -> T
+  constexpr  operator T () const noexcept { return v; }
+
+  T v;
+};
 
 //-----------------------------------------------------------------------------
 // Move-On-Copy
@@ -59,8 +91,8 @@ struct MoC {
   MoC(const MoC& other) : object(std::move(other.object)) {}
 
   T& get() { return object; }
-  
-  mutable T object; 
+
+  mutable T object;
 };
 
 template <typename T>
@@ -72,146 +104,14 @@ auto make_moc(T&& m) {
 // Visitors.
 //-----------------------------------------------------------------------------
 
-// Overloadded.
-template <typename... Ts>
-struct Visitors : Ts... { 
-  using Ts::operator()... ;
-};
-
-template <typename... Ts>
-Visitors(Ts...) -> Visitors<Ts...>;
-
-// ----------------------------------------------------------------------------
-// Function Traits
-// reference: https://github.com/ros2/rclcpp
-// ----------------------------------------------------------------------------
-
-template<typename T>
-struct tuple_tail;
-
-template<typename Head, typename ... Tail>
-struct tuple_tail<std::tuple<Head, Tail ...>> {
-  using type = std::tuple<Tail ...>;
-};
-
-// std::function
-template<typename F>
-struct function_traits
-{
-  using arguments = typename tuple_tail<
-    typename function_traits<decltype(&F::operator())>::argument_tuple_type
-  >::type;
-
-  static constexpr size_t arity = std::tuple_size_v<arguments>;
-
-  template <size_t N>
-  struct argument {
-    static_assert(N < arity, "error: invalid parameter index.");
-    using type = std::tuple_element_t<N, arguments>;
-  };
-  
-  template <size_t N>
-  using argument_t = typename argument<N>::type;
-
-  using return_type = typename function_traits<decltype(&F::operator())>::return_type;
-};
-
-// Free functions
-template<typename R, typename... Args>
-struct function_traits<R(Args...)> {
-
-  using return_type = R;
-  using argument_tuple_type = std::tuple<Args...>;
- 
-  static constexpr size_t arity = sizeof...(Args);
- 
-  template <size_t N>
-  struct argument {
-    static_assert(N < arity, "error: invalid parameter index.");
-    using type = std::tuple_element_t<N, std::tuple<Args...>>;
-  };
-
-  template <size_t N>
-  using argument_t = typename argument<N>::type;
-};
-
-// function pointer
-template<typename R, typename... Args>
-struct function_traits<R(*)(Args...)> : function_traits<R(Args...)> {
-};
-
-// function reference
-template<typename R, typename... Args>
-struct function_traits<R(&)(Args...)> : function_traits<R(Args...)> {
-};
-
-// immutable lambda
-template<typename C, typename R, typename ... Args>
-struct function_traits<R(C::*)(Args ...) const>
-  : function_traits<R(C &, Args ...)>
-{};
-
-// mutable lambda
-template<typename C, typename R, typename ... Args>
-struct function_traits<R(C::*)(Args ...)>
-  : function_traits<R(C &, Args ...)>
-{};
-
-/*// std::bind for object methods
-template<typename C, typename R, typename ... Args, typename ... FArgs>
-#if defined _LIBCPP_VERSION  // libc++ (Clang)
-struct function_traits<std::__bind<R (C::*)(Args ...), FArgs ...>>
-#elif defined _GLIBCXX_RELEASE  // glibc++ (GNU C++ >= 7.1)
-struct function_traits<std::_Bind<R(C::*(FArgs ...))(Args ...)>>
-#elif defined __GLIBCXX__  // glibc++ (GNU C++)
-struct function_traits<std::_Bind<std::_Mem_fn<R (C::*)(Args ...)>(FArgs ...)>>
-#elif defined _MSC_VER  // MS Visual Studio
-struct function_traits<
-  std::_Binder<std::_Unforced, R (C::*)(Args ...), FArgs ...>>
-#else
-#error "Unsupported C++ compiler / standard library"
-#endif
-  : function_traits<R(Args ...)>
-{};
-
-// std::bind for object const methods
-template<typename C, typename R, typename ... Args, typename ... FArgs>
-#if defined _LIBCPP_VERSION  // libc++ (Clang)
-struct function_traits<std::__bind<R (C::*)(Args ...) const, FArgs ...>>
-#elif defined _GLIBCXX_RELEASE  // glibc++ (GNU C++ >= 7.1)
-struct function_traits<std::_Bind<R(C::*(FArgs ...))(Args ...) const>>
-#elif defined __GLIBCXX__  // glibc++ (GNU C++)
-struct function_traits<std::_Bind<std::_Mem_fn<R (C::*)(Args ...) const>(FArgs ...)>>
-#elif defined _MSC_VER  // MS Visual Studio
-struct function_traits<
-  std::_Binder<std::_Unforced, R (C::*)(Args ...) const, FArgs ...>>
-#else
-#error "Unsupported C++ compiler / standard library"
-#endif
-  : function_traits<R(Args ...)>
-{};
-
-// std::bind for free functions
-template<typename R, typename ... Args, typename ... FArgs>
-#if defined _LIBCPP_VERSION  // libc++ (Clang)
-struct function_traits<std::__bind<R( &)(Args ...), FArgs ...>>
-#elif defined __GLIBCXX__  // glibc++ (GNU C++)
-struct function_traits<std::_Bind<R(*(FArgs ...))(Args ...)>>
-#elif defined _MSC_VER  // MS Visual Studio
-struct function_traits<std::_Binder<std::_Unforced, R( &)(Args ...), FArgs ...>>
-#else
-#error "Unsupported C++ compiler / standard library"
-#endif
-  : function_traits<R(Args ...)>
-{}; */
-
-// decay to the raw type
-template <typename F>
-struct function_traits<F&> : function_traits<F> {};
-
-template <typename F>
-struct function_traits<F&&> : function_traits<F> {};
-
+//// Overloadded.
+//template <typename... Ts>
+//struct Visitors : Ts... {
+//  using Ts::operator()... ;
+//};
+//
+//template <typename... Ts>
+//Visitors(Ts...) -> Visitors<Ts...>;
 
 // ----------------------------------------------------------------------------
 // std::variant
@@ -219,51 +119,21 @@ struct function_traits<F&&> : function_traits<F> {};
 template <typename T, typename>
 struct get_index;
 
-template <size_t I, typename... Ts> 
+template <size_t I, typename... Ts>
 struct get_index_impl {};
 
-template <size_t I, typename T, typename... Ts> 
+template <size_t I, typename T, typename... Ts>
 struct get_index_impl<I, T, T, Ts...> : std::integral_constant<size_t, I>{};
 
-template <size_t I, typename T, typename U, typename... Ts> 
+template <size_t I, typename T, typename U, typename... Ts>
 struct get_index_impl<I, T, U, Ts...> : get_index_impl<I+1, T, Ts...>{};
 
-template <typename T, typename... Ts> 
+template <typename T, typename... Ts>
 struct get_index<T, std::variant<Ts...>> : get_index_impl<0, T, Ts...>{};
 
 template <typename T, typename... Ts>
 constexpr auto get_index_v = get_index<T, Ts...>::value;
 
-// ----------------------------------------------------------------------------
-// is_pod
-//-----------------------------------------------------------------------------
-template <typename T>
-struct is_pod {
-  static const bool value = std::is_trivial_v<T> && 
-                            std::is_standard_layout_v<T>;
-};
-
-template <typename T>
-constexpr bool is_pod_v = is_pod<T>::value;
-
-// ----------------------------------------------------------------------------
-// bit_cast
-//-----------------------------------------------------------------------------
-template <class To, class From>
-typename std::enable_if<
-  (sizeof(To) == sizeof(From)) &&
-  std::is_trivially_copyable_v<From> &&
-  std::is_trivial_v<To>,
-  // this implementation requires that To is trivially default constructible
-  To
->::type
-// constexpr support needs compiler magic
-bit_cast(const From &src) noexcept {
-  To dst;
-  std::memcpy(&dst, &src, sizeof(To));
-  return dst;
-}
-
 // ----------------------------------------------------------------------------
 // unwrap_reference
 // ----------------------------------------------------------------------------
@@ -293,7 +163,7 @@ struct stateful_iterator {
 
   using TB = std::decay_t<unwrap_ref_decay_t<B>>;
   using TE = std::decay_t<unwrap_ref_decay_t<E>>;
-  
+
   static_assert(std::is_same_v<TB, TE>, "decayed iterator types must match");
 
   using type = TB;
@@ -313,11 +183,11 @@ struct stateful_index {
   static_assert(
     std::is_integral_v<TB>, "decayed beg index must be an integral type"
   );
-  
+
   static_assert(
     std::is_integral_v<TE>, "decayed end index must be an integral type"
   );
-  
+
   static_assert(
     std::is_integral_v<TS>, "decayed step must be an integral type"
   );
@@ -333,6 +203,95 @@ struct stateful_index {
 template <typename B, typename E, typename S>
 using stateful_index_t = typename stateful_index<B, E, S>::type;
 
+// ----------------------------------------------------------------------------
+// visit a tuple with a functor at runtime
+// ----------------------------------------------------------------------------
+
+template <typename Func, typename Tuple, size_t N = 0>
+void visit_tuple(Func func, Tuple& tup, size_t idx) {
+  if (N == idx) {
+    std::invoke(func, std::get<N>(tup));
+    return;
+  }
+  if constexpr (N + 1 < std::tuple_size_v<Tuple>) {
+    return visit_tuple<Func, Tuple, N + 1>(func, tup, idx);
+  }
+}
+
+// ----------------------------------------------------------------------------
+// unroll loop
+// ----------------------------------------------------------------------------
+
+// Template unrolled looping construct.
+template<auto beg, auto end, auto step, bool valid = (beg < end)>
+struct Unroll {
+  template<typename F>
+  static void eval(F f) {
+    f(beg);
+    Unroll<beg + step, end, step>::eval(f);
+  }
+};
+
+template<auto beg, auto end, auto step>
+struct Unroll<beg, end, step, false> {
+  template<typename F>
+  static void eval(F) { }
+};
+
+template<auto beg, auto end, auto step, typename F>
+void unroll(F f) {
+  Unroll<beg, end, step>::eval(f);
+}
+
+// ----------------------------------------------------------------------------
+// make types of variant unique
+// ----------------------------------------------------------------------------
+
+template <typename T, typename... Ts>
+struct filter_duplicates { using type = T; };
+
+template <template <typename...> class C, typename... Ts, typename U, typename... Us>
+struct filter_duplicates<C<Ts...>, U, Us...>
+    : std::conditional_t<(std::is_same_v<U, Ts> || ...)
+                       , filter_duplicates<C<Ts...>, Us...>
+                       , filter_duplicates<C<Ts..., U>, Us...>> {};
+
+template <typename T>
+struct unique_variant;
+
+template <typename... Ts>
+struct unique_variant<std::variant<Ts...>> : filter_duplicates<std::variant<>, Ts...> {};
+
+template <typename T>
+using unique_variant_t = typename unique_variant<T>::type;
+
+
+// ----------------------------------------------------------------------------
+// check if it is default compare
+// ----------------------------------------------------------------------------
+template <typename T> struct is_std_compare : std::false_type { };
+template <typename T> struct is_std_compare<std::less<T>> : std::true_type { };
+template <typename T> struct is_std_compare<std::greater<T>> : std::true_type { };
+
+template <typename T>
+constexpr static bool is_std_compare_v = is_std_compare<T>::value;
+
+// ----------------------------------------------------------------------------
+// check if all types are the same
+// ----------------------------------------------------------------------------
+
+template<bool...> 
+struct bool_pack;
+
+template<bool... bs>
+using all_true = std::is_same<bool_pack<bs..., true>, bool_pack<true, bs...>>;
+
+template <typename T, typename... Ts>
+using all_same = all_true<std::is_same_v<T, Ts>...>;
+
+template <typename T, typename... Ts>
+constexpr bool all_same_v = all_same<T, Ts...>::value;
+
 
 }  // end of namespace tf. ----------------------------------------------------
 
diff --git a/lib/taskflow/utility/uuid.hpp b/lib/taskflow/utility/uuid.hpp
index 9f318b5..11d7f3b 100644
--- a/lib/taskflow/utility/uuid.hpp
+++ b/lib/taskflow/utility/uuid.hpp
@@ -1,10 +1,9 @@
 #pragma once
 
 #include <iostream>
-#include <cstdio>
-#include <cstdlib>
+#include <string>
 #include <cstring>
-#include <cstddef>
+#include <limits>
 #include <random>
 #include <chrono>
 
@@ -12,16 +11,16 @@ namespace tf {
 
 // Class: UUID
 //
-// A universally unique identifier (UUID) is an identifier standard used in software 
-// construction. A UUID is simply a 128-bit value. The meaning of each bit is defined 
+// A universally unique identifier (UUID) is an identifier standard used in software
+// construction. A UUID is simply a 128-bit value. The meaning of each bit is defined
 // by any of several variants.
-// For human-readable display, many systems use a canonical format using hexadecimal 
+// For human-readable display, many systems use a canonical format using hexadecimal
 // text with inserted hyphen characters.
-// 
+//
 // For example: 123e4567-e89b-12d3-a456-426655440000
 //
-// The intent of UUIDs is to enable distributed systems to uniquely identify information 
-// without significant central coordination. 
+// The intent of UUIDs is to enable distributed systems to uniquely identify information
+// without significant central coordination.
 //
 //   Copyright 2006 Andy Tompkins.
 //   Distributed under the Boost Software License, Version 1.0. (See
@@ -45,12 +44,12 @@ struct UUID {
 
   UUID& operator = (const UUID&) = default;
   UUID& operator = (UUID&&) = default;
-    
-  inline static size_type size(); 
-  inline iterator begin(); 
-  inline const_iterator begin() const; 
-  inline iterator end(); 
-  inline const_iterator end() const; 
+
+  inline static size_type size();
+  inline iterator begin();
+  inline const_iterator begin() const;
+  inline iterator end();
+  inline const_iterator end() const;
 
   inline bool is_nil() const;
   inline void swap(UUID& rhs);
@@ -61,7 +60,7 @@ struct UUID {
   inline bool operator >  (const UUID&) const;
   inline bool operator != (const UUID&) const;
   inline bool operator >= (const UUID&) const;
-  inline bool operator <= (const UUID&) const; 
+  inline bool operator <= (const UUID&) const;
 
   uint8_t data[16] {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 
@@ -79,7 +78,7 @@ inline UUID::UUID() {
     (std::numeric_limits<unsigned long>::min)(),
     (std::numeric_limits<unsigned long>::max)()
   );
-  
+
   int i = 0;
   auto random_value = distribution(engine);
   for (auto it=begin(); it!=end(); ++it, ++i) {
@@ -89,39 +88,39 @@ inline UUID::UUID() {
     }
     *it = static_cast<UUID::value_type>((random_value >> (i*8)) & 0xFF);
   }
-  
+
   // set variant: must be 0b10xxxxxx
   *(begin()+8) &= 0xBF;
   *(begin()+8) |= 0x80;
 
   // set version: must be 0b0100xxxx
   *(begin()+6) &= 0x4F; //0b01001111
-  *(begin()+6) |= 0x40; //0b01000000 
+  *(begin()+6) |= 0x40; //0b01000000
 }
-  
+
 // Function: size
-inline typename UUID::size_type UUID::size() { 
-  return 16;          
+inline typename UUID::size_type UUID::size() {
+  return 16;
 }
 
 // Function: begin
-inline typename UUID::iterator UUID::begin() { 
-  return data;        
+inline typename UUID::iterator UUID::begin() {
+  return data;
 }
 
 // Function: begin
-inline typename UUID::const_iterator UUID::begin() const { 
-  return data;        
+inline typename UUID::const_iterator UUID::begin() const {
+  return data;
 }
 
 // Function: end
-inline typename UUID::iterator UUID::end() { 
-  return data+size(); 
+inline typename UUID::iterator UUID::end() {
+  return data+size();
 }
 
 // Function: end
-inline typename UUID::const_iterator UUID::end() const { 
-  return data+size(); 
+inline typename UUID::const_iterator UUID::end() const {
+  return data+size();
 }
 
 // Function: is_nil
@@ -222,7 +221,6 @@ inline std::ostream& operator << (std::ostream& os, const UUID& rhs) {
 
 //-----------------------------------------------------------------------------
 
-
 namespace std {
 
 // Partial specialization: hash<tf::UUID>