From 9daa4303cdc03f6b90b72c369e6377c6beb75c39 Mon Sep 17 00:00:00 2001 From: Alex A Yermoshenko Date: Sun, 17 Dec 2023 19:21:21 +0100 Subject: [PATCH] Update taskflow to 3.6.0 (#93) Fixed potential data-race issues induced by incorrect memory order. Co-authored-by: Alex A. Yermoshenko --- lib/taskflow/algorithm/critical.hpp | 78 + lib/taskflow/algorithm/data_pipeline.hpp | 637 ++++++ lib/taskflow/algorithm/find.hpp | 547 +++++ lib/taskflow/algorithm/for_each.hpp | 173 ++ lib/taskflow/algorithm/launch.hpp | 58 + lib/taskflow/algorithm/partitioner.hpp | 543 +++++ lib/taskflow/algorithm/pipeline.hpp | 1663 ++++++++++++++ lib/taskflow/algorithm/reduce.hpp | 295 +++ lib/taskflow/algorithm/scan.hpp | 614 +++++ lib/taskflow/algorithm/sort.hpp | 648 ++++++ lib/taskflow/algorithm/transform.hpp | 199 ++ lib/taskflow/core/async.hpp | 396 ++++ lib/taskflow/core/async_task.hpp | 125 + lib/taskflow/core/declarations.hpp | 30 +- lib/taskflow/core/executor-module-opt.hpp | 2025 +++++++++++++++++ lib/taskflow/core/executor.hpp | 2501 +++++++++++++++------ lib/taskflow/core/flow_builder.hpp | 1891 +++++++++------- lib/taskflow/core/graph.hpp | 862 +++++-- lib/taskflow/core/notifier.hpp | 4 +- lib/taskflow/core/observer.hpp | 377 +++- lib/taskflow/core/semaphore.hpp | 39 +- lib/taskflow/core/task.hpp | 284 ++- lib/taskflow/core/taskflow.hpp | 365 ++- lib/taskflow/core/topology.hpp | 19 +- lib/taskflow/core/tsq.hpp | 380 +++- lib/taskflow/core/worker.hpp | 175 +- lib/taskflow/taskflow.hpp | 37 +- lib/taskflow/utility/iterator.hpp | 16 +- lib/taskflow/utility/macros.hpp | 17 + lib/taskflow/utility/math.hpp | 28 +- lib/taskflow/utility/object_pool.hpp | 145 +- lib/taskflow/utility/os.hpp | 62 +- lib/taskflow/utility/serializer.hpp | 609 ++--- lib/taskflow/utility/singleton.hpp | 2 +- lib/taskflow/utility/small_vector.hpp | 1048 +++++++++ lib/taskflow/utility/stream.hpp | 2 +- lib/taskflow/utility/traits.hpp | 339 ++- lib/taskflow/utility/uuid.hpp | 60 +- 38 files changed, 14599 insertions(+), 2694 deletions(-) create mode 100644 lib/taskflow/algorithm/critical.hpp create mode 100644 lib/taskflow/algorithm/data_pipeline.hpp create mode 100644 lib/taskflow/algorithm/find.hpp create mode 100644 lib/taskflow/algorithm/for_each.hpp create mode 100644 lib/taskflow/algorithm/launch.hpp create mode 100644 lib/taskflow/algorithm/partitioner.hpp create mode 100644 lib/taskflow/algorithm/pipeline.hpp create mode 100644 lib/taskflow/algorithm/reduce.hpp create mode 100644 lib/taskflow/algorithm/scan.hpp create mode 100644 lib/taskflow/algorithm/sort.hpp create mode 100644 lib/taskflow/algorithm/transform.hpp create mode 100644 lib/taskflow/core/async.hpp create mode 100644 lib/taskflow/core/async_task.hpp create mode 100644 lib/taskflow/core/executor-module-opt.hpp create mode 100644 lib/taskflow/utility/macros.hpp create mode 100644 lib/taskflow/utility/small_vector.hpp diff --git a/lib/taskflow/algorithm/critical.hpp b/lib/taskflow/algorithm/critical.hpp new file mode 100644 index 0000000..c781d28 --- /dev/null +++ b/lib/taskflow/algorithm/critical.hpp @@ -0,0 +1,78 @@ +#pragma once + +#include "../core/task.hpp" + +/** +@file critical.hpp +@brief critical include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// CriticalSection +// ---------------------------------------------------------------------------- + +/** +@class CriticalSection + +@brief class to create a critical region of limited workers to run tasks + +tf::CriticalSection is a warpper over tf::Semaphore and is specialized for +limiting the maximum concurrency over a set of tasks. +A critical section starts with an initial count representing that limit. +When a task is added to the critical section, +the task acquires and releases the semaphore internal to the critical section. +This design avoids explicit call of tf::Task::acquire and tf::Task::release. +The following example creates a critical section of one worker and adds +the five tasks to the critical section. + +@code{.cpp} +tf::Executor executor(8); // create an executor of 8 workers +tf::Taskflow taskflow; + +// create a critical section of 1 worker +tf::CriticalSection critical_section(1); + +tf::Task A = taskflow.emplace([](){ std::cout << "A" << std::endl; }); +tf::Task B = taskflow.emplace([](){ std::cout << "B" << std::endl; }); +tf::Task C = taskflow.emplace([](){ std::cout << "C" << std::endl; }); +tf::Task D = taskflow.emplace([](){ std::cout << "D" << std::endl; }); +tf::Task E = taskflow.emplace([](){ std::cout << "E" << std::endl; }); + +critical_section.add(A, B, C, D, E); + +executor.run(taskflow).wait(); +@endcode + +*/ +class CriticalSection : public Semaphore { + + public: + + /** + @brief constructs a critical region of a limited number of workers + */ + explicit CriticalSection(size_t max_workers = 1); + + /** + @brief adds a task into the critical region + */ + template + void add(Tasks...tasks); +}; + +inline CriticalSection::CriticalSection(size_t max_workers) : + Semaphore {max_workers} { +} + +template +void CriticalSection::add(Tasks... tasks) { + (tasks.acquire(*this), ...); + (tasks.release(*this), ...); +} + + +} // end of namespace tf. --------------------------------------------------- + + diff --git a/lib/taskflow/algorithm/data_pipeline.hpp b/lib/taskflow/algorithm/data_pipeline.hpp new file mode 100644 index 0000000..0393548 --- /dev/null +++ b/lib/taskflow/algorithm/data_pipeline.hpp @@ -0,0 +1,637 @@ +#pragma once + +#include "pipeline.hpp" + + +namespace tf { + +// ---------------------------------------------------------------------------- +// Class Definition: DataPipe +// ---------------------------------------------------------------------------- + +/** +@class DataPipe + +@brief class to create a stage in a data-parallel pipeline + +A data pipe represents a stage of a data-parallel pipeline. +A data pipe can be either @em parallel direction or @em serial direction +(specified by tf::PipeType) and is associated with a callable to invoke +by the pipeline scheduler. + +You need to use the template function, tf::make_data_pipe, to create +a data pipe. The input and output types of a tf::DataPipe should be decayed types +(though the library will always decay them for you using `std::decay`) +to allow internal storage to work. +The data will be passed by reference to your callable, at which you can take +it by copy or reference. + +@code{.cpp} +tf::make_data_pipe( + tf::PipeType::SERIAL, + [](int& input) {return std::to_string(input + 100);} +); +@endcode + +In addition to the data, you callable can take an additional reference +of tf::Pipeflow in the second argument to probe the runtime information +for a stage task, such as its line number and token number: + +@code{.cpp} +tf::make_data_pipe( + tf::PipeType::SERIAL, + [](int& input, tf::Pipeflow& pf) { + printf("token=%lu, line=%lu\n", pf.token(), pf.line()); + return std::to_string(input + 100); + } +); +@endcode + +*/ +template +class DataPipe { + + template + friend class DataPipeline; + + public: + + /** + @brief callable type of the data pipe + */ + using callable_t = C; + + /** + @brief input type of the data pipe + */ + using input_t = Input; + + /** + @brief output type of the data pipe + */ + using output_t = Output; + + /** + @brief default constructor + */ + DataPipe() = default; + + /** + @brief constructs a data pipe + + You should use the helper function, tf::make_data_pipe, + to create a DataPipe object, especially when you need tf::DataPipe + to automatically deduct the lambda type. + */ + DataPipe(PipeType d, callable_t&& callable) : + _type{d}, _callable{std::forward(callable)} { + } + + /** + @brief queries the type of the data pipe + + A data pipe can be either parallel (tf::PipeType::PARALLEL) or serial + (tf::PipeType::SERIAL). + */ + PipeType type() const { + return _type; + } + + /** + @brief assigns a new type to the data pipe + */ + void type(PipeType type) { + _type = type; + } + + /** + @brief assigns a new callable to the data pipe + + @tparam U callable type + @param callable a callable object constructible from the callable type + of this data pipe + + Assigns a new callable to the pipe using universal forwarding. + */ + template + void callable(U&& callable) { + _callable = std::forward(callable); + } + + private: + + PipeType _type; + + callable_t _callable; +}; + +/** +@brief function to construct a data pipe (tf::DataPipe) + +@tparam Input input data type +@tparam Output output data type +@tparam C callable type + +tf::make_data_pipe is a helper function to create a data pipe (tf::DataPipe) +in a data-parallel pipeline (tf::DataPipeline). +The first argument specifies the direction of the data pipe, +either tf::PipeType::SERIAL or tf::PipeType::PARALLEL, +and the second argument is a callable to invoke by the pipeline scheduler. +Input and output data types are specified via template parameters, +which will always be decayed by the library to its original form +for storage purpose. +The callable must take the input data type in its first argument +and returns a value of the output data type. + +@code{.cpp} +tf::make_data_pipe( + tf::PipeType::SERIAL, + [](int& input) { + return std::to_string(input + 100); + } +); +@endcode + +The callable can additionally take a reference of tf::Pipeflow, +which allows you to query the runtime information of a stage task, +such as its line number and token number. + +@code{.cpp} +tf::make_data_pipe( + tf::PipeType::SERIAL, + [](int& input, tf::Pipeflow& pf) { + printf("token=%lu, line=%lu\n", pf.token(), pf.line()); + return std::to_string(input + 100); + } +); +@endcode + +*/ +template +auto make_data_pipe(PipeType d, C&& callable) { + return DataPipe(d, std::forward(callable)); +} + +// ---------------------------------------------------------------------------- +// Class Definition: DataPipeline +// ---------------------------------------------------------------------------- + +/** +@class DataPipeline + +@brief class to create a data-parallel pipeline scheduling framework + +@tparam Ps data pipe types + +Similar to tf::Pipeline, a tf::DataPipeline is a composable graph object +for users to create a data-parallel pipeline scheduling framework +using a module task in a taskflow. +The only difference is that tf::DataPipeline provides a data abstraction +for users to quickly express dataflow in a pipeline. +The following example creates a data-parallel pipeline of three stages +that generate dataflow from `void` to `int`, `std::string`, `float`, and `void`. + +@code{.cpp} +#include +#include + +int main() { + + // data flow => void -> int -> std::string -> float -> void + tf::Taskflow taskflow("pipeline"); + tf::Executor executor; + + const size_t num_lines = 4; + + tf::DataPipeline pl(num_lines, + tf::make_data_pipe(tf::PipeType::SERIAL, [&](tf::Pipeflow& pf) -> int{ + if(pf.token() == 5) { + pf.stop(); + return 0; + } + else { + return pf.token(); + } + }), + tf::make_data_pipe(tf::PipeType::SERIAL, [](int& input) { + return std::to_string(input + 100); + }), + tf::make_data_pipe(tf::PipeType::SERIAL, [](std::string& input) { + std::cout << input << std::endl; + }) + ); + + // build the pipeline graph using composition + taskflow.composed_of(pl).name("pipeline"); + + // dump the pipeline graph structure (with composition) + taskflow.dump(std::cout); + + // run the pipeline + executor.run(taskflow).wait(); + + return 0; +} +@endcode + +The pipeline schedules five tokens over four parallel lines in a circular fashion, +as depicted below: + +@code{.shell-session} +o -> o -> o +| | | +v v v +o -> o -> o +| | | +v v v +o -> o -> o +| | | +v v v +o -> o -> o +@endcode +*/ +template +class DataPipeline { + + static_assert(sizeof...(Ps)>0, "must have at least one pipe"); + + /** + @private + */ + struct Line { + std::atomic join_counter; + }; + + /** + @private + */ + struct PipeMeta { + PipeType type; + }; + + + public: + + /** + @brief internal storage type for each data token (default std::variant) + */ + using data_t = unique_variant_t, + std::monostate, + std::decay_t>... + >>; + + /** + @brief constructs a data-parallel pipeline object + + @param num_lines the number of parallel lines + @param ps a list of pipes + + Constructs a data-parallel pipeline of up to @c num_lines parallel lines to schedule + tokens through the given linear chain of pipes. + The first pipe must define a serial direction (tf::PipeType::SERIAL) + or an exception will be thrown. + */ + DataPipeline(size_t num_lines, Ps&&... ps); + + /** + @brief constructs a data-parallel pipeline object + + @param num_lines the number of parallel lines + @param ps a tuple of pipes + + Constructs a data-parallel pipeline of up to @c num_lines parallel lines to schedule + tokens through the given linear chain of pipes stored in a std::tuple. + The first pipe must define a serial direction (tf::PipeType::SERIAL) + or an exception will be thrown. + */ + DataPipeline(size_t num_lines, std::tuple&& ps); + + /** + @brief queries the number of parallel lines + + The function returns the number of parallel lines given by the user + upon the construction of the pipeline. + The number of lines represents the maximum parallelism this pipeline + can achieve. + */ + size_t num_lines() const noexcept; + + /** + @brief queries the number of pipes + + The Function returns the number of pipes given by the user + upon the construction of the pipeline. + */ + constexpr size_t num_pipes() const noexcept; + + /** + @brief resets the pipeline + + Resetting the pipeline to the initial state. After resetting a pipeline, + its token identifier will start from zero as if the pipeline was just + constructed. + */ + void reset(); + + /** + @brief queries the number of generated tokens in the pipeline + + The number represents the total scheduling tokens that has been + generated by the pipeline so far. + */ + size_t num_tokens() const noexcept; + + /** + @brief obtains the graph object associated with the pipeline construct + + This method is primarily used as an opaque data structure for creating + a module task of this pipeline. + */ + Graph& graph(); + + private: + + Graph _graph; + + size_t _num_tokens; + + std::tuple _pipes; + std::array _meta; + std::vector> _lines; + std::vector _tasks; + std::vector _pipeflows; + std::vector> _buffer; + + template + auto _gen_meta(std::tuple&&, std::index_sequence); + + void _on_pipe(Pipeflow&, Runtime&); + void _build(); +}; + +// constructor +template +DataPipeline::DataPipeline(size_t num_lines, Ps&&... ps) : + _pipes {std::make_tuple(std::forward(ps)...)}, + _meta {PipeMeta{ps.type()}...}, + _lines (num_lines), + _tasks (num_lines + 1), + _pipeflows (num_lines), + _buffer (num_lines) { + + if(num_lines == 0) { + TF_THROW("must have at least one line"); + } + + if(std::get<0>(_pipes).type() != PipeType::SERIAL) { + TF_THROW("first pipe must be serial"); + } + + reset(); + _build(); +} + +// constructor +template +DataPipeline::DataPipeline(size_t num_lines, std::tuple&& ps) : + _pipes {std::forward>(ps)}, + _meta {_gen_meta( + std::forward>(ps), std::make_index_sequence{} + )}, + _lines (num_lines), + _tasks (num_lines + 1), + _pipeflows (num_lines), + _buffer (num_lines) { + + if(num_lines == 0) { + TF_THROW("must have at least one line"); + } + + if(std::get<0>(_pipes).type() != PipeType::SERIAL) { + TF_THROW("first pipe must be serial"); + } + + reset(); + _build(); +} + +// Function: _get_meta +template +template +auto DataPipeline::_gen_meta(std::tuple&& ps, std::index_sequence) { + return std::array{PipeMeta{std::get(ps).type()}...}; +} + +// Function: num_lines +template +size_t DataPipeline::num_lines() const noexcept { + return _pipeflows.size(); +} + +// Function: num_pipes +template +constexpr size_t DataPipeline::num_pipes() const noexcept { + return sizeof...(Ps); +} + +// Function: num_tokens +template +size_t DataPipeline::num_tokens() const noexcept { + return _num_tokens; +} + +// Function: graph +template +Graph& DataPipeline::graph() { + return _graph; +} + +// Function: reset +template +void DataPipeline::reset() { + + _num_tokens = 0; + + for(size_t l = 0; l(_meta[f].type), std::memory_order_relaxed + ); + } + } + + for(size_t f=1; f(_meta[0].type) - 1, std::memory_order_relaxed + ); + } +} + +// Procedure: _on_pipe +template +void DataPipeline::_on_pipe(Pipeflow& pf, Runtime&) { + + visit_tuple([&](auto&& pipe){ + + using data_pipe_t = std::decay_t; + using callable_t = typename data_pipe_t::callable_t; + using input_t = std::decay_t; + using output_t = std::decay_t; + + // first pipe + if constexpr (std::is_invocable_v) { + // [](tf::Pipeflow&) -> void {}, i.e., we only have one pipe + if constexpr (std::is_void_v) { + pipe._callable(pf); + // [](tf::Pipeflow&) -> output_t {} + } else { + _buffer[pf._line].data = pipe._callable(pf); + } + } + // other pipes without pipeflow in the second argument + else if constexpr (std::is_invocable_v >) { + // [](input_t&) -> void {}, i.e., the last pipe + if constexpr (std::is_void_v) { + pipe._callable(std::get(_buffer[pf._line].data)); + // [](input_t&) -> output_t {} + } else { + _buffer[pf._line].data = pipe._callable( + std::get(_buffer[pf._line].data) + ); + } + } + // other pipes with pipeflow in the second argument + else if constexpr (std::is_invocable_v) { + // [](input_t&, tf::Pipeflow&) -> void {} + if constexpr (std::is_void_v) { + pipe._callable(std::get(_buffer[pf._line].data), pf); + // [](input_t&, tf::Pipeflow&) -> output_t {} + } else { + _buffer[pf._line].data = pipe._callable( + std::get(_buffer[pf._line].data), pf + ); + } + } + //else if constexpr(std::is_invocable_v) { + // pipe._callable(pf, rt); + //} + else { + static_assert(dependent_false_v, "un-supported pipe callable type"); + } + }, _pipes, pf._pipe); +} + +// Procedure: _build +template +void DataPipeline::_build() { + + using namespace std::literals::string_literals; + + FlowBuilder fb(_graph); + + // init task + _tasks[0] = fb.emplace([this]() { + return static_cast(_num_tokens % num_lines()); + }).name("cond"); + + // line task + for(size_t l = 0; l < num_lines(); l++) { + + _tasks[l + 1] = fb.emplace([this, l] (tf::Runtime& rt) mutable { + + auto pf = &_pipeflows[l]; + + pipeline: + + _lines[pf->_line][pf->_pipe].join_counter.store( + static_cast(_meta[pf->_pipe].type), std::memory_order_relaxed + ); + + if (pf->_pipe == 0) { + pf->_token = _num_tokens; + if (pf->_stop = false, _on_pipe(*pf, rt); pf->_stop == true) { + // here, the pipeline is not stopped yet because other + // lines of tasks may still be running their last stages + return; + } + ++_num_tokens; + } + else { + _on_pipe(*pf, rt); + } + + size_t c_f = pf->_pipe; + size_t n_f = (pf->_pipe + 1) % num_pipes(); + size_t n_l = (pf->_line + 1) % num_lines(); + + pf->_pipe = n_f; + + // ---- scheduling starts here ---- + // Notice that the shared variable f must not be changed after this + // point because it can result in data race due to the following + // condition: + // + // a -> b + // | | + // v v + // c -> d + // + // d will be spawned by either c or b, so if c changes f but b spawns d + // then data race on f will happen + + std::array retval; + size_t n = 0; + + // downward dependency + if(_meta[c_f].type == PipeType::SERIAL && + _lines[n_l][c_f].join_counter.fetch_sub( + 1, std::memory_order_acq_rel) == 1 + ) { + retval[n++] = 1; + } + + // forward dependency + if(_lines[pf->_line][n_f].join_counter.fetch_sub( + 1, std::memory_order_acq_rel) == 1 + ) { + retval[n++] = 0; + } + + // notice that the task index starts from 1 + switch(n) { + case 2: { + rt.schedule(_tasks[n_l+1]); + goto pipeline; + } + case 1: { + if (retval[0] == 1) { + pf = &_pipeflows[n_l]; + } + goto pipeline; + } + } + }).name("rt-"s + std::to_string(l)); + + _tasks[0].precede(_tasks[l+1]); + } +} + + +} // end of namespace tf ----------------------------------------------------- + + + + + diff --git a/lib/taskflow/algorithm/find.hpp b/lib/taskflow/algorithm/find.hpp new file mode 100644 index 0000000..ab0d801 --- /dev/null +++ b/lib/taskflow/algorithm/find.hpp @@ -0,0 +1,547 @@ +#pragma once + +#include "launch.hpp" + +namespace tf { + +namespace detail { + +// Function: find_if_loop +template +TF_FORCE_INLINE bool find_if_loop( + std::atomic& offset, + Iterator& beg, + size_t& prev_e, + size_t curr_b, + size_t curr_e, + Predicate&& predicate +) { + // early prune + if(offset.load(std::memory_order_relaxed) < curr_b) { + return true; + } + std::advance(beg, curr_b - prev_e); + for(size_t x = curr_b; x +TF_FORCE_INLINE bool find_if_not_loop( + std::atomic& offset, + Iterator& beg, + size_t& prev_e, + size_t curr_b, + size_t curr_e, + Predicate&& predicate +) { + + // early prune + if(offset.load(std::memory_order_relaxed) < curr_b) { + return true; + } + std::advance(beg, curr_b - prev_e); + for(size_t x = curr_b; x +TF_FORCE_INLINE auto make_find_if_task( + B first, E last, T& result, UOP predicate, P&& part +) { + + using B_t = std::decay_t>; + using E_t = std::decay_t>; + using namespace std::string_literals; + + return + [b=first, e=last, predicate, &result, part=std::forward

(part)] + (Runtime& rt) mutable { + + // fetch the stateful values + B_t beg = b; + E_t end = e; + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + result = std::find_if(beg, end, predicate); + return; + } + + if(N < W) { + W = N; + } + + std::atomic offset(N); + + // static partitioner + if constexpr(std::is_same_v, StaticPartitioner>) { + + size_t chunk_size; + + for(size_t w=0, curr_b=0; w next(0); + launch_loop(N, W, rt, next, part, + [N, W, beg, &predicate, &offset, &next, &part] () mutable { + part.loop_until(N, W, next, + [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable { + return detail::find_if_loop( + offset, beg, prev_e, curr_b, curr_e, predicate + ); + } + ); + } + ); + } + + // update the result iterator by the offset + result = std::next(beg, offset.load(std::memory_order_relaxed)); + }; +} + +// Function: make_find_if_not_task +template +TF_FORCE_INLINE auto make_find_if_not_task( + B first, E last, T& result, UOP predicate, P&& part +) { + + using B_t = std::decay_t>; + using E_t = std::decay_t>; + using namespace std::string_literals; + + return + [b=first, e=last, predicate, &result, part=std::forward

(part)] + (Runtime& rt) mutable { + + // fetch the stateful values + B_t beg = b; + E_t end = e; + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + result = std::find_if_not(beg, end, predicate); + return; + } + + if(N < W) { + W = N; + } + + std::atomic offset(N); + + // static partitioner + if constexpr(std::is_same_v, StaticPartitioner>) { + + size_t chunk_size; + + for(size_t w=0, curr_b=0; w next(0); + launch_loop(N, W, rt, next, part, + [N, W, beg, &predicate, &offset, &next, &part] () mutable { + part.loop_until(N, W, next, + [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable { + return detail::find_if_not_loop( + offset, beg, prev_e, curr_b, curr_e, predicate + ); + } + ); + } + ); + } + + // update the result iterator by the offset + result = std::next(beg, offset.load(std::memory_order_relaxed)); + }; +} + +// Function: make_min_element_task +template +TF_FORCE_INLINE auto make_min_element_task( + B first, E last, T& result, C comp, P&& part +) { + + using B_t = std::decay_t>; + using E_t = std::decay_t>; + using namespace std::string_literals; + + return + [b=first, e=last, &result, comp, part=std::forward

(part)] + (Runtime& rt) mutable { + + // fetch the iterator values + B_t beg = b; + E_t end = e; + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + result = std::min_element(beg, end, comp); + return; + } + + if(N < W) { + W = N; + } + + std::mutex mutex; + + // initialize the result to the first element + result = beg++; + N--; + + // static partitioner + if constexpr(std::is_same_v, StaticPartitioner>) { + + size_t chunk_size; + + for(size_t w=0, curr_b=0; w lock(mutex); + if(comp(*beg, *result)) { + result = beg; + } + return; + } + + auto beg1 = beg++; + auto beg2 = beg++; + T smallest = comp(*beg1, *beg2) ? beg1 : beg2; + + // loop reduce + part.loop(N, W, curr_b, chunk_size, + [&, prev_e=curr_b+2](size_t curr_b, size_t curr_e) mutable { + + if(curr_b > prev_e) { + std::advance(beg, curr_b - prev_e); + } + else { + curr_b = prev_e; + } + + for(size_t x=curr_b; x lock(mutex); + if(comp(*smallest, *result)) { + result = smallest; + } + }); + } + rt.join(); + } + // dynamic partitioner + else { + std::atomic next(0); + launch_loop(N, W, rt, next, part, + [beg, N, W, &next, &comp, &mutex, &result, &part] () mutable { + // pre-reduce + size_t s0 = next.fetch_add(2, std::memory_order_relaxed); + + if(s0 >= N) { + return; + } + + std::advance(beg, s0); + + if(N - s0 == 1) { + std::lock_guard lock(mutex); + if(comp(*beg, *result)) { + result = beg; + } + return; + } + + auto beg1 = beg++; + auto beg2 = beg++; + + T smallest = comp(*beg1, *beg2) ? beg1 : beg2; + + // loop reduce + part.loop(N, W, next, + [&, prev_e=s0+2](size_t curr_b, size_t curr_e) mutable { + std::advance(beg, curr_b - prev_e); + for(size_t x=curr_b; x lock(mutex); + if(comp(*smallest, *result)) { + result = smallest; + } + } + ); + } + }; +} + +// Function: make_max_element_task +template +TF_FORCE_INLINE auto make_max_element_task(B first, E last, T& result, C comp, P&& part) { + + using B_t = std::decay_t>; + using E_t = std::decay_t>; + using namespace std::string_literals; + + return + [b=first, e=last, &result, comp, part=std::forward

(part)] + (Runtime& rt) mutable { + + // fetch the iterator values + B_t beg = b; + E_t end = e; + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + result = std::max_element(beg, end, comp); + return; + } + + if(N < W) { + W = N; + } + + std::mutex mutex; + + // initialize the result to the first element + result = beg++; + N--; + + // static partitioner + if constexpr(std::is_same_v, StaticPartitioner>) { + + size_t chunk_size; + + for(size_t w=0, curr_b=0; w lock(mutex); + if(comp(*result, *beg)) { + result = beg; + } + return; + } + + auto beg1 = beg++; + auto beg2 = beg++; + T largest = comp(*beg1, *beg2) ? beg2 : beg1; + + // loop reduce + part.loop(N, W, curr_b, chunk_size, + [&, prev_e=curr_b+2](size_t curr_b, size_t curr_e) mutable { + + if(curr_b > prev_e) { + std::advance(beg, curr_b - prev_e); + } + else { + curr_b = prev_e; + } + + for(size_t x=curr_b; x lock(mutex); + if(comp(*result, *largest)) { + result = largest; + } + }); + } + rt.join(); + } + // dynamic partitioner + else { + std::atomic next(0); + launch_loop(N, W, rt, next, part, + [beg, N, W, &next, &comp, &mutex, &result, &part] () mutable { + // pre-reduce + size_t s0 = next.fetch_add(2, std::memory_order_relaxed); + + if(s0 >= N) { + return; + } + + std::advance(beg, s0); + + if(N - s0 == 1) { + std::lock_guard lock(mutex); + if(comp(*result, *beg)) { + result = beg; + } + return; + } + + auto beg1 = beg++; + auto beg2 = beg++; + + T largest = comp(*beg1, *beg2) ? beg2 : beg1; + + // loop reduce + part.loop(N, W, next, + [&, prev_e=s0+2](size_t curr_b, size_t curr_e) mutable { + std::advance(beg, curr_b - prev_e); + for(size_t x=curr_b; x lock(mutex); + if(comp(*result, *largest)) { + result = largest; + } + } + ); + } + }; +} + +} // namespace detail -------------------------------------------------------- + +// Function: find_if +template +Task tf::FlowBuilder::find_if(B first, E last, T& result, UOP predicate, P&& part) { + return emplace(detail::make_find_if_task( + first, last, result, predicate, std::forward

(part) + )); +} + +// Function: find_if_not +template +Task tf::FlowBuilder::find_if_not(B first, E last, T& result, UOP predicate, P&& part) { + return emplace(detail::make_find_if_not_task( + first, last, result, predicate, std::forward

(part) + )); +} + +// ---------------------------------------------------------------------------- +// min_element +// ---------------------------------------------------------------------------- + +// Function: min_element +template +Task FlowBuilder::min_element(B first, E last, T& result, C comp, P&& part) { + return emplace(detail::make_min_element_task( + first, last, result, comp, std::forward

(part) + )); +} + +// ---------------------------------------------------------------------------- +// max_element +// ---------------------------------------------------------------------------- + +// Function: max_element +template +Task FlowBuilder::max_element(B first, E last, T& result, C comp, P&& part) { + return emplace(detail::make_max_element_task( + first, last, result, comp, std::forward

(part) + )); +} + +} // end of namespace tf ----------------------------------------------------- diff --git a/lib/taskflow/algorithm/for_each.hpp b/lib/taskflow/algorithm/for_each.hpp new file mode 100644 index 0000000..d15958a --- /dev/null +++ b/lib/taskflow/algorithm/for_each.hpp @@ -0,0 +1,173 @@ +#pragma once + +#include "launch.hpp" + +namespace tf { + +namespace detail { + +// Function: make_for_each_task +template +TF_FORCE_INLINE auto make_for_each_task(B beg, E end, C c, P&& part) { + + using B_t = std::decay_t>; + using E_t = std::decay_t>; + using namespace std::string_literals; + + return [b=beg, e=end, c, part=std::forward

(part)] (Runtime& rt) mutable { + + // fetch the stateful values + B_t beg = b; + E_t end = e; + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + std::for_each(beg, end, c); + return; + } + + if(N < W) { + W = N; + } + + // static partitioner + if constexpr(std::is_same_v, StaticPartitioner>) { + size_t chunk_size; + for(size_t w=0, curr_b=0; w next(0); + launch_loop(N, W, rt, next, part, [=, &c, &next, &part] () mutable { + part.loop(N, W, next, + [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable { + std::advance(beg, curr_b - prev_e); + for(size_t x = curr_b; x +TF_FORCE_INLINE auto make_for_each_index_task(B beg, E end, S inc, C c, P&& part){ + + using namespace std::string_literals; + + using B_t = std::decay_t>; + using E_t = std::decay_t>; + using S_t = std::decay_t>; + + return [b=beg, e=end, a=inc, c, part=std::forward

(part)] + (Runtime& rt) mutable { + + // fetch the iterator values + B_t beg = b; + E_t end = e; + S_t inc = a; + + size_t W = rt.executor().num_workers(); + size_t N = distance(beg, end, inc); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + for(size_t x=0; x, StaticPartitioner>) { + + size_t chunk_size; + + for(size_t w=0, curr_b=0; w(curr_b) * inc + beg; + for(size_t x=curr_b; x next(0); + launch_loop(N, W, rt, next, part, [=, &c, &next, &part] () mutable { + part.loop(N, W, next, + [&](size_t curr_b, size_t curr_e) { + auto idx = static_cast(curr_b) * inc + beg; + for(size_t x=curr_b; x +Task FlowBuilder::for_each(B beg, E end, C c, P&& part) { + return emplace( + detail::make_for_each_task(beg, end, c, std::forward

(part)) + ); +} + +// ---------------------------------------------------------------------------- +// for_each_index +// ---------------------------------------------------------------------------- + +// Function: for_each_index +template +Task FlowBuilder::for_each_index(B beg, E end, S inc, C c, P&& part){ + return emplace( + detail::make_for_each_index_task(beg, end, inc, c, std::forward

(part)) + ); +} + + +} // end of namespace tf ----------------------------------------------------- + diff --git a/lib/taskflow/algorithm/launch.hpp b/lib/taskflow/algorithm/launch.hpp new file mode 100644 index 0000000..363223e --- /dev/null +++ b/lib/taskflow/algorithm/launch.hpp @@ -0,0 +1,58 @@ +#pragma once + +#include "../core/async.hpp" + +namespace tf { + +// Function: launch_loop +template +TF_FORCE_INLINE void launch_loop( + size_t N, + size_t W, + Runtime& rt, + std::atomic& next, + P&& part, + Loop&& loop +) { + + //static_assert(std::is_lvalue_reference_v, ""); + + using namespace std::string_literals; + + for(size_t w=0; w +TF_FORCE_INLINE void launch_loop( + size_t W, + size_t w, + Runtime& rt, + Loop&& loop +) { + using namespace std::string_literals; + if(w == W-1) { + loop(); + } + else { + rt.silent_async_unchecked("loop-"s + std::to_string(w), loop); + } +} + +} // end of namespace tf ----------------------------------------------------- diff --git a/lib/taskflow/algorithm/partitioner.hpp b/lib/taskflow/algorithm/partitioner.hpp new file mode 100644 index 0000000..4a253fa --- /dev/null +++ b/lib/taskflow/algorithm/partitioner.hpp @@ -0,0 +1,543 @@ +// reference: +// - gomp: https://github.com/gcc-mirror/gcc/blob/master/libgomp/iter.c +// - komp: https://github.com/llvm-mirror/openmp/blob/master/runtime/src/kmp_dispatch.cpp + +#pragma once + +/** +@file partitioner.hpp +@brief partitioner include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// Partitioner Base +// ---------------------------------------------------------------------------- + +/** +@class PartitionerBase + +@brief class to derive a partitioner for scheduling parallel algorithms + +The class provides base methods to derive a partitioner that can be used +to schedule parallel iterations (e.g., tf::Taskflow::for_each). + +An partitioner defines the scheduling method for running parallel algorithms, +such tf::Taskflow::for_each, tf::Taskflow::reduce, and so on. +By default, we provide the following partitioners: + ++ tf::GuidedPartitioner to enable guided scheduling algorithm of adaptive chunk size ++ tf::DynamicPartitioner to enable dynamic scheduling algorithm of equal chunk size ++ tf::StaticPartitioner to enable static scheduling algorithm of static chunk size ++ tf::RandomPartitioner to enable random scheduling algorithm of random chunk size + +Depending on applications, partitioning algorithms can impact the performance +a lot. +For example, if a parallel-iteration workload contains a regular work unit per +iteration, tf::StaticPartitioner can deliver the best performance. +On the other hand, if the work unit per iteration is irregular and unbalanced, +tf::GuidedPartitioner or tf::DynamicPartitioner can outperform tf::StaticPartitioner. +In most situations, tf::GuidedPartitioner can deliver decent performance and +is thus used as our default partitioner. +*/ +class PartitionerBase { + + public: + + /** + @brief default constructor + */ + PartitionerBase() = default; + + /** + @brief construct a partitioner with the given chunk size + */ + explicit PartitionerBase(size_t chunk_size) : _chunk_size {chunk_size} {} + + /** + @brief query the chunk size of this partitioner + */ + size_t chunk_size() const { return _chunk_size; } + + /** + @brief update the chunk size of this partitioner + */ + void chunk_size(size_t cz) { _chunk_size = cz; } + + protected: + + /** + @brief chunk size + */ + size_t _chunk_size{0}; +}; + +// ---------------------------------------------------------------------------- +// Guided Partitioner +// ---------------------------------------------------------------------------- + +/** +@class GuidedPartitioner + +@brief class to construct a guided partitioner for scheduling parallel algorithms + +The size of a partition is proportional to the number of unassigned iterations +divided by the number of workers, +and the size will gradually decrease to the given chunk size. +The last partition may be smaller than the chunk size. +*/ +class GuidedPartitioner : public PartitionerBase { + + public: + + /** + @brief default constructor + */ + GuidedPartitioner() : PartitionerBase{1} {} + + /** + @brief construct a guided partitioner with the given chunk size + */ + explicit GuidedPartitioner(size_t sz) : PartitionerBase (sz) {} + + // -------------------------------------------------------------------------- + // scheduling methods + // -------------------------------------------------------------------------- + + /** + @private + */ + template , void>* = nullptr + > + void loop( + size_t N, + size_t W, + std::atomic& next, + F&& func + ) const { + + size_t chunk_size = (_chunk_size == 0) ? size_t{1} : _chunk_size; + + size_t p1 = 2 * W * (chunk_size + 1); + float p2 = 0.5f / static_cast(W); + size_t curr_b = next.load(std::memory_order_relaxed); + + while(curr_b < N) { + + size_t r = N - curr_b; + + // fine-grained + if(r < p1) { + while(1) { + curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); + if(curr_b >= N) { + return; + } + func(curr_b, std::min(curr_b + chunk_size, N)); + } + break; + } + // coarse-grained + else { + size_t q = static_cast(p2 * r); + if(q < chunk_size) { + q = chunk_size; + } + //size_t curr_e = (q <= r) ? curr_b + q : N; + size_t curr_e = std::min(curr_b + q, N); + if(next.compare_exchange_strong(curr_b, curr_e, std::memory_order_relaxed, + std::memory_order_relaxed)) { + func(curr_b, curr_e); + curr_b = next.load(std::memory_order_relaxed); + } + } + } + } + + /** + @private + */ + template , void>* = nullptr + > + void loop_until( + size_t N, + size_t W, + std::atomic& next, + F&& func + ) const { + + size_t chunk_size = (_chunk_size == 0) ? size_t{1} : _chunk_size; + + size_t p1 = 2 * W * (chunk_size + 1); + float p2 = 0.5f / static_cast(W); + size_t curr_b = next.load(std::memory_order_relaxed); + + while(curr_b < N) { + + size_t r = N - curr_b; + + // fine-grained + if(r < p1) { + while(1) { + curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); + if(curr_b >= N) { + return; + } + if(func(curr_b, std::min(curr_b + chunk_size, N))) { + return; + } + } + break; + } + // coarse-grained + else { + size_t q = static_cast(p2 * r); + if(q < chunk_size) { + q = chunk_size; + } + //size_t curr_e = (q <= r) ? curr_b + q : N; + size_t curr_e = std::min(curr_b + q, N); + if(next.compare_exchange_strong(curr_b, curr_e, std::memory_order_relaxed, + std::memory_order_relaxed)) { + if(func(curr_b, curr_e)) { + return; + } + curr_b = next.load(std::memory_order_relaxed); + } + } + } + } +}; + +// ---------------------------------------------------------------------------- +// Dynamic Partitioner +// ---------------------------------------------------------------------------- + +/** +@class DynamicPartitioner + +@brief class to construct a dynamic partitioner for scheduling parallel algorithms + +The partitioner splits iterations into many partitions each of size equal to +the given chunk size. +Different partitions are distributed dynamically to workers +without any specific order. +*/ +class DynamicPartitioner : public PartitionerBase { + + public: + + /** + @brief default constructor + */ + DynamicPartitioner() : PartitionerBase{1} {}; + + /** + @brief construct a dynamic partitioner with the given chunk size + */ + explicit DynamicPartitioner(size_t sz) : PartitionerBase (sz) {} + + // -------------------------------------------------------------------------- + // scheduling methods + // -------------------------------------------------------------------------- + + /** + @private + */ + template , void>* = nullptr + > + void loop( + size_t N, + size_t, + std::atomic& next, + F&& func + ) const { + + size_t chunk_size = (_chunk_size == 0) ? size_t{1} : _chunk_size; + size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); + + while(curr_b < N) { + func(curr_b, std::min(curr_b + chunk_size, N)); + curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); + } + } + + /** + @private + */ + template , void>* = nullptr + > + void loop_until( + size_t N, + size_t, + std::atomic& next, + F&& func + ) const { + + size_t chunk_size = (_chunk_size == 0) ? size_t{1} : _chunk_size; + size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); + + while(curr_b < N) { + if(func(curr_b, std::min(curr_b + chunk_size, N))) { + return; + } + curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); + } + } +}; + +// ---------------------------------------------------------------------------- +// Static Partitioner +// ---------------------------------------------------------------------------- + +/** +@class StaticPartitioner + +@brief class to construct a dynamic partitioner for scheduling parallel algorithms + +The partitioner divides iterations into chunks and distributes chunks +to workers in order. +If the chunk size is not specified (default @c 0), the partitioner resorts to a chunk size +that equally distributes iterations into workers. + +@code{.cpp} +std::vector data = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10} +taskflow.for_each( + data.begin(), data.end(), [](int i){}, StaticPartitioner(0) +); +executor.run(taskflow).run(); +@endcode +*/ +class StaticPartitioner : public PartitionerBase { + + public: + + /** + @brief default constructor + */ + StaticPartitioner() : PartitionerBase{0} {}; + + /** + @brief construct a dynamic partitioner with the given chunk size + */ + explicit StaticPartitioner(size_t sz) : PartitionerBase(sz) {} + + /** + @brief queries the adjusted chunk size + + Returns the given chunk size if it is not zero, or returns + N/W + (w < N%W), where @c N is the number of iterations, + @c W is the number of workers, and @c w is the worker ID. + */ + size_t adjusted_chunk_size(size_t N, size_t W, size_t w) const { + return _chunk_size ? _chunk_size : N/W + (w < N%W); + } + + // -------------------------------------------------------------------------- + // scheduling methods + // -------------------------------------------------------------------------- + + /** + @private + */ + template , void>* = nullptr + > + void loop( + size_t N, + size_t W, + size_t curr_b, + size_t chunk_size, + F&& func + ) { + size_t stride = W * chunk_size; + while(curr_b < N) { + size_t curr_e = std::min(curr_b + chunk_size, N); + func(curr_b, curr_e); + curr_b += stride; + } + } + + /** + @private + */ + template , void>* = nullptr + > + void loop_until( + size_t N, + size_t W, + size_t curr_b, + size_t chunk_size, + F&& func + ) { + size_t stride = W * chunk_size; + while(curr_b < N) { + size_t curr_e = std::min(curr_b + chunk_size, N); + if(func(curr_b, curr_e)) { + return; + } + curr_b += stride; + } + } +}; + +// ---------------------------------------------------------------------------- +// RandomPartitioner +// ---------------------------------------------------------------------------- + +/** +@class RandomPartitioner + +@brief class to construct a random partitioner for scheduling parallel algorithms + +Similar to tf::DynamicPartitioner, +the partitioner splits iterations into many partitions but each with a random +chunk size in the range, c = [alpha * N * W, beta * N * W]. +By default, @c alpha is 0.01 and @c beta is 0.5, respectively. + +*/ +class RandomPartitioner : public PartitionerBase { + + public: + + /** + @brief default constructor + */ + RandomPartitioner() = default; + + /** + @brief constructs a random partitioner + */ + RandomPartitioner(size_t cz) : PartitionerBase(cz) {} + + /** + @brief constructs a random partitioner with the given parameters + */ + RandomPartitioner(float alpha, float beta) : _alpha {alpha}, _beta {beta} {} + + /** + @brief queries the @c alpha value + */ + float alpha() const { return _alpha; } + + /** + @brief queries the @c beta value + */ + float beta() const { return _beta; } + + /** + @brief queries the range of chunk size + + @param N number of iterations + @param W number of workers + */ + std::pair chunk_size_range(size_t N, size_t W) const { + + size_t b1 = static_cast(_alpha * N * W); + size_t b2 = static_cast(_beta * N * W); + + if(b1 > b2) { + std::swap(b1, b2); + } + + b1 = std::max(b1, size_t{1}); + b2 = std::max(b2, b1 + 1); + + return {b1, b2}; + } + + // -------------------------------------------------------------------------- + // scheduling methods + // -------------------------------------------------------------------------- + + /** + @private + */ + template , void>* = nullptr + > + void loop( + size_t N, + size_t W, + std::atomic& next, + F&& func + ) const { + + auto [b1, b2] = chunk_size_range(N, W); + + std::default_random_engine engine {std::random_device{}()}; + std::uniform_int_distribution dist(b1, b2); + + size_t chunk_size = dist(engine); + size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); + + while(curr_b < N) { + func(curr_b, std::min(curr_b + chunk_size, N)); + chunk_size = dist(engine); + curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); + } + } + + /** + @private + */ + template , void>* = nullptr + > + void loop_until( + size_t N, + size_t W, + std::atomic& next, + F&& func + ) const { + + auto [b1, b2] = chunk_size_range(N, W); + + std::default_random_engine engine {std::random_device{}()}; + std::uniform_int_distribution dist(b1, b2); + + size_t chunk_size = dist(engine); + size_t curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); + + while(curr_b < N) { + if(func(curr_b, std::min(curr_b + chunk_size, N))){ + return; + } + chunk_size = dist(engine); + curr_b = next.fetch_add(chunk_size, std::memory_order_relaxed); + } + } + + private: + + float _alpha {0.01f}; + float _beta {0.5f}; + +}; + +/** +@brief default partitioner set to tf::GuidedPartitioner + +Guided partitioner can achieve decent performance for most parallel algorithms, +especially for those with irregular and unbalanced workload per iteration. +*/ +using DefaultPartitioner = GuidedPartitioner; + +/** +@brief determines if a type is a partitioner + +A partitioner is a derived type from tf::PartitionerBase. +*/ +template +inline constexpr bool is_partitioner_v = std::is_base_of::value; + +} // end of namespace tf ----------------------------------------------------- + + + diff --git a/lib/taskflow/algorithm/pipeline.hpp b/lib/taskflow/algorithm/pipeline.hpp new file mode 100644 index 0000000..5442d56 --- /dev/null +++ b/lib/taskflow/algorithm/pipeline.hpp @@ -0,0 +1,1663 @@ +#pragma once + +#include "../taskflow.hpp" + +/** +@file pipeline.hpp +@brief pipeline include file +*/ + +namespace tf { + + +// ---------------------------------------------------------------------------- +// Structure Definition: DeferredPipeflow +// ---------------------------------------------------------------------------- +// For example: +// 12.defer(7); 12.defer(16); +// _____ +// | | +// v | +// 7 12 16 +// | ^ +// |____ | +// +// DeferredPipeflow dpf of 12 : +// dpf._token = 12; +// dpf._num_deferrals = 1; +// dpf._dependents = std::list{7,16}; +// dpf._dependent_satellites has following two entries +// {key: 7, value: dpf._dependents.begin()} +// {key: 16, value: dpf._dependents.begin()+1} +// +/** @private */ +class DeferredPipeflow { + + template + friend class Pipeline; + + template + friend class ScalablePipeline; + + public: + + DeferredPipeflow() = default; + DeferredPipeflow(const DeferredPipeflow&) = delete; + DeferredPipeflow(DeferredPipeflow&&) = delete; + + DeferredPipeflow(size_t t, size_t n, std::unordered_set&& dep) : + _token{t}, _num_deferrals{n}, _dependents{std::move(dep)} { + } + + DeferredPipeflow& operator = (const DeferredPipeflow&) = delete; + DeferredPipeflow& operator = (DeferredPipeflow&&) = delete; + + private: + + // token id + size_t _token; + + // number of deferrals + size_t _num_deferrals; + + // dependents + // For example, + // 12.defer(7); 12.defer(16) + // _dependents = {7, 16} + std::unordered_set _dependents; +}; + + + +// ---------------------------------------------------------------------------- +// Class Definition: Pipeflow +// ---------------------------------------------------------------------------- + +/** +@class Pipeflow + +@brief class to create a pipeflow object used by the pipe callable + +Pipeflow represents a scheduling token in the pipeline scheduling +framework. A pipeflow is created by the pipeline scheduler at runtime to +pass to the pipe callable. Users can query the present statistics +of that scheduling token, including the line identifier, pipe identifier, +and token identifier, and build their application algorithms based on +these statistics. +At the first stage, users can explicitly call the stop method +to stop the pipeline scheduler. + +@code{.cpp} +tf::Pipe{tf::PipeType::SERIAL, [](tf::Pipeflow& pf){ + std::cout << "token id=" << pf.token() + << " at line=" << pf.line() + << " at pipe=" << pf.pipe() + << '\n'; +}}; +@endcode + +Pipeflow can only be created privately by the tf::Pipeline and +be used through the pipe callable. +*/ +class Pipeflow { + + template + friend class Pipeline; + + template + friend class ScalablePipeline; + + template + friend class DataPipeline; + + public: + + /** + @brief default constructor + */ + Pipeflow() = default; + + /** + @brief queries the line identifier of the present token + */ + size_t line() const { + return _line; + } + + /** + @brief queries the pipe identifier of the present token + */ + size_t pipe() const { + return _pipe; + } + + /** + @brief queries the token identifier + */ + size_t token() const { + return _token; + } + + /** + @brief stops the pipeline scheduling + + Only the first pipe can call this method to stop the pipeline. + Calling stop from other pipes will throw exception. + */ + void stop() { + if(_pipe != 0) { + TF_THROW("only the first pipe can stop the token"); + } + _stop = true; + } + + /** + @brief queries the number of deferrals + */ + size_t num_deferrals() const { + return _num_deferrals; + } + + /** + @brief pushes token in _dependents + + Only the first pipe can call this method to defer the current + scheduling token to the given token. + */ + void defer(size_t token) { + if(_pipe != 0) { + TF_THROW("only the first pipe can defer the current scheduling token"); + } + _dependents.insert(token); + } + + private: + + // Regular data + size_t _line; + size_t _pipe; + size_t _token; + bool _stop; + + // Data field for token dependencies + size_t _num_deferrals; + std::unordered_set _dependents; + +}; + +// ---------------------------------------------------------------------------- +// Class Definition: PipeType +// ---------------------------------------------------------------------------- + +/** +@enum PipeType + +@brief enumeration of all pipe types +*/ +enum class PipeType : int { + /** @brief parallel type */ + PARALLEL = 1, + /** @brief serial type */ + SERIAL = 2 +}; + +// ---------------------------------------------------------------------------- +// Class Definition: Pipe +// ---------------------------------------------------------------------------- + +/** +@class Pipe + +@brief class to create a pipe object for a pipeline stage + +@tparam C callable type + +A pipe represents a stage of a pipeline. A pipe can be either +@em parallel direction or @em serial direction (specified by tf::PipeType) +and is coupled with a callable to invoke by the pipeline scheduler. +The callable must take a referenced tf::Pipeflow object in the first argument: + +@code{.cpp} +Pipe{PipeType::SERIAL, [](tf::Pipeflow&){}} +@endcode + +The pipeflow object is used to query the statistics of a scheduling token +in the pipeline, such as pipe, line, and token numbers. +*/ +template > +class Pipe { + + template + friend class Pipeline; + + template + friend class ScalablePipeline; + + public: + + /** + @brief alias of the callable type + */ + using callable_t = C; + + /** + @brief default constructor + */ + Pipe() = default; + + /** + @brief constructs the pipe object + + @param d pipe type (tf::PipeType) + @param callable callable type + + The constructor constructs a pipe with the given direction + (tf::PipeType::SERIAL or tf::PipeType::PARALLEL) and the given callable. + The callable must take a referenced tf::Pipeflow object in the first argument. + + @code{.cpp} + Pipe{PipeType::SERIAL, [](tf::Pipeflow&){}} + @endcode + + When creating a pipeline, the direction of the first pipe must be serial + (tf::PipeType::SERIAL). + */ + Pipe(PipeType d, C&& callable) : + _type{d}, _callable{std::forward(callable)} { + } + + /** + @brief queries the type of the pipe + + Returns the type of the callable. + */ + PipeType type() const { + return _type; + } + + /** + @brief assigns a new type to the pipe + + @param type a tf::PipeType variable + */ + void type(PipeType type) { + _type = type; + } + + /** + @brief assigns a new callable to the pipe + + @tparam U callable type + @param callable a callable object constructible from std::function + + Assigns a new callable to the pipe with universal forwarding. + */ + template + void callable(U&& callable) { + _callable = std::forward(callable); + } + + private: + + PipeType _type; + + C _callable; +}; + +// ---------------------------------------------------------------------------- +// Class Definition: Pipeline +// ---------------------------------------------------------------------------- + +/** +@class Pipeline + +@brief class to create a pipeline scheduling framework + +@tparam Ps pipe types + +A pipeline is a composable graph object for users to create a +pipeline scheduling framework using a module task in a taskflow. +Unlike the conventional pipeline programming frameworks (e.g., Intel TBB), +%Taskflow's pipeline algorithm does not provide any data abstraction, +which often restricts users from optimizing data layouts in their applications, +but a flexible framework for users to customize their application data +atop our pipeline scheduling. +The following code creates a pipeline of four parallel lines to schedule +tokens through three serial pipes: + +@code{.cpp} +tf::Taskflow taskflow; +tf::Executor executor; + +const size_t num_lines = 4; +const size_t num_pipes = 3; + +// create a custom data buffer +std::array, num_lines> buffer; + +// create a pipeline graph of four concurrent lines and three serial pipes +tf::Pipeline pipeline(num_lines, + // first pipe must define a serial direction + tf::Pipe{tf::PipeType::SERIAL, [&buffer](tf::Pipeflow& pf) { + // generate only 5 scheduling tokens + if(pf.token() == 5) { + pf.stop(); + } + // save the token id into the buffer + else { + buffer[pf.line()][pf.pipe()] = pf.token(); + } + }}, + tf::Pipe{tf::PipeType::SERIAL, [&buffer] (tf::Pipeflow& pf) { + // propagate the previous result to this pipe by adding one + buffer[pf.line()][pf.pipe()] = buffer[pf.line()][pf.pipe()-1] + 1; + }}, + tf::Pipe{tf::PipeType::SERIAL, [&buffer](tf::Pipeflow& pf){ + // propagate the previous result to this pipe by adding one + buffer[pf.line()][pf.pipe()] = buffer[pf.line()][pf.pipe()-1] + 1; + }} +); + +// build the pipeline graph using composition +tf::Task init = taskflow.emplace([](){ std::cout << "ready\n"; }) + .name("starting pipeline"); +tf::Task task = taskflow.composed_of(pipeline) + .name("pipeline"); +tf::Task stop = taskflow.emplace([](){ std::cout << "stopped\n"; }) + .name("pipeline stopped"); + +// create task dependency +init.precede(task); +task.precede(stop); + +// run the pipeline +executor.run(taskflow).wait(); +@endcode + +The above example creates a pipeline graph that schedules five tokens over +four parallel lines in a circular fashion, as depicted below: + +@code{.shell-session} +o -> o -> o +| | | +v v v +o -> o -> o +| | | +v v v +o -> o -> o +| | | +v v v +o -> o -> o +@endcode + +At each pipe stage, the program propagates the result to the next pipe +by adding one to the result stored in a custom data storage, @c buffer. +The pipeline scheduler will generate five scheduling tokens and then stop. + +Internally, tf::Pipeline uses std::tuple to store the given sequence of pipes. +The definition of each pipe can be different, completely decided by the compiler +to optimize the object layout. +After a pipeline is constructed, it is not possible to change its pipes. +If applications need to change these pipes, please use tf::ScalablePipeline. +*/ +template +class Pipeline { + + static_assert(sizeof...(Ps)>0, "must have at least one pipe"); + + /** + @private + */ + struct Line { + std::atomic join_counter; + }; + + /** + @private + */ + struct PipeMeta { + PipeType type; + }; + + public: + + /** + @brief constructs a pipeline object + + @param num_lines the number of parallel lines + @param ps a list of pipes + + Constructs a pipeline of up to @c num_lines parallel lines to schedule + tokens through the given linear chain of pipes. + The first pipe must define a serial direction (tf::PipeType::SERIAL) + or an exception will be thrown. + */ + Pipeline(size_t num_lines, Ps&&... ps); + + /** + @brief constructs a pipeline object + + @param num_lines the number of parallel lines + @param ps a tuple of pipes + + Constructs a pipeline of up to @c num_lines parallel lines to schedule + tokens through the given linear chain of pipes. + The first pipe must define a serial direction (tf::PipeType::SERIAL) + or an exception will be thrown. + */ + Pipeline(size_t num_lines, std::tuple&& ps); + + /** + @brief queries the number of parallel lines + + The function returns the number of parallel lines given by the user + upon the construction of the pipeline. + The number of lines represents the maximum parallelism this pipeline + can achieve. + */ + size_t num_lines() const noexcept; + + /** + @brief queries the number of pipes + + The Function returns the number of pipes given by the user + upon the construction of the pipeline. + */ + constexpr size_t num_pipes() const noexcept; + + /** + @brief resets the pipeline + + Resetting the pipeline to the initial state. After resetting a pipeline, + its token identifier will start from zero as if the pipeline was just + constructed. + */ + void reset(); + + /** + @brief queries the number of generated tokens in the pipeline + + The number represents the total scheduling tokens that has been + generated by the pipeline so far. + */ + size_t num_tokens() const noexcept; + + /** + @brief obtains the graph object associated with the pipeline construct + + This method is primarily used as an opaque data structure for creating + a module task of the this pipeline. + */ + Graph& graph(); + + + private: + + Graph _graph; + + size_t _num_tokens; + + std::tuple _pipes; + std::array _meta; + std::vector> _lines; + std::vector _tasks; + std::vector _pipeflows; + + // queue of ready tokens (paired with their deferral times) + // For example, + // when 12 does not have any dependents, + // we put 12 in _ready_tokens queue + // Assume num_deferrals of 12 is 1, + // we push pair{12, 1} in the queue + std::queue> _ready_tokens; + + // unordered_map of token dependencies + // For example, + // 12.defer(16); 13.defer(16); + // _token_dependencies has the following entry + // {key: 16, value: std::vector{12, 13}}. + std::unordered_map> _token_dependencies; + + // unordered_map of deferred tokens + // For example, + // 12.defer(16); 13.defer(16); + // _deferred_tokens has the following two entries + // {key: 12, DeferredPipeflow of 12} and + // {key: 13, DeferredPipeflow of 13} + std::unordered_map _deferred_tokens; + + // variable to keep track of the longest deferred tokens + // For example, + // 2.defer(16) + // 5.defer(19) + // 5.defer(17), + // _longest_deferral will be 19 - after token 19 the pipeline + // has almost zero cost on handling deferred pipeflow + size_t _longest_deferral = 0; + + template + auto _gen_meta(std::tuple&&, std::index_sequence); + + void _on_pipe(Pipeflow&, Runtime&); + void _build(); + void _check_dependents(Pipeflow&); + void _construct_deferred_tokens(Pipeflow&); + void _resolve_token_dependencies(Pipeflow&); +}; + +// constructor +template +Pipeline::Pipeline(size_t num_lines, Ps&&... ps) : + _pipes {std::make_tuple(std::forward(ps)...)}, + _meta {PipeMeta{ps.type()}...}, + _lines (num_lines), + _tasks (num_lines + 1), + _pipeflows (num_lines) { + + if(num_lines == 0) { + TF_THROW("must have at least one line"); + } + + if(std::get<0>(_pipes).type() != PipeType::SERIAL) { + TF_THROW("first pipe must be serial"); + } + + reset(); + _build(); +} + +// constructor +template +Pipeline::Pipeline(size_t num_lines, std::tuple&& ps) : + _pipes {std::forward>(ps)}, + _meta {_gen_meta( + std::forward>(ps), std::make_index_sequence{} + )}, + _lines (num_lines), + _tasks (num_lines + 1), + _pipeflows (num_lines) { + + if(num_lines == 0) { + TF_THROW("must have at least one line"); + } + + if(std::get<0>(_pipes).type() != PipeType::SERIAL) { + TF_THROW("first pipe must be serial"); + } + + reset(); + _build(); +} + +// Function: _get_meta +template +template +auto Pipeline::_gen_meta(std::tuple&& ps, std::index_sequence) { + return std::array{PipeMeta{std::get(ps).type()}...}; +} + +// Function: num_lines +template +size_t Pipeline::num_lines() const noexcept { + return _pipeflows.size(); +} + +// Function: num_pipes +template +constexpr size_t Pipeline::num_pipes() const noexcept { + return sizeof...(Ps); +} + +// Function: num_tokens +template +size_t Pipeline::num_tokens() const noexcept { + return _num_tokens; +} + +// Function: graph +template +Graph& Pipeline::graph() { + return _graph; +} + +// Function: reset +template +void Pipeline::reset() { + + _num_tokens = 0; + + for(size_t l = 0; l(_meta[f].type), std::memory_order_relaxed + ); + } + } + + for(size_t f=1; f(_meta[0].type) - 1, std::memory_order_relaxed + ); + } +} + +// Procedure: _on_pipe +template +void Pipeline::_on_pipe(Pipeflow& pf, Runtime& rt) { + visit_tuple([&](auto&& pipe){ + using callable_t = typename std::decay_t::callable_t; + if constexpr (std::is_invocable_v) { + pipe._callable(pf); + } + else if constexpr(std::is_invocable_v) { + pipe._callable(pf, rt); + } + else { + static_assert(dependent_false_v, "un-supported pipe callable type"); + } + }, _pipes, pf._pipe); +} + +// Procedure: _check_dependents +// Check and remove invalid dependents after on_pipe +// For example, users may defer a pipeflow to multiple tokens, +// and we need to remove invalid tokens. +// 12.defer(7); // valid only if 7 is deferred, or invalid otherwise +// 12.defer(16); // 16 is valid +template +void Pipeline::_check_dependents(Pipeflow& pf) { + //if (pf._dependents.size()) { + ++pf._num_deferrals; + + for (auto it = pf._dependents.begin(); it != pf._dependents.end();) { + + // valid (e.g., 12.defer(16)) + if (*it >= _num_tokens) { + _token_dependencies[*it].push_back(pf._token); + _longest_deferral = std::max(_longest_deferral, *it); + ++it; + } + // valid or invalid (e.g., 12.defer(7)) + else { + auto pit = _deferred_tokens.find(*it); + + // valid (e.g., 7 is deferred) + if (pit != _deferred_tokens.end()) { + _token_dependencies[*it].push_back(pf._token); + ++it; + } + + // invalid (e.g., 7 is finished - this this 12.defer(7) is dummy) + else { + it = pf._dependents.erase(it); + } + } + } +} + +// Procedure: _construct_deferred_tokens +// Construct a data structure for a deferred token +// +// For example, +// 12.defer(7); 12.defer(16); +// After _check_dependents, 12 needs to be deferred, +// so we will construct a data structure for 12 using hashmap: +// {key: 12, value: DeferredPipeflow of 12} +template +void Pipeline::_construct_deferred_tokens(Pipeflow& pf) { + + //auto res = _deferred_tokens.emplace( + // pf._token, DeferredPipeflow{pf._token, pf._num_deferrals, std::move(pf._dependents)} + //); + + // construct the deferred pipeflow with zero copy + //auto res = _deferred_tokens.emplace( + _deferred_tokens.emplace( + std::piecewise_construct, + std::forward_as_tuple(pf._token), + std::forward_as_tuple( + pf._token, pf._num_deferrals, std::move(pf._dependents) + ) + ); + + //assert(res.second == true); +} + +// Procedure: _resolve_token_dependencies +// Resolve dependencies for tokens that defer to current token +// +// For example, +// 12.defer(16); +// 13.defer(16); +// _token_dependencies will have the entry +// {key: 16, value: std::vector{12, 13}} +// +// When 16 finishes, we need to remove 16 from 12's and 13's +// individual_dependents +template +void Pipeline::_resolve_token_dependencies(Pipeflow& pf) { + + if (auto it = _token_dependencies.find(pf._token); + it != _token_dependencies.end()) { + + // iterate tokens that defer to pf._token + // (e.g., 12 and 13) + for(size_t target : it->second) { + + auto dpf = _deferred_tokens.find(target); + + assert(dpf != _deferred_tokens.end()); + + // erase pf._token from target's _dependents + // (e.g., remove 16 from 12's dependents) + dpf->second._dependents.erase(pf._token); + // dpf->second._dependent_satellites[pf._token] + //); + + // target has no dependents + if (dpf->second._dependents.empty()) { + + // push target into _ready_tokens queue + _ready_tokens.emplace(dpf->second._token, dpf->second._num_deferrals); + //_ready_tokens.push( + // std::make_pair(dpf->second._token, dpf->second._num_deferrals) + //); + + // erase target from _deferred_tokens + _deferred_tokens.erase(dpf); + } + } + + // remove pf._token from _token_dependencies + // (e.g., remove the entry + // {key: 16, value: std::vector{12, 13}} from _token_dependencies) + _token_dependencies.erase(it); + } +} + +// Procedure: _build +template +void Pipeline::_build() { + + using namespace std::literals::string_literals; + + FlowBuilder fb(_graph); + + // init task + _tasks[0] = fb.emplace([this]() { + return static_cast(_num_tokens % num_lines()); + }).name("cond"); + + // line task + for(size_t l = 0; l < num_lines(); l++) { + + _tasks[l + 1] = fb.emplace([this, l] (tf::Runtime& rt) mutable { + + auto pf = &_pipeflows[l]; + + pipeline: + + _lines[pf->_line][pf->_pipe].join_counter.store( + static_cast(_meta[pf->_pipe].type), std::memory_order_relaxed + ); + + // First pipe does all jobs of initialization and token dependencies + if (pf->_pipe == 0) { + // _ready_tokens queue is not empty + // substitute pf with the token at the front of the queue + if (!_ready_tokens.empty()) { + pf->_token = _ready_tokens.front().first; + pf->_num_deferrals = _ready_tokens.front().second; + _ready_tokens.pop(); + } + else { + pf->_token = _num_tokens; + pf->_num_deferrals = 0; + } + + handle_token_dependency: + + if (pf->_stop = false, _on_pipe(*pf, rt); pf->_stop == true) { + // here, the pipeline is not stopped yet because other + // lines of tasks may still be running their last stages + return; + } + + if (_num_tokens == pf->_token) { + ++_num_tokens; + } + + if (pf->_dependents.empty() == false){ + // check if the pf->_dependents have valid dependents + _check_dependents(*pf); + + // tokens in pf->_dependents are all valid dependents + if (pf->_dependents.size()) { + + // construct a data structure for pf in _deferred_tokens + _construct_deferred_tokens(*pf); + goto pipeline; + } + + // tokens in pf->_dependents are invalid dependents + // directly goto on_pipe on the same line + else { + goto handle_token_dependency; + } + } + + // Every token within the deferral range needs to check + // if it can resolve dependencies on other tokens. + if (pf->_token <= _longest_deferral) { + _resolve_token_dependencies(*pf); + } + } + else { + _on_pipe(*pf, rt); + } + + size_t c_f = pf->_pipe; + size_t n_f = (pf->_pipe + 1) % num_pipes(); + size_t n_l = (pf->_line + 1) % num_lines(); + + pf->_pipe = n_f; + + // ---- scheduling starts here ---- + // Notice that the shared variable f must not be changed after this + // point because it can result in data race due to the following + // condition: + // + // a -> b + // | | + // v v + // c -> d + // + // d will be spawned by either c or b, so if c changes f but b spawns d + // then data race on f will happen + + std::array retval; + size_t n = 0; + + // downward dependency + if(_meta[c_f].type == PipeType::SERIAL && + _lines[n_l][c_f].join_counter.fetch_sub( + 1, std::memory_order_acq_rel) == 1 + ) { + retval[n++] = 1; + } + + // forward dependency + if(_lines[pf->_line][n_f].join_counter.fetch_sub( + 1, std::memory_order_acq_rel) == 1 + ) { + retval[n++] = 0; + } + + // notice that the task index starts from 1 + switch(n) { + case 2: { + rt.schedule(_tasks[n_l+1]); + goto pipeline; + } + case 1: { + // downward dependency + if (retval[0] == 1) { + pf = &_pipeflows[n_l]; + } + // forward dependency + goto pipeline; + } + } + }).name("rt-"s + std::to_string(l)); + + _tasks[0].precede(_tasks[l+1]); + } +} + +// ---------------------------------------------------------------------------- +// Class Definition: ScalablePipeline +// ---------------------------------------------------------------------------- + +/** +@class ScalablePipeline + +@brief class to create a scalable pipeline object + +@tparam P type of the iterator to a range of pipes + +A scalable pipeline is a composable graph object for users to create a +pipeline scheduling framework using a module task in a taskflow. +Unlike tf::Pipeline that instantiates all pipes upon the construction time, +tf::ScalablePipeline allows variable assignments of pipes using range iterators. +Users can also reset a scalable pipeline to a different range of pipes +between runs. The following code creates a scalable pipeline of four +parallel lines to schedule tokens through three serial pipes in a custom storage, +then resetting the pipeline to a new range of five serial pipes: + +@code{.cpp} +tf::Taskflow taskflow("pipeline"); +tf::Executor executor; + +const size_t num_lines = 4; + +// create data storage +std::array buffer; + +// define the pipe callable +auto pipe_callable = [&buffer] (tf::Pipeflow& pf) mutable { + switch(pf.pipe()) { + // first stage generates only 5 scheduling tokens and saves the + // token number into the buffer. + case 0: { + if(pf.token() == 5) { + pf.stop(); + } + else { + printf("stage 1: input token = %zu\n", pf.token()); + buffer[pf.line()] = pf.token(); + } + return; + } + break; + + // other stages propagate the previous result to this pipe and + // increment it by one + default: { + printf( + "stage %zu: input buffer[%zu] = %d\n", pf.pipe(), pf.line(), buffer[pf.line()] + ); + buffer[pf.line()] = buffer[pf.line()] + 1; + } + break; + } +}; + +// create a vector of three pipes +std::vector< tf::Pipe> > pipes; + +for(size_t i=0; i<3; i++) { + pipes.emplace_back(tf::PipeType::SERIAL, pipe_callable); +} + +// create a pipeline of four parallel lines based on the given vector of pipes +tf::ScalablePipeline pl(num_lines, pipes.begin(), pipes.end()); + +// build the pipeline graph using composition +tf::Task init = taskflow.emplace([](){ std::cout << "ready\n"; }) + .name("starting pipeline"); +tf::Task task = taskflow.composed_of(pl) + .name("pipeline"); +tf::Task stop = taskflow.emplace([](){ std::cout << "stopped\n"; }) + .name("pipeline stopped"); + +// create task dependency +init.precede(task); +task.precede(stop); + +// dump the pipeline graph structure (with composition) +taskflow.dump(std::cout); + +// run the pipeline +executor.run(taskflow).wait(); + +// reset the pipeline to a new range of five pipes and starts from +// the initial state (i.e., token counts from zero) +for(size_t i=0; i<2; i++) { + pipes.emplace_back(tf::PipeType::SERIAL, pipe_callable); +} +pl.reset(pipes.begin(), pipes.end()); + +executor.run(taskflow).wait(); +@endcode + +The above example creates a pipeline graph that schedules five tokens over +four parallel lines in a circular fashion, first going through three serial pipes +and then five serial pipes: + +@code{.shell-session} +# initial construction of three serial pipes +o -> o -> o +| | | +v v v +o -> o -> o +| | | +v v v +o -> o -> o +| | | +v v v +o -> o -> o + +# resetting to a new range of five serial pipes +o -> o -> o -> o -> o +| | | | | +v v v v v +o -> o -> o -> o -> o +| | | | | +v v v v v +o -> o -> o -> o -> o +| | | | | +v v v v v +o -> o -> o -> o -> o +@endcode + +Each pipe has the same type of `%tf::Pipe<%std::function>` +and is kept in a vector that is amenable to change. +We construct the scalable pipeline using two range iterators pointing to the +beginning and the end of the vector. +At each pipe stage, the program propagates the result to the next pipe +by adding one to the result stored in a custom data storage, @c buffer. +The pipeline scheduler will generate five scheduling tokens and then stop. + +A scalable pipeline is move-only. +*/ +template +class ScalablePipeline { + + /** + @private + */ + struct Line { + std::atomic join_counter; + }; + + public: + + /** + @brief pipe type + */ + using pipe_t = typename std::iterator_traits

::value_type; + + /** + @brief default constructor + */ + ScalablePipeline() = default; + + /** + @brief constructs an empty scalable pipeline object + + @param num_lines the number of parallel lines + + An empty scalable pipeline does not have any pipes. + The pipeline needs to be reset to a valid range of pipes + before running. + */ + ScalablePipeline(size_t num_lines); + + /** + @brief constructs a scalable pipeline object + + @param num_lines the number of parallel lines + @param first iterator to the beginning of the range + @param last iterator to the end of the range + + Constructs a pipeline from the given range of pipes specified in + [first, last) using @c num_lines parallel lines. + The first pipe must define a serial direction (tf::PipeType::SERIAL) + or an exception will be thrown. + + Internally, the scalable pipeline copies the iterators + from the specified range. Those pipe callables pointed to by + these iterators must remain valid during the execution of the pipeline. + */ + ScalablePipeline(size_t num_lines, P first, P last); + + /** + @brief disabled copy constructor + */ + ScalablePipeline(const ScalablePipeline&) = delete; + + /** + @brief move constructor + + Constructs a pipeline from the given @c rhs using move semantics + (i.e. the data in @c rhs is moved into this pipeline). + After the move, @c rhs is in a state as if it is just constructed. + The behavior is undefined if @c rhs is running during the move. + */ + ScalablePipeline(ScalablePipeline&& rhs); + + /** + @brief disabled copy assignment operator + */ + ScalablePipeline& operator = (const ScalablePipeline&) = delete; + + /** + @brief move constructor + + Replaces the contents with those of @c rhs using move semantics + (i.e. the data in @c rhs is moved into this pipeline). + After the move, @c rhs is in a state as if it is just constructed. + The behavior is undefined if @c rhs is running during the move. + */ + ScalablePipeline& operator = (ScalablePipeline&& rhs); + + /** + @brief queries the number of parallel lines + + The function returns the number of parallel lines given by the user + upon the construction of the pipeline. + The number of lines represents the maximum parallelism this pipeline + can achieve. + */ + size_t num_lines() const noexcept; + + /** + @brief queries the number of pipes + + The Function returns the number of pipes given by the user + upon the construction of the pipeline. + */ + size_t num_pipes() const noexcept; + + /** + @brief resets the pipeline + + Resets the pipeline to the initial state. After resetting a pipeline, + its token identifier will start from zero. + */ + void reset(); + + /** + @brief resets the pipeline with a new range of pipes + + @param first iterator to the beginning of the range + @param last iterator to the end of the range + + The member function assigns the pipeline to a new range of pipes + specified in [first, last) and resets the pipeline to the + initial state. After resetting a pipeline, its token identifier will + start from zero. + + Internally, the scalable pipeline copies the iterators + from the specified range. Those pipe callables pointed to by + these iterators must remain valid during the execution of the pipeline. + */ + void reset(P first, P last); + + /** + @brief resets the pipeline to a new line number and a + new range of pipes + + @param num_lines number of parallel lines + @param first iterator to the beginning of the range + @param last iterator to the end of the range + + The member function resets the pipeline to a new number of + parallel lines and a new range of pipes specified in + [first, last), as if the pipeline is just constructed. + After resetting a pipeline, its token identifier will start from zero. + + Internally, the scalable pipeline copies the iterators + from the specified range. Those pipe callables pointed to by + these iterators must remain valid during the execution of the pipeline. + */ + void reset(size_t num_lines, P first, P last); + + /** + @brief queries the number of generated tokens in the pipeline + + The number represents the total scheduling tokens that has been + generated by the pipeline so far. + */ + size_t num_tokens() const noexcept; + + /** + @brief obtains the graph object associated with the pipeline construct + + This method is primarily used as an opaque data structure for creating + a module task of the this pipeline. + */ + Graph& graph(); + + private: + + Graph _graph; + + size_t _num_tokens{0}; + + std::vector

_pipes; + std::vector _tasks; + std::vector _pipeflows; + std::unique_ptr _lines; + + // chchiu + std::queue> _ready_tokens; + std::unordered_map> _token_dependencies; + std::unordered_map _deferred_tokens; + size_t _longest_deferral = 0; + + void _check_dependents(Pipeflow&); + void _construct_deferred_tokens(Pipeflow&); + void _resolve_token_dependencies(Pipeflow&); + // chchiu + + void _on_pipe(Pipeflow&, Runtime&); + void _build(); + + Line& _line(size_t, size_t); +}; + +// constructor +template +ScalablePipeline

::ScalablePipeline(size_t num_lines) : + _tasks (num_lines + 1), + _pipeflows (num_lines) { + + if(num_lines == 0) { + TF_THROW("must have at least one line"); + } + + _build(); +} + +// constructor +template +ScalablePipeline

::ScalablePipeline(size_t num_lines, P first, P last) : + _tasks (num_lines + 1), + _pipeflows (num_lines) { + + if(num_lines == 0) { + TF_THROW("must have at least one line"); + } + + reset(first, last); + _build(); +} + +// move constructor +template +ScalablePipeline

::ScalablePipeline(ScalablePipeline&& rhs) : + _graph {std::move(rhs._graph)}, + _num_tokens {rhs._num_tokens}, + _pipes {std::move(rhs._pipes)}, + _tasks {std::move(rhs._tasks)}, + _pipeflows {std::move(rhs._pipeflows)}, + _lines {std::move(rhs._lines)}, + _ready_tokens {std::move(rhs._ready_tokens)}, + _token_dependencies {std::move(rhs._token_dependencies)}, + _deferred_tokens {std::move(rhs._deferred_tokens)}, + _longest_deferral {rhs._longest_deferral}{ + + rhs._longest_deferral = 0; + rhs._num_tokens = 0; +} + +// move assignment operator +template +ScalablePipeline

& ScalablePipeline

::operator = (ScalablePipeline&& rhs) { + _graph = std::move(rhs._graph); + _num_tokens = rhs._num_tokens; + _pipes = std::move(rhs._pipes); + _tasks = std::move(rhs._tasks); + _pipeflows = std::move(rhs._pipeflows); + _lines = std::move(rhs._lines); + rhs._num_tokens = 0; + _ready_tokens = std::move(rhs._ready_tokens); + _token_dependencies = std::move(rhs._token_dependencies); + _deferred_tokens = std::move(rhs._deferred_tokens); + _longest_deferral = rhs._longest_deferral; + rhs._longest_deferral = 0; + return *this; +} + +// Function: num_lines +template +size_t ScalablePipeline

::num_lines() const noexcept { + return _pipeflows.size(); +} + +// Function: num_pipes +template +size_t ScalablePipeline

::num_pipes() const noexcept { + return _pipes.size(); +} + +// Function: num_tokens +template +size_t ScalablePipeline

::num_tokens() const noexcept { + return _num_tokens; +} + +// Function: graph +template +Graph& ScalablePipeline

::graph() { + return _graph; +} + +// Function: _line +template +typename ScalablePipeline

::Line& ScalablePipeline

::_line(size_t l, size_t p) { + return _lines[l*num_pipes() + p]; +} + +template +void ScalablePipeline

::reset(size_t num_lines, P first, P last) { + + if(num_lines == 0) { + TF_THROW("must have at least one line"); + } + + _graph.clear(); + _tasks.resize(num_lines + 1); + _pipeflows.resize(num_lines); + + reset(first, last); + + _build(); +} + +// Function: reset +template +void ScalablePipeline

::reset(P first, P last) { + + size_t num_pipes = static_cast(std::distance(first, last)); + + if(num_pipes == 0) { + TF_THROW("pipeline cannot be empty"); + } + + if(first->type() != PipeType::SERIAL) { + TF_THROW("first pipe must be serial"); + } + + _pipes.resize(num_pipes); + + size_t i=0; + for(auto itr = first; itr != last; itr++) { + _pipes[i++] = itr; + } + + _lines = std::make_unique(num_lines() * _pipes.size()); + + reset(); +} + +// Function: reset +template +void ScalablePipeline

::reset() { + + _num_tokens = 0; + + for(size_t l = 0; l(_pipes[f]->type()), std::memory_order_relaxed + ); + } + } + + for(size_t f=1; f(_pipes[0]->type()) - 1, std::memory_order_relaxed + ); + } + + assert(_ready_tokens.empty() == true); + _token_dependencies.clear(); + _deferred_tokens.clear(); +} + +// Procedure: _on_pipe +template +void ScalablePipeline

::_on_pipe(Pipeflow& pf, Runtime& rt) { + + using callable_t = typename pipe_t::callable_t; + + if constexpr (std::is_invocable_v) { + _pipes[pf._pipe]->_callable(pf); + } + else if constexpr(std::is_invocable_v) { + _pipes[pf._pipe]->_callable(pf, rt); + } + else { + static_assert(dependent_false_v, "un-supported pipe callable type"); + } +} + +template +void ScalablePipeline

::_check_dependents(Pipeflow& pf) { + ++pf._num_deferrals; + + for (auto it = pf._dependents.begin(); it != pf._dependents.end();) { + + // valid (e.g., 12.defer(16)) + if (*it >= _num_tokens) { + _token_dependencies[*it].push_back(pf._token); + _longest_deferral = std::max(_longest_deferral, *it); + ++it; + } + // valid or invalid (e.g., 12.defer(7)) + else { + auto pit = _deferred_tokens.find(*it); + + // valid (e.g., 7 is deferred) + if (pit != _deferred_tokens.end()) { + _token_dependencies[*it].push_back(pf._token); + ++it; + } + + else { + it = pf._dependents.erase(it); + } + } + } +} + +// Procedure: _construct_deferred_tokens +// Construct a data structure for a deferred token +template +void ScalablePipeline

::_construct_deferred_tokens(Pipeflow& pf) { + + // construct the deferred pipeflow with zero copy + _deferred_tokens.emplace( + std::piecewise_construct, + std::forward_as_tuple(pf._token), + std::forward_as_tuple( + pf._token, pf._num_deferrals, std::move(pf._dependents) + ) + ); +} + +// Procedure: _resolve_token_dependencies +// Resolve dependencies for tokens that defer to current token +template +void ScalablePipeline

::_resolve_token_dependencies(Pipeflow& pf) { + + if (auto it = _token_dependencies.find(pf._token); + it != _token_dependencies.end()) { + + // iterate tokens that defer to pf._token + for(size_t target : it->second) { + + auto dpf = _deferred_tokens.find(target); + + assert(dpf != _deferred_tokens.end()); + + // erase pf._token from target's _dependents + dpf->second._dependents.erase(pf._token); + + // target has no dependents + if (dpf->second._dependents.empty()) { + _ready_tokens.emplace(dpf->second._token, dpf->second._num_deferrals); + _deferred_tokens.erase(dpf); + } + } + + _token_dependencies.erase(it); + } +} + +// Procedure: _build +template +void ScalablePipeline

::_build() { + + using namespace std::literals::string_literals; + + FlowBuilder fb(_graph); + + // init task + _tasks[0] = fb.emplace([this]() { + return static_cast(_num_tokens % num_lines()); + }).name("cond"); + + // line task + for(size_t l = 0; l < num_lines(); l++) { + + _tasks[l + 1] = fb.emplace([this, l] (tf::Runtime& rt) mutable { + + auto pf = &_pipeflows[l]; + + pipeline: + + _line(pf->_line, pf->_pipe).join_counter.store( + static_cast(_pipes[pf->_pipe]->type()), std::memory_order_relaxed + ); + + // First pipe does all jobs of initialization and token dependencies + if (pf->_pipe == 0) { + // _ready_tokens queue is not empty + // substitute pf with the token at the front of the queue + if (!_ready_tokens.empty()) { + pf->_token = _ready_tokens.front().first; + pf->_num_deferrals = _ready_tokens.front().second; + _ready_tokens.pop(); + } + else { + pf->_token = _num_tokens; + pf->_num_deferrals = 0; + } + + handle_token_dependency: + + if (pf->_stop = false, _on_pipe(*pf, rt); pf->_stop == true) { + // here, the pipeline is not stopped yet because other + // lines of tasks may still be running their last stages + return; + } + + if (_num_tokens == pf->_token) { + ++_num_tokens; + } + + if (pf->_dependents.empty() == false){ + // check if the pf->_dependents have valid dependents + _check_dependents(*pf); + + // tokens in pf->_dependents are all valid dependents + if (pf->_dependents.size()) { + + // construct a data structure for pf in _deferred_tokens + _construct_deferred_tokens(*pf); + goto pipeline; + } + + // tokens in pf->_dependents are invalid dependents + // directly goto on_pipe on the same line + else { + goto handle_token_dependency; + } + } + + // Every token within the deferral range needs to check + // if it can resolve dependencies on other tokens. + if (pf->_token <= _longest_deferral) { + _resolve_token_dependencies(*pf); + } + } + else { + _on_pipe(*pf, rt); + } + + size_t c_f = pf->_pipe; + size_t n_f = (pf->_pipe + 1) % num_pipes(); + size_t n_l = (pf->_line + 1) % num_lines(); + + pf->_pipe = n_f; + + // ---- scheduling starts here ---- + // Notice that the shared variable f must not be changed after this + // point because it can result in data race due to the following + // condition: + // + // a -> b + // | | + // v v + // c -> d + // + // d will be spawned by either c or b, so if c changes f but b spawns d + // then data race on f will happen + + std::array retval; + size_t n = 0; + + // downward dependency + if(_pipes[c_f]->type() == PipeType::SERIAL && + _line(n_l, c_f).join_counter.fetch_sub( + 1, std::memory_order_acq_rel) == 1 + ) { + retval[n++] = 1; + } + + // forward dependency + if(_line(pf->_line, n_f).join_counter.fetch_sub( + 1, std::memory_order_acq_rel) == 1 + ) { + retval[n++] = 0; + } + + // notice that the task index starts from 1 + switch(n) { + case 2: { + rt.schedule(_tasks[n_l+1]); + goto pipeline; + } + case 1: { + if (retval[0] == 1) { + pf = &_pipeflows[n_l]; + } + goto pipeline; + } + } + }).name("rt-"s + std::to_string(l)); + + _tasks[0].precede(_tasks[l+1]); + } +} + +} // end of namespace tf ----------------------------------------------------- + + + + + diff --git a/lib/taskflow/algorithm/reduce.hpp b/lib/taskflow/algorithm/reduce.hpp new file mode 100644 index 0000000..64869dc --- /dev/null +++ b/lib/taskflow/algorithm/reduce.hpp @@ -0,0 +1,295 @@ +#pragma once + +#include "launch.hpp" + +namespace tf { + +namespace detail { + +// Function: make_reduce_task +template +TF_FORCE_INLINE auto make_reduce_task(B beg, E end, T& init, O bop, P&& part) { + + using B_t = std::decay_t>; + using E_t = std::decay_t>; + using namespace std::string_literals; + + return + [b=beg, e=end, &r=init, bop, part=std::forward

(part)] + (Runtime& rt) mutable { + + // fetch the iterator values + B_t beg = b; + E_t end = e; + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + for(; beg!=end; r = bop(r, *beg++)); + return; + } + + if(N < W) { + W = N; + } + + std::mutex mtx; + + // static partitioner + if constexpr(std::is_same_v, StaticPartitioner>) { + + size_t chunk_size; + + for(size_t w=0, curr_b=0; w lock(mtx); + r = bop(r, *beg); + return; + } + + auto beg1 = beg++; + auto beg2 = beg++; + T sum = bop(*beg1, *beg2); + + // loop reduce + part.loop(N, W, curr_b, chunk_size, + [&, prev_e=curr_b+2](size_t curr_b, size_t curr_e) mutable { + + if(curr_b > prev_e) { + std::advance(beg, curr_b - prev_e); + } + else { + curr_b = prev_e; + } + + for(size_t x=curr_b; x lock(mtx); + r = bop(r, sum); + + }); + } + rt.join(); + } + // dynamic partitioner + else { + std::atomic next(0); + launch_loop(N, W, rt, next, part, [=, &bop, &mtx, &next, &r, &part] () mutable { + // pre-reduce + size_t s0 = next.fetch_add(2, std::memory_order_relaxed); + + if(s0 >= N) { + return; + } + + std::advance(beg, s0); + + if(N - s0 == 1) { + std::lock_guard lock(mtx); + r = bop(r, *beg); + return; + } + + auto beg1 = beg++; + auto beg2 = beg++; + + T sum = bop(*beg1, *beg2); + + // loop reduce + part.loop(N, W, next, + [&, prev_e=s0+2](size_t curr_b, size_t curr_e) mutable { + std::advance(beg, curr_b - prev_e); + for(size_t x=curr_b; x lock(mtx); + r = bop(r, sum); + }); + } + }; +} + +// Function: make_transform_reduce_task +template +TF_FORCE_INLINE auto make_transform_reduce_task( + B beg, E end, T& init, BOP bop, UOP uop, P&& part +) { + + using B_t = std::decay_t>; + using E_t = std::decay_t>; + using namespace std::string_literals; + + return + [b=beg, e=end, &r=init, bop, uop, part=std::forward

(part)] + (Runtime& rt) mutable { + + // fetch the iterator values + B_t beg = b; + E_t end = e; + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + for(; beg!=end; r = bop(std::move(r), uop(*beg++))); + return; + } + + if(N < W) { + W = N; + } + + std::mutex mtx; + + // static partitioner + if constexpr(std::is_same_v, StaticPartitioner>) { + + size_t chunk_size; + + for(size_t w=0, curr_b=0; w lock(mtx); + r = bop(std::move(r), uop(*beg)); + return; + } + + //auto beg1 = beg++; + //auto beg2 = beg++; + //T sum = bop(uop(*beg1), uop(*beg2)); + + T sum = (chunk_size == 1) ? uop(*beg++) : bop(uop(*beg++), uop(*beg++)); + + // loop reduce + part.loop(N, W, curr_b, chunk_size, + [&, prev_e=curr_b+(chunk_size == 1 ? 1 : 2)] + (size_t curr_b, size_t curr_e) mutable { + if(curr_b > prev_e) { + std::advance(beg, curr_b - prev_e); + } + else { + curr_b = prev_e; + } + for(size_t x=curr_b; x lock(mtx); + r = bop(std::move(r), std::move(sum)); + + }); + } + + rt.join(); + } + // dynamic partitioner + else { + std::atomic next(0); + + launch_loop(N, W, rt, next, part, [=, &bop, &uop, &mtx, &next, &r, &part] () mutable { + + // pre-reduce + size_t s0 = next.fetch_add(2, std::memory_order_relaxed); + + if(s0 >= N) { + return; + } + + std::advance(beg, s0); + + if(N - s0 == 1) { + std::lock_guard lock(mtx); + r = bop(std::move(r), uop(*beg)); + return; + } + + auto beg1 = beg++; + auto beg2 = beg++; + + T sum = bop(uop(*beg1), uop(*beg2)); + + // loop reduce + part.loop(N, W, next, + [&, prev_e=s0+2](size_t curr_b, size_t curr_e) mutable { + std::advance(beg, curr_b - prev_e); + for(size_t x=curr_b; x lock(mtx); + r = bop(std::move(r), std::move(sum)); + }); + } + }; +} + +} // end of namespace detail ------------------------------------------------- + +// ---------------------------------------------------------------------------- +// default reduction +// ---------------------------------------------------------------------------- + +// Function: reduce +template +Task FlowBuilder::reduce(B beg, E end, T& init, O bop, P&& part) { + return emplace(detail::make_reduce_task( + beg, end, init, bop, std::forward

(part) + )); +} + +// ---------------------------------------------------------------------------- +// default transform and reduction +// ---------------------------------------------------------------------------- + +// Function: transform_reduce +template +Task FlowBuilder::transform_reduce( + B beg, E end, T& init, BOP bop, UOP uop, P&& part +) { + return emplace(detail::make_transform_reduce_task( + beg, end, init, bop, uop, std::forward

(part) + )); +} + +} // end of namespace tf ----------------------------------------------------- + + + + diff --git a/lib/taskflow/algorithm/scan.hpp b/lib/taskflow/algorithm/scan.hpp new file mode 100644 index 0000000..cccb205 --- /dev/null +++ b/lib/taskflow/algorithm/scan.hpp @@ -0,0 +1,614 @@ +#pragma once + +#include "launch.hpp" + +namespace tf { + +namespace detail { + +// Function: scan_loop +template +TF_FORCE_INLINE void scan_loop( + tf::Runtime& rt, + std::atomic& counter, + BufferT& buf, + B&& bop, + Iterator d_beg, + size_t W, + size_t w, + size_t chunk_size +){ + // whoever finishes the last performs global scan + if(counter.fetch_add(1, std::memory_order_acq_rel) == W-1) { + for(size_t i=1; i +TF_FORCE_INLINE auto make_inclusive_scan_task(B first, E last, D d_first, BOP bop) { + + using B_t = std::decay_t>; + using E_t = std::decay_t>; + using D_t = std::decay_t>; + using value_type = typename std::iterator_traits::value_type; + using namespace std::string_literals; + + return [=] (Runtime& rt) mutable { + + // fetch the stateful values + B_t s_beg = first; + E_t s_end = last; + D_t d_beg = d_first; + + if(s_beg == s_end) { + return; + } + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(s_beg, s_end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= 2) { + std::inclusive_scan(s_beg, s_end, d_beg, bop); + return; + } + + if(N < W) { + W = N; + } + + std::vector> buf(W); + std::atomic counter(0); + + size_t Q = N/W; + size_t R = N%W; + + //auto orig_d_beg = d_beg; + //ExecutionPolicy policy; + + for(size_t w=0, curr_b=0, chunk_size; w +TF_FORCE_INLINE auto make_inclusive_scan_task(B first, E last, D d_first, BOP bop, T init) { + + using B_t = std::decay_t>; + using E_t = std::decay_t>; + using D_t = std::decay_t>; + using value_type = typename std::iterator_traits::value_type; + using namespace std::string_literals; + + return [=] (Runtime& rt) mutable { + + // fetch the stateful values + B_t s_beg = first; + E_t s_end = last; + D_t d_beg = d_first; + + if(s_beg == s_end) { + return; + } + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(s_beg, s_end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= 2) { + std::inclusive_scan(s_beg, s_end, d_beg, bop, init); + return; + } + + if(N < W) { + W = N; + } + + std::vector> buf(W); + std::atomic counter(0); + + // set up the initial value for the first worker + buf[0].data = std::move(init); + + size_t Q = N/W; + size_t R = N%W; + + for(size_t w=0, curr_b=0, chunk_size; w +TF_FORCE_INLINE auto make_transform_inclusive_scan_task( + B first, E last, D d_first, BOP bop, UOP uop +) { + + using B_t = std::decay_t>; + using E_t = std::decay_t>; + using D_t = std::decay_t>; + using value_type = typename std::iterator_traits::value_type; + using namespace std::string_literals; + + return [=] (Runtime& rt) mutable { + + // fetch the stateful values + B_t s_beg = first; + E_t s_end = last; + D_t d_beg = d_first; + + if(s_beg == s_end) { + return; + } + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(s_beg, s_end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= 2) { + std::transform_inclusive_scan(s_beg, s_end, d_beg, bop, uop); + return; + } + + if(N < W) { + W = N; + } + + std::vector> buf(W); + std::atomic counter(0); + + size_t Q = N/W; + size_t R = N%W; + + for(size_t w=0, curr_b=0, chunk_size; w +TF_FORCE_INLINE auto make_transform_inclusive_scan_task( + B first, E last, D d_first, BOP bop, UOP uop, T init +) { + + using B_t = std::decay_t>; + using E_t = std::decay_t>; + using D_t = std::decay_t>; + using value_type = typename std::iterator_traits::value_type; + using namespace std::string_literals; + + return [=] (Runtime& rt) mutable { + + // fetch the stateful values + B_t s_beg = first; + E_t s_end = last; + D_t d_beg = d_first; + + if(s_beg == s_end) { + return; + } + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(s_beg, s_end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= 2) { + std::transform_inclusive_scan(s_beg, s_end, d_beg, bop, uop, init); + return; + } + + if(N < W) { + W = N; + } + + std::vector> buf(W); + std::atomic counter(0); + + // set up the initial value for the first worker + buf[0].data = std::move(init); + + size_t Q = N/W; + size_t R = N%W; + + for(size_t w=0, curr_b=0, chunk_size; w +TF_FORCE_INLINE auto make_exclusive_scan_task( + B first, E last, D d_first, T init, BOP bop +) { + + using B_t = std::decay_t>; + using E_t = std::decay_t>; + using D_t = std::decay_t>; + using value_type = typename std::iterator_traits::value_type; + using namespace std::string_literals; + + return [=] (Runtime& rt) mutable { + + // fetch the stateful values + B_t s_beg = first; + E_t s_end = last; + D_t d_beg = d_first; + + if(s_beg == s_end) { + return; + } + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(s_beg, s_end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= 2) { + std::exclusive_scan(s_beg, s_end, d_beg, init, bop); + return; + } + + if(N < W) { + W = N; + } + + std::vector> buf(W); + std::atomic counter(0); + + size_t Q = N/W; + size_t R = N%W; + + // fetch the init value + auto s_beg_temp = s_beg; + for(size_t w=0, curr_b=0, chunk_size; w +TF_FORCE_INLINE auto make_transform_exclusive_scan_task( + B first, E last, D d_first, T init, BOP bop, UOP uop +) { + + using B_t = std::decay_t>; + using E_t = std::decay_t>; + using D_t = std::decay_t>; + using value_type = typename std::iterator_traits::value_type; + using namespace std::string_literals; + + return [=] (Runtime& rt) mutable { + + // fetch the stateful values + B_t s_beg = first; + E_t s_end = last; + D_t d_beg = d_first; + + if(s_beg == s_end) { + return; + } + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(s_beg, s_end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= 2) { + std::transform_exclusive_scan(s_beg, s_end, d_beg, init, bop, uop); + return; + } + + if(N < W) { + W = N; + } + + std::vector> buf(W); + std::atomic counter(0); + + size_t Q = N/W; + size_t R = N%W; + + // fetch the init value + auto s_beg_temp = s_beg; + for(size_t w=0, curr_b=0, chunk_size; w +Task FlowBuilder::inclusive_scan(B first, E last, D d_first, BOP bop) { + return emplace(detail::make_inclusive_scan_task( + first, last, d_first, bop + )); +} + +// Function: inclusive_scan +template +Task FlowBuilder::inclusive_scan(B first, E last, D d_first, BOP bop, T init) { + return emplace(detail::make_inclusive_scan_task( + first, last, d_first, bop, init + )); +} + +// ---------------------------------------------------------------------------- +// Transform Inclusive Scan +// ---------------------------------------------------------------------------- + +// Function: transform_inclusive_scan +template +Task FlowBuilder::transform_inclusive_scan( + B first, E last, D d_first, BOP bop, UOP uop +) { + return emplace(detail::make_transform_inclusive_scan_task( + first, last, d_first, bop, uop + )); +} + +// Function: transform_inclusive_scan +template +Task FlowBuilder::transform_inclusive_scan( + B first, E last, D d_first, BOP bop, UOP uop, T init +) { + return emplace(detail::make_transform_inclusive_scan_task( + first, last, d_first, bop, uop, init + )); +} + +// ---------------------------------------------------------------------------- +// Exclusive Scan +// ---------------------------------------------------------------------------- + +// Function: exclusive_scan +template +Task FlowBuilder::exclusive_scan(B first, E last, D d_first, T init, BOP bop) { + return emplace(detail::make_exclusive_scan_task( + first, last, d_first, init, bop + )); +} + +// ---------------------------------------------------------------------------- +// Transform Exclusive Scan +// ---------------------------------------------------------------------------- + +// Function: transform_exclusive_scan +template +Task FlowBuilder::transform_exclusive_scan( + B first, E last, D d_first, T init, BOP bop, UOP uop +) { + return emplace(detail::make_transform_exclusive_scan_task( + first, last, d_first, init, bop, uop + )); +} + +} // end of namespace tf ----------------------------------------------------- diff --git a/lib/taskflow/algorithm/sort.hpp b/lib/taskflow/algorithm/sort.hpp new file mode 100644 index 0000000..a4fdf3c --- /dev/null +++ b/lib/taskflow/algorithm/sort.hpp @@ -0,0 +1,648 @@ +#pragma once + +#include "../core/async.hpp" + +namespace tf { + +// threshold whether or not to perform parallel sort +template +constexpr size_t parallel_sort_cutoff() { + + //using value_type = std::decay_t())>; + using value_type = typename std::iterator_traits::value_type; + + constexpr size_t object_size = sizeof(value_type); + + if constexpr(std::is_same_v) { + return 65536 / sizeof(std::string); + } + else { + if constexpr(object_size < 16) return 4096; + else if constexpr(object_size < 32) return 2048; + else if constexpr(object_size < 64) return 1024; + else if constexpr(object_size < 128) return 768; + else if constexpr(object_size < 256) return 512; + else if constexpr(object_size < 512) return 256; + else return 128; + } +} + +// ---------------------------------------------------------------------------- +// pattern-defeating quick sort (pdqsort) +// https://github.com/orlp/pdqsort/ +// ---------------------------------------------------------------------------- + +template +inline T* align_cacheline(T* p) { +#if defined(UINTPTR_MAX) && __cplusplus >= 201103L + std::uintptr_t ip = reinterpret_cast(p); +#else + std::size_t ip = reinterpret_cast(p); +#endif + ip = (ip + cacheline_size - 1) & -cacheline_size; + return reinterpret_cast(ip); +} + +template +inline void swap_offsets( + Iter first, Iter last, + unsigned char* offsets_l, unsigned char* offsets_r, + size_t num, bool use_swaps +) { + typedef typename std::iterator_traits::value_type T; + if (use_swaps) { + // This case is needed for the descending distribution, where we need + // to have proper swapping for pdqsort to remain O(n). + for (size_t i = 0; i < num; ++i) { + std::iter_swap(first + offsets_l[i], last - offsets_r[i]); + } + } else if (num > 0) { + Iter l = first + offsets_l[0]; Iter r = last - offsets_r[0]; + T tmp(std::move(*l)); *l = std::move(*r); + for (size_t i = 1; i < num; ++i) { + l = first + offsets_l[i]; *r = std::move(*l); + r = last - offsets_r[i]; *l = std::move(*r); + } + *r = std::move(tmp); + } +} + +// Sorts [begin, end) using insertion sort with the given comparison function. +template +void insertion_sort(RandItr begin, RandItr end, Compare comp) { + + using T = typename std::iterator_traits::value_type; + + if (begin == end) { + return; + } + + for (RandItr cur = begin + 1; cur != end; ++cur) { + + RandItr shift = cur; + RandItr shift_1 = cur - 1; + + // Compare first to avoid 2 moves for an element + // already positioned correctly. + if (comp(*shift, *shift_1)) { + T tmp = std::move(*shift); + do { + *shift-- = std::move(*shift_1); + }while (shift != begin && comp(tmp, *--shift_1)); + *shift = std::move(tmp); + } + } +} + +// Sorts [begin, end) using insertion sort with the given comparison function. +// Assumes *(begin - 1) is an element smaller than or equal to any element +// in [begin, end). +template +void unguarded_insertion_sort(RandItr begin, RandItr end, Compare comp) { + + using T = typename std::iterator_traits::value_type; + + if (begin == end) { + return; + } + + for (RandItr cur = begin + 1; cur != end; ++cur) { + RandItr shift = cur; + RandItr shift_1 = cur - 1; + + // Compare first so we can avoid 2 moves + // for an element already positioned correctly. + if (comp(*shift, *shift_1)) { + T tmp = std::move(*shift); + + do { + *shift-- = std::move(*shift_1); + }while (comp(tmp, *--shift_1)); + + *shift = std::move(tmp); + } + } +} + +// Attempts to use insertion sort on [begin, end). +// Will return false if more than +// partial_insertion_sort_limit elements were moved, +// and abort sorting. Otherwise it will successfully sort and return true. +template +bool partial_insertion_sort(RandItr begin, RandItr end, Compare comp) { + + using T = typename std::iterator_traits::value_type; + using D = typename std::iterator_traits::difference_type; + + // When we detect an already sorted partition, attempt an insertion sort + // that allows this amount of element moves before giving up. + constexpr auto partial_insertion_sort_limit = D{8}; + + if (begin == end) return true; + + auto limit = D{0}; + + for (RandItr cur = begin + 1; cur != end; ++cur) { + + if (limit > partial_insertion_sort_limit) { + return false; + } + + RandItr shift = cur; + RandItr shift_1 = cur - 1; + + // Compare first so we can avoid 2 moves + // for an element already positioned correctly. + if (comp(*shift, *shift_1)) { + T tmp = std::move(*shift); + + do { + *shift-- = std::move(*shift_1); + }while (shift != begin && comp(tmp, *--shift_1)); + + *shift = std::move(tmp); + limit += cur - shift; + } + } + + return true; +} + +// Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal +// to the pivot are put in the right-hand partition. Returns the position of the pivot after +// partitioning and whether the passed sequence already was correctly partitioned. Assumes the +// pivot is a median of at least 3 elements and that [begin, end) is at least +// insertion_sort_threshold long. Uses branchless partitioning. +template +std::pair partition_right_branchless(Iter begin, Iter end, Compare comp) { + + typedef typename std::iterator_traits::value_type T; + + constexpr size_t block_size = 64; + constexpr size_t cacheline_size = 64; + + // Move pivot into local for speed. + T pivot(std::move(*begin)); + Iter first = begin; + Iter last = end; + + // Find the first element greater than or equal than the pivot (the median of 3 guarantees + // this exists). + while (comp(*++first, pivot)); + + // Find the first element strictly smaller than the pivot. We have to guard this search if + // there was no element before *first. + if (first - 1 == begin) while (first < last && !comp(*--last, pivot)); + else while ( !comp(*--last, pivot)); + + // If the first pair of elements that should be swapped to partition are the same element, + // the passed in sequence already was correctly partitioned. + bool already_partitioned = first >= last; + if (!already_partitioned) { + std::iter_swap(first, last); + ++first; + + // The following branchless partitioning is derived from "BlockQuicksort: How Branch + // Mispredictions don't affect Quicksort" by Stefan Edelkamp and Armin Weiss, but + // heavily micro-optimized. + unsigned char offsets_l_storage[block_size + cacheline_size]; + unsigned char offsets_r_storage[block_size + cacheline_size]; + unsigned char* offsets_l = align_cacheline(offsets_l_storage); + unsigned char* offsets_r = align_cacheline(offsets_r_storage); + + Iter offsets_l_base = first; + Iter offsets_r_base = last; + size_t num_l, num_r, start_l, start_r; + num_l = num_r = start_l = start_r = 0; + + while (first < last) { + // Fill up offset blocks with elements that are on the wrong side. + // First we determine how much elements are considered for each offset block. + size_t num_unknown = last - first; + size_t left_split = num_l == 0 ? (num_r == 0 ? num_unknown / 2 : num_unknown) : 0; + size_t right_split = num_r == 0 ? (num_unknown - left_split) : 0; + + // Fill the offset blocks. + if (left_split >= block_size) { + for (size_t i = 0; i < block_size;) { + offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; + offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; + offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; + offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; + offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; + offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; + offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; + offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; + } + } else { + for (size_t i = 0; i < left_split;) { + offsets_l[num_l] = i++; num_l += !comp(*first, pivot); ++first; + } + } + + if (right_split >= block_size) { + for (size_t i = 0; i < block_size;) { + offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); + offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); + offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); + offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); + offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); + offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); + offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); + offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); + } + } else { + for (size_t i = 0; i < right_split;) { + offsets_r[num_r] = ++i; num_r += comp(*--last, pivot); + } + } + + // Swap elements and update block sizes and first/last boundaries. + size_t num = std::min(num_l, num_r); + swap_offsets( + offsets_l_base, offsets_r_base, + offsets_l + start_l, offsets_r + start_r, + num, num_l == num_r + ); + num_l -= num; num_r -= num; + start_l += num; start_r += num; + + if (num_l == 0) { + start_l = 0; + offsets_l_base = first; + } + + if (num_r == 0) { + start_r = 0; + offsets_r_base = last; + } + } + + // We have now fully identified [first, last)'s proper position. Swap the last elements. + if (num_l) { + offsets_l += start_l; + while (num_l--) std::iter_swap(offsets_l_base + offsets_l[num_l], --last); + first = last; + } + if (num_r) { + offsets_r += start_r; + while (num_r--) std::iter_swap(offsets_r_base - offsets_r[num_r], first), ++first; + last = first; + } + } + + // Put the pivot in the right place. + Iter pivot_pos = first - 1; + *begin = std::move(*pivot_pos); + *pivot_pos = std::move(pivot); + + return std::make_pair(pivot_pos, already_partitioned); +} + +// Partitions [begin, end) around pivot *begin using comparison function comp. +// Elements equal to the pivot are put in the right-hand partition. +// Returns the position of the pivot after partitioning and whether the passed +// sequence already was correctly partitioned. +// Assumes the pivot is a median of at least 3 elements and that [begin, end) +// is at least insertion_sort_threshold long. +template +std::pair partition_right(Iter begin, Iter end, Compare comp) { + + using T = typename std::iterator_traits::value_type; + + // Move pivot into local for speed. + T pivot(std::move(*begin)); + + Iter first = begin; + Iter last = end; + + // Find the first element greater than or equal than the pivot + // (the median of 3 guarantees/ this exists). + while (comp(*++first, pivot)); + + // Find the first element strictly smaller than the pivot. + // We have to guard this search if there was no element before *first. + if (first - 1 == begin) while (first < last && !comp(*--last, pivot)); + else while (!comp(*--last, pivot)); + + // If the first pair of elements that should be swapped to partition + // are the same element, the passed in sequence already was correctly + // partitioned. + bool already_partitioned = first >= last; + + // Keep swapping pairs of elements that are on the wrong side of the pivot. + // Previously swapped pairs guard the searches, + // which is why the first iteration is special-cased above. + while (first < last) { + std::iter_swap(first, last); + while (comp(*++first, pivot)); + while (!comp(*--last, pivot)); + } + + // Put the pivot in the right place. + Iter pivot_pos = first - 1; + *begin = std::move(*pivot_pos); + *pivot_pos = std::move(pivot); + + return std::make_pair(pivot_pos, already_partitioned); +} + +// Similar function to the one above, except elements equal to the pivot +// are put to the left of the pivot and it doesn't check or return +// if the passed sequence already was partitioned. +// Since this is rarely used (the many equal case), +// and in that case pdqsort already has O(n) performance, +// no block quicksort is applied here for simplicity. +template +RandItr partition_left(RandItr begin, RandItr end, Compare comp) { + + using T = typename std::iterator_traits::value_type; + + T pivot(std::move(*begin)); + + RandItr first = begin; + RandItr last = end; + + while (comp(pivot, *--last)); + + if (last + 1 == end) { + while (first < last && !comp(pivot, *++first)); + } + else { + while (!comp(pivot, *++first)); + } + + while (first < last) { + std::iter_swap(first, last); + while (comp(pivot, *--last)); + while (!comp(pivot, *++first)); + } + + RandItr pivot_pos = last; + *begin = std::move(*pivot_pos); + *pivot_pos = std::move(pivot); + + return pivot_pos; +} + +template +void parallel_pdqsort( + tf::Runtime& rt, + Iter begin, Iter end, Compare comp, + int bad_allowed, bool leftmost = true +) { + + // Partitions below this size are sorted sequentially + constexpr auto cutoff = parallel_sort_cutoff(); + + // Partitions below this size are sorted using insertion sort + constexpr auto insertion_sort_threshold = 24; + + // Partitions above this size use Tukey's ninther to select the pivot. + constexpr auto ninther_threshold = 128; + + //using diff_t = typename std::iterator_traits::difference_type; + + // Use a while loop for tail recursion elimination. + while (true) { + + //diff_t size = end - begin; + size_t size = end - begin; + + // Insertion sort is faster for small arrays. + if (size < insertion_sort_threshold) { + if (leftmost) { + insertion_sort(begin, end, comp); + } + else { + unguarded_insertion_sort(begin, end, comp); + } + return; + } + + if(size <= cutoff) { + std::sort(begin, end, comp); + return; + } + + // Choose pivot as median of 3 or pseudomedian of 9. + //diff_t s2 = size / 2; + size_t s2 = size >> 1; + if (size > ninther_threshold) { + sort3(begin, begin + s2, end - 1, comp); + sort3(begin + 1, begin + (s2 - 1), end - 2, comp); + sort3(begin + 2, begin + (s2 + 1), end - 3, comp); + sort3(begin + (s2 - 1), begin + s2, begin + (s2 + 1), comp); + std::iter_swap(begin, begin + s2); + } + else { + sort3(begin + s2, begin, end - 1, comp); + } + + // If *(begin - 1) is the end of the right partition + // of a previous partition operation, there is no element in [begin, end) + // that is smaller than *(begin - 1). + // Then if our pivot compares equal to *(begin - 1) we change strategy, + // putting equal elements in the left partition, + // greater elements in the right partition. + // We do not have to recurse on the left partition, + // since it's sorted (all equal). + if (!leftmost && !comp(*(begin - 1), *begin)) { + begin = partition_left(begin, end, comp) + 1; + continue; + } + + // Partition and get results. + const auto pair = Branchless ? partition_right_branchless(begin, end, comp) : + partition_right(begin, end, comp); + + const auto pivot_pos = pair.first; + const auto already_partitioned = pair.second; + + // Check for a highly unbalanced partition. + //diff_t l_size = pivot_pos - begin; + //diff_t r_size = end - (pivot_pos + 1); + const size_t l_size = pivot_pos - begin; + const size_t r_size = end - (pivot_pos + 1); + const bool highly_unbalanced = l_size < size / 8 || r_size < size / 8; + + // If we got a highly unbalanced partition we shuffle elements + // to break many patterns. + if (highly_unbalanced) { + // If we had too many bad partitions, switch to heapsort + // to guarantee O(n log n). + if (--bad_allowed == 0) { + std::make_heap(begin, end, comp); + std::sort_heap(begin, end, comp); + return; + } + + if (l_size >= insertion_sort_threshold) { + std::iter_swap(begin, begin + l_size / 4); + std::iter_swap(pivot_pos - 1, pivot_pos - l_size / 4); + if (l_size > ninther_threshold) { + std::iter_swap(begin + 1, begin + (l_size / 4 + 1)); + std::iter_swap(begin + 2, begin + (l_size / 4 + 2)); + std::iter_swap(pivot_pos - 2, pivot_pos - (l_size / 4 + 1)); + std::iter_swap(pivot_pos - 3, pivot_pos - (l_size / 4 + 2)); + } + } + + if (r_size >= insertion_sort_threshold) { + std::iter_swap(pivot_pos + 1, pivot_pos + (1 + r_size / 4)); + std::iter_swap(end - 1, end - r_size / 4); + if (r_size > ninther_threshold) { + std::iter_swap(pivot_pos + 2, pivot_pos + (2 + r_size / 4)); + std::iter_swap(pivot_pos + 3, pivot_pos + (3 + r_size / 4)); + std::iter_swap(end - 2, end - (1 + r_size / 4)); + std::iter_swap(end - 3, end - (2 + r_size / 4)); + } + } + } + // decently balanced + else { + // sequence try to use insertion sort. + if (already_partitioned && + partial_insertion_sort(begin, pivot_pos, comp) && + partial_insertion_sort(pivot_pos + 1, end, comp) + ) { + return; + } + } + + // Sort the left partition first using recursion and + // do tail recursion elimination for the right-hand partition. + rt.silent_async( + [&rt, begin, pivot_pos, comp, bad_allowed, leftmost] () mutable { + parallel_pdqsort( + rt, begin, pivot_pos, comp, bad_allowed, leftmost + ); + } + ); + begin = pivot_pos + 1; + leftmost = false; + } +} + +// ---------------------------------------------------------------------------- +// 3-way quick sort +// ---------------------------------------------------------------------------- + +// 3-way quick sort +template +void parallel_3wqsort(tf::Runtime& rt, RandItr first, RandItr last, C compare) { + + using namespace std::string_literals; + + constexpr auto cutoff = parallel_sort_cutoff(); + + sort_partition: + + if(static_cast(last - first) < cutoff) { + std::sort(first, last+1, compare); + return; + } + + auto m = pseudo_median_of_nine(first, last, compare); + + if(m != first) { + std::iter_swap(first, m); + } + + auto l = first; + auto r = last; + auto f = std::next(first, 1); + bool is_swapped_l = false; + bool is_swapped_r = false; + + while(f <= r) { + if(compare(*f, *l)) { + is_swapped_l = true; + std::iter_swap(l, f); + l++; + f++; + } + else if(compare(*l, *f)) { + is_swapped_r = true; + std::iter_swap(r, f); + r--; + } + else { + f++; + } + } + + if(l - first > 1 && is_swapped_l) { + //rt.emplace([&](tf::Runtime& rtl) mutable { + // parallel_3wqsort(rtl, first, l-1, compare); + //}); + rt.silent_async([&rt, first, l, &compare] () mutable { + parallel_3wqsort(rt, first, l-1, compare); + }); + } + + if(last - r > 1 && is_swapped_r) { + //rt.emplace([&](tf::Runtime& rtr) mutable { + // parallel_3wqsort(rtr, r+1, last, compare); + //}); + //rt.silent_async([&rt, r, last, &compare] () mutable { + // parallel_3wqsort(rt, r+1, last, compare); + //}); + first = r+1; + goto sort_partition; + } + + //rt.join(); +} + +// ---------------------------------------------------------------------------- +// tf::Taskflow::sort +// ---------------------------------------------------------------------------- + +// Function: sort +template +Task FlowBuilder::sort(B beg, E end, C cmp) { + + Task task = emplace([b=beg, e=end, cmp] (Runtime& rt) mutable { + + using B_t = std::decay_t>; + using E_t = std::decay_t>; + + // fetch the iterator values + B_t beg = b; + E_t end = e; + + if(beg == end) { + return; + } + + size_t W = rt._executor.num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= parallel_sort_cutoff()) { + std::sort(beg, end, cmp); + return; + } + + //parallel_3wqsort(rt, beg, end-1, cmp); + parallel_pdqsort> && + std::is_arithmetic_v::value_type> + >(rt, beg, end, cmp, log2(end - beg)); + + rt.join(); + }); + + return task; +} + +// Function: sort +template +Task FlowBuilder::sort(B beg, E end) { + using value_type = std::decay_t())>; + return sort(beg, end, std::less{}); +} + +} // namespace tf ------------------------------------------------------------ + diff --git a/lib/taskflow/algorithm/transform.hpp b/lib/taskflow/algorithm/transform.hpp new file mode 100644 index 0000000..4c87887 --- /dev/null +++ b/lib/taskflow/algorithm/transform.hpp @@ -0,0 +1,199 @@ +#pragma once + +#include "launch.hpp" + +namespace tf { + +namespace detail { + +// Function: make_transform_task +template +TF_FORCE_INLINE auto make_transform_task( + B first1, E last1, O d_first, C c, P&& part +) { + + using namespace std::string_literals; + + using B_t = std::decay_t>; + using E_t = std::decay_t>; + using O_t = std::decay_t>; + + return + [first1, last1, d_first, c, part=std::forward

(part)] + (Runtime& rt) mutable { + + // fetch the stateful values + B_t beg = first1; + E_t end = last1; + O_t d_beg = d_first; + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(beg, end); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + std::transform(beg, end, d_beg, c); + return; + } + + if(N < W) { + W = N; + } + + // static partitioner + if constexpr(std::is_same_v, StaticPartitioner>) { + size_t chunk_size; + for(size_t w=0, curr_b=0; w next(0); + + launch_loop(N, W, rt, next, part, [=, &next, &part] () mutable { + part.loop(N, W, next, + [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable { + std::advance(beg, curr_b - prev_e); + std::advance(d_beg, curr_b - prev_e); + for(size_t x = curr_b; x>, void>* = nullptr +> +TF_FORCE_INLINE auto make_transform_task( + B1 first1, E1 last1, B2 first2, O d_first, C c, P&& part +) { + + using namespace std::string_literals; + + using B1_t = std::decay_t>; + using E1_t = std::decay_t>; + using B2_t = std::decay_t>; + using O_t = std::decay_t>; + + return + [first1, last1, first2, d_first, c, part=std::forward

(part)] + (Runtime& rt) mutable { + + // fetch the stateful values + B1_t beg1 = first1; + E1_t end1 = last1; + B2_t beg2 = first2; + O_t d_beg = d_first; + + size_t W = rt.executor().num_workers(); + size_t N = std::distance(beg1, end1); + + // only myself - no need to spawn another graph + if(W <= 1 || N <= part.chunk_size()) { + std::transform(beg1, end1, beg2, d_beg, c); + return; + } + + if(N < W) { + W = N; + } + + // static partitioner + if constexpr(std::is_same_v, StaticPartitioner>) { + size_t chunk_size; + for(size_t w=0, curr_b=0; w next(0); + launch_loop(N, W, rt, next, part, [=, &c, &next, &part] () mutable { + part.loop(N, W, next, + [&, prev_e=size_t{0}](size_t curr_b, size_t curr_e) mutable { + std::advance(beg1, curr_b - prev_e); + std::advance(beg2, curr_b - prev_e); + std::advance(d_beg, curr_b - prev_e); + for(size_t x = curr_b; x +Task FlowBuilder::transform(B first1, E last1, O d_first, C c, P&& part) { + return emplace( + detail::make_transform_task(first1, last1, d_first, c, std::forward

(part)) + ); +} + +// ---------------------------------------------------------------------------- +// transform2 +// ---------------------------------------------------------------------------- + +// Function: transform +template < + typename B1, typename E1, typename B2, typename O, typename C, typename P, + std::enable_if_t>, void>* +> +Task FlowBuilder::transform( + B1 first1, E1 last1, B2 first2, O d_first, C c, P&& part +) { + + return emplace(detail::make_transform_task( + first1, last1, first2, d_first, c, std::forward

(part) + )); +} + + +} // end of namespace tf ----------------------------------------------------- + + + diff --git a/lib/taskflow/core/async.hpp b/lib/taskflow/core/async.hpp new file mode 100644 index 0000000..69788c6 --- /dev/null +++ b/lib/taskflow/core/async.hpp @@ -0,0 +1,396 @@ +#pragma once + +#include "executor.hpp" + +// https://hackmd.io/@sysprog/concurrency-atomics + +namespace tf { + +// ---------------------------------------------------------------------------- +// Async +// ---------------------------------------------------------------------------- + +// Function: async +template +auto Executor::async(const std::string& name, F&& f) { + + _increment_topology(); + + using R = std::invoke_result_t>; + + std::promise p; + auto fu{p.get_future()}; + + auto node = node_pool.animate( + name, 0, nullptr, nullptr, 0, + std::in_place_type_t{}, + _make_promised_async(std::move(p), std::forward(f)) + ); + + _schedule_async_task(node); + + return fu; +} + +// Function: async +template +auto Executor::async(F&& f) { + return async("", std::forward(f)); +} + +// ---------------------------------------------------------------------------- +// Silent Async +// ---------------------------------------------------------------------------- + +// Function: silent_async +template +void Executor::silent_async(const std::string& name, F&& f) { + + _increment_topology(); + + auto node = node_pool.animate( + name, 0, nullptr, nullptr, 0, + std::in_place_type_t{}, std::forward(f) + ); + + _schedule_async_task(node); +} + +// Function: silent_async +template +void Executor::silent_async(F&& f) { + silent_async("", std::forward(f)); +} + +// ---------------------------------------------------------------------------- +// Async Helper Methods +// ---------------------------------------------------------------------------- + +// Function: _make_promised_async +template +auto Executor::_make_promised_async(std::promise&& p, F&& func) { + return [p=make_moc(std::move(p)), func=std::forward(func)]() mutable { + if constexpr(std::is_same_v) { + func(); + p.object.set_value(); + } + else { + p.object.set_value(func()); + } + }; +} + +// Procedure: _schedule_async_task +inline void Executor::_schedule_async_task(Node* node) { + if(auto w = _this_worker(); w) { + _schedule(*w, node); + } + else{ + _schedule(node); + } +} + +// Procedure: _tear_down_async +inline void Executor::_tear_down_async(Node* node) { + // from runtime + if(node->_parent) { + node->_parent->_join_counter.fetch_sub(1, std::memory_order_release); + } + // from executor + else { + _decrement_topology_and_notify(); + } + node_pool.recycle(node); +} + +// ---------------------------------------------------------------------------- +// Silent Dependent Async +// ---------------------------------------------------------------------------- + +// Function: silent_dependent_async +template ...>, void>* +> +tf::AsyncTask Executor::silent_dependent_async(F&& func, Tasks&&... tasks) { + return silent_dependent_async("", std::forward(func), std::forward(tasks)...); +} + +// Function: silent_dependent_async +template ...>, void>* +> +tf::AsyncTask Executor::silent_dependent_async( + const std::string& name, F&& func, Tasks&&... tasks +){ + + _increment_topology(); + + size_t num_dependents = sizeof...(Tasks); + + std::shared_ptr node( + node_pool.animate( + name, 0, nullptr, nullptr, num_dependents, + std::in_place_type_t{}, std::forward(func) + ), + [&](Node* ptr){ node_pool.recycle(ptr); } + ); + + { + std::scoped_lock lock(_asyncs_mutex); + _asyncs.insert(node); + } + + if constexpr(sizeof...(Tasks) > 0) { + (_process_async_dependent(node.get(), tasks, num_dependents), ...); + } + + if(num_dependents == 0) { + _schedule_async_task(node.get()); + } + + return AsyncTask(std::move(node)); +} + +// Function: silent_dependent_async +template , AsyncTask>, void>* +> +tf::AsyncTask Executor::silent_dependent_async(F&& func, I first, I last) { + return silent_dependent_async("", std::forward(func), first, last); +} + +// Function: silent_dependent_async +template , AsyncTask>, void>* +> +tf::AsyncTask Executor::silent_dependent_async( + const std::string& name, F&& func, I first, I last +) { + + _increment_topology(); + + size_t num_dependents = std::distance(first, last); + + std::shared_ptr node( + node_pool.animate( + name, 0, nullptr, nullptr, num_dependents, + std::in_place_type_t{}, std::forward(func) + ), + [&](Node* ptr){ node_pool.recycle(ptr); } + ); + + { + std::scoped_lock lock(_asyncs_mutex); + _asyncs.insert(node); + } + + for(; first != last; first++){ + _process_async_dependent(node.get(), *first, num_dependents); + } + + if(num_dependents == 0) { + _schedule_async_task(node.get()); + } + + return AsyncTask(std::move(node)); +} + +// ---------------------------------------------------------------------------- +// Dependent Async +// ---------------------------------------------------------------------------- + +// Function: dependent_async +template ...>, void>* +> +auto Executor::dependent_async(F&& func, Tasks&&... tasks) { + return dependent_async("", std::forward(func), std::forward(tasks)...); +} + +// Function: dependent_async +template ...>, void>* +> +auto Executor::dependent_async( + const std::string& name, F&& func, Tasks&&... tasks +) { + + _increment_topology(); + + using R = std::invoke_result_t>; + + std::promise p; + auto fu{p.get_future()}; + + size_t num_dependents = sizeof...(tasks); + + std::shared_ptr node( + node_pool.animate( + name, 0, nullptr, nullptr, num_dependents, + std::in_place_type_t{}, + _make_promised_async(std::move(p), std::forward(func)) + ), + [&](Node* ptr){ node_pool.recycle(ptr); } + ); + + { + std::scoped_lock lock(_asyncs_mutex); + _asyncs.insert(node); + } + + if constexpr(sizeof...(Tasks) > 0) { + (_process_async_dependent(node.get(), tasks, num_dependents), ...); + } + + if(num_dependents == 0) { + _schedule_async_task(node.get()); + } + + return std::make_pair(AsyncTask(std::move(node)), std::move(fu)); +} + +// Function: dependent_async +template , AsyncTask>, void>* +> +auto Executor::dependent_async(F&& func, I first, I last) { + return dependent_async("", std::forward(func), first, last); +} + +// Function: dependent_async +template , AsyncTask>, void>* +> +auto Executor::dependent_async( + const std::string& name, F&& func, I first, I last +) { + + _increment_topology(); + + using R = std::invoke_result_t>; + + std::promise p; + auto fu{p.get_future()}; + + size_t num_dependents = std::distance(first, last); + + std::shared_ptr node( + node_pool.animate( + name, 0, nullptr, nullptr, num_dependents, + std::in_place_type_t{}, + _make_promised_async(std::move(p), std::forward(func)) + ), + [&](Node* ptr){ node_pool.recycle(ptr); } + ); + + { + std::scoped_lock lock(_asyncs_mutex); + _asyncs.insert(node); + } + + for(; first != last; first++) { + _process_async_dependent(node.get(), *first, num_dependents); + } + + if(num_dependents == 0) { + _schedule_async_task(node.get()); + } + + return std::make_pair(AsyncTask(std::move(node)), std::move(fu)); +} + +// ---------------------------------------------------------------------------- +// Dependent Async Helper Functions +// ---------------------------------------------------------------------------- + +// Procedure: _process_async_dependent +inline void Executor::_process_async_dependent( + Node* node, tf::AsyncTask& task, size_t& num_dependents +) { + + std::shared_ptr dep; + { + std::scoped_lock lock(_asyncs_mutex); + if(auto itr = _asyncs.find(task._node); itr != _asyncs.end()){ + dep = *itr; + } + } + + // if the dependent task exists + if(dep) { + auto& state = std::get_if(&(dep->_handle))->state; + + add_dependent: + + auto target = Node::AsyncState::UNFINISHED; + + // acquires the lock + if(state.compare_exchange_weak(target, Node::AsyncState::LOCKED, + std::memory_order_acq_rel, + std::memory_order_acquire)) { + dep->_successors.push_back(node); + state.store(Node::AsyncState::UNFINISHED, std::memory_order_release); + } + // dep's state is FINISHED, which means dep finished its callable already + // thus decrement the node's join counter by 1 + else if (target == Node::AsyncState::FINISHED) { + // decrement the counter needs to be the order of acquire and release + // to synchronize with the worker + num_dependents = node->_join_counter.fetch_sub(1, std::memory_order_acq_rel) - 1; + } + // another worker adding an async task that shares the same dependent + else { + goto add_dependent; + } + } + else { + num_dependents = node->_join_counter.fetch_sub(1, std::memory_order_acq_rel) - 1; + } +} + +// Procedure: _tear_down_dependent_async +inline void Executor::_tear_down_dependent_async(Worker& worker, Node* node) { + + // this async task comes from Executor + auto& state = std::get_if(&(node->_handle))->state; + auto target = Node::AsyncState::UNFINISHED; + + while(!state.compare_exchange_weak(target, Node::AsyncState::FINISHED, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { + target = Node::AsyncState::UNFINISHED; + } + + // spaw successors whenever their dependencies are resolved + worker._cache = nullptr; + for(size_t i=0; i_successors.size(); ++i) { + //if(auto s = node->_successors[i]; --(s->_join_counter) == 0) { + if(auto s = node->_successors[i]; + s->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1 + ) { + if(worker._cache) { + _schedule(worker, worker._cache); + } + worker._cache = s; + } + } + + // remove myself from the asyncs using extraction to avoid calling + // ~Node inside the lock + typename std::unordered_set>::node_type extracted; + { + std::shared_ptr ptr(node, [](Node*){}); + std::scoped_lock lock(_asyncs_mutex); + extracted = _asyncs.extract(ptr); + // assert(extracted.empty() == false); + } + + _decrement_topology_and_notify(); +} + + + + + +} // end of namespace tf ----------------------------------------------------- + diff --git a/lib/taskflow/core/async_task.hpp b/lib/taskflow/core/async_task.hpp new file mode 100644 index 0000000..7c92d8e --- /dev/null +++ b/lib/taskflow/core/async_task.hpp @@ -0,0 +1,125 @@ +#pragma once + +#include "graph.hpp" + +/** +@file async_task.hpp +@brief asynchronous task include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// AsyncTask +// ---------------------------------------------------------------------------- + +/** +@brief class to create a dependent asynchronous task + +A tf::AsyncTask is a lightweight handle that retains @em shared ownership +of a dependent async task created by an executor. +This shared ownership ensures that the async task remains alive when +adding it to the dependency list of another async task, +thus avoiding the classical [ABA problem](https://en.wikipedia.org/wiki/ABA_problem). + +@code{.cpp} +// main thread retains shared ownership of async task A +tf::AsyncTask A = executor.silent_dependent_async([](){}); + +// task A remains alive (i.e., at least one ref count by the main thread) +// when being added to the dependency list of async task B +tf::AsyncTask B = executor.silent_dependent_async([](){}, A); +@endcode + +Currently, tf::AsyncTask is implemented based on C++ smart pointer std::shared_ptr and +is considered cheap to copy or move as long as only a handful of objects +own it. +When a worker completes an async task, it will remove the task from the executor, +decrementing the number of shared owners by one. +If that counter reaches zero, the task is destroyed. +*/ +class AsyncTask { + + friend class FlowBuilder; + friend class Runtime; + friend class Taskflow; + friend class TaskView; + friend class Executor; + + public: + + /** + @brief constructs an empty task handle + */ + AsyncTask() = default; + + /** + @brief destroys the managed asynchronous task if this is the last owner + */ + ~AsyncTask() = default; + + /** + @brief constructs an task that shares ownership of @c rhs + */ + AsyncTask(const AsyncTask& rhs) = default; + + /** + @brief move-constructs an task from @c rhs + */ + AsyncTask(AsyncTask&& rhs) = default; + + /** + @brief shares ownership of the task managed by @c rhs + */ + AsyncTask& operator = (const AsyncTask& rhs) = default; + + /** + @brief move-assigns the task from @c rhs + */ + AsyncTask& operator = (AsyncTask&& rhs) = default; + + /** + @brief checks if the task stores a non-null shared pointer + */ + bool empty() const; + + /** + @brief release the ownership + */ + void reset(); + + /** + @brief obtains a hash value of the underlying node + */ + size_t hash_value() const; + + private: + + AsyncTask(std::shared_ptr); + + std::shared_ptr _node; +}; + +// Constructor +inline AsyncTask::AsyncTask(std::shared_ptr ptr) : _node {std::move(ptr)} { +} + +// Function: empty +inline bool AsyncTask::empty() const { + return _node == nullptr; +} + +// Function: reset +inline void AsyncTask::reset() { + _node.reset(); +} + +// Function: hash_value +inline size_t AsyncTask::hash_value() const { + return std::hash>{}(_node); +} + +} // end of namespace tf ---------------------------------------------------- + + + diff --git a/lib/taskflow/core/declarations.hpp b/lib/taskflow/core/declarations.hpp index b7f1b24..dd89ab3 100644 --- a/lib/taskflow/core/declarations.hpp +++ b/lib/taskflow/core/declarations.hpp @@ -2,19 +2,23 @@ namespace tf { +// ---------------------------------------------------------------------------- // taskflow +// ---------------------------------------------------------------------------- class AsyncTopology; class Node; class Graph; class FlowBuilder; class Semaphore; class Subflow; +class Runtime; class Task; class TaskView; class Taskflow; class Topology; class TopologyBase; class Executor; +class Worker; class WorkerView; class ObserverInterface; class ChromeTracingObserver; @@ -24,17 +28,29 @@ class TFProfManager; template class Future; +template +class Pipeline; + +// ---------------------------------------------------------------------------- // cudaFlow -class cudaNode; -class cudaGraph; +// ---------------------------------------------------------------------------- +class cudaFlowNode; +class cudaFlowGraph; class cudaTask; class cudaFlow; class cudaFlowCapturer; -class cudaFlowCapturerBase; -class cudaCapturingBase; -class cudaSequentialCapturing; -class cudaRoundRobinCapturing; -class cublasFlowCapturer; +class cudaFlowOptimizerBase; +class cudaFlowLinearOptimizer; +class cudaFlowSequentialOptimizer; +class cudaFlowRoundRobinOptimizer; + +// ---------------------------------------------------------------------------- +// syclFlow +// ---------------------------------------------------------------------------- +class syclNode; +class syclGraph; +class syclTask; +class syclFlow; } // end of namespace tf ----------------------------------------------------- diff --git a/lib/taskflow/core/executor-module-opt.hpp b/lib/taskflow/core/executor-module-opt.hpp new file mode 100644 index 0000000..0e2b1ee --- /dev/null +++ b/lib/taskflow/core/executor-module-opt.hpp @@ -0,0 +1,2025 @@ +#pragma once + +#include "observer.hpp" +#include "taskflow.hpp" + +/** +@file executor.hpp +@brief executor include file +*/ + +namespace tf { + +// ---------------------------------------------------------------------------- +// Executor Definition +// ---------------------------------------------------------------------------- + +/** @class Executor + +@brief class to create an executor for running a taskflow graph + +An executor manages a set of worker threads to run one or multiple taskflows +using an efficient work-stealing scheduling algorithm. + +@code{.cpp} +// Declare an executor and a taskflow +tf::Executor executor; +tf::Taskflow taskflow; + +// Add three tasks into the taskflow +tf::Task A = taskflow.emplace([] () { std::cout << "This is TaskA\n"; }); +tf::Task B = taskflow.emplace([] () { std::cout << "This is TaskB\n"; }); +tf::Task C = taskflow.emplace([] () { std::cout << "This is TaskC\n"; }); + +// Build precedence between tasks +A.precede(B, C); + +tf::Future fu = executor.run(taskflow); +fu.wait(); // block until the execution completes + +executor.run(taskflow, [](){ std::cout << "end of 1 run"; }).wait(); +executor.run_n(taskflow, 4); +executor.wait_for_all(); // block until all associated executions finish +executor.run_n(taskflow, 4, [](){ std::cout << "end of 4 runs"; }).wait(); +executor.run_until(taskflow, [cnt=0] () mutable { return ++cnt == 10; }); +@endcode + +All the @c run methods are @em thread-safe. You can submit multiple +taskflows at the same time to an executor from different threads. +*/ +class Executor { + + friend class FlowBuilder; + friend class Subflow; + friend class Runtime; + + public: + + /** + @brief constructs the executor with @c N worker threads + + The constructor spawns @c N worker threads to run tasks in a + work-stealing loop. The number of workers must be greater than zero + or an exception will be thrown. + By default, the number of worker threads is equal to the maximum + hardware concurrency returned by std::thread::hardware_concurrency. + */ + explicit Executor(size_t N = std::thread::hardware_concurrency()); + + /** + @brief destructs the executor + + The destructor calls Executor::wait_for_all to wait for all submitted + taskflows to complete and then notifies all worker threads to stop + and join these threads. + */ + ~Executor(); + + /** + @brief runs a taskflow once + + @param taskflow a tf::Taskflow object + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow once and returns a tf::Future + object that eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run(taskflow); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + tf::Future run(Taskflow& taskflow); + + /** + @brief runs a moved taskflow once + + @param taskflow a moved tf::Taskflow object + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow once and returns a tf::Future + object that eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run(std::move(taskflow)); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + tf::Future run(Taskflow&& taskflow); + + /** + @brief runs a taskflow once and invoke a callback upon completion + + @param taskflow a tf::Taskflow object + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow once and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run(taskflow, [](){ std::cout << "done"; }); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template + tf::Future run(Taskflow& taskflow, C&& callable); + + /** + @brief runs a moved taskflow once and invoke a callback upon completion + + @param taskflow a moved tf::Taskflow object + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow once and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run( + std::move(taskflow), [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template + tf::Future run(Taskflow&& taskflow, C&& callable); + + /** + @brief runs a taskflow for @c N times + + @param taskflow a tf::Taskflow object + @param N number of runs + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow @c N times and returns a tf::Future + object that eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run_n(taskflow, 2); // run taskflow 2 times + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + tf::Future run_n(Taskflow& taskflow, size_t N); + + /** + @brief runs a moved taskflow for @c N times + + @param taskflow a moved tf::Taskflow object + @param N number of runs + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow @c N times and returns a tf::Future + object that eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run_n( + std::move(taskflow), 2 // run the moved taskflow 2 times + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + tf::Future run_n(Taskflow&& taskflow, size_t N); + + /** + @brief runs a taskflow for @c N times and then invokes a callback + + @param taskflow a tf::Taskflow + @param N number of runs + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow @c N times and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run( + taskflow, 2, [](){ std::cout << "done"; } // runs taskflow 2 times and invoke + // the lambda to print "done" + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template + tf::Future run_n(Taskflow& taskflow, size_t N, C&& callable); + + /** + @brief runs a moved taskflow for @c N times and then invokes a callback + + @param taskflow a moved tf::Taskflow + @param N number of runs + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow @c N times and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run( + // run the moved taskflow 2 times and invoke the lambda to print "done" + std::move(taskflow), 2, [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template + tf::Future run_n(Taskflow&& taskflow, size_t N, C&& callable); + + /** + @brief runs a taskflow multiple times until the predicate becomes true + + @param taskflow a tf::Taskflow + @param pred a boolean predicate to return @c true for stop + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow multiple times until + the predicate returns @c true. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run( + taskflow, [](){ return rand()%10 == 0 } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template + tf::Future run_until(Taskflow& taskflow, P&& pred); + + /** + @brief runs a moved taskflow and keeps running it + until the predicate becomes true + + @param taskflow a moved tf::Taskflow object + @param pred a boolean predicate to return @c true for stop + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow multiple times until + the predicate returns @c true. + This member function returns a tf::Future object that + eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run( + std::move(taskflow), [](){ return rand()%10 == 0 } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template + tf::Future run_until(Taskflow&& taskflow, P&& pred); + + /** + @brief runs a taskflow multiple times until the predicate becomes true and + then invokes the callback + + @param taskflow a tf::Taskflow + @param pred a boolean predicate to return @c true for stop + @param callable a callable object to be invoked after this run completes + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow multiple times until + the predicate returns @c true and then invokes the given callable when + the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run( + taskflow, [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template + tf::Future run_until(Taskflow& taskflow, P&& pred, C&& callable); + + /** + @brief runs a moved taskflow and keeps running + it until the predicate becomes true and then invokes the callback + + @param taskflow a moved tf::Taskflow + @param pred a boolean predicate to return @c true for stop + @param callable a callable object to be invoked after this run completes + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow multiple times until + the predicate returns @c true and then invokes the given callable when + the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run( + std::move(taskflow), + [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template + tf::Future run_until(Taskflow&& taskflow, P&& pred, C&& callable); + + /** + @brief wait for all tasks to complete + + This member function waits until all submitted tasks + (e.g., taskflows, asynchronous tasks) to finish. + + @code{.cpp} + executor.run(taskflow1); + executor.run_n(taskflow2, 10); + executor.run_n(taskflow3, 100); + executor.wait_for_all(); // wait until the above submitted taskflows finish + @endcode + */ + void wait_for_all(); + + /** + @brief queries the number of worker threads + + Each worker represents one unique thread spawned by an executor + upon its construction time. + + @code{.cpp} + tf::Executor executor(4); + std::cout << executor.num_workers(); // 4 + @endcode + */ + size_t num_workers() const noexcept; + + /** + @brief queries the number of running topologies at the time of this call + + When a taskflow is submitted to an executor, a topology is created to store + runtime metadata of the running taskflow. + When the execution of the submitted taskflow finishes, + its corresponding topology will be removed from the executor. + + @code{.cpp} + executor.run(taskflow); + std::cout << executor.num_topologies(); // 0 or 1 (taskflow still running) + @endcode + */ + size_t num_topologies() const; + + /** + @brief queries the number of running taskflows with moved ownership + + @code{.cpp} + executor.run(std::move(taskflow)); + std::cout << executor.num_taskflows(); // 0 or 1 (taskflow still running) + @endcode + */ + size_t num_taskflows() const; + + /** + @brief queries the id of the caller thread in this executor + + Each worker has an unique id in the range of @c 0 to @c N-1 associated with + its parent executor. + If the caller thread does not belong to the executor, @c -1 is returned. + + @code{.cpp} + tf::Executor executor(4); // 4 workers in the executor + executor.this_worker_id(); // -1 (main thread is not a worker) + + taskflow.emplace([&](){ + std::cout << executor.this_worker_id(); // 0, 1, 2, or 3 + }); + executor.run(taskflow); + @endcode + */ + int this_worker_id() const; + + /** + @brief runs a given function asynchronously + + @tparam F callable type + @tparam ArgsT parameter types + + @param f callable object to call + @param args parameters to pass to the callable + + @return a tf::Future that will holds the result of the execution + + The method creates an asynchronous task to launch the given + function on the given arguments. + Unlike std::async, the return here is a @em tf::Future that holds + an optional object to the result. + If the asynchronous task is cancelled before it runs, the return is + a @c std::nullopt, or the value returned by the callable. + + @code{.cpp} + tf::Future> future = executor.async([](){ + std::cout << "create an asynchronous task and returns 1\n"; + return 1; + }); + @endcode + + This member function is thread-safe. + */ + template + auto async(F&& f, ArgsT&&... args); + + /** + @brief runs a given function asynchronously and gives a name to this task + + @tparam F callable type + @tparam ArgsT parameter types + + @param name name of the asynchronous task + @param f callable object to call + @param args parameters to pass to the callable + + @return a tf::Future that will holds the result of the execution + + The method creates a named asynchronous task to launch the given + function on the given arguments. + Naming an asynchronous task is primarily used for profiling and visualizing + the task execution timeline. + Unlike std::async, the return here is a tf::Future that holds + an optional object to the result. + If the asynchronous task is cancelled before it runs, the return is + a @c std::nullopt, or the value returned by the callable. + + @code{.cpp} + tf::Future> future = executor.named_async("name", [](){ + std::cout << "create an asynchronous task with a name and returns 1\n"; + return 1; + }); + @endcode + + This member function is thread-safe. + */ + template + auto named_async(const std::string& name, F&& f, ArgsT&&... args); + + /** + @brief similar to tf::Executor::async but does not return a future object + + This member function is more efficient than tf::Executor::async + and is encouraged to use when there is no data returned. + + @code{.cpp} + executor.silent_async([](){ + std::cout << "create an asynchronous task with no return\n"; + }); + @endcode + + This member function is thread-safe. + */ + template + void silent_async(F&& f, ArgsT&&... args); + + /** + @brief similar to tf::Executor::named_async but does not return a future object + + This member function is more efficient than tf::Executor::named_async + and is encouraged to use when there is no data returned. + + @code{.cpp} + executor.named_silent_async("name", [](){ + std::cout << "create an asynchronous task with a name and no return\n"; + }); + @endcode + + This member function is thread-safe. + */ + template + void named_silent_async(const std::string& name, F&& f, ArgsT&&... args); + + /** + @brief constructs an observer to inspect the activities of worker threads + + @tparam Observer observer type derived from tf::ObserverInterface + @tparam ArgsT argument parameter pack + + @param args arguments to forward to the constructor of the observer + + @return a shared pointer to the created observer + + Each executor manages a list of observers with shared ownership with callers. + For each of these observers, the two member functions, + tf::ObserverInterface::on_entry and tf::ObserverInterface::on_exit + will be called before and after the execution of a task. + + This member function is not thread-safe. + */ + template + std::shared_ptr make_observer(ArgsT&&... args); + + /** + @brief removes an observer from the executor + + This member function is not thread-safe. + */ + template + void remove_observer(std::shared_ptr observer); + + /** + @brief queries the number of observers + */ + size_t num_observers() const noexcept; + + private: + + std::condition_variable _topology_cv; + std::mutex _taskflow_mutex; + std::mutex _topology_mutex; + std::mutex _wsq_mutex; + + size_t _num_topologies {0}; + + std::unordered_map _wids; + std::vector _workers; + std::vector _threads; + std::list _taskflows; + + Notifier _notifier; + + TaskQueue _wsq; + + std::atomic _num_actives {0}; + std::atomic _num_thieves {0}; + std::atomic _done {0}; + + std::unordered_set> _observers; + + Worker* _this_worker(); + + bool _wait_for_task(Worker&, Node*&); + + void _observer_prologue(Worker&, Node*); + void _observer_epilogue(Worker&, Node*); + void _spawn(size_t); + void _worker_loop(Worker&); + void _exploit_task(Worker&, Node*&); + void _explore_task(Worker&, Node*&); + void _consume_task(Worker&, Node*); + void _schedule(Worker&, Node*); + void _schedule(Node*); + void _schedule(Worker&, const SmallVector&); + void _schedule(const SmallVector&); + void _set_up_topology(Worker*, Topology*); + void _tear_down_topology(Worker&, Topology*); + void _tear_down_async(Node*); + void _tear_down_invoke(Worker&, Node*); + void _cancel_invoke(Worker&, Node*); + void _increment_topology(); + void _decrement_topology(); + void _decrement_topology_and_notify(); + void _invoke(Worker&, Node*); + void _invoke_static_task(Worker&, Node*); + void _invoke_dynamic_task(Worker&, Node*); + void _invoke_dynamic_task_external(Worker&, Node*, Graph&, bool); + void _invoke_dynamic_task_internal(Worker&, Node*, Graph&); + void _invoke_condition_task(Worker&, Node*, SmallVector&); + void _invoke_multi_condition_task(Worker&, Node*, SmallVector&); + void _invoke_module_task(Worker&, Node*, bool&); + void _invoke_module_task_internal(Worker&, Node*, Graph&, bool&); + void _invoke_async_task(Worker&, Node*); + void _invoke_silent_async_task(Worker&, Node*); + void _invoke_cudaflow_task(Worker&, Node*); + void _invoke_syclflow_task(Worker&, Node*); + void _invoke_runtime_task(Worker&, Node*); + + template , void>* = nullptr + > + void _invoke_cudaflow_task_entry(Node*, C&&); + + template , void>* = nullptr + > + void _invoke_syclflow_task_entry(Node*, C&&, Q&); +}; + +// Constructor +inline Executor::Executor(size_t N) : + _workers {N}, + _notifier {N} { + + if(N == 0) { + TF_THROW("no cpu workers to execute taskflows"); + } + + _spawn(N); + + // instantite the default observer if requested + if(has_env(TF_ENABLE_PROFILER)) { + TFProfManager::get()._manage(make_observer()); + } +} + +// Destructor +inline Executor::~Executor() { + + // wait for all topologies to complete + wait_for_all(); + + // shut down the scheduler + _done = true; + + _notifier.notify(true); + + for(auto& t : _threads){ + t.join(); + } +} + +// Function: num_workers +inline size_t Executor::num_workers() const noexcept { + return _workers.size(); +} + +// Function: num_topologies +inline size_t Executor::num_topologies() const { + return _num_topologies; +} + +// Function: num_taskflows +inline size_t Executor::num_taskflows() const { + return _taskflows.size(); +} + +// Function: _this_worker +inline Worker* Executor::_this_worker() { + auto itr = _wids.find(std::this_thread::get_id()); + return itr == _wids.end() ? nullptr : &_workers[itr->second]; +} + +// Function: named_async +template +auto Executor::named_async(const std::string& name, F&& f, ArgsT&&... args) { + + _increment_topology(); + + using T = std::invoke_result_t; + using R = std::conditional_t, void, std::optional>; + + std::promise p; + + auto tpg = std::make_shared(); + + Future fu(p.get_future(), tpg); + + auto node = node_pool.animate( + std::in_place_type_t{}, + [p=make_moc(std::move(p)), f=std::forward(f), args...] + (bool cancel) mutable { + if constexpr(std::is_same_v) { + if(!cancel) { + f(args...); + } + p.object.set_value(); + } + else { + p.object.set_value(cancel ? std::nullopt : std::make_optional(f(args...))); + } + }, + std::move(tpg) + ); + + node->_name = name; + + if(auto w = _this_worker(); w) { + _schedule(*w, node); + } + else{ + _schedule(node); + } + + return fu; +} + +// Function: async +template +auto Executor::async(F&& f, ArgsT&&... args) { + return named_async("", std::forward(f), std::forward(args)...); +} + +// Function: named_silent_async +template +void Executor::named_silent_async( + const std::string& name, F&& f, ArgsT&&... args +) { + + _increment_topology(); + + Node* node = node_pool.animate( + std::in_place_type_t{}, + [f=std::forward(f), args...] () mutable { + f(args...); + } + ); + + node->_name = name; + + if(auto w = _this_worker(); w) { + _schedule(*w, node); + } + else { + _schedule(node); + } +} + +// Function: silent_async +template +void Executor::silent_async(F&& f, ArgsT&&... args) { + named_silent_async("", std::forward(f), std::forward(args)...); +} + +// Function: this_worker_id +inline int Executor::this_worker_id() const { + auto i = _wids.find(std::this_thread::get_id()); + return i == _wids.end() ? -1 : static_cast(_workers[i->second]._id); +} + +// Procedure: _spawn +inline void Executor::_spawn(size_t N) { + + std::mutex mutex; + std::condition_variable cond; + size_t n=0; + + for(size_t id=0; id void { + + // enables the mapping + { + std::scoped_lock lock(mutex); + _wids[std::this_thread::get_id()] = w._id; + if(n++; n == num_workers()) { + cond.notify_one(); + } + } + + //this_worker().worker = &w; + + Node* t = nullptr; + + // must use 1 as condition instead of !done + while(1) { + + // execute the tasks. + _exploit_task(w, t); + + // wait for tasks + if(_wait_for_task(w, t) == false) { + break; + } + } + + }, std::ref(_workers[id]), std::ref(mutex), std::ref(cond), std::ref(n)); + } + + std::unique_lock lock(mutex); + cond.wait(lock, [&](){ return n==N; }); +} + +// Function: _consume_task +inline void Executor::_consume_task(Worker& w, Node* p) { + + std::uniform_int_distribution rdvtm(0, _workers.size()-1); + + while(p->_join_counter != 0) { + exploit: + if(auto t = w._wsq.pop(); t) { + _invoke(w, t); + } + else { + size_t num_steals = 0; + //size_t num_pauses = 0; + size_t max_steals = ((_workers.size() + 1) << 1); + + explore: + + t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal(); + if(t) { + _invoke(w, t); + goto exploit; + } + else if(p->_join_counter != 0){ + + if(num_steals++ > max_steals) { + std::this_thread::yield(); + } + + //std::this_thread::yield(); + w._vtm = rdvtm(w._rdgen); + goto explore; + } + else { + break; + } + } + } +} + +// Function: _explore_task +inline void Executor::_explore_task(Worker& w, Node*& t) { + + //assert(_workers[w].wsq.empty()); + //assert(!t); + + size_t num_steals = 0; + size_t num_yields = 0; + size_t max_steals = ((_workers.size() + 1) << 1); + + std::uniform_int_distribution rdvtm(0, _workers.size()-1); + + do { + t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal(); + + if(t) { + break; + } + + if(num_steals++ > max_steals) { + std::this_thread::yield(); + if(num_yields++ > 100) { + break; + } + } + + w._vtm = rdvtm(w._rdgen); + } while(!_done); + +} + +// Procedure: _exploit_task +inline void Executor::_exploit_task(Worker& w, Node*& t) { + + if(t) { + + if(_num_actives.fetch_add(1) == 0 && _num_thieves == 0) { + _notifier.notify(false); + } + + while(t) { + _invoke(w, t); + t = w._wsq.pop(); + } + + --_num_actives; + } +} + +// Function: _wait_for_task +inline bool Executor::_wait_for_task(Worker& worker, Node*& t) { + + wait_for_task: + + //assert(!t); + + ++_num_thieves; + + explore_task: + + _explore_task(worker, t); + + if(t) { + if(_num_thieves.fetch_sub(1) == 1) { + _notifier.notify(false); + } + return true; + } + + _notifier.prepare_wait(worker._waiter); + + //if(auto vtm = _find_vtm(me); vtm != _workers.size()) { + if(!_wsq.empty()) { + + _notifier.cancel_wait(worker._waiter); + //t = (vtm == me) ? _wsq.steal() : _workers[vtm].wsq.steal(); + + t = _wsq.steal(); // must steal here + if(t) { + if(_num_thieves.fetch_sub(1) == 1) { + _notifier.notify(false); + } + return true; + } + else { + worker._vtm = worker._id; + goto explore_task; + } + } + + if(_done) { + _notifier.cancel_wait(worker._waiter); + _notifier.notify(true); + --_num_thieves; + return false; + } + + if(_num_thieves.fetch_sub(1) == 1) { + if(_num_actives) { + _notifier.cancel_wait(worker._waiter); + goto wait_for_task; + } + // check all queues again + for(auto& w : _workers) { + if(!w._wsq.empty()) { + worker._vtm = w._id; + _notifier.cancel_wait(worker._waiter); + goto wait_for_task; + } + } + } + + // Now I really need to relinguish my self to others + _notifier.commit_wait(worker._waiter); + + return true; +} + +// Function: make_observer +template +std::shared_ptr Executor::make_observer(ArgsT&&... args) { + + static_assert( + std::is_base_of_v, + "Observer must be derived from ObserverInterface" + ); + + // use a local variable to mimic the constructor + auto ptr = std::make_shared(std::forward(args)...); + + ptr->set_up(_workers.size()); + + _observers.emplace(std::static_pointer_cast(ptr)); + + return ptr; +} + +// Procedure: remove_observer +template +void Executor::remove_observer(std::shared_ptr ptr) { + + static_assert( + std::is_base_of_v, + "Observer must be derived from ObserverInterface" + ); + + _observers.erase(std::static_pointer_cast(ptr)); +} + +// Function: num_observers +inline size_t Executor::num_observers() const noexcept { + return _observers.size(); +} + +// Procedure: _schedule +inline void Executor::_schedule(Worker& worker, Node* node) { + + node->_state.fetch_or(Node::READY, std::memory_order_release); + + // caller is a worker to this pool + if(worker._executor == this) { + worker._wsq.push(node); + return; + } + + { + std::lock_guard lock(_wsq_mutex); + _wsq.push(node); + } + + _notifier.notify(false); +} + +// Procedure: _schedule +inline void Executor::_schedule(Node* node) { + + node->_state.fetch_or(Node::READY, std::memory_order_release); + + { + std::lock_guard lock(_wsq_mutex); + _wsq.push(node); + } + + _notifier.notify(false); +} + +// Procedure: _schedule +inline void Executor::_schedule( + Worker& worker, const SmallVector& nodes +) { + + // We need to cacth the node count to avoid accessing the nodes + // vector while the parent topology is removed! + const auto num_nodes = nodes.size(); + + if(num_nodes == 0) { + return; + } + + // make the node ready + for(size_t i=0; i_state.fetch_or(Node::READY, std::memory_order_release); + } + + if(worker._executor == this) { + for(size_t i=0; i lock(_wsq_mutex); + for(size_t k=0; k& nodes) { + + // parent topology may be removed! + const auto num_nodes = nodes.size(); + + if(num_nodes == 0) { + return; + } + + // make the node ready + for(size_t i=0; i_state.fetch_or(Node::READY, std::memory_order_release); + } + + { + std::lock_guard lock(_wsq_mutex); + for(size_t k=0; k conds; + + // synchronize all outstanding memory operations caused by reordering + do { + state = node->_state.load(std::memory_order_acquire); + } while(! (state & Node::READY)); + + // unwind stack for deferred node + if(state & Node::DEFERRED) { + node->_state.fetch_and(~Node::DEFERRED, std::memory_order_relaxed); + goto invoke_epilogue; + } + + //while(!(node->_state.load(std::memory_order_acquire) & Node::READY)); + + invoke_prologue: + + // no need to do other things if the topology is cancelled + if(node->_is_cancelled()) { + _cancel_invoke(worker, node); + return; + } + + // if acquiring semaphore(s) exists, acquire them first + if(node->_semaphores && !node->_semaphores->to_acquire.empty()) { + SmallVector nodes; + if(!node->_acquire_all(nodes)) { + _schedule(worker, nodes); + return; + } + node->_state.fetch_or(Node::ACQUIRED, std::memory_order_release); + } + + // condition task + //int cond = -1; + //SmallVector conds = { -1 }; + + // switch is faster than nested if-else due to jump table + switch(node->_handle.index()) { + // static task + case Node::STATIC:{ + _invoke_static_task(worker, node); + } + break; + + // dynamic task + case Node::DYNAMIC: { + _invoke_dynamic_task(worker, node); + } + break; + + // condition task + case Node::CONDITION: { + _invoke_condition_task(worker, node, conds); + } + break; + + // multi-condition task + case Node::MULTI_CONDITION: { + _invoke_multi_condition_task(worker, node, conds); + } + break; + + // module task + case Node::MODULE: { + bool deferred = false; + _invoke_module_task(worker, node, deferred); + if(deferred) { + return; + } + } + break; + + // async task + case Node::ASYNC: { + _invoke_async_task(worker, node); + _tear_down_async(node); + return ; + } + break; + + // silent async task + case Node::SILENT_ASYNC: { + _invoke_silent_async_task(worker, node); + _tear_down_async(node); + return ; + } + break; + + // cudaflow task + case Node::CUDAFLOW: { + _invoke_cudaflow_task(worker, node); + } + break; + + // syclflow task + case Node::SYCLFLOW: { + _invoke_syclflow_task(worker, node); + } + break; + + // runtime task + case Node::RUNTIME: { + _invoke_runtime_task(worker, node); + } + break; + + // monostate (placeholder) + default: + break; + } + + invoke_epilogue: + + // if releasing semaphores exist, release them + if(node->_semaphores && !node->_semaphores->to_release.empty()) { + _schedule(worker, node->_release_all()); + } + + // We MUST recover the dependency since the graph may have cycles. + // This must be done before scheduling the successors, otherwise this might cause + // race condition on the _dependents + if((node->_state.load(std::memory_order_relaxed) & Node::CONDITIONED)) { + node->_join_counter = node->num_strong_dependents(); + } + else { + node->_join_counter = node->num_dependents(); + } + + // acquire the parent flow counter + auto& j = (node->_parent) ? node->_parent->_join_counter : + node->_topology->_join_counter; + + Node* cache {nullptr}; + + // At this point, the node storage might be destructed (to be verified) + // case 1: non-condition task + switch(node->_handle.index()) { + + // condition and multi-condition tasks + case Node::CONDITION: + case Node::MULTI_CONDITION: { + for(auto cond : conds) { + if(cond >= 0 && static_cast(cond) < node->_successors.size()) { + auto s = node->_successors[cond]; + // zeroing the join counter for invariant + s->_join_counter.store(0, std::memory_order_relaxed); + j.fetch_add(1); + if(cache) { + _schedule(worker, cache); + } + cache = s; + } + } + } + break; + + // non-condition task + default: { + for(size_t i=0; i_successors.size(); ++i) { + if(--(node->_successors[i]->_join_counter) == 0) { + j.fetch_add(1); + if(cache) { + _schedule(worker, cache); + } + cache = node->_successors[i]; + } + } + } + break; + } + + // tear_down the invoke + _tear_down_invoke(worker, node); + + // perform tail recursion elimination for the right-most child to reduce + // the number of expensive pop/push operations through the task queue + if(cache) { + node = cache; + //node->_state.fetch_or(Node::READY, std::memory_order_release); + goto invoke_prologue; + } +} + +// Procedure: _tear_down_async +inline void Executor::_tear_down_async(Node* node) { + if(node->_parent) { + node->_parent->_join_counter.fetch_sub(1); + } + else { + _decrement_topology_and_notify(); + } + node_pool.recycle(node); +} + +// Proecdure: _tear_down_invoke +inline void Executor::_tear_down_invoke(Worker& worker, Node* node) { + // we must check parent first before substracting the join counter, + // or it can introduce data race + if(auto parent = node->_parent; parent == nullptr) { + if(node->_topology->_join_counter.fetch_sub(1) == 1) { + _tear_down_topology(worker, node->_topology); + } + } + else { + // prefetch the deferred status, as subtracting the join counter can + // immediately cause the other worker to release the subflow + auto deferred = parent->_state.load(std::memory_order_relaxed) & Node::DEFERRED; + if(parent->_join_counter.fetch_sub(1) == 1 && deferred) { + _schedule(worker, parent); + } + } +} + +// Procedure: _cancel_invoke +inline void Executor::_cancel_invoke(Worker& worker, Node* node) { + + switch(node->_handle.index()) { + // async task needs to carry out the promise + case Node::ASYNC: + std::get_if(&(node->_handle))->work(true); + _tear_down_async(node); + break; + + // silent async doesn't need to carry out the promise + case Node::SILENT_ASYNC: + _tear_down_async(node); + break; + + // tear down topology if the node is the last leaf + default: { + _tear_down_invoke(worker, node); + } + break; + } +} + +// Procedure: _observer_prologue +inline void Executor::_observer_prologue(Worker& worker, Node* node) { + for(auto& observer : _observers) { + observer->on_entry(WorkerView(worker), TaskView(*node)); + } +} + +// Procedure: _observer_epilogue +inline void Executor::_observer_epilogue(Worker& worker, Node* node) { + for(auto& observer : _observers) { + observer->on_exit(WorkerView(worker), TaskView(*node)); + } +} + +// Procedure: _invoke_static_task +inline void Executor::_invoke_static_task(Worker& worker, Node* node) { + _observer_prologue(worker, node); + std::get_if(&node->_handle)->work(); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_dynamic_task +inline void Executor::_invoke_dynamic_task(Worker& w, Node* node) { + + _observer_prologue(w, node); + + auto handle = std::get_if(&node->_handle); + + handle->subgraph._clear(); + + Subflow sf(*this, w, node, handle->subgraph); + + handle->work(sf); + + if(sf._joinable) { + _invoke_dynamic_task_internal(w, node, handle->subgraph); + } + + _observer_epilogue(w, node); +} + +// Procedure: _invoke_dynamic_task_external +inline void Executor::_invoke_dynamic_task_external( + Worker& w, Node* p, Graph& g, bool detach +) { + + // graph is empty and has no async tasks + if(g.empty() && p->_join_counter == 0) { + return; + } + + SmallVector src; + + for(auto n : g._nodes) { + + n->_topology = p->_topology; + n->_state.store(0, std::memory_order_relaxed); + n->_set_up_join_counter(); + + if(detach) { + n->_parent = nullptr; + n->_state.fetch_or(Node::DETACHED, std::memory_order_relaxed); + } + else { + n->_parent = p; + } + + if(n->num_dependents() == 0) { + src.push_back(n); + } + } + + // detach here + if(detach) { + + { + std::lock_guard lock(p->_topology->_taskflow._mutex); + p->_topology->_taskflow._graph._merge(std::move(g)); + } + + p->_topology->_join_counter.fetch_add(src.size()); + _schedule(w, src); + } + // join here + else { + p->_join_counter.fetch_add(src.size()); + _schedule(w, src); + _consume_task(w, p); + } +} + +// Procedure: _invoke_dynamic_task_internal +inline void Executor::_invoke_dynamic_task_internal( + Worker& w, Node* p, Graph& g +) { + + // graph is empty and has no async tasks + if(g.empty() && p->_join_counter == 0) { + return; + } + + SmallVector src; + + for(auto n : g._nodes) { + n->_topology = p->_topology; + n->_state.store(0, std::memory_order_relaxed); + n->_set_up_join_counter(); + n->_parent = p; + if(n->num_dependents() == 0) { + src.push_back(n); + } + } + p->_join_counter.fetch_add(src.size()); + _schedule(w, src); + _consume_task(w, p); +} + +// Procedure: _invoke_module_task_internal +inline void Executor::_invoke_module_task_internal( + Worker& w, Node* p, Graph& g, bool& deferred +) { + + // graph is empty and has no async tasks + if(g.empty()) { + return; + } + + // set deferred + deferred = true; + p->_state.fetch_or(Node::DEFERRED, std::memory_order_relaxed); + + SmallVector src; + + for(auto n : g._nodes) { + n->_topology = p->_topology; + n->_state.store(0, std::memory_order_relaxed); + n->_set_up_join_counter(); + n->_parent = p; + if(n->num_dependents() == 0) { + src.push_back(n); + } + } + p->_join_counter.fetch_add(src.size()); + _schedule(w, src); +} + +// Procedure: _invoke_condition_task +inline void Executor::_invoke_condition_task( + Worker& worker, Node* node, SmallVector& conds +) { + _observer_prologue(worker, node); + conds = { std::get_if(&node->_handle)->work() }; + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_multi_condition_task +inline void Executor::_invoke_multi_condition_task( + Worker& worker, Node* node, SmallVector& conds +) { + _observer_prologue(worker, node); + conds = std::get_if(&node->_handle)->work(); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_cudaflow_task +inline void Executor::_invoke_cudaflow_task(Worker& worker, Node* node) { + _observer_prologue(worker, node); + std::get_if(&node->_handle)->work(*this, node); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_syclflow_task +inline void Executor::_invoke_syclflow_task(Worker& worker, Node* node) { + _observer_prologue(worker, node); + std::get_if(&node->_handle)->work(*this, node); + _observer_epilogue(worker, node); +} + +// Procedure: _invoke_module_task +inline void Executor::_invoke_module_task(Worker& w, Node* node, bool& deferred) { + _observer_prologue(w, node); + _invoke_module_task_internal( + w, node, std::get_if(&node->_handle)->graph, deferred + ); + _observer_epilogue(w, node); +} + +// Procedure: _invoke_async_task +inline void Executor::_invoke_async_task(Worker& w, Node* node) { + _observer_prologue(w, node); + std::get_if(&node->_handle)->work(false); + _observer_epilogue(w, node); +} + +// Procedure: _invoke_silent_async_task +inline void Executor::_invoke_silent_async_task(Worker& w, Node* node) { + _observer_prologue(w, node); + std::get_if(&node->_handle)->work(); + _observer_epilogue(w, node); +} + +// Procedure: _invoke_runtime_task +inline void Executor::_invoke_runtime_task(Worker& w, Node* node) { + _observer_prologue(w, node); + Runtime rt(*this, w, node); + std::get_if(&node->_handle)->work(rt); + _observer_epilogue(w, node); +} + +// Function: run +inline tf::Future Executor::run(Taskflow& f) { + return run_n(f, 1, [](){}); +} + +// Function: run +inline tf::Future Executor::run(Taskflow&& f) { + return run_n(std::move(f), 1, [](){}); +} + +// Function: run +template +tf::Future Executor::run(Taskflow& f, C&& c) { + return run_n(f, 1, std::forward(c)); +} + +// Function: run +template +tf::Future Executor::run(Taskflow&& f, C&& c) { + return run_n(std::move(f), 1, std::forward(c)); +} + +// Function: run_n +inline tf::Future Executor::run_n(Taskflow& f, size_t repeat) { + return run_n(f, repeat, [](){}); +} + +// Function: run_n +inline tf::Future Executor::run_n(Taskflow&& f, size_t repeat) { + return run_n(std::move(f), repeat, [](){}); +} + +// Function: run_n +template +tf::Future Executor::run_n(Taskflow& f, size_t repeat, C&& c) { + return run_until( + f, [repeat]() mutable { return repeat-- == 0; }, std::forward(c) + ); +} + +// Function: run_n +template +tf::Future Executor::run_n(Taskflow&& f, size_t repeat, C&& c) { + return run_until( + std::move(f), [repeat]() mutable { return repeat-- == 0; }, std::forward(c) + ); +} + +// Function: run_until +template +tf::Future Executor::run_until(Taskflow& f, P&& pred) { + return run_until(f, std::forward

(pred), [](){}); +} + +// Function: run_until +template +tf::Future Executor::run_until(Taskflow&& f, P&& pred) { + return run_until(std::move(f), std::forward

(pred), [](){}); +} + +// Function: run_until +template +tf::Future Executor::run_until(Taskflow& f, P&& p, C&& c) { + + _increment_topology(); + + // Need to check the empty under the lock since dynamic task may + // define detached blocks that modify the taskflow at the same time + bool empty; + { + std::lock_guard lock(f._mutex); + empty = f.empty(); + } + + // No need to create a real topology but returns an dummy future + if(empty || p()) { + c(); + std::promise promise; + promise.set_value(); + _decrement_topology_and_notify(); + return tf::Future(promise.get_future(), std::monostate{}); + } + + // create a topology for this run + auto t = std::make_shared(f, std::forward

(p), std::forward(c)); + + // need to create future before the topology got torn down quickly + tf::Future future(t->_promise.get_future(), t); + + // modifying topology needs to be protected under the lock + { + std::lock_guard lock(f._mutex); + f._topologies.push(t); + if(f._topologies.size() == 1) { + _set_up_topology(_this_worker(), t.get()); + } + } + + return future; +} + +// Function: run_until +template +tf::Future Executor::run_until(Taskflow&& f, P&& pred, C&& c) { + + std::list::iterator itr; + + { + std::scoped_lock lock(_taskflow_mutex); + itr = _taskflows.emplace(_taskflows.end(), std::move(f)); + itr->_satellite = itr; + } + + return run_until(*itr, std::forward

(pred), std::forward(c)); +} + +// Procedure: _increment_topology +inline void Executor::_increment_topology() { + std::lock_guard lock(_topology_mutex); + ++_num_topologies; +} + +// Procedure: _decrement_topology_and_notify +inline void Executor::_decrement_topology_and_notify() { + std::lock_guard lock(_topology_mutex); + if(--_num_topologies == 0) { + _topology_cv.notify_all(); + } +} + +// Procedure: _decrement_topology +inline void Executor::_decrement_topology() { + std::lock_guard lock(_topology_mutex); + --_num_topologies; +} + +// Procedure: wait_for_all +inline void Executor::wait_for_all() { + std::unique_lock lock(_topology_mutex); + _topology_cv.wait(lock, [&](){ return _num_topologies == 0; }); +} + +// Function: _set_up_topology +inline void Executor::_set_up_topology(Worker* worker, Topology* tpg) { + + // ---- under taskflow lock ---- + + tpg->_sources.clear(); + tpg->_taskflow._graph._clear_detached(); + + // scan each node in the graph and build up the links + for(auto node : tpg->_taskflow._graph._nodes) { + + node->_topology = tpg; + node->_state.store(0, std::memory_order_relaxed); + + if(node->num_dependents() == 0) { + tpg->_sources.push_back(node); + } + + node->_set_up_join_counter(); + } + + tpg->_join_counter = tpg->_sources.size(); + + if(worker) { + _schedule(*worker, tpg->_sources); + } + else { + _schedule(tpg->_sources); + } +} + +// Function: _tear_down_topology +inline void Executor::_tear_down_topology(Worker& worker, Topology* tpg) { + + auto &f = tpg->_taskflow; + + //assert(&tpg == &(f._topologies.front())); + + // case 1: we still need to run the topology again + if(!tpg->_is_cancelled && !tpg->_pred()) { + //assert(tpg->_join_counter == 0); + std::lock_guard lock(f._mutex); + tpg->_join_counter = tpg->_sources.size(); + _schedule(worker, tpg->_sources); + } + // case 2: the final run of this topology + else { + + // TODO: if the topology is cancelled, need to release all semaphores + + if(tpg->_call != nullptr) { + tpg->_call(); + } + + // If there is another run (interleave between lock) + if(std::unique_lock lock(f._mutex); f._topologies.size()>1) { + //assert(tpg->_join_counter == 0); + + // Set the promise + tpg->_promise.set_value(); + f._topologies.pop(); + tpg = f._topologies.front().get(); + + // decrement the topology but since this is not the last we don't notify + _decrement_topology(); + + // set up topology needs to be under the lock or it can + // introduce memory order error with pop + _set_up_topology(&worker, tpg); + } + else { + //assert(f._topologies.size() == 1); + + // Need to back up the promise first here becuz taskflow might be + // destroy soon after calling get + auto p {std::move(tpg->_promise)}; + + // Back up lambda capture in case it has the topology pointer, + // to avoid it releasing on pop_front ahead of _mutex.unlock & + // _promise.set_value. Released safely when leaving scope. + auto c {std::move(tpg->_call)}; + + // Get the satellite if any + auto s {f._satellite}; + + // Now we remove the topology from this taskflow + f._topologies.pop(); + + //f._mutex.unlock(); + lock.unlock(); + + // We set the promise in the end in case taskflow leaves the scope. + // After set_value, the caller will return from wait + p.set_value(); + + _decrement_topology_and_notify(); + + // remove the taskflow if it is managed by the executor + // TODO: in the future, we may need to synchronize on wait + // (which means the following code should the moved before set_value) + if(s) { + std::scoped_lock lock(_taskflow_mutex); + _taskflows.erase(*s); + } + } + } +} + +// ############################################################################ +// Forward Declaration: Subflow +// ############################################################################ + +inline void Subflow::join() { + + // assert(this_worker().worker == &_worker); + + if(!_joinable) { + TF_THROW("subflow not joinable"); + } + + // only the parent worker can join the subflow + _executor._invoke_dynamic_task_external(_worker, _parent, _graph, false); + _joinable = false; +} + +inline void Subflow::detach() { + + // assert(this_worker().worker == &_worker); + + if(!_joinable) { + TF_THROW("subflow already joined or detached"); + } + + // only the parent worker can detach the subflow + _executor._invoke_dynamic_task_external(_worker, _parent, _graph, true); + _joinable = false; +} + +// Function: named_async +template +auto Subflow::named_async(const std::string& name, F&& f, ArgsT&&... args) { + return _named_async( + *_executor._this_worker(), name, std::forward(f), std::forward(args)... + ); +} + +// Function: _named_async +template +auto Subflow::_named_async( + Worker& w, + const std::string& name, + F&& f, + ArgsT&&... args +) { + + _parent->_join_counter.fetch_add(1); + + using T = std::invoke_result_t; + using R = std::conditional_t, void, std::optional>; + + std::promise p; + + auto tpg = std::make_shared(); + + Future fu(p.get_future(), tpg); + + auto node = node_pool.animate( + std::in_place_type_t{}, + [p=make_moc(std::move(p)), f=std::forward(f), args...] + (bool cancel) mutable { + if constexpr(std::is_same_v) { + if(!cancel) { + f(args...); + } + p.object.set_value(); + } + else { + p.object.set_value(cancel ? std::nullopt : std::make_optional(f(args...))); + } + }, + std::move(tpg) + ); + + node->_name = name; + node->_topology = _parent->_topology; + node->_parent = _parent; + + _executor._schedule(w, node); + + return fu; +} + +// Function: async +template +auto Subflow::async(F&& f, ArgsT&&... args) { + return named_async("", std::forward(f), std::forward(args)...); +} + +// Function: _named_silent_async +template +void Subflow::_named_silent_async( + Worker& w, const std::string& name, F&& f, ArgsT&&... args +) { + + _parent->_join_counter.fetch_add(1); + + auto node = node_pool.animate( + std::in_place_type_t{}, + [f=std::forward(f), args...] () mutable { + f(args...); + } + ); + + node->_name = name; + node->_topology = _parent->_topology; + node->_parent = _parent; + + _executor._schedule(w, node); +} + +// Function: silent_async +template +void Subflow::named_silent_async(const std::string& name, F&& f, ArgsT&&... args) { + _named_silent_async( + *_executor._this_worker(), name, std::forward(f), std::forward(args)... + ); +} + +// Function: named_silent_async +template +void Subflow::silent_async(F&& f, ArgsT&&... args) { + named_silent_async("", std::forward(f), std::forward(args)...); +} + +// ############################################################################ +// Forward Declaration: Runtime +// ############################################################################ + +// Procedure: schedule +inline void Runtime::schedule(Task task) { + auto node = task._node; + auto& j = node->_parent ? node->_parent->_join_counter : + node->_topology->_join_counter; + j.fetch_add(1); + _executor._schedule(_worker, node); +} + +// Procedure: run +template +void Runtime::run(C&& callable) { + + // dynamic task (subflow) + if constexpr(is_dynamic_task_v) { + Graph graph; + Subflow sf(_executor, _worker, _parent, graph); + callable(sf); + if(sf._joinable) { + _executor._invoke_dynamic_task_internal(_worker, _parent, graph); + } + } + else { + static_assert(dependent_false_v, "unsupported task callable to run"); + } +} + +} // end of namespace tf ----------------------------------------------------- + + + + + + + + diff --git a/lib/taskflow/core/executor.hpp b/lib/taskflow/core/executor.hpp index 28cae53..a5607e0 100644 --- a/lib/taskflow/core/executor.hpp +++ b/lib/taskflow/core/executor.hpp @@ -2,283 +2,1119 @@ #include "observer.hpp" #include "taskflow.hpp" +#include "async_task.hpp" -/** +/** @file executor.hpp @brief executor include file */ namespace tf { -struct PerThread { - - Worker* worker; - PerThread() : worker{ nullptr } {} -}; - -thread_local PerThread per_thread; - // ---------------------------------------------------------------------------- // Executor Definition // ---------------------------------------------------------------------------- - /** @class Executor -@brief execution interface for running a taskflow graph +@brief class to create an executor for running a taskflow graph -An executor object manages a set of worker threads to run taskflow(s) +An executor manages a set of worker threads to run one or multiple taskflows using an efficient work-stealing scheduling algorithm. +@code{.cpp} +// Declare an executor and a taskflow +tf::Executor executor; +tf::Taskflow taskflow; + +// Add three tasks into the taskflow +tf::Task A = taskflow.emplace([] () { std::cout << "This is TaskA\n"; }); +tf::Task B = taskflow.emplace([] () { std::cout << "This is TaskB\n"; }); +tf::Task C = taskflow.emplace([] () { std::cout << "This is TaskC\n"; }); + +// Build precedence between tasks +A.precede(B, C); + +tf::Future fu = executor.run(taskflow); +fu.wait(); // block until the execution completes + +executor.run(taskflow, [](){ std::cout << "end of 1 run"; }).wait(); +executor.run_n(taskflow, 4); +executor.wait_for_all(); // block until all associated executions finish +executor.run_n(taskflow, 4, [](){ std::cout << "end of 4 runs"; }).wait(); +executor.run_until(taskflow, [cnt=0] () mutable { return ++cnt == 10; }); +@endcode + +All the @c run methods are @em thread-safe. You can submit multiple +taskflows at the same time to an executor from different threads. */ class Executor { friend class FlowBuilder; friend class Subflow; - friend class cudaFlow; + friend class Runtime; + public: + /** + @brief constructs the executor with @c N worker threads - public: - /** - @brief constructs the executor with N worker threads - */ - explicit Executor(size_t N = std::thread::hardware_concurrency()); - - /** - @brief destructs the executor - */ - ~Executor(); + @param N number of workers (default std::thread::hardware_concurrency) + @param wix worker interface class to alter worker (thread) behaviors + + The constructor spawns @c N worker threads to run tasks in a + work-stealing loop. The number of workers must be greater than zero + or an exception will be thrown. + By default, the number of worker threads is equal to the maximum + hardware concurrency returned by std::thread::hardware_concurrency. + + Users can alter the worker behavior, such as changing thread affinity, + via deriving an instance from tf::WorkerInterface. + */ + explicit Executor( + size_t N = std::thread::hardware_concurrency(), + std::shared_ptr wix = nullptr + ); - /** - @brief runs the taskflow once - - @param taskflow a tf::Taskflow object + /** + @brief destructs the executor - @return a tf::Future that will holds the result of the execution - */ - tf::Future run(Taskflow& taskflow); + The destructor calls Executor::wait_for_all to wait for all submitted + taskflows to complete and then notifies all worker threads to stop + and join these threads. + */ + ~Executor(); - /** - @brief runs the taskflow once and invoke a callback upon completion + /** + @brief runs a taskflow once - @param taskflow a tf::Taskflow object - @param callable a callable object to be invoked after this run + @param taskflow a tf::Taskflow object - @return a tf::Future that will holds the result of the execution - */ - template - tf::Future run(Taskflow& taskflow, C&& callable); + @return a tf::Future that holds the result of the execution - /** - @brief runs the taskflow for N times - - @param taskflow a tf::Taskflow object - @param N number of runs + This member function executes the given taskflow once and returns a tf::Future + object that eventually holds the result of the execution. - @return a tf::Future that will holds the result of the execution - */ - tf::Future run_n(Taskflow& taskflow, size_t N); + @code{.cpp} + tf::Future future = executor.run(taskflow); + // do something else + future.wait(); + @endcode - /** - @brief runs the taskflow for N times and then invokes a callback + This member function is thread-safe. - @param taskflow a tf::Taskflow - @param N number of runs - @param callable a callable object to be invoked after this run + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + tf::Future run(Taskflow& taskflow); - @return a tf::Future that will holds the result of the execution - */ - template - tf::Future run_n(Taskflow& taskflow, size_t N, C&& callable); + /** + @brief runs a moved taskflow once - /** - @brief runs the taskflow multiple times until the predicate becomes true and - then invokes a callback + @param taskflow a moved tf::Taskflow object - @param taskflow a tf::Taskflow - @param pred a boolean predicate to return true for stop + @return a tf::Future that holds the result of the execution - @return a tf::Future that will holds the result of the execution - */ - template - tf::Future run_until(Taskflow& taskflow, P&& pred); + This member function executes a moved taskflow once and returns a tf::Future + object that eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. - /** - @brief runs the taskflow multiple times until the predicate becomes true and - then invokes the callback + @code{.cpp} + tf::Future future = executor.run(std::move(taskflow)); + // do something else + future.wait(); + @endcode - @param taskflow a tf::Taskflow - @param pred a boolean predicate to return true for stop - @param callable a callable object to be invoked after this run + This member function is thread-safe. + */ + tf::Future run(Taskflow&& taskflow); - @return a tf::Future that will holds the result of the execution - */ - template - tf::Future run_until(Taskflow& taskflow, P&& pred, C&& callable); - - /** - @brief wait for all pending graphs to complete - */ - void wait_for_all(); - - /** - @brief queries the number of worker threads (can be zero) - */ - size_t num_workers() const; - - /** - @brief queries the number of running topologies at the time of this call + /** + @brief runs a taskflow once and invoke a callback upon completion - When a taskflow is submitted to an executor, a topology is created to store - runtime metadata of the running taskflow. - */ - size_t num_topologies() const; + @param taskflow a tf::Taskflow object + @param callable a callable object to be invoked after this run - /** - @brief queries the id of the caller thread in this executor + @return a tf::Future that holds the result of the execution - Each worker has an unique id from 0 to N-1 exclusive to the associated executor. - If the caller thread does not belong to the executor, -1 is returned. - */ - int this_worker_id() const; + This member function executes the given taskflow once and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. - /** - @brief runs a given function asynchronously + @code{.cpp} + tf::Future future = executor.run(taskflow, [](){ std::cout << "done"; }); + // do something else + future.wait(); + @endcode - @tparam F callable type - @tparam ArgsT parameter types + This member function is thread-safe. - @param f callable object to call - @param args parameters to pass to the callable - - @return a tf::Future that will holds the result of the execution + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template + tf::Future run(Taskflow& taskflow, C&& callable); - This method is thread-safe. Multiple threads can launch asynchronous tasks - at the same time. - */ - template - auto async(F&& f, ArgsT&&... args); - - /** - @brief similar to tf::Executor::async but does not return a future object - */ - template - void silent_async(F&& f, ArgsT&&... args); - - /** - @brief constructs an observer to inspect the activities of worker threads + /** + @brief runs a moved taskflow once and invoke a callback upon completion - Each executor manage a list of observers in shared ownership with callers. - - @tparam Observer observer type derived from tf::ObserverInterface - @tparam ArgsT argument parameter pack + @param taskflow a moved tf::Taskflow object + @param callable a callable object to be invoked after this run - @param args arguments to forward to the constructor of the observer - - @return a shared pointer to the created observer - */ - template - std::shared_ptr make_observer(ArgsT&&... args); - - /** - @brief removes the associated observer - */ - template - void remove_observer(std::shared_ptr observer); + @return a tf::Future that holds the result of the execution - /** - @brief queries the number of observers - */ - size_t num_observers() const; + This member function executes a moved taskflow once and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. - private: + @code{.cpp} + tf::Future future = executor.run( + std::move(taskflow), [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode - + This member function is thread-safe. + */ + template + tf::Future run(Taskflow&& taskflow, C&& callable); - const size_t _VICTIM_BEG; - const size_t _VICTIM_END; - const size_t _MAX_STEALS; - const size_t _MAX_YIELDS; - - std::condition_variable _topology_cv; - std::mutex _topology_mutex; - std::mutex _wsq_mutex; + /** + @brief runs a taskflow for @c N times - size_t _num_topologies {0}; - - std::vector _workers; - std::vector _threads; + @param taskflow a tf::Taskflow object + @param N number of runs - Notifier _notifier; + @return a tf::Future that holds the result of the execution - TaskQueue _wsq; + This member function executes the given taskflow @c N times and returns a tf::Future + object that eventually holds the result of the execution. - std::atomic _num_actives {0}; - std::atomic _num_thieves {0}; - std::atomic _done {0}; - - std::unordered_set> _observers; + @code{.cpp} + tf::Future future = executor.run_n(taskflow, 2); // run taskflow 2 times + // do something else + future.wait(); + @endcode - bool _wait_for_task(Worker&, Node*&); - - void _observer_prologue(Worker&, Node*); - void _observer_epilogue(Worker&, Node*); - void _spawn(size_t); - void _worker_loop(Worker&); - void _exploit_task(Worker&, Node*&); - void _explore_task(Worker&, Node*&); - void _schedule(Node*); - void _schedule(const std::vector&); - void _invoke(Worker&, Node*); - void _invoke_static_task(Worker&, Node*); - void _invoke_dynamic_task(Worker&, Node*); - void _invoke_dynamic_task_internal(Worker&, Node*, Graph&, bool); - void _invoke_dynamic_task_external(Node*, Graph&, bool); - void _invoke_condition_task(Worker&, Node*, int&); - void _invoke_module_task(Worker&, Node*); - void _invoke_async_task(Worker&, Node*); - void _invoke_silent_async_task(Worker&, Node*); - void _set_up_topology(Topology*); - void _tear_down_topology(Topology*); - void _tear_down_async(Node*); - void _tear_down_invoke(Node*, bool); - void _increment_topology(); - void _decrement_topology(); - void _decrement_topology_and_notify(); - void _invoke_cudaflow_task(Worker&, Node*); - - template , void>* = nullptr - > - void _invoke_cudaflow_task_entry(C&&, Node*); - - template , void>* = nullptr - > - void _invoke_cudaflow_task_entry(C&&, Node*); - - //template - //void _invoke_cudaflow_task_internal(cudaFlow&, P&&, bool); + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + tf::Future run_n(Taskflow& taskflow, size_t N); + + /** + @brief runs a moved taskflow for @c N times + + @param taskflow a moved tf::Taskflow object + @param N number of runs + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow @c N times and returns a tf::Future + object that eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run_n( + std::move(taskflow), 2 // run the moved taskflow 2 times + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + tf::Future run_n(Taskflow&& taskflow, size_t N); + + /** + @brief runs a taskflow for @c N times and then invokes a callback + + @param taskflow a tf::Taskflow + @param N number of runs + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow @c N times and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run( + taskflow, 2, [](){ std::cout << "done"; } // runs taskflow 2 times and invoke + // the lambda to print "done" + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template + tf::Future run_n(Taskflow& taskflow, size_t N, C&& callable); + + /** + @brief runs a moved taskflow for @c N times and then invokes a callback + + @param taskflow a moved tf::Taskflow + @param N number of runs + @param callable a callable object to be invoked after this run + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow @c N times and invokes the given + callable when the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run_n( + // run the moved taskflow 2 times and invoke the lambda to print "done" + std::move(taskflow), 2, [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template + tf::Future run_n(Taskflow&& taskflow, size_t N, C&& callable); + + /** + @brief runs a taskflow multiple times until the predicate becomes true + + @param taskflow a tf::Taskflow + @param pred a boolean predicate to return @c true for stop + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow multiple times until + the predicate returns @c true. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run_until( + taskflow, [](){ return rand()%10 == 0 } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template + tf::Future run_until(Taskflow& taskflow, P&& pred); + + /** + @brief runs a moved taskflow and keeps running it + until the predicate becomes true + + @param taskflow a moved tf::Taskflow object + @param pred a boolean predicate to return @c true for stop + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow multiple times until + the predicate returns @c true. + This member function returns a tf::Future object that + eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run_until( + std::move(taskflow), [](){ return rand()%10 == 0 } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template + tf::Future run_until(Taskflow&& taskflow, P&& pred); + + /** + @brief runs a taskflow multiple times until the predicate becomes true and + then invokes the callback + + @param taskflow a tf::Taskflow + @param pred a boolean predicate to return @c true for stop + @param callable a callable object to be invoked after this run completes + + @return a tf::Future that holds the result of the execution + + This member function executes the given taskflow multiple times until + the predicate returns @c true and then invokes the given callable when + the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + + @code{.cpp} + tf::Future future = executor.run_until( + taskflow, [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + + @attention + The executor does not own the given taskflow. It is your responsibility to + ensure the taskflow remains alive during its execution. + */ + template + tf::Future run_until(Taskflow& taskflow, P&& pred, C&& callable); + + /** + @brief runs a moved taskflow and keeps running + it until the predicate becomes true and then invokes the callback + + @param taskflow a moved tf::Taskflow + @param pred a boolean predicate to return @c true for stop + @param callable a callable object to be invoked after this run completes + + @return a tf::Future that holds the result of the execution + + This member function executes a moved taskflow multiple times until + the predicate returns @c true and then invokes the given callable when + the execution completes. + This member function returns a tf::Future object that + eventually holds the result of the execution. + The executor will take care of the lifetime of the moved taskflow. + + @code{.cpp} + tf::Future future = executor.run_until( + std::move(taskflow), + [](){ return rand()%10 == 0 }, [](){ std::cout << "done"; } + ); + // do something else + future.wait(); + @endcode + + This member function is thread-safe. + */ + template + tf::Future run_until(Taskflow&& taskflow, P&& pred, C&& callable); + + /** + @brief runs a target graph and waits until it completes using + an internal worker of this executor + + @tparam T target type which has `tf::Graph& T::graph()` defined + @param target the target task graph object + + The method runs a target graph which has `tf::Graph& T::graph()` defined + and waits until the execution completes. + Unlike the typical flow of calling `tf::Executor::run` series + plus waiting on the result, this method must be called by an internal + worker of this executor. The caller worker will participate in + the work-stealing loop of the scheduler, therby avoiding potential + deadlock caused by blocked waiting. + + @code{.cpp} + tf::Executor executor(2); + tf::Taskflow taskflow; + std::array others; + + std::atomic counter{0}; + + for(size_t n=0; n<1000; n++) { + for(size_t i=0; i<1000; i++) { + others[n].emplace([&](){ counter++; }); + } + taskflow.emplace([&executor, &tf=others[n]](){ + executor.corun(tf); + //executor.run(tf).wait(); <- blocking the worker without doing anything + // will introduce deadlock + }); + } + executor.run(taskflow).wait(); + @endcode + + The method is thread-safe as long as the target is not concurrently + ran by two or more threads. + + @attention + You must call tf::Executor::corun from a worker of the calling executor + or an exception will be thrown. + */ + template + void corun(T& target); + + /** + @brief keeps running the work-stealing loop until the predicate becomes true + + @tparam P predicate type + @param predicate a boolean predicate to indicate when to stop the loop + + The method keeps the caller worker running in the work-stealing loop + until the stop predicate becomes true. + + @code{.cpp} + taskflow.emplace([&](){ + std::future fu = std::async([](){ std::sleep(100s); }); + executor.corun_until([](){ + return fu.wait_for(std::chrono::seconds(0)) == future_status::ready; + }); + }); + @endcode + + @attention + You must call tf::Executor::corun_until from a worker of the calling executor + or an exception will be thrown. + */ + template + void corun_until(P&& predicate); + + /** + @brief waits for all tasks to complete + + This member function waits until all submitted tasks + (e.g., taskflows, asynchronous tasks) to finish. + + @code{.cpp} + executor.run(taskflow1); + executor.run_n(taskflow2, 10); + executor.run_n(taskflow3, 100); + executor.wait_for_all(); // wait until the above submitted taskflows finish + @endcode + */ + void wait_for_all(); + + /** + @brief queries the number of worker threads + + Each worker represents one unique thread spawned by an executor + upon its construction time. + + @code{.cpp} + tf::Executor executor(4); + std::cout << executor.num_workers(); // 4 + @endcode + */ + size_t num_workers() const noexcept; + + /** + @brief queries the number of running topologies at the time of this call + + When a taskflow is submitted to an executor, a topology is created to store + runtime metadata of the running taskflow. + When the execution of the submitted taskflow finishes, + its corresponding topology will be removed from the executor. + + @code{.cpp} + executor.run(taskflow); + std::cout << executor.num_topologies(); // 0 or 1 (taskflow still running) + @endcode + */ + size_t num_topologies() const; + + /** + @brief queries the number of running taskflows with moved ownership + + @code{.cpp} + executor.run(std::move(taskflow)); + std::cout << executor.num_taskflows(); // 0 or 1 (taskflow still running) + @endcode + */ + size_t num_taskflows() const; + + /** + @brief queries the id of the caller thread in this executor + + Each worker has an unique id in the range of @c 0 to @c N-1 associated with + its parent executor. + If the caller thread does not belong to the executor, @c -1 is returned. + + @code{.cpp} + tf::Executor executor(4); // 4 workers in the executor + executor.this_worker_id(); // -1 (main thread is not a worker) + + taskflow.emplace([&](){ + std::cout << executor.this_worker_id(); // 0, 1, 2, or 3 + }); + executor.run(taskflow); + @endcode + */ + int this_worker_id() const; + + // -------------------------------------------------------------------------- + // Observer methods + // -------------------------------------------------------------------------- + + /** + @brief constructs an observer to inspect the activities of worker threads + + @tparam Observer observer type derived from tf::ObserverInterface + @tparam ArgsT argument parameter pack + + @param args arguments to forward to the constructor of the observer + + @return a shared pointer to the created observer + + Each executor manages a list of observers with shared ownership with callers. + For each of these observers, the two member functions, + tf::ObserverInterface::on_entry and tf::ObserverInterface::on_exit + will be called before and after the execution of a task. + + This member function is not thread-safe. + */ + template + std::shared_ptr make_observer(ArgsT&&... args); + + /** + @brief removes an observer from the executor + + This member function is not thread-safe. + */ + template + void remove_observer(std::shared_ptr observer); + + /** + @brief queries the number of observers + */ + size_t num_observers() const noexcept; + + // -------------------------------------------------------------------------- + // Async Task Methods + // -------------------------------------------------------------------------- + + /** + @brief runs a given function asynchronously + + @tparam F callable type + + @param func callable object + + @return a @std_future that will hold the result of the execution + + The method creates an asynchronous task to run the given function + and return a @std_future object that eventually will hold the result + of the return value. + + @code{.cpp} + std::future future = executor.async([](){ + std::cout << "create an asynchronous task and returns 1\n"; + return 1; + }); + future.get(); + @endcode + + This member function is thread-safe. + */ + template + auto async(F&& func); + + /** + @brief runs a given function asynchronously and gives a name to this task + + @tparam F callable type + + @param name name of the asynchronous task + @param func callable object + + @return a @std_future that will hold the result of the execution + + The method creates and assigns a name to an asynchronous task + to run the given function, + returning @std_future object that eventually will hold the result + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + std::future future = executor.async("name", [](){ + std::cout << "create an asynchronous task with a name and returns 1\n"; + return 1; + }); + future.get(); + @endcode + + This member function is thread-safe. + */ + template + auto async(const std::string& name, F&& func); + + /** + @brief similar to tf::Executor::async but does not return a future object + + @tparam F callable type + + @param func callable object + + This member function is more efficient than tf::Executor::async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + + @code{.cpp} + executor.silent_async([](){ + std::cout << "create an asynchronous task with no return\n"; + }); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template + void silent_async(F&& func); + + /** + @brief similar to tf::Executor::async but does not return a future object + + @tparam F callable type + + @param name assigned name to the task + @param func callable object + + This member function is more efficient than tf::Executor::async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + executor.silent_async("name", [](){ + std::cout << "create an asynchronous task with a name and no return\n"; + }); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template + void silent_async(const std::string& name, F&& func); + + // -------------------------------------------------------------------------- + // Silent Dependent Async Methods + // -------------------------------------------------------------------------- + + /** + @brief runs the given function asynchronously + when the given dependents finish + + @tparam F callable type + @tparam Tasks task types convertible to tf::AsyncTask + + @param func callable object + @param tasks asynchronous tasks on which this execution depends + + @return a tf::AsyncTask handle + + This member function is more efficient than tf::Executor::dependent_async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + + @code{.cpp} + tf::AsyncTask A = executor.silent_dependent_async([](){ printf("A\n"); }); + tf::AsyncTask B = executor.silent_dependent_async([](){ printf("B\n"); }); + executor.silent_dependent_async([](){ printf("C runs after A and B\n"); }, A, B); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template ...>, void>* = nullptr + > + tf::AsyncTask silent_dependent_async(F&& func, Tasks&&... tasks); + + /** + @brief names and runs the given function asynchronously + when the given dependents finish + + @tparam F callable type + @tparam Tasks task types convertible to tf::AsyncTask + + @param name assigned name to the task + @param func callable object + @param tasks asynchronous tasks on which this execution depends + + @return a tf::AsyncTask handle + + This member function is more efficient than tf::Executor::dependent_async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + tf::AsyncTask A = executor.silent_dependent_async("A", [](){ printf("A\n"); }); + tf::AsyncTask B = executor.silent_dependent_async("B", [](){ printf("B\n"); }); + executor.silent_dependent_async( + "C", [](){ printf("C runs after A and B\n"); }, A, B + ); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template ...>, void>* = nullptr + > + tf::AsyncTask silent_dependent_async(const std::string& name, F&& func, Tasks&&... tasks); + + /** + @brief runs the given function asynchronously + when the given range of dependents finish + + @tparam F callable type + @tparam I iterator type + + @param func callable object + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + @return a tf::AsyncTask handle + + This member function is more efficient than tf::Executor::dependent_async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + + @code{.cpp} + std::array array { + executor.silent_dependent_async([](){ printf("A\n"); }), + executor.silent_dependent_async([](){ printf("B\n"); }) + }; + executor.silent_dependent_async( + [](){ printf("C runs after A and B\n"); }, array.begin(), array.end() + ); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template , AsyncTask>, void>* = nullptr + > + tf::AsyncTask silent_dependent_async(F&& func, I first, I last); + + /** + @brief names and runs the given function asynchronously + when the given range of dependents finish + + @tparam F callable type + @tparam I iterator type + + @param name assigned name to the task + @param func callable object + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + @return a tf::AsyncTask handle + + This member function is more efficient than tf::Executor::dependent_async + and is encouraged to use when you do not want a @std_future to + acquire the result or synchronize the execution. + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + std::array array { + executor.silent_dependent_async("A", [](){ printf("A\n"); }), + executor.silent_dependent_async("B", [](){ printf("B\n"); }) + }; + executor.silent_dependent_async( + "C", [](){ printf("C runs after A and B\n"); }, array.begin(), array.end() + ); + executor.wait_for_all(); + @endcode + + This member function is thread-safe. + */ + template , AsyncTask>, void>* = nullptr + > + tf::AsyncTask silent_dependent_async(const std::string& name, F&& func, I first, I last); + + // -------------------------------------------------------------------------- + // Dependent Async Methods + // -------------------------------------------------------------------------- + + /** + @brief runs the given function asynchronously + when the given dependents finish + + @tparam F callable type + @tparam Tasks task types convertible to tf::AsyncTask + + @param func callable object + @param tasks asynchronous tasks on which this execution depends + + @return a pair of a tf::AsyncTask handle and + a @std_future that holds the result of the execution + + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Task @c C returns a pair of its tf::AsyncTask handle and a std::future + that eventually will hold the result of the execution. + + @code{.cpp} + tf::AsyncTask A = executor.silent_dependent_async([](){ printf("A\n"); }); + tf::AsyncTask B = executor.silent_dependent_async([](){ printf("B\n"); }); + auto [C, fuC] = executor.dependent_async( + [](){ + printf("C runs after A and B\n"); + return 1; + }, + A, B + ); + fuC.get(); // C finishes, which in turns means both A and B finish + @endcode + + You can mixed the use of tf::AsyncTask handles + returned by Executor::dependent_async and Executor::silent_dependent_async + when specifying task dependencies. + + This member function is thread-safe. + */ + template ...>, void>* = nullptr + > + auto dependent_async(F&& func, Tasks&&... tasks); + + /** + @brief names and runs the given function asynchronously + when the given dependents finish + + @tparam F callable type + @tparam Tasks task types convertible to tf::AsyncTask + + @param name assigned name to the task + @param func callable object + @param tasks asynchronous tasks on which this execution depends + + @return a pair of a tf::AsyncTask handle and + a @std_future that holds the result of the execution + + The example below creates three named asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Task @c C returns a pair of its tf::AsyncTask handle and a std::future + that eventually will hold the result of the execution. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + tf::AsyncTask A = executor.silent_dependent_async("A", [](){ printf("A\n"); }); + tf::AsyncTask B = executor.silent_dependent_async("B", [](){ printf("B\n"); }); + auto [C, fuC] = executor.dependent_async( + "C", + [](){ + printf("C runs after A and B\n"); + return 1; + }, + A, B + ); + assert(fuC.get()==1); // C finishes, which in turns means both A and B finish + @endcode + + You can mixed the use of tf::AsyncTask handles + returned by Executor::dependent_async and Executor::silent_dependent_async + when specifying task dependencies. + + This member function is thread-safe. + */ + template ...>, void>* = nullptr + > + auto dependent_async(const std::string& name, F&& func, Tasks&&... tasks); + + /** + @brief runs the given function asynchronously + when the given range of dependents finish + + @tparam F callable type + @tparam I iterator type + + @param func callable object + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + @return a pair of a tf::AsyncTask handle and + a @std_future that holds the result of the execution + + The example below creates three asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Task @c C returns a pair of its tf::AsyncTask handle and a std::future + that eventually will hold the result of the execution. + + @code{.cpp} + std::array array { + executor.silent_dependent_async([](){ printf("A\n"); }), + executor.silent_dependent_async([](){ printf("B\n"); }) + }; + auto [C, fuC] = executor.dependent_async( + [](){ + printf("C runs after A and B\n"); + return 1; + }, + array.begin(), array.end() + ); + assert(fuC.get()==1); // C finishes, which in turns means both A and B finish + @endcode + + You can mixed the use of tf::AsyncTask handles + returned by Executor::dependent_async and Executor::silent_dependent_async + when specifying task dependencies. + + This member function is thread-safe. + */ + template , AsyncTask>, void>* = nullptr + > + auto dependent_async(F&& func, I first, I last); + + /** + @brief names and runs the given function asynchronously + when the given range of dependents finish + + @tparam F callable type + @tparam I iterator type + + @param name assigned name to the task + @param func callable object + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + @return a pair of a tf::AsyncTask handle and + a @std_future that holds the result of the execution + + The example below creates three named asynchronous tasks, @c A, @c B, and @c C, + in which task @c C runs after task @c A and task @c B. + Task @c C returns a pair of its tf::AsyncTask handle and a std::future + that eventually will hold the result of the execution. + Assigned task names will appear in the observers of the executor. + + @code{.cpp} + std::array array { + executor.silent_dependent_async("A", [](){ printf("A\n"); }), + executor.silent_dependent_async("B", [](){ printf("B\n"); }) + }; + auto [C, fuC] = executor.dependent_async( + "C", + [](){ + printf("C runs after A and B\n"); + return 1; + }, + array.begin(), array.end() + ); + assert(fuC.get()==1); // C finishes, which in turns means both A and B finish + @endcode + + You can mixed the use of tf::AsyncTask handles + returned by Executor::dependent_async and Executor::silent_dependent_async + when specifying task dependencies. + + This member function is thread-safe. + */ + template , AsyncTask>, void>* = nullptr + > + auto dependent_async(const std::string& name, F&& func, I first, I last); + + private: - //template - //void _invoke_cudaflow_task_external(cudaFlow&, P&&, bool); + const size_t _MAX_STEALS; + + std::condition_variable _topology_cv; + std::mutex _taskflows_mutex; + std::mutex _topology_mutex; + std::mutex _wsq_mutex; + std::mutex _asyncs_mutex; + + size_t _num_topologies {0}; + + std::unordered_map _wids; + std::vector _threads; + std::vector _workers; + std::list _taskflows; + + std::unordered_set> _asyncs; + + Notifier _notifier; + + TaskQueue _wsq; + + std::atomic _done {0}; + + std::shared_ptr _worker_interface; + std::unordered_set> _observers; + + Worker* _this_worker(); + + bool _wait_for_task(Worker&, Node*&); + + void _observer_prologue(Worker&, Node*); + void _observer_epilogue(Worker&, Node*); + void _spawn(size_t); + void _exploit_task(Worker&, Node*&); + void _explore_task(Worker&, Node*&); + void _schedule(Worker&, Node*); + void _schedule(Node*); + void _schedule(Worker&, const SmallVector&); + void _schedule(const SmallVector&); + void _set_up_topology(Worker*, Topology*); + void _tear_down_topology(Worker&, Topology*); + void _tear_down_async(Node*); + void _tear_down_dependent_async(Worker&, Node*); + void _tear_down_invoke(Worker&, Node*); + void _increment_topology(); + void _decrement_topology(); + void _decrement_topology_and_notify(); + void _invoke(Worker&, Node*); + void _invoke_static_task(Worker&, Node*); + void _invoke_dynamic_task(Worker&, Node*); + void _consume_graph(Worker&, Node*, Graph&); + void _detach_dynamic_task(Worker&, Node*, Graph&); + void _invoke_condition_task(Worker&, Node*, SmallVector&); + void _invoke_multi_condition_task(Worker&, Node*, SmallVector&); + void _invoke_module_task(Worker&, Node*); + void _invoke_async_task(Worker&, Node*); + void _invoke_dependent_async_task(Worker&, Node*); + void _process_async_dependent(Node*, tf::AsyncTask&, size_t&); + void _schedule_async_task(Node*); + + template + void _corun_until(Worker&, P&&); + + template + auto _make_promised_async(std::promise&&, F&&); }; // Constructor -inline Executor::Executor(size_t N) : - _VICTIM_BEG {0}, - _VICTIM_END {N - 1}, - _MAX_STEALS {(N + 1) << 1}, - _MAX_YIELDS {100}, +inline Executor::Executor(size_t N, std::shared_ptr wix) : + _MAX_STEALS {((N+1) << 1)}, + _threads {N}, _workers {N}, - _notifier {N} { - + _notifier {N}, + _worker_interface {std::move(wix)} { + if(N == 0) { TF_THROW("no cpu workers to execute taskflows"); } - + _spawn(N); // instantite the default observer if requested @@ -289,25 +1125,22 @@ inline Executor::Executor(size_t N) : // Destructor inline Executor::~Executor() { - + // wait for all topologies to complete wait_for_all(); - + // shut down the scheduler _done = true; _notifier.notify(true); - + for(auto& t : _threads){ t.join(); - } - - // flush the default observer - //_flush_tfprof(); + } } // Function: num_workers -inline size_t Executor::num_workers() const { +inline size_t Executor::num_workers() const noexcept { return _workers.size(); } @@ -315,140 +1148,169 @@ inline size_t Executor::num_workers() const { inline size_t Executor::num_topologies() const { return _num_topologies; } - -// Function: async -template -auto Executor::async(F&& f, ArgsT&&... args) { - - _increment_topology(); - - using T = std::invoke_result_t; - using R = std::conditional_t, void, std::optional>; - std::promise p; - - auto tpg = std::make_shared(); - - Future fu(p.get_future(), tpg); - - auto node = node_pool.animate( - std::in_place_type_t{}, - [p=make_moc(std::move(p)), f=std::forward(f), args...] - (bool cancel) mutable { - if constexpr(std::is_same_v) { - if(!cancel) { - f(args...); - } - p.object.set_value(); - } - else { - p.object.set_value(cancel ? std::nullopt : std::make_optional(f(args...))); - } - }, - std::move(tpg) - ); - - _schedule(node); - - return fu; +// Function: num_taskflows +inline size_t Executor::num_taskflows() const { + return _taskflows.size(); } -// Function: silent_async -template -void Executor::silent_async(F&& f, ArgsT&&... args) { - - _increment_topology(); - - Node* node = node_pool.animate( - std::in_place_type_t{}, - [f=std::forward(f), args...] () mutable { - f(args...); - } - ); - - _schedule(node); +// Function: _this_worker +inline Worker* Executor::_this_worker() { + auto itr = _wids.find(std::this_thread::get_id()); + return itr == _wids.end() ? nullptr : &_workers[itr->second]; } // Function: this_worker_id inline int Executor::this_worker_id() const { - auto worker = per_thread.worker; - return worker ? static_cast(worker->_id) : -1; + auto i = _wids.find(std::this_thread::get_id()); + return i == _wids.end() ? -1 : static_cast(_workers[i->second]._id); } // Procedure: _spawn inline void Executor::_spawn(size_t N) { + + std::mutex mutex; + std::condition_variable cond; + size_t n=0; + for(size_t id=0; id void { - per_thread.worker = &w; + _threads[id] = std::thread([this] ( + Worker& w, std::mutex& mutex, std::condition_variable& cond, size_t& n + ) -> void { + + // assign the thread + w._thread = &_threads[w._id]; + + // enables the mapping + { + std::scoped_lock lock(mutex); + _wids[std::this_thread::get_id()] = w._id; + if(n++; n == num_workers()) { + cond.notify_one(); + } + } Node* t = nullptr; + + // before entering the scheduler (work-stealing loop), + // call the user-specified prologue function + if(_worker_interface) { + _worker_interface->scheduler_prologue(w); + } + + // must use 1 as condition instead of !done because + // the previous worker may stop while the following workers + // are still preparing for entering the scheduling loop + std::exception_ptr ptr{nullptr}; + try { + while(1) { + + // execute the tasks. + _exploit_task(w, t); + + // wait for tasks + if(_wait_for_task(w, t) == false) { + break; + } + } + } + catch(...) { + ptr = std::current_exception(); + } + + // call the user-specified epilogue function + if(_worker_interface) { + _worker_interface->scheduler_epilogue(w, ptr); + } - // must use 1 as condition instead of !done - while(1) { - - // execute the tasks. - _exploit_task(w, t); + }, std::ref(_workers[id]), std::ref(mutex), std::ref(cond), std::ref(n)); + + // POSIX-like system can use the following to affine threads to cores + //cpu_set_t cpuset; + //CPU_ZERO(&cpuset); + //CPU_SET(id, &cpuset); + //pthread_setaffinity_np( + // _threads[id].native_handle(), sizeof(cpu_set_t), &cpuset + //); + } + + std::unique_lock lock(mutex); + cond.wait(lock, [&](){ return n==N; }); +} - // wait for tasks - if(_wait_for_task(w, t) == false) { - break; +// Function: _corun_until +template +void Executor::_corun_until(Worker& w, P&& stop_predicate) { + + std::uniform_int_distribution rdvtm(0, _workers.size()-1); + + exploit: + + while(!stop_predicate()) { + + //exploit: + + if(auto t = w._wsq.pop(); t) { + _invoke(w, t); + } + else { + size_t num_steals = 0; + + explore: + + t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal(); + + if(t) { + _invoke(w, t); + goto exploit; + } + else if(!stop_predicate()) { + if(num_steals++ > _MAX_STEALS) { + std::this_thread::yield(); } + w._vtm = rdvtm(w._rdgen); + goto explore; } - - }, std::ref(_workers[id])); + else { + break; + } + } } } // Function: _explore_task inline void Executor::_explore_task(Worker& w, Node*& t) { - + //assert(_workers[w].wsq.empty()); - assert(!t); + //assert(!t); size_t num_steals = 0; size_t num_yields = 0; - std::uniform_int_distribution rdvtm(_VICTIM_BEG, _VICTIM_END); - - //while(!_done) { - // - // size_t vtm = rdvtm(w._rdgen); - // - // t = (vtm == w._id) ? _wsq[d].steal() : _workers[vtm].wsq[d].steal(); - - // if(t) { - // break; - // } - - // if(num_steal++ > _MAX_STEALS) { - // std::this_thread::yield(); - // if(num_yields++ > _MAX_YIELDS) { - // break; - // } - // } - //} - + std::uniform_int_distribution rdvtm(0, _workers.size()-1); + + // Here, we write do-while to make the worker steal at once + // from the assigned victim. do { t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal(); if(t) { break; } - + if(num_steals++ > _MAX_STEALS) { std::this_thread::yield(); - if(num_yields++ > _MAX_YIELDS) { + if(num_yields++ > 100) { break; } } - + w._vtm = rdvtm(w._rdgen); } while(!_done); @@ -456,92 +1318,58 @@ inline void Executor::_explore_task(Worker& w, Node*& t) { // Procedure: _exploit_task inline void Executor::_exploit_task(Worker& w, Node*& t) { - - if(t) { - - if(_num_actives.fetch_add(1) == 0 && _num_thieves == 0) { - _notifier.notify(false); - } - - while(t) { - _invoke(w, t); - t = w._wsq.pop(); - } - - --_num_actives; + while(t) { + _invoke(w, t); + t = w._wsq.pop(); } } // Function: _wait_for_task inline bool Executor::_wait_for_task(Worker& worker, Node*& t) { - wait_for_task: - - assert(!t); - - ++_num_thieves; - explore_task: _explore_task(worker, t); - + + // The last thief who successfully stole a task will wake up + // another thief worker to avoid starvation. if(t) { - if(_num_thieves.fetch_sub(1) == 1) { - _notifier.notify(false); - } + _notifier.notify(false); return true; } + // ---- 2PC guard ---- _notifier.prepare_wait(worker._waiter); - - //if(auto vtm = _find_vtm(me); vtm != _workers.size()) { - if(!_wsq.empty()) { + if(!_wsq.empty()) { _notifier.cancel_wait(worker._waiter); - //t = (vtm == me) ? _wsq.steal() : _workers[vtm].wsq.steal(); - - t = _wsq.steal(); // must steal here - if(t) { - if(_num_thieves.fetch_sub(1) == 1) { - _notifier.notify(false); - } - return true; - } - else { - worker._vtm = worker._id; - goto explore_task; - } + worker._vtm = worker._id; + goto explore_task; } - + if(_done) { _notifier.cancel_wait(worker._waiter); _notifier.notify(true); - --_num_thieves; return false; } - - if(_num_thieves.fetch_sub(1) == 1) { - if(_num_actives) { + + // We need to use index-based scanning to avoid data race + // with _spawn which may initialize a worker at the same time. + for(size_t vtm=0; vtm<_workers.size(); vtm++) { + if(!_workers[vtm]._wsq.empty()) { _notifier.cancel_wait(worker._waiter); - goto wait_for_task; - } - // check all queues again - for(auto& w : _workers) { - if(!w._wsq.empty()) { - worker._vtm = w._id; - _notifier.cancel_wait(worker._waiter); - goto wait_for_task; - } + worker._vtm = vtm; + goto explore_task; } } - + // Now I really need to relinguish my self to others _notifier.commit_wait(worker._waiter); - return true; + goto explore_task; } -// Function: make_observer +// Function: make_observer template std::shared_ptr Executor::make_observer(ArgsT&&... args) { @@ -549,10 +1377,10 @@ std::shared_ptr Executor::make_observer(ArgsT&&... args) { std::is_base_of_v, "Observer must be derived from ObserverInterface" ); - - // use a local variable to mimic the constructor + + // use a local variable to mimic the constructor auto ptr = std::make_shared(std::forward(args)...); - + ptr->set_up(_workers.size()); _observers.emplace(std::static_pointer_cast(ptr)); @@ -563,7 +1391,7 @@ std::shared_ptr Executor::make_observer(ArgsT&&... args) { // Procedure: remove_observer template void Executor::remove_observer(std::shared_ptr ptr) { - + static_assert( std::is_base_of_v, "Observer must be derived from ObserverInterface" @@ -573,114 +1401,170 @@ void Executor::remove_observer(std::shared_ptr ptr) { } // Function: num_observers -inline size_t Executor::num_observers() const { +inline size_t Executor::num_observers() const noexcept { return _observers.size(); } // Procedure: _schedule -// The main procedure to schedule a give task node. -// Each task node has two types of tasks - regular and subflow. -inline void Executor::_schedule(Node* node) { +inline void Executor::_schedule(Worker& worker, Node* node) { - //assert(_workers.size() != 0); - - // caller is a worker to this pool - auto worker = per_thread.worker; - - if(worker != nullptr && worker->_executor == this) { - worker->_wsq.push(node); + // We need to fetch p before the release such that the read + // operation is synchronized properly with other thread to + // void data race. + auto p = node->_priority; + + node->_state.fetch_or(Node::READY, std::memory_order_release); + + // caller is a worker to this pool - starting at v3.5 we do not use + // any complicated notification mechanism as the experimental result + // has shown no significant advantage. + if(worker._executor == this) { + worker._wsq.push(node, p); + _notifier.notify(false); return; } - // other threads { std::lock_guard lock(_wsq_mutex); - _wsq.push(node); + _wsq.push(node, p); } _notifier.notify(false); } // Procedure: _schedule -// The main procedure to schedule a set of task nodes. -// Each task node has two types of tasks - regular and subflow. -inline void Executor::_schedule(const std::vector& nodes) { - - //assert(_workers.size() != 0); +inline void Executor::_schedule(Node* node) { + // We need to fetch p before the release such that the read + // operation is synchronized properly with other thread to + // void data race. + auto p = node->_priority; + + node->_state.fetch_or(Node::READY, std::memory_order_release); + + { + std::lock_guard lock(_wsq_mutex); + _wsq.push(node, p); + } + + _notifier.notify(false); +} + +// Procedure: _schedule +inline void Executor::_schedule(Worker& worker, const SmallVector& nodes) { + // We need to cacth the node count to avoid accessing the nodes // vector while the parent topology is removed! const auto num_nodes = nodes.size(); - + if(num_nodes == 0) { return; } - // worker thread - auto worker = per_thread.worker; - - if(worker != nullptr && worker->_executor == this) { + // caller is a worker to this pool - starting at v3.5 we do not use + // any complicated notification mechanism as the experimental result + // has shown no significant advantage. + if(worker._executor == this) { for(size_t i=0; i_wsq.push(nodes[i]); + // We need to fetch p before the release such that the read + // operation is synchronized properly with other thread to + // void data race. + auto p = nodes[i]->_priority; + nodes[i]->_state.fetch_or(Node::READY, std::memory_order_release); + worker._wsq.push(nodes[i], p); + _notifier.notify(false); } return; } - - // other threads + { std::lock_guard lock(_wsq_mutex); for(size_t k=0; k_priority; + nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release); + _wsq.push(nodes[k], p); } } - + + _notifier.notify_n(num_nodes); +} + +// Procedure: _schedule +inline void Executor::_schedule(const SmallVector& nodes) { + + // parent topology may be removed! + const auto num_nodes = nodes.size(); + + if(num_nodes == 0) { + return; + } + + // We need to fetch p before the release such that the read + // operation is synchronized properly with other thread to + // void data race. + { + std::lock_guard lock(_wsq_mutex); + for(size_t k=0; k_priority; + nodes[k]->_state.fetch_or(Node::READY, std::memory_order_release); + _wsq.push(nodes[k], p); + } + } + _notifier.notify_n(num_nodes); } // Procedure: _invoke inline void Executor::_invoke(Worker& worker, Node* node) { - + + // synchronize all outstanding memory operations caused by reordering + while(!(node->_state.load(std::memory_order_acquire) & Node::READY)); + + begin_invoke: + // no need to do other things if the topology is cancelled - //if(node->_topology && node->_topology->_is_cancelled) { if(node->_is_cancelled()) { - _tear_down_invoke(node, true); + _tear_down_invoke(worker, node); return; } // if acquiring semaphore(s) exists, acquire them first if(node->_semaphores && !node->_semaphores->to_acquire.empty()) { - std::vector nodes; + SmallVector nodes; if(!node->_acquire_all(nodes)) { - _schedule(nodes); + _schedule(worker, nodes); return; } - node->_set_state(Node::ACQUIRED); + node->_state.fetch_or(Node::ACQUIRED, std::memory_order_release); } - // Here we need to fetch the num_successors first to avoid the invalid memory - // access caused by topology clear. - const auto num_successors = node->num_successors(); - // condition task - int cond = -1; - + //int cond = -1; + SmallVector conds; + // switch is faster than nested if-else due to jump table switch(node->_handle.index()) { // static task case Node::STATIC:{ _invoke_static_task(worker, node); - } + } break; - + // dynamic task case Node::DYNAMIC: { _invoke_dynamic_task(worker, node); } break; - + // condition task case Node::CONDITION: { - _invoke_condition_task(worker, node, cond); + _invoke_condition_task(worker, node, conds); + } + break; + + // multi-condition task + case Node::MULTI_CONDITION: { + _invoke_multi_condition_task(worker, node, conds); } break; @@ -693,114 +1577,129 @@ inline void Executor::_invoke(Worker& worker, Node* node) { // async task case Node::ASYNC: { _invoke_async_task(worker, node); - _tear_down_invoke(node, false); - return ; - } - break; - - // silent async task - case Node::SILENT_ASYNC: { - _invoke_silent_async_task(worker, node); - _tear_down_invoke(node, false); + _tear_down_async(node); return ; } break; - // cudaflow task - case Node::CUDAFLOW: { - _invoke_cudaflow_task(worker, node); + // dependent async task + case Node::DEPENDENT_ASYNC: { + _invoke_dependent_async_task(worker, node); + _tear_down_dependent_async(worker, node); + if(worker._cache) { + node = worker._cache; + goto begin_invoke; + } + return; } - break; + break; - // monostate + // monostate (placeholder) default: break; } // if releasing semaphores exist, release them if(node->_semaphores && !node->_semaphores->to_release.empty()) { - _schedule(node->_release_all()); + _schedule(worker, node->_release_all()); } - - // We MUST recover the dependency since the graph may have cycles. - // This must be done before scheduling the successors, otherwise this might cause - // race condition on the _dependents - if(node->_has_state(Node::BRANCHED)) { - node->_join_counter = node->num_strong_dependents(); - } - else { - node->_join_counter = node->num_dependents(); - } - - // acquire the parent flow counter - auto& j = (node->_parent) ? node->_parent->_join_counter : - node->_topology->_join_counter; - // At this point, the node storage might be destructed (to be verified) - // case 1: non-condition task - if(node->_handle.index() != Node::CONDITION) { - for(size_t i=0; i_successors[i]->_join_counter) == 0) { - j.fetch_add(1); - _schedule(node->_successors[i]); - } - } + // Reset the join counter to support the cyclic control flow. + // + We must do this before scheduling the successors to avoid race + // condition on _dependents. + // + We must use fetch_add instead of direct assigning + // because the user-space call on "invoke" may explicitly schedule + // this task again (e.g., pipeline) which can access the join_counter. + if((node->_state.load(std::memory_order_relaxed) & Node::CONDITIONED)) { + node->_join_counter.fetch_add(node->num_strong_dependents(), std::memory_order_relaxed); } - // case 2: condition task else { - if(cond >= 0 && static_cast(cond) < num_successors) { - auto s = node->_successors[cond]; - s->_join_counter.store(0); // seems redundant but just for invariant - j.fetch_add(1); - _schedule(s); - } + node->_join_counter.fetch_add(node->num_dependents(), std::memory_order_relaxed); } - - // tear_down the invoke - _tear_down_invoke(node, false); -} -// Procedure: _tear_down_async -inline void Executor::_tear_down_async(Node* node) { - if(node->_parent) { - node->_parent->_join_counter.fetch_sub(1); - } - else { - _decrement_topology_and_notify(); - } - node_pool.recycle(node); -} + // acquire the parent flow counter + auto& j = (node->_parent) ? node->_parent->_join_counter : + node->_topology->_join_counter; -// Procedure: _tear_down_invoke -inline void Executor::_tear_down_invoke(Node* node, bool cancel) { + // Here, we want to cache the latest successor with the highest priority + worker._cache = nullptr; + auto max_p = static_cast(TaskPriority::MAX); + // Invoke the task based on the corresponding type switch(node->_handle.index()) { - // async task needs to carry out the promise - case Node::ASYNC: - if(cancel) { - std::get(node->_handle).work(true); - } - _tear_down_async(node); - break; - // silent async doesn't need to carry out the promise - case Node::SILENT_ASYNC: - _tear_down_async(node); + // condition and multi-condition tasks + case Node::CONDITION: + case Node::MULTI_CONDITION: { + for(auto cond : conds) { + if(cond >= 0 && static_cast(cond) < node->_successors.size()) { + auto s = node->_successors[cond]; + // zeroing the join counter for invariant + s->_join_counter.store(0, std::memory_order_relaxed); + j.fetch_add(1, std::memory_order_relaxed); + if(s->_priority <= max_p) { + if(worker._cache) { + _schedule(worker, worker._cache); + } + worker._cache = s; + max_p = s->_priority; + } + else { + _schedule(worker, s); + } + } + } + } break; - // tear down topology if the node is the last leaf + // non-condition task default: { - if(node->_parent == nullptr) { - if(node->_topology->_join_counter.fetch_sub(1) == 1) { - _tear_down_topology(node->_topology); + for(size_t i=0; i_successors.size(); ++i) { + //if(auto s = node->_successors[i]; --(s->_join_counter) == 0) { + if(auto s = node->_successors[i]; + s->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) { + j.fetch_add(1, std::memory_order_relaxed); + if(s->_priority <= max_p) { + if(worker._cache) { + _schedule(worker, worker._cache); + } + worker._cache = s; + max_p = s->_priority; + } + else { + _schedule(worker, s); + } } } - else { // joined subflow - node->_parent->_join_counter.fetch_sub(1); - } } break; } + + // tear_down the invoke + _tear_down_invoke(worker, node); + + // perform tail recursion elimination for the right-most child to reduce + // the number of expensive pop/push operations through the task queue + if(worker._cache) { + node = worker._cache; + //node->_state.fetch_or(Node::READY, std::memory_order_release); + goto begin_invoke; + } +} + +// Proecdure: _tear_down_invoke +inline void Executor::_tear_down_invoke(Worker& worker, Node* node) { + // we must check parent first before substracting the join counter, + // or it can introduce data race + if(node->_parent == nullptr) { + if(node->_topology->_join_counter.fetch_sub(1, std::memory_order_acq_rel) == 1) { + _tear_down_topology(worker, node->_topology); + } + } + // joined subflow + else { + node->_parent->_join_counter.fetch_sub(1, std::memory_order_release); + } } // Procedure: _observer_prologue @@ -820,7 +1719,17 @@ inline void Executor::_observer_epilogue(Worker& worker, Node* node) { // Procedure: _invoke_static_task inline void Executor::_invoke_static_task(Worker& worker, Node* node) { _observer_prologue(worker, node); - std::get(node->_handle).work(); + auto& work = std::get_if(&node->_handle)->work; + switch(work.index()) { + case 0: + std::get_if<0>(&work)->operator()(); + break; + + case 1: + Runtime rt(*this, worker, node); + std::get_if<1>(&work)->operator()(rt); + break; + } _observer_epilogue(worker, node); } @@ -829,145 +1738,138 @@ inline void Executor::_invoke_dynamic_task(Worker& w, Node* node) { _observer_prologue(w, node); - auto& handle = std::get(node->_handle); + auto handle = std::get_if(&node->_handle); - handle.subgraph.clear(); + handle->subgraph._clear(); - Subflow sf(*this, node, handle.subgraph); + Subflow sf(*this, w, node, handle->subgraph); - handle.work(sf); + handle->work(sf); if(sf._joinable) { - _invoke_dynamic_task_internal(w, node, handle.subgraph, false); + _consume_graph(w, node, handle->subgraph); } - + _observer_epilogue(w, node); } -// Procedure: _invoke_dynamic_task_external -inline void Executor::_invoke_dynamic_task_external(Node*p, Graph& g, bool detach) { +// Procedure: _detach_dynamic_task +inline void Executor::_detach_dynamic_task( + Worker& w, Node* p, Graph& g +) { + + // graph is empty and has no async tasks + if(g.empty() && p->_join_counter.load(std::memory_order_acquire) == 0) { + return; + } + + SmallVector src; + + for(auto n : g._nodes) { - auto worker = per_thread.worker; + n->_state.store(Node::DETACHED, std::memory_order_relaxed); + n->_set_up_join_counter(); + n->_topology = p->_topology; + n->_parent = nullptr; + + if(n->num_dependents() == 0) { + src.push_back(n); + } + } + + { + std::lock_guard lock(p->_topology->_taskflow._mutex); + p->_topology->_taskflow._graph._merge(std::move(g)); + } - assert(worker && worker->_executor == this); - - _invoke_dynamic_task_internal(*worker, p, g, detach); + p->_topology->_join_counter.fetch_add(src.size(), std::memory_order_relaxed); + _schedule(w, src); } -// Procedure: _invoke_dynamic_task_internal -inline void Executor::_invoke_dynamic_task_internal( - Worker& w, Node* p, Graph& g, bool detach -) { +// Procedure: _consume_graph +inline void Executor::_consume_graph(Worker& w, Node* p, Graph& g) { // graph is empty and has no async tasks - if(g.empty() && p->_join_counter == 0) { + if(g.empty() && p->_join_counter.load(std::memory_order_acquire) == 0) { return; } - std::vector src; + SmallVector src; for(auto n : g._nodes) { - - n->_topology = p->_topology; + n->_state.store(0, std::memory_order_relaxed); n->_set_up_join_counter(); - - if(detach) { - n->_parent = nullptr; - n->_set_state(Node::DETACHED); - } - else { - n->_parent = p; - } - + n->_topology = p->_topology; + n->_parent = p; if(n->num_dependents() == 0) { src.push_back(n); } } + p->_join_counter.fetch_add(src.size(), std::memory_order_relaxed); - // detach here - if(detach) { - - { - std::lock_guard lock(p->_topology->_taskflow._mtx); - p->_topology->_taskflow._graph.merge(std::move(g)); - } - - p->_topology->_join_counter.fetch_add(src.size()); - _schedule(src); - } - // join here - else { - p->_join_counter.fetch_add(src.size()); - _schedule(src); - Node* t = nullptr; - - std::uniform_int_distribution rdvtm(_VICTIM_BEG, _VICTIM_END); - - while(p->_join_counter != 0) { - - t = w._wsq.pop(); - - exploit: - - if(t) { - _invoke(w, t); - } - else { - explore: - t = (w._id == w._vtm) ? _wsq.steal() : _workers[w._vtm]._wsq.steal(); - if(t) { - goto exploit; - } - else if(p->_join_counter != 0){ - std::this_thread::yield(); - w._vtm = rdvtm(w._rdgen); - goto explore; - } - else { - break; - } - } - } - } + _schedule(w, src); + _corun_until(w, [p] () -> bool { return p->_join_counter.load(std::memory_order_acquire) == 0; }); } // Procedure: _invoke_condition_task inline void Executor::_invoke_condition_task( - Worker& worker, Node* node, int& cond + Worker& worker, Node* node, SmallVector& conds ) { _observer_prologue(worker, node); - cond = std::get(node->_handle).work(); + auto& work = std::get_if(&node->_handle)->work; + switch(work.index()) { + case 0: + conds = { std::get_if<0>(&work)->operator()() }; + break; + + case 1: + Runtime rt(*this, worker, node); + conds = { std::get_if<1>(&work)->operator()(rt) }; + break; + } _observer_epilogue(worker, node); } -// Procedure: _invoke_cudaflow_task -inline void Executor::_invoke_cudaflow_task(Worker& worker, Node* node) { - _observer_prologue(worker, node); - std::get(node->_handle).work(*this, node); +// Procedure: _invoke_multi_condition_task +inline void Executor::_invoke_multi_condition_task( + Worker& worker, Node* node, SmallVector& conds +) { + _observer_prologue(worker, node); + auto& work = std::get_if(&node->_handle)->work; + switch(work.index()) { + case 0: + conds = std::get_if<0>(&work)->operator()(); + break; + + case 1: + Runtime rt(*this, worker, node); + conds = std::get_if<1>(&work)->operator()(rt); + break; + } _observer_epilogue(worker, node); } - // Procedure: _invoke_module_task inline void Executor::_invoke_module_task(Worker& w, Node* node) { _observer_prologue(w, node); - auto module = std::get(node->_handle).module; - _invoke_dynamic_task_internal(w, node, module->_graph, false); - _observer_epilogue(w, node); + _consume_graph( + w, node, std::get_if(&node->_handle)->graph + ); + _observer_epilogue(w, node); } // Procedure: _invoke_async_task inline void Executor::_invoke_async_task(Worker& w, Node* node) { _observer_prologue(w, node); - std::get(node->_handle).work(false); - _observer_epilogue(w, node); + std::get_if(&node->_handle)->work(); + _observer_epilogue(w, node); } -// Procedure: _invoke_silent_async_task -inline void Executor::_invoke_silent_async_task(Worker& w, Node* node) { +// Procedure: _invoke_dependent_async_task +inline void Executor::_invoke_dependent_async_task(Worker& w, Node* node) { _observer_prologue(w, node); - std::get(node->_handle).work(); - _observer_epilogue(w, node); + std::get_if(&node->_handle)->work(); + _observer_epilogue(w, node); } // Function: run @@ -975,17 +1877,33 @@ inline tf::Future Executor::run(Taskflow& f) { return run_n(f, 1, [](){}); } +// Function: run +inline tf::Future Executor::run(Taskflow&& f) { + return run_n(std::move(f), 1, [](){}); +} + // Function: run template tf::Future Executor::run(Taskflow& f, C&& c) { return run_n(f, 1, std::forward(c)); } +// Function: run +template +tf::Future Executor::run(Taskflow&& f, C&& c) { + return run_n(std::move(f), 1, std::forward(c)); +} + // Function: run_n inline tf::Future Executor::run_n(Taskflow& f, size_t repeat) { return run_n(f, repeat, [](){}); } +// Function: run_n +inline tf::Future Executor::run_n(Taskflow&& f, size_t repeat) { + return run_n(std::move(f), repeat, [](){}); +} + // Function: run_n template tf::Future Executor::run_n(Taskflow& f, size_t repeat, C&& c) { @@ -994,28 +1912,149 @@ tf::Future Executor::run_n(Taskflow& f, size_t repeat, C&& c) { ); } -// Function: run_until +// Function: run_n +template +tf::Future Executor::run_n(Taskflow&& f, size_t repeat, C&& c) { + return run_until( + std::move(f), [repeat]() mutable { return repeat-- == 0; }, std::forward(c) + ); +} + +// Function: run_until template tf::Future Executor::run_until(Taskflow& f, P&& pred) { return run_until(f, std::forward

(pred), [](){}); } -// Function: _set_up_topology -inline void Executor::_set_up_topology(Topology* tpg) { +// Function: run_until +template +tf::Future Executor::run_until(Taskflow&& f, P&& pred) { + return run_until(std::move(f), std::forward

(pred), [](){}); +} - if(tpg->_is_cancelled) { - _tear_down_topology(tpg); - return; +// Function: run_until +template +tf::Future Executor::run_until(Taskflow& f, P&& p, C&& c) { + + _increment_topology(); + + // Need to check the empty under the lock since dynamic task may + // define detached blocks that modify the taskflow at the same time + bool empty; + { + std::lock_guard lock(f._mutex); + empty = f.empty(); } - tpg->_sources.clear(); - tpg->_taskflow._graph.clear_detached(); + // No need to create a real topology but returns an dummy future + if(empty || p()) { + c(); + std::promise promise; + promise.set_value(); + _decrement_topology_and_notify(); + return tf::Future(promise.get_future(), std::monostate{}); + } + + // create a topology for this run + auto t = std::make_shared(f, std::forward

(p), std::forward(c)); + + // need to create future before the topology got torn down quickly + tf::Future future(t->_promise.get_future(), t); + + // modifying topology needs to be protected under the lock + { + std::lock_guard lock(f._mutex); + f._topologies.push(t); + if(f._topologies.size() == 1) { + _set_up_topology(_this_worker(), t.get()); + } + } + + return future; +} + +// Function: run_until +template +tf::Future Executor::run_until(Taskflow&& f, P&& pred, C&& c) { + + std::list::iterator itr; + + { + std::scoped_lock lock(_taskflows_mutex); + itr = _taskflows.emplace(_taskflows.end(), std::move(f)); + itr->_satellite = itr; + } + + return run_until(*itr, std::forward

(pred), std::forward(c)); +} + +// Function: corun +template +void Executor::corun(T& target) { + + auto w = _this_worker(); + + if(w == nullptr) { + TF_THROW("corun must be called by a worker of the executor"); + } + + Node parent; // dummy parent + _consume_graph(*w, &parent, target.graph()); +} + +// Function: corun_until +template +void Executor::corun_until(P&& predicate) { + auto w = _this_worker(); + + if(w == nullptr) { + TF_THROW("corun_until must be called by a worker of the executor"); + } + + _corun_until(*w, std::forward

(predicate)); +} + +// Procedure: _increment_topology +inline void Executor::_increment_topology() { + std::lock_guard lock(_topology_mutex); + ++_num_topologies; +} + +// Procedure: _decrement_topology_and_notify +inline void Executor::_decrement_topology_and_notify() { + std::lock_guard lock(_topology_mutex); + if(--_num_topologies == 0) { + _topology_cv.notify_all(); + } +} + +// Procedure: _decrement_topology +inline void Executor::_decrement_topology() { + std::lock_guard lock(_topology_mutex); + --_num_topologies; +} + +// Procedure: wait_for_all +inline void Executor::wait_for_all() { + std::unique_lock lock(_topology_mutex); + _topology_cv.wait(lock, [&](){ return _num_topologies == 0; }); +} + +// Function: _set_up_topology +inline void Executor::_set_up_topology(Worker* worker, Topology* tpg) { + + // ---- under taskflow lock ---- + + tpg->_sources.clear(); + tpg->_taskflow._graph._clear_detached(); + // scan each node in the graph and build up the links for(auto node : tpg->_taskflow._graph._nodes) { - + node->_topology = tpg; - node->_clear_state(); + node->_parent = nullptr; + node->_state.store(0, std::memory_order_relaxed); if(node->num_dependents() == 0) { tpg->_sources.push_back(node); @@ -1024,12 +2063,18 @@ inline void Executor::_set_up_topology(Topology* tpg) { node->_set_up_join_counter(); } - tpg->_join_counter = tpg->_sources.size(); - _schedule(tpg->_sources); + tpg->_join_counter.store(tpg->_sources.size(), std::memory_order_relaxed); + + if(worker) { + _schedule(*worker, tpg->_sources); + } + else { + _schedule(tpg->_sources); + } } // Function: _tear_down_topology -inline void Executor::_tear_down_topology(Topology* tpg) { +inline void Executor::_tear_down_topology(Worker& worker, Topology* tpg) { auto &f = tpg->_taskflow; @@ -1037,130 +2082,71 @@ inline void Executor::_tear_down_topology(Topology* tpg) { // case 1: we still need to run the topology again if(!tpg->_is_cancelled && !tpg->_pred()) { - assert(tpg->_join_counter == 0); - tpg->_join_counter = tpg->_sources.size(); - _schedule(tpg->_sources); + //assert(tpg->_join_counter == 0); + std::lock_guard lock(f._mutex); + tpg->_join_counter.store(tpg->_sources.size(), std::memory_order_relaxed); + _schedule(worker, tpg->_sources); } // case 2: the final run of this topology else { - // TODO: if the topology is cancelled, need to release all constraints - + // TODO: if the topology is cancelled, need to release all semaphores if(tpg->_call != nullptr) { tpg->_call(); } - f._mtx.lock(); - // If there is another run (interleave between lock) - if(f._topologies.size() > 1) { - - assert(tpg->_join_counter == 0); + if(std::unique_lock lock(f._mutex); f._topologies.size()>1) { + //assert(tpg->_join_counter == 0); // Set the promise tpg->_promise.set_value(); f._topologies.pop(); tpg = f._topologies.front().get(); - f._mtx.unlock(); - // decrement the topology but since this is not the last we don't notify _decrement_topology(); - _set_up_topology(tpg); + // set up topology needs to be under the lock or it can + // introduce memory order error with pop + _set_up_topology(&worker, tpg); } else { - assert(f._topologies.size() == 1); + //assert(f._topologies.size() == 1); - // Need to back up the promise first here becuz taskflow might be + // Need to back up the promise first here becuz taskflow might be // destroy soon after calling get auto p {std::move(tpg->_promise)}; - // Back up lambda capture in case it has the topology pointer, - // to avoid it releasing on pop_front ahead of _mtx.unlock & + // Back up lambda capture in case it has the topology pointer, + // to avoid it releasing on pop_front ahead of _mutex.unlock & // _promise.set_value. Released safely when leaving scope. - auto c { std::move( tpg->_call ) }; + auto c {std::move(tpg->_call)}; + // Get the satellite if any + auto s {f._satellite}; + + // Now we remove the topology from this taskflow f._topologies.pop(); - f._mtx.unlock(); + //f._mutex.unlock(); + lock.unlock(); - // We set the promise in the end in case taskflow leaves before taskflow + // We set the promise in the end in case taskflow leaves the scope. + // After set_value, the caller will return from wait p.set_value(); _decrement_topology_and_notify(); - } - } -} - -// Function: run_until -template -tf::Future Executor::run_until(Taskflow& f, P&& pred, C&& c) { - - _increment_topology(); - - // Special case of predicate - if(f.empty() || pred()) { - std::promise promise; - promise.set_value(); - _decrement_topology_and_notify(); - return tf::Future(promise.get_future(), std::monostate{}); - } - - // Multi-threaded execution. - bool run_now {false}; - - // create a topology for this run - auto tpg = std::make_shared( - f, std::forward

(pred), std::forward(c) - ); - - // need to create future before the topology got torn down quickly - tf::Future future(tpg->_promise.get_future(), tpg); - { - std::lock_guard lock(f._mtx); - - f._topologies.push(tpg); - - if(f._topologies.size() == 1) { - run_now = true; + // remove the taskflow if it is managed by the executor + // TODO: in the future, we may need to synchronize on wait + // (which means the following code should the moved before set_value) + if(s) { + std::scoped_lock lock(_taskflows_mutex); + _taskflows.erase(*s); + } } } - - // Notice here calling schedule may cause the topology to be removed sonner - // before the function leaves. - if(run_now) { - _set_up_topology(tpg.get()); - } - - return future; -} - -// Procedure: _increment_topology -inline void Executor::_increment_topology() { - std::lock_guard lock(_topology_mutex); - ++_num_topologies; -} - -// Procedure: _decrement_topology_and_notify -inline void Executor::_decrement_topology_and_notify() { - std::lock_guard lock(_topology_mutex); - if(--_num_topologies == 0) { - _topology_cv.notify_all(); - } -} - -// Procedure: _decrement_topology -inline void Executor::_decrement_topology() { - std::lock_guard lock(_topology_mutex); - --_num_topologies; -} - -// Procedure: wait_for_all -inline void Executor::wait_for_all() { - std::unique_lock lock(_topology_mutex); - _topology_cv.wait(lock, [&](){ return _num_topologies == 0; }); } // ############################################################################ @@ -1169,86 +2155,159 @@ inline void Executor::wait_for_all() { inline void Subflow::join() { + // assert(this_worker().worker == &_worker); + if(!_joinable) { TF_THROW("subflow not joinable"); } - _executor._invoke_dynamic_task_external(_parent, _graph, false); + // only the parent worker can join the subflow + _executor._consume_graph(_worker, _parent, _graph); _joinable = false; } inline void Subflow::detach() { + // assert(this_worker().worker == &_worker); + if(!_joinable) { TF_THROW("subflow already joined or detached"); } - _executor._invoke_dynamic_task_external(_parent, _graph, true); + // only the parent worker can detach the subflow + _executor._detach_dynamic_task(_worker, _parent, _graph); _joinable = false; } -// Function: async -template -auto Subflow::async(F&& f, ArgsT&&... args) { +// ############################################################################ +// Forward Declaration: Runtime +// ############################################################################ + +// Procedure: schedule +inline void Runtime::schedule(Task task) { + + auto node = task._node; + // need to keep the invariant: when scheduling a task, the task must have + // zero dependency (join counter is 0) + // or we can encounter bug when inserting a nested flow (e.g., module task) + node->_join_counter.store(0, std::memory_order_relaxed); + + auto& j = node->_parent ? node->_parent->_join_counter : + node->_topology->_join_counter; + j.fetch_add(1, std::memory_order_relaxed); + _executor._schedule(_worker, node); +} - _parent->_join_counter.fetch_add(1); +// Procedure: corun +template +void Runtime::corun(T&& target) { + + // dynamic task (subflow) + if constexpr(is_dynamic_task_v) { + Graph graph; + Subflow sf(_executor, _worker, _parent, graph); + target(sf); + if(sf._joinable) { + _executor._consume_graph(_worker, _parent, graph); + } + } + // a composable graph object with `tf::Graph& T::graph()` defined + else { + _executor._consume_graph(_worker, _parent, target.graph()); + } +} - //using T = typename function_traits::return_type; - using T = std::invoke_result_t; - using R = std::conditional_t, void, std::optional>; +// Procedure: corun_until +template +void Runtime::corun_until(P&& predicate) { + _executor._corun_until(_worker, std::forward

(predicate)); +} - std::promise p; +// Function: _silent_async +template +void Runtime::_silent_async(Worker& w, const std::string& name, F&& f) { - auto tpg = std::make_shared(); + _parent->_join_counter.fetch_add(1, std::memory_order_relaxed); - Future fu(p.get_future(), tpg); + auto node = node_pool.animate( + name, 0, _parent->_topology, _parent, 0, + std::in_place_type_t{}, std::forward(f) + ); + + _executor._schedule(w, node); +} + +// Function: silent_async +template +void Runtime::silent_async(F&& f) { + _silent_async(*_executor._this_worker(), "", std::forward(f)); +} + +// Function: silent_async +template +void Runtime::silent_async(const std::string& name, F&& f) { + _silent_async(*_executor._this_worker(), name, std::forward(f)); +} + +// Function: silent_async_unchecked +template +void Runtime::silent_async_unchecked(const std::string& name, F&& f) { + _silent_async(_worker, name, std::forward(f)); +} + +// Function: _async +template +auto Runtime::_async(Worker& w, const std::string& name, F&& f) { + + _parent->_join_counter.fetch_add(1, std::memory_order_relaxed); + + using R = std::invoke_result_t>; + + std::promise p; + auto fu{p.get_future()}; auto node = node_pool.animate( + name, 0, _parent->_topology, _parent, 0, std::in_place_type_t{}, - [p=make_moc(std::move(p)), f=std::forward(f), args...] - (bool cancel) mutable { + [p=make_moc(std::move(p)), f=std::forward(f)] () mutable { if constexpr(std::is_same_v) { - if(!cancel) { - f(args...); - } + f(); p.object.set_value(); } else { - p.object.set_value(cancel ? std::nullopt : std::make_optional(f(args...))); + p.object.set_value(f()); } - }, - std::move(tpg) + } ); - node->_topology = _parent->_topology; - node->_parent = _parent; - - _executor._schedule(node); + _executor._schedule(w, node); return fu; } -// Function: silent_async -template -void Subflow::silent_async(F&& f, ArgsT&&... args) { +// Function: async +template +auto Runtime::async(F&& f) { + return _async(*_executor._this_worker(), "", std::forward(f)); +} - _parent->_join_counter.fetch_add(1); +// Function: async +template +auto Runtime::async(const std::string& name, F&& f) { + return _async(*_executor._this_worker(), name, std::forward(f)); +} - auto node = node_pool.animate( - std::in_place_type_t{}, - [f=std::forward(f), args...] () mutable { - f(args...); - } - ); +// Function: join +inline void Runtime::join() { + corun_until([this] () -> bool { + return _parent->_join_counter.load(std::memory_order_acquire) == 0; + }); +} - node->_topology = _parent->_topology; - node->_parent = _parent; +} // end of namespace tf ----------------------------------------------------- - _executor._schedule(node); -} -} // end of namespace tf ----------------------------------------------------- diff --git a/lib/taskflow/core/flow_builder.hpp b/lib/taskflow/core/flow_builder.hpp index a022138..3e90d8e 100644 --- a/lib/taskflow/core/flow_builder.hpp +++ b/lib/taskflow/core/flow_builder.hpp @@ -1,18 +1,22 @@ #pragma once #include "task.hpp" +#include "../algorithm/partitioner.hpp" -/** +/** @file flow_builder.hpp @brief flow builder include file */ namespace tf { -/** +/** @class FlowBuilder -@brief building methods of a task dependency graph +@brief class to build a task dependency graph + +The class provides essential methods to construct a task dependency graph +from which tf::Taskflow and tf::Subflow are derived. */ class FlowBuilder { @@ -20,773 +24,1069 @@ class FlowBuilder { friend class Executor; public: - - /** - @brief creates a static task - - @tparam C callable type constructible from std::function - @param callable callable to construct a static task + /** + @brief constructs a flow builder with a graph + */ + FlowBuilder(Graph& graph); - @return a tf::Task handle + /** + @brief creates a static task - The following example creates a static task. + @tparam C callable type constructible from std::function - @code{.cpp} - tf::Task static_task = taskflow.emplace([](){}); - @endcode - - Please refer to @ref StaticTasking for details. - */ - template , void>* = nullptr - > - Task emplace(C&& callable); - - /** - @brief creates a dynamic task - - @tparam C callable type constructible from std::function + @param callable callable to construct a static task - @param callable callable to construct a dynamic task + @return a tf::Task handle - @return a tf::Task handle - - The following example creates a dynamic task (tf::Subflow) - that spawns two static tasks. + The following example creates a static task. - @code{.cpp} - tf::Task dynamic_task = taskflow.emplace([](tf::Subflow& sf){ - tf::Task static_task1 = sf.emplace([](){}); - tf::Task static_task2 = sf.emplace([](){}); - }); - @endcode - - Please refer to @ref DynamicTasking for details. - */ - template , void>* = nullptr - > - Task emplace(C&& callable); - - /** - @brief creates a condition task - - @tparam C callable type constructible from std::function + @code{.cpp} + tf::Task static_task = taskflow.emplace([](){}); + @endcode - @param callable callable to construct a condition task + Please refer to @ref StaticTasking for details. + */ + template , void>* = nullptr + > + Task emplace(C&& callable); - @return a tf::Task handle - - The following example creates an if-else block using one condition task - and three static tasks. - - @code{.cpp} - tf::Taskflow taskflow; - - auto [init, cond, yes, no] = taskflow.emplace( - [] () { }, - [] () { return 0; }, - [] () { std::cout << "yes\n"; }, - [] () { std::cout << "no\n"; } - ); - - // executes yes if cond returns 0, or no if cond returns 1 - cond.precede(yes, no); - cond.succeed(init); - @endcode + /** + @brief creates a dynamic task - Please refer to @ref ConditionalTasking for details. - */ - template , void>* = nullptr - > - Task emplace(C&& callable); + @tparam C callable type constructible from std::function - /** - @brief creates multiple tasks from a list of callable objects - - @tparam C callable types + @param callable callable to construct a dynamic task - @param callables one or multiple callable objects constructible from each task category + @return a tf::Task handle - @return a tf::Task handle + The following example creates a dynamic task (tf::Subflow) + that spawns two static tasks. - The method returns a tuple of tasks each corresponding to the given - callable target. You can use structured binding to get the return tasks - one by one. - The following example creates four static tasks and assign them to - @c A, @c B, @c C, and @c D using structured binding. + @code{.cpp} + tf::Task dynamic_task = taskflow.emplace([](tf::Subflow& sf){ + tf::Task static_task1 = sf.emplace([](){}); + tf::Task static_task2 = sf.emplace([](){}); + }); + @endcode - @code{.cpp} - auto [A, B, C, D] = taskflow.emplace( - [] () { std::cout << "A"; }, - [] () { std::cout << "B"; }, - [] () { std::cout << "C"; }, - [] () { std::cout << "D"; } - ); - @endcode - */ - template 1), void>* = nullptr> - auto emplace(C&&... callables); + Please refer to @ref DynamicTasking for details. + */ + template , void>* = nullptr + > + Task emplace(C&& callable); - /** - @brief creates a module task from a taskflow + /** + @brief creates a condition task - @param taskflow a taskflow object for the module + @tparam C callable type constructible from std::function - @return a tf::Task handle + @param callable callable to construct a condition task - Please refer to @ref ComposableTasking for details. - */ - Task composed_of(Taskflow& taskflow); + @return a tf::Task handle - /** - @brief creates a placeholder task + The following example creates an if-else block using one condition task + and three static tasks. - @return a tf::Task handle + @code{.cpp} + tf::Taskflow taskflow; - A placeholder task maps to a node in the taskflow graph, but - it does not have any callable work assigned yet. - A placeholder task is different from an empty task handle that - does not point to any node in a graph. + auto [init, cond, yes, no] = taskflow.emplace( + [] () { }, + [] () { return 0; }, + [] () { std::cout << "yes\n"; }, + [] () { std::cout << "no\n"; } + ); - @code{.cpp} - // create a placeholder task with no callable target assigned - tf::Task placeholder = taskflow.placeholder(); - assert(placeholder.empty() == false && placeholder.has_work() == false); - - // create an empty task handle - tf::Task task; - assert(task.empty() == true); - - // assign the task handle to the placeholder task - task = placeholder; - assert(task.empty() == false && task.has_work() == false); - @endcode - */ - Task placeholder(); + // executes yes if cond returns 0, or no if cond returns 1 + cond.precede(yes, no); + cond.succeed(init); + @endcode - /** - @brief creates a %cudaFlow task on the caller's GPU device context + Please refer to @ref ConditionalTasking for details. + */ + template , void>* = nullptr + > + Task emplace(C&& callable); - @tparam C callable type constructible from @c std::function + /** + @brief creates a multi-condition task - @return a tf::Task handle + @tparam C callable type constructible from + std::function()> - This method is equivalent to calling tf::FlowBuilder::emplace_on(callable, d) - where @c d is the caller's device context. - The following example creates a %cudaFlow of two kernel tasks, @c task1 and - @c task2, where @c task1 runs before @c task2. - - @code{.cpp} - taskflow.emplace([&](tf::cudaFlow& cf){ - // create two kernel tasks - tf::cudaTask task1 = cf.kernel(grid1, block1, shm1, kernel1, args1); - tf::cudaTask task2 = cf.kernel(grid2, block2, shm2, kernel2, args2); + @param callable callable to construct a multi-condition task - // kernel1 runs before kernel2 - task1.precede(task2); - }); - @endcode + @return a tf::Task handle - Please refer to @ref GPUTaskingcudaFlow and @ref GPUTaskingcudaFlowCapturer - for details. - */ - template , void>* = nullptr - > - Task emplace(C&& callable); - - /** - @brief creates a %cudaFlow task on the given device + The following example creates a multi-condition task that selectively + jumps to two successor tasks. - @tparam C callable type constructible from std::function - @tparam D device type, either @c int or @c std::ref (stateful) + @code{.cpp} + tf::Taskflow taskflow; - @return a tf::Task handle - - The following example creates a %cudaFlow of two kernel tasks, @c task1 and - @c task2 on GPU @c 2, where @c task1 runs before @c task2 - - @code{.cpp} - taskflow.emplace_on([&](tf::cudaFlow& cf){ - // create two kernel tasks - tf::cudaTask task1 = cf.kernel(grid1, block1, shm1, kernel1, args1); - tf::cudaTask task2 = cf.kernel(grid2, block2, shm2, kernel2, args2); - - // kernel1 runs before kernel2 - task1.precede(task2); - }, 2); - @endcode - */ - template , void>* = nullptr - > - Task emplace_on(C&& callable, D&& device); + auto [init, cond, branch1, branch2, branch3] = taskflow.emplace( + [] () { }, + [] () { return tf::SmallVector{0, 2}; }, + [] () { std::cout << "branch1\n"; }, + [] () { std::cout << "branch2\n"; }, + [] () { std::cout << "branch3\n"; } + ); - /** - @brief adds adjacent dependency links to a linear list of tasks + // executes branch1 and branch3 when cond returns 0 and 2 + cond.precede(branch1, branch2, branch3); + cond.succeed(init); + @endcode - @param tasks a vector of tasks - */ - void linearize(std::vector& tasks); + Please refer to @ref ConditionalTasking for details. + */ + template , void>* = nullptr + > + Task emplace(C&& callable); - /** - @brief adds adjacent dependency links to a linear list of tasks + /** + @brief creates multiple tasks from a list of callable objects - @param tasks an initializer list of tasks - */ - void linearize(std::initializer_list tasks); + @tparam C callable types - // ------------------------------------------------------------------------ - // parallel iterations - // ------------------------------------------------------------------------ - - /** - @brief constructs a STL-styled parallel-for task - - @tparam B beginning iterator type - @tparam E ending iterator type - @tparam C callable type + @param callables one or multiple callable objects constructible from each task category - @param first iterator to the beginning (inclusive) - @param last iterator to the end (exclusive) - @param callable a callable object to apply to the dereferenced iterator + @return a tf::Task handle - @return a tf::Task handle + The method returns a tuple of tasks each corresponding to the given + callable target. You can use structured binding to get the return tasks + one by one. + The following example creates four static tasks and assign them to + @c A, @c B, @c C, and @c D using structured binding. - The task spawns a subflow that applies the callable object to each object obtained by dereferencing every iterator in the range [first, last). By default, we employ the guided partition algorithm with chunk size equal to one. - This method is equivalent to the parallel execution of the following loop: - - @code{.cpp} - for(auto itr=first; itr!=last; itr++) { - callable(*itr); + @code{.cpp} + auto [A, B, C, D] = taskflow.emplace( + [] () { std::cout << "A"; }, + [] () { std::cout << "B"; }, + [] () { std::cout << "C"; }, + [] () { std::cout << "D"; } + ); + @endcode + */ + template 1), void>* = nullptr> + auto emplace(C&&... callables); + + /** + @brief removes a task from a taskflow + + @param task task to remove + + Removes a task and its input and output dependencies from the graph + associated with the flow builder. + If the task does not belong to the graph, nothing will happen. + + @code{.cpp} + tf::Task A = taskflow.emplace([](){ std::cout << "A"; }); + tf::Task B = taskflow.emplace([](){ std::cout << "B"; }); + tf::Task C = taskflow.emplace([](){ std::cout << "C"; }); + tf::Task D = taskflow.emplace([](){ std::cout << "D"; }); + A.precede(B, C, D); + + // erase A from the taskflow and its dependencies to B, C, and D + taskflow.erase(A); + @endcode + */ + void erase(Task task); + + /** + @brief creates a module task for the target object + + @tparam T target object type + @param object a custom object that defines the method @c T::graph() + + @return a tf::Task handle + + The example below demonstrates a taskflow composition using + the @c composed_of method. + + @code{.cpp} + tf::Taskflow t1, t2; + t1.emplace([](){ std::cout << "t1"; }); + + // t2 is partially composed of t1 + tf::Task comp = t2.composed_of(t1); + tf::Task init = t2.emplace([](){ std::cout << "t2"; }); + init.precede(comp); + @endcode + + The taskflow object @c t2 is composed of another taskflow object @c t1, + preceded by another static task @c init. + When taskflow @c t2 is submitted to an executor, + @c init will run first and then @c comp which spwans its definition + in taskflow @c t1. + + The target @c object being composed must define the method + T::graph() that returns a reference to a graph object of + type tf::Graph such that it can interact with the executor. + For example: + + @code{.cpp} + // custom struct + struct MyObj { + tf::Graph graph; + MyObj() { + tf::FlowBuilder builder(graph); + tf::Task task = builder.emplace([](){ + std::cout << "a task\n"; // static task + }); } - @endcode - - Arguments templated to enable stateful passing using std::reference_wrapper. - The callable needs to take a single argument of - the dereferenced iterator type. + Graph& graph() { return graph; } + }; + + MyObj obj; + tf::Task comp = taskflow.composed_of(obj); + @endcode + + Please refer to @ref ComposableTasking for details. + */ + template + Task composed_of(T& object); + + /** + @brief creates a placeholder task + + @return a tf::Task handle + + A placeholder task maps to a node in the taskflow graph, but + it does not have any callable work assigned yet. + A placeholder task is different from an empty task handle that + does not point to any node in a graph. + + @code{.cpp} + // create a placeholder task with no callable target assigned + tf::Task placeholder = taskflow.placeholder(); + assert(placeholder.empty() == false && placeholder.has_work() == false); + + // create an empty task handle + tf::Task task; + assert(task.empty() == true); + + // assign the task handle to the placeholder task + task = placeholder; + assert(task.empty() == false && task.has_work() == false); + @endcode + */ + Task placeholder(); + + /** + @brief adds adjacent dependency links to a linear list of tasks + + @param tasks a vector of tasks + + This member function creates linear dependencies over a vector of tasks. + + @code{.cpp} + tf::Task A = taskflow.emplace([](){ std::cout << "A"; }); + tf::Task B = taskflow.emplace([](){ std::cout << "B"; }); + tf::Task C = taskflow.emplace([](){ std::cout << "C"; }); + tf::Task D = taskflow.emplace([](){ std::cout << "D"; }); + std::vector tasks {A, B, C, D} + taskflow.linearize(tasks); // A->B->C->D + @endcode + + */ + void linearize(std::vector& tasks); + + /** + @brief adds adjacent dependency links to a linear list of tasks + + @param tasks an initializer list of tasks + + This member function creates linear dependencies over a list of tasks. + + @code{.cpp} + tf::Task A = taskflow.emplace([](){ std::cout << "A"; }); + tf::Task B = taskflow.emplace([](){ std::cout << "B"; }); + tf::Task C = taskflow.emplace([](){ std::cout << "C"; }); + tf::Task D = taskflow.emplace([](){ std::cout << "D"; }); + taskflow.linearize({A, B, C, D}); // A->B->C->D + @endcode + */ + void linearize(std::initializer_list tasks); + + // ------------------------------------------------------------------------ + // parallel iterations + // ------------------------------------------------------------------------ + + /** + @brief constructs an STL-styled parallel-for task + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam C callable type + @tparam P partitioner type (default tf::GuidedPartitioner) + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param callable callable object to apply to the dereferenced iterator + @param part partitioning algorithm to schedule parallel iterations + + @return a tf::Task handle + + The task spawns asynchronous tasks that applies the callable object to each object + obtained by dereferencing every iterator in the range [first, last). + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + for(auto itr=first; itr!=last; itr++) { + callable(*itr); + } + @endcode - Please refer to @ref ParallelIterations for details. - */ - template - Task for_each(B&& first, E&& last, C&& callable); - - /** - @brief constructs a STL-styled parallel-for task using the guided partition algorithm - - @tparam B beginning iterator type - @tparam E ending iterator type - @tparam C callable type - @tparam H chunk size type - - @param beg iterator to the beginning (inclusive) - @param end iterator to the end (exclusive) - @param callable a callable object to apply to the dereferenced iterator - @param chunk_size chunk size - - @return a tf::Task handle - - The task spawns a subflow that applies the callable object to each object obtained by dereferencing every iterator in the range [beg, end). The runtime partitions the range into chunks of the given chunk size, where each chunk is processed by a worker. - - Arguments are templated to enable stateful passing using std::reference_wrapper. - The callable needs to take a single argument of the dereferenced iterator type. - - Please refer to @ref ParallelIterations for details. - */ - template - Task for_each_guided(B&& beg, E&& end, C&& callable, H&& chunk_size = 1); - - /** - @brief constructs a STL-styled parallel-for task using the dynamic partition algorithm - - @tparam B beginning iterator type - @tparam E ending iterator type - @tparam C callable type - @tparam H chunk size type - - @param beg iterator to the beginning (inclusive) - @param end iterator to the end (exclusive) - @param callable a callable object to apply to the dereferenced iterator - @param chunk_size chunk size - - @return a tf::Task handle - - The task spawns a subflow that applies the callable object to each object obtained by dereferencing every iterator in the range [beg, end). The runtime partitions the range into chunks of the given chunk size, where each chunk is processed by a worker. - - Arguments are templated to enable stateful passing using std::reference_wrapper. - The callable needs to take a single argument of the dereferenced iterator type. - - Please refer to @ref ParallelIterations for details. - */ - template - Task for_each_dynamic(B&& beg, E&& end, C&& callable, H&& chunk_size = 1); - - /** - @brief constructs a STL-styled parallel-for task using the dynamic partition algorithm - - @tparam B beginning iterator type - @tparam E ending iterator type - @tparam C callable type - @tparam H chunk size type - - @param beg iterator to the beginning (inclusive) - @param end iterator to the end (exclusive) - @param callable a callable object to apply to the dereferenced iterator - @param chunk_size chunk size - - @return a tf::Task handle - - The task spawns a subflow that applies the callable object to each object obtained by dereferencing every iterator in the range [beg, end). The runtime partitions the range into chunks of the given chunk size, where each chunk is processed by a worker. When the given chunk size is zero, the runtime distributes the work evenly across workers. - - Arguments are templated to enable stateful passing using std::reference_wrapper. - The callable needs to take a single argument of the dereferenced iterator type. - - Please refer to @ref ParallelIterations for details. - */ - template - Task for_each_static( - B&& beg, E&& end, C&& callable, H&& chunk_size = 0 - ); - - /** - @brief constructs an index-based parallel-for task - - @tparam B beginning index type (must be integral) - @tparam E ending index type (must be integral) - @tparam S step type (must be integral) - @tparam C callable type - - @param first index of the beginning (inclusive) - @param last index of the end (exclusive) - @param step step size - @param callable a callable object to apply to each valid index - - @return a tf::Task handle - - The task spawns a subflow that applies the callable object to each index in the range [first, last) with the step size. By default, we employ the guided partition algorithm with chunk size equal to one. - - This method is equivalent to the parallel execution of the following loop: - - @code{.cpp} - // case 1: step size is positive - for(auto i=first; ilast; i+=step) { - callable(i); - } - @endcode + Please refer to @ref ParallelIterations for details. + */ + template + Task for_each(B first, E last, C callable, P&& part = P()); + + /** + @brief constructs an STL-styled index-based parallel-for task + + @tparam B beginning index type (must be integral) + @tparam E ending index type (must be integral) + @tparam S step type (must be integral) + @tparam C callable type + @tparam P partitioner type (default tf::GuidedPartitioner) + + @param first index of the beginning (inclusive) + @param last index of the end (exclusive) + @param step step size + @param callable callable object to apply to each valid index + @param part partitioning algorithm to schedule parallel iterations + + @return a tf::Task handle + + The task spawns asynchronous tasks that applies the callable object to each index + in the range [first, last) with the step size. + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + // case 1: step size is positive + for(auto i=first; i - Task for_each_index(B&& first, E&& last, S&& step, C&& callable); - - /** - @brief constructs an index-based parallel-for task using the guided partition algorithm. - - @tparam B beginning index type (must be integral) - @tparam E ending index type (must be integral) - @tparam S step type (must be integral) - @tparam C callable type - @tparam H chunk size type - - @param beg index of the beginning (inclusive) - @param end index of the end (exclusive) - @param step step size - @param callable a callable object to apply to each valid index - @param chunk_size chunk size (default 1) - - @return a tf::Task handle - - The task spawns a subflow that applies the callable object to each index in the range [beg, end) with the step size. The runtime partitions the range into chunks of the given size, where each chunk is processed by a worker. - - Arguments are templated to enable stateful passing using std::reference_wrapper. - The callable needs to take a single argument of the integral index type. - - Please refer to @ref ParallelIterations for details. - */ - template - Task for_each_index_guided( - B&& beg, E&& end, S&& step, C&& callable, H&& chunk_size = 1 - ); - - /** - @brief constructs an index-based parallel-for task using the dynamic partition algorithm. - - @tparam B beginning index type (must be integral) - @tparam E ending index type (must be integral) - @tparam S step type (must be integral) - @tparam C callable type - @tparam H chunk size type - - @param beg index of the beginning (inclusive) - @param end index of the end (exclusive) - @param step step size - @param callable a callable object to apply to each valid index - @param chunk_size chunk size (default 1) - - @return a tf::Task handle - - The task spawns a subflow that applies the callable object to each index in the range [beg, end) with the step size. The runtime partitions the range into chunks of the given size, where each chunk is processed by a worker. - - Arguments are templated to enable stateful passing using std::reference_wrapper. - The callable needs to take a single argument of the integral index type. - - Please refer to @ref ParallelIterations for details. - */ - template - Task for_each_index_dynamic( - B&& beg, E&& end, S&& step, C&& callable, H&& chunk_size = 1 - ); - - /** - @brief constructs an index-based parallel-for task using the static partition algorithm. - - @tparam B beginning index type (must be integral) - @tparam E ending index type (must be integral) - @tparam S step type (must be integral) - @tparam C callable type - @tparam H chunk size type - - @param beg index of the beginning (inclusive) - @param end index of the end (exclusive) - @param step step size - @param callable a callable object to apply to each valid index - @param chunk_size chunk size (default 0) - - @return a tf::Task handle - - The task spawns a subflow that applies the callable object to each index in the range [beg, end) with the step size. The runtime partitions the range into chunks of the given size, where each chunk is processed by a worker. When the given chunk size is zero, the runtime distributes the work evenly across workers. - - Arguments are templated to enable stateful passing using std::reference_wrapper. - The callable needs to take a single argument of the integral index type. - - Please refer to @ref ParallelIterations for details. - */ - template - Task for_each_index_static( - B&& beg, E&& end, S&& step, C&& callable, H&& chunk_size = 0 - ); + // case 2: step size is negative + for(auto i=first, i>last; i+=step) { + callable(i); + } + @endcode - // ------------------------------------------------------------------------ - // reduction - // ------------------------------------------------------------------------ + Iterators are templated to enable stateful range using std::reference_wrapper. + The callable needs to take a single argument of the integral index type. - /** - @brief constructs a STL-styled parallel-reduce task - - @tparam B beginning iterator type - @tparam E ending iterator type - @tparam T result type - @tparam O binary reducer type - - @param first iterator to the beginning (inclusive) - @param last iterator to the end (exclusive) - @param init initial value of the reduction and the storage for the reduced result - @param bop binary operator that will be applied - - @return a tf::Task handle - - The task spawns a subflow to perform parallel reduction over @c init and the elements in the range [first, last). The reduced result is store in @c init. The runtime partitions the range into chunks of the given chunk size, where each chunk is processed by a worker. By default, we employ the guided partition algorithm. - - This method is equivalent to the parallel execution of the following loop: - - @code{.cpp} - for(auto itr=first; itr!=last; itr++) { - init = bop(init, *itr); - } - @endcode - - Arguments are templated to enable stateful passing using std::reference_wrapper. + Please refer to @ref ParallelIterations for details. + */ + template + Task for_each_index( + B first, E last, S step, C callable, P&& part = P() + ); - Please refer to @ref ParallelReduction for details. - */ - template - Task reduce(B&& first, E&& last, T& init, O&& bop); + // ------------------------------------------------------------------------ + // transform + // ------------------------------------------------------------------------ - /** - @brief constructs a STL-styled parallel-reduce task using the guided partition algorithm + /** + @brief constructs a parallel-transform task - @tparam B beginning iterator type - @tparam E ending iterator type - @tparam T result type - @tparam O binary reducer type - @tparam H chunk size type + @tparam B beginning input iterator type + @tparam E ending input iterator type + @tparam O output iterator type + @tparam C callable type + @tparam P partitioner type (default tf::GuidedPartitioner) - @param first iterator to the beginning (inclusive) - @param last iterator to the end (exclusive) - @param init initial value of the reduction and the storage for the reduced result - @param bop binary operator that will be applied - @param chunk_size chunk size - - @return a tf::Task handle + @param first1 iterator to the beginning of the first range + @param last1 iterator to the end of the first range + @param d_first iterator to the beginning of the output range + @param c an unary callable to apply to dereferenced input elements + @param part partitioning algorithm to schedule parallel iterations - The task spawns a subflow to perform parallel reduction over @c init and the elements in the range [first, last). The reduced result is store in @c init. The runtime partitions the range into chunks of size @c chunk_size, where each chunk is processed by a worker. + @return a tf::Task handle - Arguments are templated to enable stateful passing using std::reference_wrapper. + The task spawns asynchronous tasks that applies the callable object to an + input range and stores the result in another output range. + This method is equivalent to the parallel execution of the following loop: - Please refer to @ref ParallelReduction for details. - */ - template - Task reduce_guided( - B&& first, E&& last, T& init, O&& bop, H&& chunk_size = 1 - ); - - /** - @brief constructs a STL-styled parallel-reduce task using the dynamic partition algorithm + @code{.cpp} + while (first1 != last1) { + *d_first++ = c(*first1++); + } + @endcode - @tparam B beginning iterator type - @tparam E ending iterator type - @tparam T result type - @tparam O binary reducer type - @tparam H chunk size type + Iterators are templated to enable stateful range using std::reference_wrapper. + The callable needs to take a single argument of the dereferenced + iterator type. + + Please refer to @ref ParallelTransforms for details. + */ + template < + typename B, typename E, typename O, typename C, typename P = GuidedPartitioner + > + Task transform(B first1, E last1, O d_first, C c, P&& part = P()); + + /** + @brief constructs a parallel-transform task + + @tparam B1 beginning input iterator type for the first input range + @tparam E1 ending input iterator type for the first input range + @tparam B2 beginning input iterator type for the first second range + @tparam O output iterator type + @tparam C callable type + @tparam P partitioner type (default tf::GuidedPartitioner) + + @param first1 iterator to the beginning of the first input range + @param last1 iterator to the end of the first input range + @param first2 iterator to the beginning of the second input range + @param d_first iterator to the beginning of the output range + @param c a binary operator to apply to dereferenced input elements + @param part partitioning algorithm to schedule parallel iterations + + @return a tf::Task handle + + The task spawns asynchronous tasks that applies the callable object to two + input ranges and stores the result in another output range. + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + while (first1 != last1) { + *d_first++ = c(*first1++, *first2++); + } + @endcode - @param first iterator to the beginning (inclusive) - @param last iterator to the end (exclusive) - @param init initial value of the reduction and the storage for the reduced result - @param bop binary operator that will be applied - @param chunk_size chunk size + Iterators are templated to enable stateful range using std::reference_wrapper. + The callable needs to take two arguments of dereferenced elements + from the two input ranges. + + Please refer to @ref ParallelTransforms for details. + */ + template < + typename B1, typename E1, typename B2, typename O, typename C, typename P=GuidedPartitioner, + std::enable_if_t>, void>* = nullptr + > + Task transform(B1 first1, E1 last1, B2 first2, O d_first, C c, P&& part = P()); + + // ------------------------------------------------------------------------ + // reduction + // ------------------------------------------------------------------------ + + /** + @brief constructs an STL-styled parallel-reduce task + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam T result type + @tparam O binary reducer type + @tparam P partitioner type (default tf::GuidedPartitioner) + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param init initial value of the reduction and the storage for the reduced result + @param bop binary operator that will be applied + @param part partitioning algorithm to schedule parallel iterations + + @return a tf::Task handle + + The task spawns asynchronous tasks to perform parallel reduction over @c init + and the elements in the range [first, last). + The reduced result is store in @c init. + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + for(auto itr=first; itr!=last; itr++) { + init = bop(init, *itr); + } + @endcode - @return a tf::Task handle + Iterators are templated to enable stateful range using std::reference_wrapper. - The task spawns a subflow to perform parallel reduction over @c init and the elements in the range [first, last). The reduced result is store in @c init. The runtime partitions the range into chunks of size @c chunk_size, where each chunk is processed by a worker. - - Arguments are templated to enable stateful passing using std::reference_wrapper. + Please refer to @ref ParallelReduction for details. + */ + template + Task reduce(B first, E last, T& init, O bop, P&& part = P()); + + // ------------------------------------------------------------------------ + // transfrom and reduction + // ------------------------------------------------------------------------ + + /** + @brief constructs an STL-styled parallel transform-reduce task + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam T result type + @tparam BOP binary reducer type + @tparam UOP unary transformion type + @tparam P partitioner type (default tf::GuidedPartitioner) + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param init initial value of the reduction and the storage for the reduced result + @param bop binary operator that will be applied in unspecified order to the results of @c uop + @param uop unary operator that will be applied to transform each element in the range to the result type + @param part partitioning algorithm to schedule parallel iterations + + @return a tf::Task handle + + The task spawns asynchronous tasks to perform parallel reduction over @c init and + the transformed elements in the range [first, last). + The reduced result is store in @c init. + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + for(auto itr=first; itr!=last; itr++) { + init = bop(init, uop(*itr)); + } + @endcode - Please refer to @ref ParallelReduction for details. - */ - template - Task reduce_dynamic( - B&& first, E&& last, T& init, O&& bop, H&& chunk_size = 1 - ); - - /** - @brief constructs a STL-styled parallel-reduce task using the static partition algorithm + Iterators are templated to enable stateful range using std::reference_wrapper. - @tparam B beginning iterator type - @tparam E ending iterator type - @tparam T result type - @tparam O binary reducer type - @tparam H chunk size type + Please refer to @ref ParallelReduction for details. + */ + template < + typename B, typename E, typename T, typename BOP, typename UOP, typename P = GuidedPartitioner + > + Task transform_reduce(B first, E last, T& init, BOP bop, UOP uop, P&& part = P()); + + // ------------------------------------------------------------------------ + // scan + // ------------------------------------------------------------------------ + + /** + @brief creates an STL-styled parallel inclusive-scan task + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam D destination iterator type + @tparam BOP summation operator type + + @param first start of input range + @param last end of input range + @param d_first start of output range (may be the same as input range) + @param bop function to perform summation + + Performs the cumulative sum (aka prefix sum, aka scan) of the input range + and writes the result to the output range. + Each element of the output range contains the + running total of all earlier elements using the given binary operator + for summation. + + This function generates an @em inclusive scan, meaning that the N-th element + of the output range is the sum of the first N input elements, + so the N-th input element is included. + + @code{.cpp} + std::vector input = {1, 2, 3, 4, 5}; + taskflow.inclusive_scan( + input.begin(), input.end(), input.begin(), std::plus{} + ); + executor.run(taskflow).wait(); + + // input is {1, 3, 6, 10, 15} + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + + Please refer to @ref ParallelScan for details. + */ + template + Task inclusive_scan(B first, E last, D d_first, BOP bop); + + /** + @brief creates an STL-styled parallel inclusive-scan task with an initial value + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam D destination iterator type + @tparam BOP summation operator type + @tparam T initial value type + + @param first start of input range + @param last end of input range + @param d_first start of output range (may be the same as input range) + @param bop function to perform summation + @param init initial value + + Performs the cumulative sum (aka prefix sum, aka scan) of the input range + and writes the result to the output range. + Each element of the output range contains the + running total of all earlier elements (and the initial value) + using the given binary operator for summation. + + This function generates an @em inclusive scan, meaning the N-th element + of the output range is the sum of the first N input elements, + so the N-th input element is included. + + @code{.cpp} + std::vector input = {1, 2, 3, 4, 5}; + taskflow.inclusive_scan( + input.begin(), input.end(), input.begin(), std::plus{}, -1 + ); + executor.run(taskflow).wait(); + + // input is {0, 2, 5, 9, 14} + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + + Please refer to @ref ParallelScan for details. - @param first iterator to the beginning (inclusive) - @param last iterator to the end (exclusive) - @param init initial value of the reduction and the storage for the reduced result - @param bop binary operator that will be applied - @param chunk_size chunk size + */ + template + Task inclusive_scan(B first, E last, D d_first, BOP bop, T init); + + /** + @brief creates an STL-styled parallel exclusive-scan task + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam D destination iterator type + @tparam T initial value type + @tparam BOP summation operator type + + @param first start of input range + @param last end of input range + @param d_first start of output range (may be the same as input range) + @param init initial value + @param bop function to perform summation + + Performs the cumulative sum (aka prefix sum, aka scan) of the input range + and writes the result to the output range. + Each element of the output range contains the + running total of all earlier elements (and the initial value) + using the given binary operator for summation. + + This function generates an @em exclusive scan, meaning the N-th element + of the output range is the sum of the first N-1 input elements, + so the N-th input element is not included. + + @code{.cpp} + std::vector input = {1, 2, 3, 4, 5}; + taskflow.exclusive_scan( + input.begin(), input.end(), input.begin(), -1, std::plus{} + ); + executor.run(taskflow).wait(); + + // input is {-1, 0, 2, 5, 9} + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + + Please refer to @ref ParallelScan for details. + */ + template + Task exclusive_scan(B first, E last, D d_first, T init, BOP bop); + + // ------------------------------------------------------------------------ + // transform scan + // ------------------------------------------------------------------------ + + /** + @brief creates an STL-styled parallel transform-inclusive scan task + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam D destination iterator type + @tparam BOP summation operator type + @tparam UOP transform operator type + + @param first start of input range + @param last end of input range + @param d_first start of output range (may be the same as input range) + @param bop function to perform summation + @param uop function to transform elements of the input range + + Write the cumulative sum (aka prefix sum, aka scan) of the input range + to the output range. Each element of the output range contains the + running total of all earlier elements + using @c uop to transform the input elements + and using @c bop for summation. + + This function generates an @em inclusive scan, meaning the Nth element + of the output range is the sum of the first N input elements, + so the Nth input element is included. + + @code{.cpp} + std::vector input = {1, 2, 3, 4, 5}; + taskflow.transform_inclusive_scan( + input.begin(), input.end(), input.begin(), std::plus{}, + [] (int item) { return -item; } + ); + executor.run(taskflow).wait(); + + // input is {-1, -3, -6, -10, -15} + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + + Please refer to @ref ParallelScan for details. + */ + template + Task transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop); + + /** + @brief creates an STL-styled parallel transform-inclusive scan task + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam D destination iterator type + @tparam BOP summation operator type + @tparam UOP transform operator type + @tparam T initial value type + + @param first start of input range + @param last end of input range + @param d_first start of output range (may be the same as input range) + @param bop function to perform summation + @param uop function to transform elements of the input range + @param init initial value + + Write the cumulative sum (aka prefix sum, aka scan) of the input range + to the output range. Each element of the output range contains the + running total of all earlier elements (including an initial value) + using @c uop to transform the input elements + and using @c bop for summation. + + This function generates an @em inclusive scan, meaning the Nth element + of the output range is the sum of the first N input elements, + so the Nth input element is included. + + @code{.cpp} + std::vector input = {1, 2, 3, 4, 5}; + taskflow.transform_inclusive_scan( + input.begin(), input.end(), input.begin(), std::plus{}, + [] (int item) { return -item; }, + -1 + ); + executor.run(taskflow).wait(); + + // input is {-2, -4, -7, -11, -16} + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + + Please refer to @ref ParallelScan for details. + */ + template + Task transform_inclusive_scan(B first, E last, D d_first, BOP bop, UOP uop, T init); + + /** + @brief creates an STL-styled parallel transform-exclusive scan task + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam D destination iterator type + @tparam BOP summation operator type + @tparam UOP transform operator type + @tparam T initial value type + + @param first start of input range + @param last end of input range + @param d_first start of output range (may be the same as input range) + @param bop function to perform summation + @param uop function to transform elements of the input range + @param init initial value + + Write the cumulative sum (aka prefix sum, aka scan) of the input range + to the output range. Each element of the output range contains the + running total of all earlier elements (including an initial value) + using @c uop to transform the input elements + and using @c bop for summation. + + This function generates an @em exclusive scan, meaning the Nth element + of the output range is the sum of the first N-1 input elements, + so the Nth input element is not included. + + @code{.cpp} + std::vector input = {1, 2, 3, 4, 5}; + taskflow.transform_exclusive_scan( + input.begin(), input.end(), input.begin(), -1, std::plus{}, + [](int item) { return -item; } + ); + executor.run(taskflow).wait(); + + // input is {-1, -2, -4, -7, -11} + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + + Please refer to @ref ParallelScan for details. + */ + template + Task transform_exclusive_scan(B first, E last, D d_first, T init, BOP bop, UOP uop); + + // ------------------------------------------------------------------------ + // find + // ------------------------------------------------------------------------ + + /** + @brief constructs a task to perform STL-styled find-if algorithm + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam T resulting iterator type + @tparam UOP unary predicate type + @tparam P partitioner type + + @param first start of the input range + @param last end of the input range + @param result resulting iterator to the found element in the input range + @param predicate unary predicate which returns @c true for the required element + @param part partitioning algorithm (default tf::GuidedPartitioner) + + Returns an iterator to the first element in the range [first, last) + that satisfies the given criteria (or last if there is no such iterator). + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + auto find_if(InputIt first, InputIt last, UnaryPredicate p) { + for (; first != last; ++first) { + if (predicate(*first)){ + return first; + } + } + return last; + } + @endcode - @return a tf::Task handle + For example, the code below find the element that satisfies the given + criteria (value plus one is equal to 23) from an input range of 10 elements: - The task spawns a subflow to perform parallel reduction over @c init and the elements in the range [first, last). The reduced result is store in @c init. The runtime partitions the range into chunks of size @c chunk_size, where each chunk is processed by a worker. - - Arguments are templated to enable stateful passing using std::reference_wrapper. + @code{.cpp} + std::vector input = {1, 6, 9, 10, 22, 5, 7, 8, 9, 11}; + std::vector::iterator result; + taskflow.find_if( + input.begin(), input.end(), [](int i){ return i+1 = 23; }, result + ); + executor.run(taskflow).wait(); + assert(*result == 22); + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + */ + template + Task find_if(B first, E last, T& result, UOP predicate, P&& part = P()); + + /** + @brief constructs a task to perform STL-styled find-if-not algorithm + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam T resulting iterator type + @tparam UOP unary predicate type + @tparam P partitioner type + + @param first start of the input range + @param last end of the input range + @param result resulting iterator to the found element in the input range + @param predicate unary predicate which returns @c false for the required element + @param part partitioning algorithm (default tf::GuidedPartitioner) + + Returns an iterator to the first element in the range [first, last) + that satisfies the given criteria (or last if there is no such iterator). + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + auto find_if(InputIt first, InputIt last, UnaryPredicate p) { + for (; first != last; ++first) { + if (!predicate(*first)){ + return first; + } + } + return last; + } + @endcode - Please refer to @ref ParallelReduction for details. - */ - template - Task reduce_static( - B&& first, E&& last, T& init, O&& bop, H&& chunk_size = 0 - ); - - // ------------------------------------------------------------------------ - // transfrom and reduction - // ------------------------------------------------------------------------ - - /** - @brief constructs a STL-styled parallel transform-reduce task - - @tparam B beginning iterator type - @tparam E ending iterator type - @tparam T result type - @tparam BOP binary reducer type - @tparam UOP unary transformion type - - @param first iterator to the beginning (inclusive) - @param last iterator to the end (exclusive) - @param init initial value of the reduction and the storage for the reduced result - @param bop binary operator that will be applied in unspecified order to the results of @c uop - @param uop unary operator that will be applied to transform each element in the range to the result type - - @return a tf::Task handle - - The task spawns a subflow to perform parallel reduction over @c init and the transformed elements in the range [first, last). The reduced result is store in @c init. The runtime partitions the range into chunks of the given chunk size, where each chunk is processed by a worker. By default, we employ the guided partition algorithm. - - This method is equivalent to the parallel execution of the following loop: - - @code{.cpp} - for(auto itr=first; itr!=last; itr++) { - init = bop(init, uop(*itr)); + For example, the code below find the element that satisfies the given + criteria (value is not equal to 1) from an input range of 10 elements: + + @code{.cpp} + std::vector input = {1, 1, 1, 1, 22, 1, 1, 1, 1, 1}; + std::vector::iterator result; + taskflow.find_if_not( + input.begin(), input.end(), [](int i){ return i == 1; }, result + ); + executor.run(taskflow).wait(); + assert(*result == 22); + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + */ + template + Task find_if_not(B first, E last, T& result, UOP predicate, P&& part = P()); + + /** + @brief constructs a task to perform STL-styled min-element algorithm + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam T resulting iterator type + @tparam C comparator type + @tparam P partitioner type + + @param first start of the input range + @param last end of the input range + @param result resulting iterator to the found element in the input range + @param comp comparison function object + @param part partitioning algorithm (default tf::GuidedPartitioner) + + Finds the smallest element in the [first, last) + using the given comparison function object. + The iterator to that smallest element is stored in @c result. + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + if (first == last) { + return last; + } + auto smallest = first; + ++first; + for (; first != last; ++first) { + if (comp(*first, *smallest)) { + smallest = first; } - @endcode - - Arguments are templated to enable stateful passing using std::reference_wrapper. - - Please refer to @ref ParallelReduction for details. - */ - template - Task transform_reduce(B&& first, E&& last, T& init, BOP&& bop, UOP&& uop); - - /** - @brief constructs a STL-styled parallel transform-reduce task using the guided partition algorithm - - @tparam B beginning iterator type - @tparam E ending iterator type - @tparam T result type - @tparam BOP binary reducer type - @tparam UOP unary transformion type - @tparam H chunk size type - - @param first iterator to the beginning (inclusive) - @param last iterator to the end (exclusive) - @param init initial value of the reduction and the storage for the reduced result - @param bop binary operator that will be applied in unspecified order to the results of @c uop - @param uop unary operator that will be applied to transform each element in the range to the result type - @param chunk_size chunk size - - @return a tf::Task handle - - The task spawns a subflow to perform parallel reduction over @c init and the transformed elements in the range [first, last). The reduced result is store in @c init. The runtime partitions the range into chunks of size @c chunk_size, where each chunk is processed by a worker. - - Arguments are templated to enable stateful passing using std::reference_wrapper. - - Please refer to @ref ParallelReduction for details. - */ - template - Task transform_reduce_guided( - B&& first, E&& last, T& init, BOP&& bop, UOP&& uop, H&& chunk_size = 1 - ); + } + return smallest; + @endcode - /** - @brief constructs a STL-styled parallel transform-reduce task using the static partition algorithm - - @tparam B beginning iterator type - @tparam E ending iterator type - @tparam T result type - @tparam BOP binary reducer type - @tparam UOP unary transformion type - @tparam H chunk size type - - @param first iterator to the beginning (inclusive) - @param last iterator to the end (exclusive) - @param init initial value of the reduction and the storage for the reduced result - @param bop binary operator that will be applied in unspecified order to the results of @c uop - @param uop unary operator that will be applied to transform each element in the range to the result type - @param chunk_size chunk size - - @return a tf::Task handle - - The task spawns a subflow to perform parallel reduction over @c init and the transformed elements in the range [first, last). The reduced result is store in @c init. The runtime partitions the range into chunks of size @c chunk_size, where each chunk is processed by a worker. - - Arguments are templated to enable stateful passing using std::reference_wrapper. - - Please refer to @ref ParallelReduction for details. - */ - template - Task transform_reduce_static( - B&& first, E&& last, T& init, BOP&& bop, UOP&& uop, H&& chunk_size = 0 - ); + For example, the code below find the smallest element from an input + range of 10 elements. + + @code{.cpp} + std::vector input = {1, 1, 1, 1, 1, -1, 1, 1, 1, 1}; + std::vector::iterator result; + taskflow.min_element( + input.begin(), input.end(), std::less(), result + ); + executor.run(taskflow).wait(); + assert(*result == -1); + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + */ + template + Task min_element(B first, E last, T& result, C comp, P&& part); + + /** + @brief constructs a task to perform STL-styled max-element algorithm + + @tparam B beginning iterator type + @tparam E ending iterator type + @tparam T resulting iterator type + @tparam C comparator type + @tparam P partitioner type + + @param first start of the input range + @param last end of the input range + @param result resulting iterator to the found element in the input range + @param comp comparison function object + @param part partitioning algorithm (default tf::GuidedPartitioner) + + Finds the largest element in the [first, last) + using the given comparison function object. + The iterator to that largest element is stored in @c result. + This method is equivalent to the parallel execution of the following loop: + + @code{.cpp} + if (first == last){ + return last; + } + auto largest = first; + ++first; + for (; first != last; ++first) { + if (comp(*largest, *first)) { + largest = first; + } + } + return largest; + @endcode + + For example, the code below find the largest element from an input + range of 10 elements. + + @code{.cpp} + std::vector input = {1, 1, 1, 1, 1, 2, 1, 1, 1, 1}; + std::vector::iterator result; + taskflow.max_element( + input.begin(), input.end(), std::less(), result + ); + executor.run(taskflow).wait(); + assert(*result == 2); + @endcode + + Iterators are templated to enable stateful range using std::reference_wrapper. + */ + template + Task max_element(B first, E last, T& result, C comp, P&& part); + + // ------------------------------------------------------------------------ + // sort + // ------------------------------------------------------------------------ + + /** + @brief constructs a dynamic task to perform STL-styled parallel sort + + @tparam B beginning iterator type (random-accessible) + @tparam E ending iterator type (random-accessible) + @tparam C comparator type + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + @param cmp comparison operator + + The task spawns asynchronous tasks to sort elements in the range + [first, last) in parallel. + + Iterators are templated to enable stateful range using std::reference_wrapper. + + Please refer to @ref ParallelSort for details. + */ + template + Task sort(B first, E last, C cmp); + + /** + @brief constructs a dynamic task to perform STL-styled parallel sort using + the @c std::less comparator, where @c T is the element type + + @tparam B beginning iterator type (random-accessible) + @tparam E ending iterator type (random-accessible) + + @param first iterator to the beginning (inclusive) + @param last iterator to the end (exclusive) + + The task spawns asynchronous tasks to parallelly sort elements in the range + [first, last) using the @c std::less comparator, + where @c T is the dereferenced iterator type. + + Iterators are templated to enable stateful range using std::reference_wrapper. + + Please refer to @ref ParallelSort for details. + */ + template + Task sort(B first, E last); - /** - @brief constructs a STL-styled parallel transform-reduce task using the dynamic partition algorithm - - @tparam B beginning iterator type - @tparam E ending iterator type - @tparam T result type - @tparam BOP binary reducer type - @tparam UOP unary transformion type - @tparam H chunk size type - - @param first iterator to the beginning (inclusive) - @param last iterator to the end (exclusive) - @param init initial value of the reduction and the storage for the reduced result - @param bop binary operator that will be applied in unspecified order to the results of @c uop - @param uop unary operator that will be applied to transform each element in the range to the result type - @param chunk_size chunk size - - @return a tf::Task handle - - The task spawns a subflow to perform parallel reduction over @c init and the transformed elements in the range [first, last). The reduced result is store in @c init. The runtime partitions the range into chunks of size @c chunk_size, where each chunk is processed by a worker. - - Arguments are templated to enable stateful passing using std::reference_wrapper. - - Please refer to @ref ParallelReduction for details. - */ - template - Task transform_reduce_dynamic( - B&& first, E&& last, T& init, BOP&& bop, UOP&& uop, H&& chunk_size = 1 - ); - - // ------------------------------------------------------------------------ - // sort - // ------------------------------------------------------------------------ - - /** - @brief constructs a dynamic task to perform STL-styled parallel sort - - @tparam B beginning iterator type (random-accessible) - @tparam E ending iterator type (random-accessible) - @tparam C comparator type - - @param first iterator to the beginning (inclusive) - @param last iterator to the end (exclusive) - @param cmp comparison function object - - The task spawns a subflow to parallelly sort elements in the range - [first, last). - - Arguments are templated to enable stateful passing using std::reference_wrapper. - - Please refer to @ref ParallelSort for details. - */ - template - Task sort(B&& first, E&& last, C&& cmp); - - /** - @brief constructs a dynamic task to perform STL-styled parallel sort using - the @c std::less comparator, where @c T is the element type - - @tparam B beginning iterator type (random-accessible) - @tparam E ending iterator type (random-accessible) - - @param first iterator to the beginning (inclusive) - @param last iterator to the end (exclusive) - - The task spawns a subflow to parallelly sort elements in the range - [first, last) using the @c std::less comparator, - where @c T is the dereferenced iterator type. - - Arguments are templated to enable stateful passing using std::reference_wrapper. - - Please refer to @ref ParallelSort for details. - */ - template - Task sort(B&& first, E&& last); - protected: - - /** - @brief constructs a flow builder with a graph - */ - FlowBuilder(Graph& graph); - - /** - @brief associated graph object - */ - Graph& _graph; - + + /** + @brief associated graph object + */ + Graph& _graph; + private: - template - void _linearize(L&); + template + void _linearize(L&); }; // Constructor @@ -797,7 +1097,7 @@ inline FlowBuilder::FlowBuilder(Graph& graph) : // Function: emplace template , void>*> Task FlowBuilder::emplace(C&& c) { - return Task(_graph.emplace_back( + return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0, std::in_place_type_t{}, std::forward(c) )); } @@ -805,7 +1105,7 @@ Task FlowBuilder::emplace(C&& c) { // Function: emplace template , void>*> Task FlowBuilder::emplace(C&& c) { - return Task(_graph.emplace_back( + return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0, std::in_place_type_t{}, std::forward(c) )); } @@ -813,28 +1113,63 @@ Task FlowBuilder::emplace(C&& c) { // Function: emplace template , void>*> Task FlowBuilder::emplace(C&& c) { - return Task(_graph.emplace_back( + return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0, std::in_place_type_t{}, std::forward(c) )); } +// Function: emplace +template , void>*> +Task FlowBuilder::emplace(C&& c) { + return Task(_graph._emplace_back("", 0, nullptr, nullptr, 0, + std::in_place_type_t{}, std::forward(c) + )); +} + // Function: emplace template 1), void>*> auto FlowBuilder::emplace(C&&... cs) { return std::make_tuple(emplace(std::forward(cs))...); } -// Function: composed_of -inline Task FlowBuilder::composed_of(Taskflow& taskflow) { - auto node = _graph.emplace_back( - std::in_place_type_t{}, &taskflow +// Function: erase +inline void FlowBuilder::erase(Task task) { + + if (!task._node) { + return; + } + + task.for_each_dependent([&] (Task dependent) { + auto& S = dependent._node->_successors; + if(auto I = std::find(S.begin(), S.end(), task._node); I != S.end()) { + S.erase(I); + } + }); + + task.for_each_successor([&] (Task dependent) { + auto& D = dependent._node->_dependents; + if(auto I = std::find(D.begin(), D.end(), task._node); I != D.end()) { + D.erase(I); + } + }); + + _graph._erase(task._node); +} + +// Function: composed_of +template +Task FlowBuilder::composed_of(T& object) { + auto node = _graph._emplace_back("", 0, nullptr, nullptr, 0, + std::in_place_type_t{}, object ); return Task(node); } // Function: placeholder inline Task FlowBuilder::placeholder() { - auto node = _graph.emplace_back(); + auto node = _graph._emplace_back("", 0, nullptr, nullptr, 0, + std::in_place_type_t{} + ); return Task(node); } @@ -858,7 +1193,7 @@ void FlowBuilder::_linearize(L& keys) { // Procedure: linearize inline void FlowBuilder::linearize(std::vector& keys) { - _linearize(keys); + _linearize(keys); } // Procedure: linearize @@ -868,20 +1203,22 @@ inline void FlowBuilder::linearize(std::initializer_list keys) { // ---------------------------------------------------------------------------- -/** +/** @class Subflow @brief class to construct a subflow graph from the execution of a dynamic task -By default, a subflow automatically @em joins its parent node. -You may explicitly join or detach a subflow by calling tf::Subflow::join +tf::Subflow is a derived class from tf::Runtime with a specialized mechanism +to manage the execution of a child graph. +By default, a subflow automatically @em joins its parent node. +You may explicitly join or detach a subflow by calling tf::Subflow::join or tf::Subflow::detach, respectively. The following example creates a taskflow graph that spawns a subflow from the execution of task @c B, and the subflow contains three tasks, @c B1, @c B2, and @c B3, where @c B3 runs after @c B1 and @c B2. @code{.cpp} -// create three regular tasks +// create three static tasks tf::Task A = taskflow.emplace([](){}).name("A"); tf::Task C = taskflow.emplace([](){}).name("C"); tf::Task D = taskflow.emplace([](){}).name("D"); @@ -894,26 +1231,37 @@ tf::Task B = taskflow.emplace([] (tf::Subflow& subflow) { B1.precede(B3); B2.precede(B3); }).name("B"); - -A.precede(B); // B runs after A -A.precede(C); // C runs after A -B.precede(D); // D runs after B -C.precede(D); // D runs after C + +A.precede(B); // B runs after A +A.precede(C); // C runs after A +B.precede(D); // D runs after B +C.precede(D); // D runs after C @endcode -*/ -class Subflow : public FlowBuilder { +*/ +class Subflow : public FlowBuilder, + public Runtime { friend class Executor; friend class FlowBuilder; + friend class Runtime; public: - + /** @brief enables the subflow to join its parent task Performs an immediate action to join the subflow. Once the subflow is joined, it is considered finished and you may not modify the subflow anymore. + + @code{.cpp} + taskflow.emplace([](tf::Subflow& sf){ + sf.emplace([](){}); + sf.join(); // join the subflow of one task + }); + @endcode + + Only the worker that spawns this subflow can join it. */ void join(); @@ -922,80 +1270,83 @@ class Subflow : public FlowBuilder { Performs an immediate action to detach the subflow. Once the subflow is detached, it is considered finished and you may not modify the subflow anymore. + + @code{.cpp} + taskflow.emplace([](tf::Subflow& sf){ + sf.emplace([](){}); + sf.detach(); + }); + @endcode + + Only the worker that spawns this subflow can detach it. */ void detach(); - - /** - @brief queries if the subflow is joinable - When a subflow is joined or detached, it becomes not joinable. - */ - bool joinable() const; + /** + @brief resets the subflow to a joinable state - /** - @brief runs a given function asynchronously + @param clear_graph specifies whether to clear the associated graph (default @c true) - @tparam F callable type - @tparam ArgsT parameter types + Clears the underlying task graph depending on the + given variable @c clear_graph (default @c true) and then + updates the subflow to a joinable state. + */ + void reset(bool clear_graph = true); - @param f callable object to call - @param args parameters to pass to the callable - - @return a tf::Future that will holds the result of the execution + /** + @brief queries if the subflow is joinable - This method is thread-safe and can be called by multiple tasks in the - subflow at the same time. - The difference to tf::Executor::async is that the created asynchronous task - pertains to the subflow. - When the subflow joins, all asynchronous tasks created from the subflow - are guaranteed to finish before the join. - For example: + This member function queries if the subflow is joinable. + When a subflow is joined or detached, it becomes not joinable. @code{.cpp} - std::atomic counter(0); - taskflow.empalce([&](tf::Subflow& sf){ - for(int i=0; i<100; i++) { - sf.async([&](){ counter++; }); - } + taskflow.emplace([](tf::Subflow& sf){ + sf.emplace([](){}); + std::cout << sf.joinable() << '\n'; // true sf.join(); - assert(counter == 100); + std::cout << sf.joinable() << '\n'; // false }); @endcode - - You cannot create asynchronous tasks from a detached subflow. - Doing this results in undefined behavior. */ - template - auto async(F&& f, ArgsT&&... args); - - /** - @brief similar to tf::Subflow::async but did not return a future object - */ - template - void silent_async(F&& f, ArgsT&&... args); + bool joinable() const noexcept; private: - - Subflow(Executor&, Node*, Graph&); - - Executor& _executor; - Node* _parent; bool _joinable {true}; + + Subflow(Executor&, Worker&, Node*, Graph&); }; // Constructor -inline Subflow::Subflow(Executor& executor, Node* parent, Graph& graph) : +inline Subflow::Subflow( + Executor& executor, Worker& worker, Node* parent, Graph& graph +) : FlowBuilder {graph}, - _executor {executor}, - _parent {parent} { + Runtime {executor, worker, parent} { + // assert(_parent != nullptr); } // Function: joined -inline bool Subflow::joinable() const { +inline bool Subflow::joinable() const noexcept { return _joinable; } +// Procedure: reset +inline void Subflow::reset(bool clear_graph) { + if(clear_graph) { + _graph._clear(); + } + _joinable = true; +} + } // end of namespace tf. --------------------------------------------------- + + + + + + + + diff --git a/lib/taskflow/core/graph.hpp b/lib/taskflow/core/graph.hpp index 06c6dd3..475422d 100644 --- a/lib/taskflow/core/graph.hpp +++ b/lib/taskflow/core/graph.hpp @@ -1,74 +1,495 @@ #pragma once +#include "../utility/traits.hpp" #include "../utility/iterator.hpp" #include "../utility/object_pool.hpp" -#include "../utility/traits.hpp" -#include "../utility/singleton.hpp" #include "../utility/os.hpp" #include "../utility/math.hpp" +#include "../utility/small_vector.hpp" #include "../utility/serializer.hpp" #include "error.hpp" #include "declarations.hpp" #include "semaphore.hpp" #include "environment.hpp" #include "topology.hpp" +#include "tsq.hpp" + +/** +@file graph.hpp +@brief graph include file +*/ namespace tf { // ---------------------------------------------------------------------------- -// Class: CustomGraphBase +// Class: Graph // ---------------------------------------------------------------------------- -class CustomGraphBase { - public: - - virtual void dump(std::ostream&, const void*, const std::string&) const = 0; - virtual ~CustomGraphBase() = default; -}; +/** +@class Graph -// ---------------------------------------------------------------------------- -// Class: Graph -// ---------------------------------------------------------------------------- +@brief class to create a graph object + +A graph is the ultimate storage for a task dependency graph and is the main +gateway to interact with an executor. +A graph manages a set of nodes in a global object pool that animates and +recycles node objects efficiently without going through repetitive and +expensive memory allocations and deallocations. +This class is mainly used for creating an opaque graph object in a custom +class to interact with the executor through taskflow composition. + +A graph object is move-only. +*/ class Graph { friend class Node; + friend class FlowBuilder; + friend class Subflow; friend class Taskflow; friend class Executor; public: + /** + @brief constructs a graph object + */ Graph() = default; + + /** + @brief disabled copy constructor + */ Graph(const Graph&) = delete; + + /** + @brief constructs a graph using move semantics + */ Graph(Graph&&); + /** + @brief destructs the graph object + */ ~Graph(); + /** + @brief disabled copy assignment operator + */ Graph& operator = (const Graph&) = delete; + + /** + @brief assigns a graph using move semantics + */ Graph& operator = (Graph&&); - - void clear(); - void clear_detached(); - void merge(Graph&&); + /** + @brief queries if the graph is empty + */ bool empty() const; + /** + @brief queries the number of nodes in the graph + */ size_t size() const; - - template - Node* emplace_back(Args&& ...); - Node* emplace_back(); + /** + @brief clears the graph + */ + void clear(); private: std::vector _nodes; + + void _clear(); + void _clear_detached(); + void _merge(Graph&&); + void _erase(Node*); + + /** + @private + */ + template + Node* _emplace_back(ArgsT&&...); }; // ---------------------------------------------------------------------------- -// Class: Node -class Node { +/** +@class Runtime + +@brief class to include a runtime object in a task + +A runtime object allows users to interact with the +scheduling runtime inside a task, such as scheduling an active task, +spawning a subflow, and so on. + +@code{.cpp} +tf::Task A, B, C, D; +std::tie(A, B, C, D) = taskflow.emplace( + [] () { return 0; }, + [&C] (tf::Runtime& rt) { // C must be captured by reference + std::cout << "B\n"; + rt.schedule(C); + }, + [] () { std::cout << "C\n"; }, + [] () { std::cout << "D\n"; } +); +A.precede(B, C, D); +executor.run(taskflow).wait(); +@endcode + +A runtime object is associated with the worker and the executor +that runs the task. + +*/ +class Runtime { + + friend class Executor; + friend class FlowBuilder; + + public: + + /** + @brief obtains the running executor + + The running executor of a runtime task is the executor that runs + the parent taskflow of that runtime task. + + @code{.cpp} + tf::Executor executor; + tf::Taskflow taskflow; + taskflow.emplace([&](tf::Runtime& rt){ + assert(&(rt.executor()) == &executor); + }); + executor.run(taskflow).wait(); + @endcode + */ + Executor& executor(); + + /** + @brief schedules an active task immediately to the worker's queue + + @param task the given active task to schedule immediately + + This member function immediately schedules an active task to the + task queue of the associated worker in the runtime task. + An active task is a task in a running taskflow. + The task may or may not be running, and scheduling that task + will immediately put the task into the task queue of the worker + that is running the runtime task. + Consider the following example: + + @code{.cpp} + tf::Task A, B, C, D; + std::tie(A, B, C, D) = taskflow.emplace( + [] () { return 0; }, + [&C] (tf::Runtime& rt) { // C must be captured by reference + std::cout << "B\n"; + rt.schedule(C); + }, + [] () { std::cout << "C\n"; }, + [] () { std::cout << "D\n"; } + ); + A.precede(B, C, D); + executor.run(taskflow).wait(); + @endcode + + The executor will first run the condition task @c A which returns @c 0 + to inform the scheduler to go to the runtime task @c B. + During the execution of @c B, it directly schedules task @c C without + going through the normal taskflow graph scheduling process. + At this moment, task @c C is active because its parent taskflow is running. + When the taskflow finishes, we will see both @c B and @c C in the output. + */ + void schedule(Task task); + + /** + @brief runs the given callable asynchronously + + @tparam F callable type + @param f callable object + + The method creates an asynchronous task to launch the given + function on the given arguments. + The difference to tf::Executor::async is that the created asynchronous task + pertains to the runtime. + When the runtime joins, all asynchronous tasks created from the runtime + are guaranteed to finish after the join returns. + For example: + + @code{.cpp} + std::atomic counter(0); + taskflow.emplace([&](tf::Runtime& rt){ + auto fu1 = rt.async([&](){ counter++; }); + auto fu2 = rt.async([&](){ counter++; }); + fu1.get(); + fu2.get(); + assert(counter == 2); + + // spawn 100 asynchronous tasks from the worker of the runtime + for(int i=0; i<100; i++) { + rt.async([&](){ counter++; }); + } + + // explicit join 100 asynchronous tasks + rt.join(); + assert(counter == 102); + }); + @endcode + + This method is thread-safe and can be called by multiple workers + that hold the reference to the runtime. + For example, the code below spawns 100 tasks from the worker of + a runtime, and each of the 100 tasks spawns another task + that will be run by another worker. + @code{.cpp} + std::atomic counter(0); + taskflow.emplace([&](tf::Runtime& rt){ + // worker of the runtime spawns 100 tasks each spawning another task + // that will be run by another worker + for(int i=0; i<100; i++) { + rt.async([&](){ + counter++; + rt.async([](){ counter++; }); + }); + } + + // explicit join 100 asynchronous tasks + rt.join(); + assert(counter == 200); + }); + @endcode + */ + template + auto async(F&& f); + + /** + @brief similar to tf::Runtime::async but assigns the task a name + + @tparam F callable type + + @param name assigned name to the task + @param f callable + + @code{.cpp} + taskflow.emplace([&](tf::Runtime& rt){ + auto future = rt.async("my task", [](){}); + future.get(); + }); + @endcode + + */ + template + auto async(const std::string& name, F&& f); + + /** + @brief runs the given function asynchronously without returning any future object + + @tparam F callable type + @param f callable + + This member function is more efficient than tf::Runtime::async + and is encouraged to use when there is no data returned. + + @code{.cpp} + std::atomic counter(0); + taskflow.emplace([&](tf::Runtime& rt){ + for(int i=0; i<100; i++) { + rt.silent_async([&](){ counter++; }); + } + rt.join(); + assert(counter == 100); + }); + @endcode + + This member function is thread-safe. + */ + template + void silent_async(F&& f); + + /** + @brief similar to tf::Runtime::silent_async but assigns the task a name + + @tparam F callable type + @param name assigned name to the task + @param f callable + + @code{.cpp} + taskflow.emplace([&](tf::Runtime& rt){ + rt.silent_async("my task", [](){}); + rt.join(); + }); + @endcode + */ + template + void silent_async(const std::string& name, F&& f); + + /** + @brief similar to tf::Runtime::silent_async but the caller must be the worker of the runtime + + @tparam F callable type + + @param name assigned name to the task + @param f callable + + The method bypass the check of the caller worker from the executor + and thus can only called by the worker of this runtime. + + @code{.cpp} + taskflow.emplace([&](tf::Runtime& rt){ + // running by the worker of this runtime + rt.silent_async_unchecked("my task", [](){}); + rt.join(); + }); + @endcode + */ + template + void silent_async_unchecked(const std::string& name, F&& f); + + /** + @brief co-runs the given target and waits until it completes + + A target can be one of the following forms: + + a dynamic task to spawn a subflow or + + a composable graph object with `tf::Graph& T::graph()` defined + + @code{.cpp} + // co-run a subflow and wait until all tasks complete + taskflow.emplace([](tf::Runtime& rt){ + rt.corun([](tf::Subflow& sf){ + tf::Task A = sf.emplace([](){}); + tf::Task B = sf.emplace([](){}); + }); + }); + + // co-run a taskflow and wait until all tasks complete + tf::Taskflow taskflow1, taskflow2; + taskflow1.emplace([](){ std::cout << "running taskflow1\n"; }); + taskflow2.emplace([&](tf::Runtime& rt){ + std::cout << "running taskflow2\n"; + rt.corun(taskflow1); + }); + executor.run(taskflow2).wait(); + @endcode + + Although tf::Runtime::corun blocks until the operation completes, + the caller thread (worker) is not blocked (e.g., sleeping or holding any lock). + Instead, the caller thread joins the work-stealing loop of the executor + and returns when all tasks in the target completes. + */ + template + void corun(T&& target); + + /** + @brief keeps running the work-stealing loop until the predicate becomes true + + @tparam P predicate type + @param predicate a boolean predicate to indicate when to stop the loop + + The method keeps the caller worker running in the work-stealing loop + until the stop predicate becomes true. + */ + template + void corun_until(P&& predicate); + + /** + @brief joins all asynchronous tasks spawned by this runtime + + Immediately joins all asynchronous tasks (tf::Runtime::async, + tf::Runtime::silent_async). + Unlike tf::Subflow::join, you can join multiples times from + a tf::Runtime object. + + @code{.cpp} + std::atomic counter{0}; + taskflow.emplace([&](tf::Runtime& rt){ + // spawn 100 async tasks and join + for(int i=0; i<100; i++) { + rt.silent_async([&](){ counter++; }); + } + rt.join(); + assert(counter == 100); + + // spawn another 100 async tasks and join + for(int i=0; i<100; i++) { + rt.silent_async([&](){ counter++; }); + } + rt.join(); + assert(counter == 200); + }); + @endcode + + @attention + Only the worker of this tf::Runtime can issue join. + */ + inline void join(); + + /** + @brief acquire a reference to the underlying worker + */ + inline Worker& worker(); + + protected: + + /** + @private + */ + explicit Runtime(Executor&, Worker&, Node*); + + /** + @private + */ + Executor& _executor; + + /** + @private + */ + Worker& _worker; + + /** + @private + */ + Node* _parent; + + /** + @private + */ + template + auto _async(Worker& w, const std::string& name, F&& f); + + /** + @private + */ + template + void _silent_async(Worker& w, const std::string& name, F&& f); +}; + +// constructor +inline Runtime::Runtime(Executor& e, Worker& w, Node* p) : + _executor{e}, + _worker {w}, + _parent {p}{ +} + +// Function: executor +inline Executor& Runtime::executor() { + return _executor; +} + +// Function: worker +inline Worker& Runtime::worker() { + return _worker; +} + +// ---------------------------------------------------------------------------- +// Node +// ---------------------------------------------------------------------------- + +/** +@private +*/ +class Node { + friend class Graph; friend class Task; friend class TaskView; @@ -76,225 +497,239 @@ class Node { friend class Executor; friend class FlowBuilder; friend class Subflow; + friend class Runtime; + + enum class AsyncState : int { + UNFINISHED = 0, + LOCKED = 1, + FINISHED = 2 + }; TF_ENABLE_POOLABLE_ON_THIS; // state bit flag - constexpr static int BRANCHED = 0x1; - constexpr static int DETACHED = 0x2; - constexpr static int ACQUIRED = 0x4; - + constexpr static int CONDITIONED = 1; + constexpr static int DETACHED = 2; + constexpr static int ACQUIRED = 4; + constexpr static int READY = 8; + + using Placeholder = std::monostate; + // static work handle struct Static { - template + template Static(C&&); - std::function work; + std::variant< + std::function, std::function + > work; }; // dynamic work handle struct Dynamic { - template + template Dynamic(C&&); std::function work; Graph subgraph; }; - + // condition work handle struct Condition { - template + template Condition(C&&); + + std::variant< + std::function, std::function + > work; + }; - std::function work; + // multi-condition work handle + struct MultiCondition { + + template + MultiCondition(C&&); + + std::variant< + std::function()>, std::function(Runtime&)> + > work; }; // module work handle struct Module { template - Module(T&&); + Module(T&); - Taskflow* module {nullptr}; + Graph& graph; }; // Async work struct Async { template - Async(T&&, std::shared_ptr); - - std::function work; + Async(T&&); - std::shared_ptr topology; + std::function work; }; - // Silent async work - struct SilentAsync { + // silent dependent async + struct DependentAsync { template - SilentAsync(C&&); - + DependentAsync(C&&); + std::function work; - }; - - // cudaFlow work handle - struct cudaFlow { - template - cudaFlow(C&& c, G&& g); - - std::function work; - - std::unique_ptr graph; + std::atomic state {AsyncState::UNFINISHED}; }; - + using handle_t = std::variant< - std::monostate, // placeholder - Static, // static tasking - Dynamic, // dynamic tasking - Condition, // conditional tasking - Module, // composable tasking - Async, // async tasking - SilentAsync, // async tasking (no future) - cudaFlow // cudaFlow + Placeholder, // placeholder + Static, // static tasking + Dynamic, // dynamic tasking + Condition, // conditional tasking + MultiCondition, // multi-conditional tasking + Module, // composable tasking + Async, // async tasking + DependentAsync // dependent async tasking (no future) >; - - struct Semaphores { - std::vector to_acquire; - std::vector to_release; + + struct Semaphores { + SmallVector to_acquire; + SmallVector to_release; }; public: - + // variant index - constexpr static auto PLACEHOLDER = get_index_v; - constexpr static auto STATIC = get_index_v; - constexpr static auto DYNAMIC = get_index_v; - constexpr static auto CONDITION = get_index_v; - constexpr static auto MODULE = get_index_v; - constexpr static auto ASYNC = get_index_v; - constexpr static auto SILENT_ASYNC = get_index_v; - constexpr static auto CUDAFLOW = get_index_v; + constexpr static auto PLACEHOLDER = get_index_v; + constexpr static auto STATIC = get_index_v; + constexpr static auto DYNAMIC = get_index_v; + constexpr static auto CONDITION = get_index_v; + constexpr static auto MULTI_CONDITION = get_index_v; + constexpr static auto MODULE = get_index_v; + constexpr static auto ASYNC = get_index_v; + constexpr static auto DEPENDENT_ASYNC = get_index_v; - template - Node(Args&&... args); + Node() = default; - ~Node(); + template + Node(const std::string&, unsigned, Topology*, Node*, size_t, Args&&... args); - size_t num_successors() const; - size_t num_dependents() const; - size_t num_strong_dependents() const; - size_t num_weak_dependents() const; + ~Node(); - const std::string& name() const; + size_t num_successors() const; + size_t num_dependents() const; + size_t num_strong_dependents() const; + size_t num_weak_dependents() const; - private: + const std::string& name() const; - std::string _name; + private: - handle_t _handle; + std::string _name; + + unsigned _priority {0}; + + Topology* _topology {nullptr}; + Node* _parent {nullptr}; - std::vector _successors; - std::vector _dependents; + void* _data {nullptr}; - //std::optional _semaphores; - std::unique_ptr _semaphores; + SmallVector _successors; + SmallVector _dependents; - Topology* _topology {nullptr}; - - Node* _parent {nullptr}; + std::atomic _state {0}; + std::atomic _join_counter {0}; - int _state {0}; + std::unique_ptr _semaphores; + + handle_t _handle; - std::atomic _join_counter {0}; - - void _precede(Node*); - void _set_state(int); - void _unset_state(int); - void _clear_state(); - void _set_up_join_counter(); + void _precede(Node*); + void _set_up_join_counter(); - bool _has_state(int) const; - bool _is_cancelled() const; - bool _acquire_all(std::vector&); + bool _is_cancelled() const; + bool _is_conditioner() const; + bool _acquire_all(SmallVector&); - std::vector _release_all(); + SmallVector _release_all(); }; // ---------------------------------------------------------------------------- // Node Object Pool // ---------------------------------------------------------------------------- + +/** +@private +*/ inline ObjectPool node_pool; // ---------------------------------------------------------------------------- // Definition for Node::Static // ---------------------------------------------------------------------------- - + // Constructor -template +template Node::Static::Static(C&& c) : work {std::forward(c)} { } // ---------------------------------------------------------------------------- // Definition for Node::Dynamic // ---------------------------------------------------------------------------- - + // Constructor -template +template Node::Dynamic::Dynamic(C&& c) : work {std::forward(c)} { } // ---------------------------------------------------------------------------- // Definition for Node::Condition // ---------------------------------------------------------------------------- - + // Constructor -template +template Node::Condition::Condition(C&& c) : work {std::forward(c)} { -} +} // ---------------------------------------------------------------------------- -// Definition for Node::cudaFlow +// Definition for Node::MultiCondition // ---------------------------------------------------------------------------- -template -Node::cudaFlow::cudaFlow(C&& c, G&& g) : - work {std::forward(c)}, - graph {std::forward(g)} { +// Constructor +template +Node::MultiCondition::MultiCondition(C&& c) : work {std::forward(c)} { } - + // ---------------------------------------------------------------------------- // Definition for Node::Module // ---------------------------------------------------------------------------- - + // Constructor template -Node::Module::Module(T&& tf) : module {tf} { +inline Node::Module::Module(T& obj) : graph{ obj.graph() } { } // ---------------------------------------------------------------------------- // Definition for Node::Async // ---------------------------------------------------------------------------- - + // Constructor template -Node::Async::Async(C&& c, std::shared_ptrtpg) : - work {std::forward(c)}, - topology {std::move(tpg)} { +Node::Async::Async(C&& c) : work {std::forward(c)} { } // ---------------------------------------------------------------------------- -// Definition for Node::SilentAsync +// Definition for Node::DependentAsync // ---------------------------------------------------------------------------- // Constructor template -Node::SilentAsync::SilentAsync(C&& c) : - work {std::forward(c)} { +Node::DependentAsync::DependentAsync(C&& c) : work {std::forward(c)} { } // ---------------------------------------------------------------------------- @@ -303,18 +738,37 @@ Node::SilentAsync::SilentAsync(C&& c) : // Constructor template -Node::Node(Args&&... args): _handle{std::forward(args)...} { -} +Node::Node( + const std::string& name, + unsigned priority, + Topology* topology, + Node* parent, + size_t join_counter, + Args&&... args +) : + _name {name}, + _priority {priority}, + _topology {topology}, + _parent {parent}, + _join_counter {join_counter}, + _handle {std::forward(args)...} { +} + +//Node::Node(Args&&... args): _handle{std::forward(args)...} { +//} // Destructor inline Node::~Node() { // this is to avoid stack overflow if(_handle.index() == DYNAMIC) { - - auto& subgraph = std::get(_handle).subgraph; - + // using std::get_if instead of std::get makes this compatible + // with older macOS versions + // the result of std::get_if is guaranteed to be non-null + // due to the index check above + auto& subgraph = std::get_if(&_handle)->subgraph; std::vector nodes; + nodes.reserve(subgraph.size()); std::move( subgraph._nodes.begin(), subgraph._nodes.end(), std::back_inserter(nodes) @@ -326,8 +780,7 @@ inline Node::~Node() { while(i < nodes.size()) { if(nodes[i]->_handle.index() == DYNAMIC) { - - auto& sbg = std::get(nodes[i]->_handle).subgraph; + auto& sbg = std::get_if(&(nodes[i]->_handle))->subgraph; std::move( sbg._nodes.begin(), sbg._nodes.end(), std::back_inserter(nodes) ); @@ -336,7 +789,7 @@ inline Node::~Node() { ++i; } - + //auto& np = Graph::_node_pool(); for(i=0; i_handle.index() == Node::CONDITION) { + //if(_dependents[i]->_handle.index() == Node::CONDITION) { + if(_dependents[i]->_is_conditioner()) { n++; } } @@ -375,7 +829,8 @@ inline size_t Node::num_weak_dependents() const { inline size_t Node::num_strong_dependents() const { size_t n = 0; for(size_t i=0; i<_dependents.size(); i++) { - if(_dependents[i]->_handle.index() != Node::CONDITION) { + //if(_dependents[i]->_handle.index() != Node::CONDITION) { + if(!_dependents[i]->_is_conditioner()) { n++; } } @@ -387,58 +842,35 @@ inline const std::string& Node::name() const { return _name; } -// Procedure: _set_state -inline void Node::_set_state(int flag) { - _state |= flag; -} - -// Procedure: _unset_state -inline void Node::_unset_state(int flag) { - _state &= ~flag; -} - -// Procedure: _clear_state -inline void Node::_clear_state() { - _state = 0; -} - -// Function: _has_state -inline bool Node::_has_state(int flag) const { - return _state & flag; +// Function: _is_conditioner +inline bool Node::_is_conditioner() const { + return _handle.index() == Node::CONDITION || + _handle.index() == Node::MULTI_CONDITION; } // Function: _is_cancelled inline bool Node::_is_cancelled() const { - if(_handle.index() == Node::ASYNC) { - auto& h = std::get(_handle); - if(h.topology && h.topology->_is_cancelled) { - return true; - } - } - // async tasks spawned from subflow does not have topology - return _topology && _topology->_is_cancelled; + return _topology && _topology->_is_cancelled.load(std::memory_order_relaxed); } // Procedure: _set_up_join_counter inline void Node::_set_up_join_counter() { - size_t c = 0; - for(auto p : _dependents) { - if(p->_handle.index() == Node::CONDITION) { - _set_state(Node::BRANCHED); + //if(p->_handle.index() == Node::CONDITION) { + if(p->_is_conditioner()) { + _state.fetch_or(Node::CONDITIONED, std::memory_order_relaxed); } else { c++; } } - - _join_counter.store(c, std::memory_order_relaxed); + _join_counter.store(c, std::memory_order_release); } // Function: _acquire_all -inline bool Node::_acquire_all(std::vector& nodes) { +inline bool Node::_acquire_all(SmallVector& nodes) { auto& to_acquire = _semaphores->to_acquire; @@ -446,7 +878,7 @@ inline bool Node::_acquire_all(std::vector& nodes) { if(!to_acquire[i]->_try_acquire_or_wait(this)) { for(size_t j = 1; j <= i; ++j) { auto r = to_acquire[i-j]->_release(); - nodes.insert(end(nodes), begin(r), end(r)); + nodes.insert(std::end(nodes), std::begin(r), std::end(r)); } return false; } @@ -455,67 +887,73 @@ inline bool Node::_acquire_all(std::vector& nodes) { } // Function: _release_all -inline std::vector Node::_release_all() { +inline SmallVector Node::_release_all() { auto& to_release = _semaphores->to_release; - std::vector nodes; + SmallVector nodes; for(const auto& sem : to_release) { auto r = sem->_release(); - nodes.insert(end(nodes), begin(r), end(r)); + nodes.insert(std::end(nodes), std::begin(r), std::end(r)); } + return nodes; } +// ---------------------------------------------------------------------------- +// Node Deleter +// ---------------------------------------------------------------------------- + +/** +@private +*/ +struct NodeDeleter { + void operator ()(Node* ptr) { + node_pool.recycle(ptr); + } +}; + // ---------------------------------------------------------------------------- // Graph definition // ---------------------------------------------------------------------------- - -//// Function: _node_pool -//inline ObjectPool& Graph::_node_pool() { -// static ObjectPool pool; -// return pool; -//} // Destructor inline Graph::~Graph() { - //auto& np = _node_pool(); - for(auto node : _nodes) { - //np.recycle(node); - node_pool.recycle(node); - } + _clear(); } // Move constructor -inline Graph::Graph(Graph&& other) : +inline Graph::Graph(Graph&& other) : _nodes {std::move(other._nodes)} { } // Move assignment inline Graph& Graph::operator = (Graph&& other) { + _clear(); _nodes = std::move(other._nodes); return *this; } // Procedure: clear inline void Graph::clear() { - //auto& np = _node_pool(); + _clear(); +} + +// Procedure: clear +inline void Graph::_clear() { for(auto node : _nodes) { - //node->~Node(); - //np.deallocate(node); node_pool.recycle(node); } _nodes.clear(); } // Procedure: clear_detached -inline void Graph::clear_detached() { +inline void Graph::_clear_detached() { auto mid = std::partition(_nodes.begin(), _nodes.end(), [] (Node* node) { - return !(node->_has_state(Node::DETACHED)); + return !(node->_state.load(std::memory_order_relaxed) & Node::DETACHED); }); - - //auto& np = _node_pool(); + for(auto itr = mid; itr != _nodes.end(); ++itr) { node_pool.recycle(*itr); } @@ -523,50 +961,38 @@ inline void Graph::clear_detached() { } // Procedure: merge -inline void Graph::merge(Graph&& g) { +inline void Graph::_merge(Graph&& g) { for(auto n : g._nodes) { _nodes.push_back(n); } g._nodes.clear(); } +// Function: erase +inline void Graph::_erase(Node* node) { + if(auto I = std::find(_nodes.begin(), _nodes.end(), node); I != _nodes.end()) { + _nodes.erase(I); + node_pool.recycle(node); + } +} + // Function: size -// query the size inline size_t Graph::size() const { return _nodes.size(); } // Function: empty -// query the emptiness inline bool Graph::empty() const { return _nodes.empty(); } - -// Function: emplace_back -// create a node from a give argument; constructor is called if necessary + +/** +@private +*/ template -Node* Graph::emplace_back(ArgsT&&... args) { - //auto node = _node_pool().allocate(); - //new (node) Node(std::forward(args)...); - //_nodes.push_back(node); +Node* Graph::_emplace_back(ArgsT&&... args) { _nodes.push_back(node_pool.animate(std::forward(args)...)); return _nodes.back(); } -// Function: emplace_back -// create a node from a give argument; constructor is called if necessary -inline Node* Graph::emplace_back() { - //auto node = _node_pool().allocate(); - //new (node) Node(); - //_nodes.push_back(node); - _nodes.push_back(node_pool.animate()); - return _nodes.back(); -} - - } // end of namespace tf. --------------------------------------------------- - - - - - diff --git a/lib/taskflow/core/notifier.hpp b/lib/taskflow/core/notifier.hpp index a82f8a5..39bcf64 100644 --- a/lib/taskflow/core/notifier.hpp +++ b/lib/taskflow/core/notifier.hpp @@ -67,7 +67,7 @@ class Notifier { friend class Executor; public: - + struct Waiter { std::atomic next; std::mutex mu; @@ -199,7 +199,7 @@ class Notifier { } } } - + // notify n workers void notify_n(size_t n) { if(n >= _waiters.size()) { diff --git a/lib/taskflow/core/observer.hpp b/lib/taskflow/core/observer.hpp index 4ca0166..3c1873e 100644 --- a/lib/taskflow/core/observer.hpp +++ b/lib/taskflow/core/observer.hpp @@ -114,9 +114,9 @@ struct ProfileData { /** @class: ObserverInterface -@brief The interface class for creating an executor observer. +@brief class to derive an executor observer -The tf::ObserverInterface class let users define custom methods to monitor +The tf::ObserverInterface class allows users to define custom methods to monitor the behaviors of an executor. This is particularly useful when you want to inspect the performance of an executor and visualize when each thread participates in the execution of a task. @@ -168,8 +168,6 @@ executor.run(taskflow).wait(); */ class ObserverInterface { - friend class Executor; - public: /** @@ -185,17 +183,17 @@ class ObserverInterface { /** @brief method to call before a worker thread executes a closure - @param w an immutable view of this worker thread + @param wv an immutable view of this worker thread @param task_view a constant wrapper object to the task */ - virtual void on_entry(WorkerView w, TaskView task_view) = 0; + virtual void on_entry(WorkerView wv, TaskView task_view) = 0; /** @brief method to call after a worker thread executed a closure - @param w an immutable view of this worker thread + @param wv an immutable view of this worker thread @param task_view a constant wrapper object to the task */ - virtual void on_exit(WorkerView w, TaskView task_view) = 0; + virtual void on_exit(WorkerView wv, TaskView task_view) = 0; }; // ---------------------------------------------------------------------------- @@ -205,7 +203,7 @@ class ObserverInterface { /** @class: ChromeObserver -@brief observer interface based on Chrome tracing format +@brief class to create an observer based on Chrome tracing format A tf::ChromeObserver inherits tf::ObserverInterface and defines methods to dump the observed thread activities into a format that can be visualized through @@ -338,6 +336,8 @@ inline void ChromeObserver::clear() { // Procedure: dump inline void ChromeObserver::dump(std::ostream& os) const { + using namespace std::chrono; + size_t first; for(first = 0; first<_timeline.segments.size(); ++first) { @@ -356,8 +356,7 @@ inline void ChromeObserver::dump(std::ostream& os) const { for(size_t i=0; i<_timeline.segments[w].size(); i++) { - os << '{' - << "\"cat\":\"ChromeObserver\","; + os << '{'<< "\"cat\":\"ChromeObserver\","; // name field os << "\"name\":\""; @@ -373,10 +372,10 @@ inline void ChromeObserver::dump(std::ostream& os) const { os << "\"ph\":\"X\"," << "\"pid\":1," << "\"tid\":" << w << ',' - << "\"ts\":" << std::chrono::duration_cast( + << "\"ts\":" << duration_cast( _timeline.segments[w][i].beg - _timeline.origin ).count() << ',' - << "\"dur\":" << std::chrono::duration_cast( + << "\"dur\":" << duration_cast( _timeline.segments[w][i].end - _timeline.segments[w][i].beg ).count(); @@ -415,7 +414,7 @@ inline size_t ChromeObserver::num_tasks() const { /** @class TFProfObserver -@brief observer interface based on the built-in taskflow profiler format +@brief class to create an observer based on the built-in taskflow profiler format A tf::TFProfObserver inherits tf::ObserverInterface and defines methods to dump the observed thread activities into a format that can be visualized through @@ -438,17 +437,48 @@ executor.run(taskflow).wait(); observer->dump(std::cout); @endcode -We recommend using our @TFProf python script to observe thread activities -instead of the raw function call. -The script will turn on environment variables needed for observing all executors -in a taskflow program and dump the result to a valid, clean JSON file -compatible with the format of @TFProf. */ class TFProfObserver : public ObserverInterface { friend class Executor; friend class TFProfManager; + + /** @private overall task summary */ + struct TaskSummary { + size_t count {0}; + size_t total_span {0}; + size_t min_span; + size_t max_span; + + float avg_span() const { return total_span * 1.0f / count; } + }; + + /** @private worker summary at a level */ + struct WorkerSummary { + + size_t id; + size_t level; + size_t count {0}; + size_t total_span {0}; + size_t min_span{0}; + size_t max_span{0}; + + std::array tsum; + + float avg_span() const { return total_span * 1.0f / count; } + //return count < 2 ? 0.0f : total_delay * 1.0f / (count-1); + }; + /** @private */ + struct Summary { + std::array tsum; + std::vector wsum; + + void dump_tsum(std::ostream&) const; + void dump_wsum(std::ostream&) const; + void dump(std::ostream&) const; + }; + public: /** @@ -462,6 +492,16 @@ class TFProfObserver : public ObserverInterface { */ std::string dump() const; + /** + @brief shows the summary report through an output stream + */ + void summary(std::ostream& ostream) const; + + /** + @brief returns the summary report in a string + */ + std::string summary() const; + /** @brief clears the timeline data */ @@ -471,6 +511,11 @@ class TFProfObserver : public ObserverInterface { @brief queries the number of tasks observed */ size_t num_tasks() const; + + /** + @brief queries the number of observed workers + */ + size_t num_workers() const; private: @@ -483,6 +528,155 @@ class TFProfObserver : public ObserverInterface { inline void on_exit(WorkerView, TaskView) override final; }; + +// dump the task summary +inline void TFProfObserver::Summary::dump_tsum(std::ostream& os) const { + + // task summary + size_t type_w{10}, count_w{5}, time_w{9}, avg_w{8}, min_w{8}, max_w{8}; + + std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){ + if(i.count == 0) return; + count_w = std::max(count_w, std::to_string(i.count).size()); + }); + + std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){ + if(i.count == 0) return; + time_w = std::max(time_w, std::to_string(i.total_span).size()); + }); + + std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){ + if(i.count == 0) return; + avg_w = std::max(time_w, std::to_string(i.avg_span()).size()); + }); + + std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){ + if(i.count == 0) return; + min_w = std::max(min_w, std::to_string(i.min_span).size()); + }); + + std::for_each(tsum.begin(), tsum.end(), [&](const auto& i){ + if(i.count == 0) return; + max_w = std::max(max_w, std::to_string(i.max_span).size()); + }); + + os << std::setw(type_w) << "-Task-" + << std::setw(count_w+2) << "Count" + << std::setw(time_w+2) << "Time (us)" + << std::setw(avg_w+2) << "Avg (us)" + << std::setw(min_w+2) << "Min (us)" + << std::setw(max_w+2) << "Max (us)" + << '\n'; + + for(size_t i=0; i(); @@ -530,6 +724,8 @@ inline void TFProfObserver::clear() { // Procedure: dump inline void TFProfObserver::dump(std::ostream& os) const { + using namespace std::chrono; + size_t first; for(first = 0; first<_timeline.segments.size(); ++first) { @@ -571,12 +767,10 @@ inline void TFProfObserver::dump(std::ostream& os) const { // span os << "{\"span\":[" - << std::chrono::duration_cast( - s.beg - _timeline.origin - ).count() << "," - << std::chrono::duration_cast( - s.end - _timeline.origin - ).count() << "],"; + << duration_cast(s.beg - _timeline.origin).count() + << "," + << duration_cast(s.end - _timeline.origin).count() + << "],"; // name os << "\"name\":\""; @@ -588,7 +782,7 @@ inline void TFProfObserver::dump(std::ostream& os) const { } os << "\","; - // category "type": "Condition Task", + // e.g., category "type": "Condition Task" os << "\"type\":\"" << to_string(s.type) << "\""; os << "}"; @@ -607,15 +801,124 @@ inline std::string TFProfObserver::dump() const { return oss.str(); } +// Procedure: summary +inline void TFProfObserver::summary(std::ostream& os) const { + + using namespace std::chrono; + + Summary summary; + std::optional view_beg, view_end; + + // find the first non-empty worker + size_t first; + for(first = 0; first<_timeline.segments.size(); ++first) { + if(_timeline.segments[first].size() > 0) { + break; + } + } + + // not timeline data to dump + if(first == _timeline.segments.size()) { + goto end_of_summary; + } + + for(size_t w=first; w<_timeline.segments.size(); w++) { + for(size_t l=0; l<_timeline.segments[w].size(); l++) { + + if(_timeline.segments[w][l].empty()) { + continue; + } + + // worker w at level l + WorkerSummary ws; + ws.id = w; + ws.level = l; + ws.count = _timeline.segments[w][l].size(); + + // scan all tasks at level l + for(size_t i=0; i<_timeline.segments[w][l].size(); ++i) { + + // update the entire span + auto& s = _timeline.segments[w][l][i]; + view_beg = view_beg ? std::min(*view_beg, s.beg) : s.beg; + view_end = view_end ? std::max(*view_end, s.end) : s.end; + + // update the task summary + size_t t = duration_cast(s.end - s.beg).count(); + + auto& x = summary.tsum[static_cast(s.type)]; + x.count += 1; + x.total_span += t; + x.min_span = (x.count == 1) ? t : std::min(t, x.min_span); + x.max_span = (x.count == 1) ? t : std::max(t, x.max_span); + + // update the worker summary + ws.total_span += t; + ws.min_span = (i == 0) ? t : std::min(t, ws.min_span); + ws.max_span = (i == 0) ? t : std::max(t, ws.max_span); + + auto&y = ws.tsum[static_cast(s.type)]; + y.count += 1; + y.total_span += t; + y.min_span = (y.count == 1) ? t : std::min(t, y.min_span); + y.max_span = (y.count == 1) ? t : std::max(t, y.max_span); + + // update the delay + //if(i) { + // size_t d = duration_cast( + // s.beg - _timeline.segments[w][l][i-1].end + // ).count(); + // ws.total_delay += d; + // ws.min_delay = (i == 1) ? d : std::min(ws.min_delay, d); + // ws.max_delay = (i == 1) ? d : std::max(ws.max_delay, d); + //} + } + summary.wsum.push_back(ws); + } + } + + end_of_summary: + + size_t view = 0; + if(view_beg && view_end) { + view = duration_cast(*view_end - *view_beg).count(); + } + + os << "==Observer " << _timeline.uid << ": " + << num_workers() << " workers completed " + << num_tasks() << " tasks in " + << view << " us\n"; + + summary.dump(os); +} + +// Procedure: summary +inline std::string TFProfObserver::summary() const { + std::ostringstream oss; + summary(oss); + return oss.str(); +} + // Function: num_tasks inline size_t TFProfObserver::num_tasks() const { - return std::accumulate( - _timeline.segments.begin(), _timeline.segments.end(), size_t{0}, - [](size_t sum, const auto& exe){ - return sum + exe.size(); + size_t s = 0; + for(size_t w=0; w<_timeline.segments.size(); ++w) { + for(size_t l=0; l<_timeline.segments[w].size(); ++l) { + s += _timeline.segments[w][l].size(); } - ); + } + return s; } + +// Function: num_workers +inline size_t TFProfObserver::num_workers() const { + size_t w = 0; + for(size_t i=0; i<_timeline.segments.size(); ++i) { + w += (!_timeline.segments[i].empty()); + } + return w; +} + // ---------------------------------------------------------------------------- // TFProfManager @@ -682,11 +985,11 @@ inline TFProfManager::~TFProfManager() { for(size_t i=0; i<_observers.size(); ++i) { data.timelines.push_back(std::move(_observers[i]->_timeline)); } - Serializer serializer(ofs); + Serializer serializer(ofs); serializer(data); } // .json - else { + else { // if(_fpath.rfind(".json") != std::string::npos) { ofs << "[\n"; for(size_t i=0; i<_observers.size(); ++i) { if(i) ofs << ','; @@ -695,6 +998,14 @@ inline TFProfManager::~TFProfManager() { ofs << "]\n"; } } + // do a summary report in stderr for each observer + else { + std::ostringstream oss; + for(size_t i=0; i<_observers.size(); ++i) { + _observers[i]->summary(oss); + } + fprintf(stderr, "%s", oss.str().c_str()); + } } // Function: get diff --git a/lib/taskflow/core/semaphore.hpp b/lib/taskflow/core/semaphore.hpp index 75d49be..12d6069 100644 --- a/lib/taskflow/core/semaphore.hpp +++ b/lib/taskflow/core/semaphore.hpp @@ -5,7 +5,7 @@ #include "declarations.hpp" -/** +/** @file semaphore.hpp @brief semaphore include file */ @@ -23,16 +23,16 @@ namespace tf { A semaphore creates a constraint that limits the maximum concurrency, i.e., the number of workers, in a set of tasks. -You can let a task acquire/release one or multiple semaphores before/after +You can let a task acquire/release one or multiple semaphores before/after executing its work. -A task can acquire and release a semaphore, -or just acquire or just release it. +A task can acquire and release a semaphore, +or just acquire or just release it. A tf::Semaphore object starts with an initial count. As long as that count is above 0, tasks can acquire the semaphore and do their work. If the count is 0 or less, a task trying to acquire the semaphore will not run but goes to a waiting list of that semaphore. -When the semaphore is released by another task, +When the semaphore is released by another task, it reschedules all tasks on that waiting list. @code{.cpp} @@ -62,7 +62,7 @@ Under normal circumstances, the five tasks would be executed concurrently. However, this example has a semaphore with initial count 1, and all tasks need to acquire that semaphore before running and release that semaphore after they are done. -This organization limits the number of concurrently running tasks to only one. +This arrangement limits the number of concurrently running tasks to only one. */ class Semaphore { @@ -70,34 +70,41 @@ class Semaphore { friend class Node; public: - + /** @brief constructs a semaphore with the given counter + + A semaphore creates a constraint that limits the maximum concurrency, + i.e., the number of workers, in a set of tasks. + + @code{.cpp} + tf::Semaphore semaphore(4); // concurrency constraint of 4 workers + @endcode */ - explicit Semaphore(int max_workers); - + explicit Semaphore(size_t max_workers); + /** @brief queries the counter value (not thread-safe during the run) */ - int count() const; - + size_t count() const; + private: std::mutex _mtx; - int _counter; + size_t _counter; std::vector _waiters; - + bool _try_acquire_or_wait(Node*); std::vector _release(); }; -inline Semaphore::Semaphore(int max_workers) : +inline Semaphore::Semaphore(size_t max_workers) : _counter(max_workers) { } - + inline bool Semaphore::_try_acquire_or_wait(Node* me) { std::lock_guard lock(_mtx); if(_counter > 0) { @@ -117,7 +124,7 @@ inline std::vector Semaphore::_release() { return r; } -inline int Semaphore::count() const { +inline size_t Semaphore::count() const { return _counter; } diff --git a/lib/taskflow/core/task.hpp b/lib/taskflow/core/task.hpp index 2cc4621..cd10b73 100644 --- a/lib/taskflow/core/task.hpp +++ b/lib/taskflow/core/task.hpp @@ -2,7 +2,7 @@ #include "graph.hpp" -/** +/** @file task.hpp @brief task include file */ @@ -19,45 +19,61 @@ namespace tf { @brief enumeration of all task types */ enum class TaskType : int { + /** @brief placeholder task type */ PLACEHOLDER = 0, - CUDAFLOW, + /** @brief static task type */ STATIC, + /** @brief dynamic (subflow) task type */ DYNAMIC, + /** @brief condition task type */ CONDITION, + /** @brief module task type */ MODULE, + /** @brief asynchronous task type */ ASYNC, - UNDEFINED + /** @brief undefined task type (for internal use only) */ + UNDEFINED }; /** +@private @brief array of all task types (used for iterating task types) */ -inline constexpr std::array TASK_TYPES = { +inline constexpr std::array TASK_TYPES = { TaskType::PLACEHOLDER, - TaskType::CUDAFLOW, TaskType::STATIC, TaskType::DYNAMIC, TaskType::CONDITION, TaskType::MODULE, - TaskType::ASYNC + TaskType::ASYNC, }; /** @brief convert a task type to a human-readable string + +The name of each task type is the litte-case string of its characters. + +@code{.cpp} +TaskType::PLACEHOLDER -> "placeholder" +TaskType::STATIC -> "static" +TaskType::DYNAMIC -> "subflow" +TaskType::CONDITION -> "condition" +TaskType::MODULE -> "module" +TaskType::ASYNC -> "async" +@endcode */ inline const char* to_string(TaskType type) { const char* val; switch(type) { - case TaskType::PLACEHOLDER: val = "placeholder"; break; - case TaskType::CUDAFLOW: val = "cudaflow"; break; - case TaskType::STATIC: val = "static"; break; - case TaskType::DYNAMIC: val = "subflow"; break; - case TaskType::CONDITION: val = "condition"; break; - case TaskType::MODULE: val = "module"; break; - case TaskType::ASYNC: val = "async"; break; - default: val = "undefined"; break; + case TaskType::PLACEHOLDER: val = "placeholder"; break; + case TaskType::STATIC: val = "static"; break; + case TaskType::DYNAMIC: val = "subflow"; break; + case TaskType::CONDITION: val = "condition"; break; + case TaskType::MODULE: val = "module"; break; + case TaskType::ASYNC: val = "async"; break; + default: val = "undefined"; break; } return val; @@ -68,39 +84,51 @@ inline const char* to_string(TaskType type) { // ---------------------------------------------------------------------------- /** -@brief determines if a callable is a static task +@brief determines if a callable is a dynamic task -A static task is a callable object constructible from std::function. +A dynamic task is a callable object constructible from std::function. */ template -constexpr bool is_static_task_v = std::is_invocable_r_v && - !std::is_invocable_r_v; +constexpr bool is_dynamic_task_v = + std::is_invocable_r_v && + !std::is_invocable_r_v; /** -@brief determines if a callable is a dynamic task +@brief determines if a callable is a condition task -A dynamic task is a callable object constructible from std::function. +A condition task is a callable object constructible from std::function +or std::function. */ template -constexpr bool is_dynamic_task_v = std::is_invocable_r_v; +constexpr bool is_condition_task_v = + (std::is_invocable_r_v || std::is_invocable_r_v) && + !is_dynamic_task_v; /** -@brief determines if a callable is a condition task +@brief determines if a callable is a multi-condition task -A condition task is a callable object constructible from std::function. +A multi-condition task is a callable object constructible from +std::function()> or +std::function(tf::Runtime&)>. */ template -constexpr bool is_condition_task_v = std::is_invocable_r_v; +constexpr bool is_multi_condition_task_v = + (std::is_invocable_r_v, C> || + std::is_invocable_r_v, C, Runtime&>) && + !is_dynamic_task_v; /** -@brief determines if a callable is a cudaflow task +@brief determines if a callable is a static task -A cudaFlow task is a callable object constructible from -std::function or std::function. +A static task is a callable object constructible from std::function +or std::function. */ template -constexpr bool is_cudaflow_task_v = std::is_invocable_r_v || - std::is_invocable_r_v; +constexpr bool is_static_task_v = + (std::is_invocable_r_v || std::is_invocable_r_v) && + !is_condition_task_v && + !is_multi_condition_task_v && + !is_dynamic_task_v; // ---------------------------------------------------------------------------- // Task @@ -109,19 +137,23 @@ constexpr bool is_cudaflow_task_v = std::is_invocable_r_v || /** @class Task -@brief handle to a node in a task dependency graph - -A Task is handle to manipulate a node in a taskflow graph. -It provides a set of methods for users to access and modify the attributes of -the associated graph node without directly touching internal node data. +@brief class to create a task handle over a node in a taskflow graph +A task is a wrapper over a node in a taskflow graph. +It provides a set of methods for users to access and modify the attributes of +the associated node in the taskflow graph. +A task is very lightweight object (i.e., only storing a node pointer) that +can be trivially copied around, +and it does not own the lifetime of the associated node. */ class Task { friend class FlowBuilder; + friend class Runtime; friend class Taskflow; friend class TaskView; - + friend class Executor; + public: /** @@ -133,12 +165,12 @@ class Task { @brief constructs the task with the copy of the other task */ Task(const Task& other); - + /** @brief replaces the contents with a copy of the other task */ Task& operator = (const Task&); - + /** @brief replaces the contents with a null pointer */ @@ -153,12 +185,12 @@ class Task { @brief compares if two tasks are not associated with the same graph node */ bool operator != (const Task& rhs) const; - + /** @brief queries the name of the task */ const std::string& name() const; - + /** @brief queries the number of successors of the task */ @@ -168,7 +200,7 @@ class Task { @brief queries the number of predecessors of the task */ size_t num_dependents() const; - + /** @brief queries the number of strong dependents of the task */ @@ -178,7 +210,7 @@ class Task { @brief queries the number of weak dependents of the task */ size_t num_weak_dependents() const; - + /** @brief assigns a name to the task @@ -193,22 +225,24 @@ class Task { @tparam C callable type - @param callable callable to construct one of the static, dynamic, condition, and cudaFlow tasks + @param callable callable to construct a task @return @c *this */ template Task& work(C&& callable); - + /** @brief creates a module task from a taskflow - @param taskflow a taskflow object for the module + @tparam T object type + @param object a custom object that defines @c T::graph() method @return @c *this */ - Task& composed_of(Taskflow& taskflow); - + template + Task& composed_of(T& object); + /** @brief adds precedence links from this to other tasks @@ -220,11 +254,11 @@ class Task { */ template Task& precede(Ts&&... tasks); - + /** @brief adds precedence links from other tasks to this - @tparam Ts parameter pack + @tparam Ts parameter pack @param tasks one or multiple tasks @@ -242,7 +276,54 @@ class Task { @brief makes the task acquire this semaphore */ Task& acquire(Semaphore& semaphore); + + /** + @brief assigns pointer to user data + + @param data pointer to user data + + The following example shows how to attach user data to a task and + run the task iteratively while changing the data value: + + @code{.cpp} + tf::Executor executor; + tf::Taskflow taskflow("attach data to a task"); + + int data; + + // create a task and attach it the data + auto A = taskflow.placeholder(); + A.data(&data).work([A](){ + auto d = *static_cast(A.data()); + std::cout << "data is " << d << std::endl; + }); + + // run the taskflow iteratively with changing data + for(data = 0; data<10; data++){ + executor.run(taskflow).wait(); + } + @endcode + + @return @c *this + */ + Task& data(void* data); + + /** + @brief assigns a priority value to the task + + A priority value can be one of the following three levels, + tf::TaskPriority::HIGH (numerically equivalent to 0), + tf::TaskPriority::NORMAL (numerically equivalent to 1), and + tf::TaskPriority::LOW (numerically equivalent to 2). + The smaller the priority value, the higher the priority. + */ + Task& priority(TaskPriority p); + /** + @brief queries the priority value of the task + */ + TaskPriority priority() const; + /** @brief resets the task handle to null */ @@ -262,13 +343,13 @@ class Task { @brief queries if the task has a work assigned */ bool has_work() const; - + /** @brief applies an visitor callable to each successor of the task */ template void for_each_successor(V&& visitor) const; - + /** @brief applies an visitor callable to each dependents of the task */ @@ -279,7 +360,7 @@ class Task { @brief obtains a hash value of the underlying node */ size_t hash_value() const; - + /** @brief returns the task type */ @@ -290,8 +371,14 @@ class Task { */ void dump(std::ostream& ostream) const; + /** + @brief queries pointer to user data + */ + void* data() const; + + private: - + Task(Node*); Node* _node {nullptr}; @@ -322,8 +409,9 @@ Task& Task::succeed(Ts&&... tasks) { } // Function: composed_of -inline Task& Task::composed_of(Taskflow& tf) { - _node->_handle.emplace(&tf); +template +Task& Task::composed_of(T& object) { + _node->_handle.emplace(object); return *this; } @@ -358,7 +446,6 @@ inline Task& Task::name(const std::string& name) { // Function: acquire inline Task& Task::acquire(Semaphore& s) { if(!_node->_semaphores) { - //_node->_semaphores.emplace(); _node->_semaphores = std::make_unique(); } _node->_semaphores->to_acquire.push_back(&s); @@ -423,15 +510,15 @@ inline bool Task::has_work() const { // Function: task_type inline TaskType Task::type() const { switch(_node->_handle.index()) { - case Node::PLACEHOLDER: return TaskType::PLACEHOLDER; - case Node::STATIC: return TaskType::STATIC; - case Node::DYNAMIC: return TaskType::DYNAMIC; - case Node::CONDITION: return TaskType::CONDITION; - case Node::MODULE: return TaskType::MODULE; - case Node::ASYNC: return TaskType::ASYNC; - case Node::SILENT_ASYNC: return TaskType::ASYNC; - case Node::CUDAFLOW: return TaskType::CUDAFLOW; - default: return TaskType::UNDEFINED; + case Node::PLACEHOLDER: return TaskType::PLACEHOLDER; + case Node::STATIC: return TaskType::STATIC; + case Node::DYNAMIC: return TaskType::DYNAMIC; + case Node::CONDITION: return TaskType::CONDITION; + case Node::MULTI_CONDITION: return TaskType::CONDITION; + case Node::MODULE: return TaskType::MODULE; + case Node::ASYNC: return TaskType::ASYNC; + case Node::DEPENDENT_ASYNC: return TaskType::ASYNC; + default: return TaskType::UNDEFINED; } } @@ -467,6 +554,7 @@ inline void Task::dump(std::ostream& os) const { // Function: work template Task& Task::work(C&& c) { + if constexpr(is_static_task_v) { _node->_handle.emplace(std::forward(c)); } @@ -476,8 +564,8 @@ Task& Task::work(C&& c) { else if constexpr(is_condition_task_v) { _node->_handle.emplace(std::forward(c)); } - else if constexpr(is_cudaflow_task_v) { - _node->_handle.emplace(std::forward(c)); + else if constexpr(is_multi_condition_task_v) { + _node->_handle.emplace(std::forward(c)); } else { static_assert(dependent_false_v, "invalid task callable"); @@ -485,18 +573,42 @@ Task& Task::work(C&& c) { return *this; } +// Function: data +inline void* Task::data() const { + return _node->_data; +} + +// Function: data +inline Task& Task::data(void* data) { + _node->_data = data; + return *this; +} + +// Function: priority +inline Task& Task::priority(TaskPriority p) { + _node->_priority = static_cast(p); + return *this; +} + +// Function: priority +inline TaskPriority Task::priority() const { + return static_cast(_node->_priority); +} + // ---------------------------------------------------------------------------- // global ostream // ---------------------------------------------------------------------------- /** -@brief overload of ostream inserter operator for cudaTask +@brief overload of ostream inserter operator for Task */ inline std::ostream& operator << (std::ostream& os, const Task& task) { task.dump(os); return os; } +// ---------------------------------------------------------------------------- +// Task View // ---------------------------------------------------------------------------- /** @@ -505,7 +617,7 @@ inline std::ostream& operator << (std::ostream& os, const Task& task) { @brief class to access task information from the observer interface */ class TaskView { - + friend class Executor; public: @@ -514,7 +626,7 @@ class TaskView { @brief queries the name of the task */ const std::string& name() const; - + /** @brief queries the number of successors of the task */ @@ -524,7 +636,7 @@ class TaskView { @brief queries the number of predecessors of the task */ size_t num_dependents() const; - + /** @brief queries the number of strong dependents of the task */ @@ -540,7 +652,7 @@ class TaskView { */ template void for_each_successor(V&& visitor) const; - + /** @brief applies an visitor callable to each dependents of the task */ @@ -551,14 +663,14 @@ class TaskView { @brief queries the task type */ TaskType type() const; - + /** @brief obtains a hash value of the underlying node */ size_t hash_value() const; - + private: - + TaskView(const Node&); TaskView(const TaskView&) = default; @@ -597,18 +709,18 @@ inline size_t TaskView::num_successors() const { // Function: type inline TaskType TaskView::type() const { switch(_node._handle.index()) { - case Node::PLACEHOLDER: return TaskType::PLACEHOLDER; - case Node::STATIC: return TaskType::STATIC; - case Node::DYNAMIC: return TaskType::DYNAMIC; - case Node::CONDITION: return TaskType::CONDITION; - case Node::MODULE: return TaskType::MODULE; - case Node::ASYNC: return TaskType::ASYNC; - case Node::SILENT_ASYNC: return TaskType::ASYNC; - case Node::CUDAFLOW: return TaskType::CUDAFLOW; - default: return TaskType::UNDEFINED; + case Node::PLACEHOLDER: return TaskType::PLACEHOLDER; + case Node::STATIC: return TaskType::STATIC; + case Node::DYNAMIC: return TaskType::DYNAMIC; + case Node::CONDITION: return TaskType::CONDITION; + case Node::MULTI_CONDITION: return TaskType::CONDITION; + case Node::MODULE: return TaskType::MODULE; + case Node::ASYNC: return TaskType::ASYNC; + case Node::DEPENDENT_ASYNC: return TaskType::ASYNC; + default: return TaskType::UNDEFINED; } } - + // Function: hash_value inline size_t TaskView::hash_value() const { return std::hash{}(&_node); @@ -618,7 +730,7 @@ inline size_t TaskView::hash_value() const { template void TaskView::for_each_successor(V&& visitor) const { for(size_t i=0; i<_node._successors.size(); ++i) { - visitor(TaskView(_node._successors[i])); + visitor(TaskView(*_node._successors[i])); } } @@ -626,7 +738,7 @@ void TaskView::for_each_successor(V&& visitor) const { template void TaskView::for_each_dependent(V&& visitor) const { for(size_t i=0; i<_node._dependents.size(); ++i) { - visitor(TaskView(_node._dependents[i])); + visitor(TaskView(*_node._dependents[i])); } } diff --git a/lib/taskflow/core/taskflow.hpp b/lib/taskflow/core/taskflow.hpp index 00b26f3..ff836f5 100644 --- a/lib/taskflow/core/taskflow.hpp +++ b/lib/taskflow/core/taskflow.hpp @@ -2,8 +2,8 @@ #include "flow_builder.hpp" -/** -@file core/taskflow.hpp +/** +@file taskflow/core/taskflow.hpp @brief taskflow include file */ @@ -12,47 +12,55 @@ namespace tf { // ---------------------------------------------------------------------------- /** -@class Taskflow +@class Taskflow -@brief main entry to create a task dependency graph +@brief class to create a taskflow object -A %taskflow manages a task dependency graph where each task represents a -callable object (e.g., @std_lambda, @std_function) and an edge represents a +A %taskflow manages a task dependency graph where each task represents a +callable object (e.g., @std_lambda, @std_function) and an edge represents a dependency between two tasks. A task is one of the following types: - - 1. static task: the callable constructible from - @c std::function - 2. dynamic task: the callable constructible from - @c std::function - 3. condition task: the callable constructible from - @c std::function - 4. module task: the task constructed from tf::Taskflow::composed_of - 5. %cudaFlow task: the callable constructible from - @c std::function or - @c std::function + + 1. static task : the callable constructible from + @c std::function + 2. dynamic task : the callable constructible from + @c std::function + 3. condition task : the callable constructible from + @c std::function + 4. multi-condition task: the callable constructible from + @c %std::function()> + 5. module task : the task constructed from tf::Taskflow::composed_of + @c std::function Each task is a basic computation unit and is run by one worker thread from an executor. -The following example creates a simple taskflow graph of four static tasks, +The following example creates a simple taskflow graph of four static tasks, @c A, @c B, @c C, and @c D, where -@c A runs before @c B and @c C and +@c A runs before @c B and @c C and @c D runs after @c B and @c C. @code{.cpp} tf::Executor executor; tf::Taskflow taskflow("simple"); -tf::Task A = taskflow.emplace([](){ std::cout << "TaskA\n"; }); +tf::Task A = taskflow.emplace([](){ std::cout << "TaskA\n"; }); tf::Task B = taskflow.emplace([](){ std::cout << "TaskB\n"; }); tf::Task C = taskflow.emplace([](){ std::cout << "TaskC\n"; }); tf::Task D = taskflow.emplace([](){ std::cout << "TaskD\n"; }); A.precede(B, C); // A runs before B and C D.succeed(B, C); // D runs after B and C - -executor.run(taskflow).wait(); + +executor.run(taskflow).wait(); @endcode +The taskflow object itself is NOT thread-safe. You should not +modifying the graph while it is running, +such as adding new tasks, adding new dependencies, and moving +the taskflow to another. +To minimize the overhead of task creation, +our runtime leverages a global object pool to recycle +tasks in a thread-safe manner. + Please refer to @ref Cookbook to learn more about each task type and how to submit a taskflow to an executor. */ @@ -63,14 +71,20 @@ class Taskflow : public FlowBuilder { friend class FlowBuilder; struct Dumper { - std::stack stack; - std::unordered_set visited; + size_t id; + std::stack> stack; + std::unordered_map visited; }; public: /** @brief constructs a taskflow with the given name + + @code{.cpp} + tf::Taskflow taskflow("My Taskflow"); + std::cout << taskflow.name(); // "My Taskflow" + @endcode */ Taskflow(const std::string& name); @@ -79,52 +93,140 @@ class Taskflow : public FlowBuilder { */ Taskflow(); + /** + @brief constructs a taskflow from a moved taskflow + + Constructing a taskflow @c taskflow1 from a moved taskflow @c taskflow2 will + migrate the graph of @c taskflow2 to @c taskflow1. + After the move, @c taskflow2 will become empty. + + @code{.cpp} + tf::Taskflow taskflow1(std::move(taskflow2)); + assert(taskflow2.empty()); + @endcode + + Notice that @c taskflow2 should not be running in an executor + during the move operation, or the behavior is undefined. + */ + Taskflow(Taskflow&& rhs); + + /** + @brief move assignment operator + + Moving a taskflow @c taskflow2 to another taskflow @c taskflow1 will destroy + the existing graph of @c taskflow1 and assign it the graph of @c taskflow2. + After the move, @c taskflow2 will become empty. + + @code{.cpp} + taskflow1 = std::move(taskflow2); + assert(taskflow2.empty()); + @endcode + + Notice that both @c taskflow1 and @c taskflow2 should not be running + in an executor during the move operation, or the behavior is undefined. + */ + Taskflow& operator = (Taskflow&& rhs); + /** @brief default destructor When the destructor is called, all tasks and their associated data (e.g., captured data) will be destroyed. - It is your responsibility to ensure all submitted execution of this + It is your responsibility to ensure all submitted execution of this taskflow have completed before destroying it. + For instance, the following code results in undefined behavior + since the executor may still be running the taskflow while + it is destroyed after the block. + + @code{.cpp} + { + tf::Taskflow taskflow; + executor.run(taskflow); + } + @endcode + + To fix the problem, we must wait for the execution to complete + before destroying the taskflow. + + @code{.cpp} + { + tf::Taskflow taskflow; + executor.run(taskflow).wait(); + } + @endcode */ ~Taskflow() = default; /** @brief dumps the taskflow to a DOT format through a std::ostream target + + @code{.cpp} + taskflow.dump(std::cout); // dump the graph to the standard output + + std::ofstream ofs("output.dot"); + taskflow.dump(ofs); // dump the graph to the file output.dot + @endcode + + For dynamically spawned tasks, such as module tasks, subflow tasks, + and GPU tasks, you need to run the taskflow first before you can + dump the entire graph. + + @code{.cpp} + tf::Task parent = taskflow.emplace([](tf::Subflow sf){ + sf.emplace([](){ std::cout << "child\n"; }); + }); + taskflow.dump(std::cout); // this dumps only the parent tasks + executor.run(taskflow).wait(); + taskflow.dump(std::cout); // this dumps both parent and child tasks + @endcode */ void dump(std::ostream& ostream) const; - + /** @brief dumps the taskflow to a std::string of DOT format + + This method is similar to tf::Taskflow::dump(std::ostream& ostream), + but returning a string of the graph in DOT format. */ std::string dump() const; - + /** @brief queries the number of tasks */ size_t num_tasks() const; - + /** @brief queries the emptiness of the taskflow + + An empty taskflow has no tasks. That is the return of + tf::Taskflow::num_tasks is zero. */ bool empty() const; /** @brief assigns a name to the taskflow + + @code{.cpp} + taskflow.name("assign another name"); + @endcode */ - void name(const std::string&); + void name(const std::string&); /** @brief queries the name of the taskflow + + @code{.cpp} + std::cout << "my name is: " << taskflow.name(); + @endcode */ - const std::string& name() const ; - + const std::string& name() const; + /** @brief clears the associated task dependency graph - + When you clear a taskflow, all tasks and their associated data - (e.g., captured data) will be destroyed. - You should never clean a taskflow while it is being run by an executor. + (e.g., captured data in task callables) will be destroyed. + The behavior of clearing a running taskflow is undefined. */ void clear(); @@ -144,23 +246,34 @@ class Taskflow : public FlowBuilder { template void for_each_task(V&& visitor) const; + /** + @brief returns a reference to the underlying graph object + + A graph object (of type tf::Graph) is the ultimate storage for the + task dependency graph and should only be used as an opaque + data structure to interact with the executor (e.g., composition). + */ + Graph& graph(); + private: - + + mutable std::mutex _mutex; + std::string _name; - - Graph _graph; - std::mutex _mtx; + Graph _graph; std::queue> _topologies; - - void _dump(std::ostream&, const Taskflow*) const; + + std::optional::iterator> _satellite; + + void _dump(std::ostream&, const Graph*) const; void _dump(std::ostream&, const Node*, Dumper&) const; - void _dump(std::ostream&, const Graph&, Dumper&) const; + void _dump(std::ostream&, const Graph*, Dumper&) const; }; // Constructor -inline Taskflow::Taskflow(const std::string& name) : +inline Taskflow::Taskflow(const std::string& name) : FlowBuilder {_graph}, _name {name} { } @@ -169,9 +282,35 @@ inline Taskflow::Taskflow(const std::string& name) : inline Taskflow::Taskflow() : FlowBuilder{_graph} { } +// Move constructor +inline Taskflow::Taskflow(Taskflow&& rhs) : FlowBuilder{_graph} { + + std::scoped_lock lock(rhs._mutex); + + _name = std::move(rhs._name); + _graph = std::move(rhs._graph); + _topologies = std::move(rhs._topologies); + _satellite = rhs._satellite; + + rhs._satellite.reset(); +} + +// Move assignment +inline Taskflow& Taskflow::operator = (Taskflow&& rhs) { + if(this != &rhs) { + std::scoped_lock lock(_mutex, rhs._mutex); + _name = std::move(rhs._name); + _graph = std::move(rhs._graph); + _topologies = std::move(rhs._topologies); + _satellite = rhs._satellite; + rhs._satellite.reset(); + } + return *this; +} + // Procedure: inline void Taskflow::clear() { - _graph.clear(); + _graph._clear(); } // Function: num_tasks @@ -194,6 +333,11 @@ inline const std::string& Taskflow::name() const { return _name; } +// Function: graph +inline Graph& Taskflow::graph() { + return _graph; +} + // Function: for_each_task template void Taskflow::for_each_task(V&& visitor) const { @@ -212,28 +356,40 @@ inline std::string Taskflow::dump() const { // Function: dump inline void Taskflow::dump(std::ostream& os) const { os << "digraph Taskflow {\n"; - _dump(os, this); + _dump(os, &_graph); os << "}\n"; } // Procedure: _dump -inline void Taskflow::_dump(std::ostream& os, const Taskflow* top) const { - +inline void Taskflow::_dump(std::ostream& os, const Graph* top) const { + Dumper dumper; - - dumper.stack.push(top); - dumper.visited.insert(top); + + dumper.id = 0; + dumper.stack.push({nullptr, top}); + dumper.visited[top] = dumper.id++; while(!dumper.stack.empty()) { - - auto f = dumper.stack.top(); + + auto [p, f] = dumper.stack.top(); dumper.stack.pop(); - - os << "subgraph cluster_p" << f << " {\nlabel=\"Taskflow: "; - if(f->_name.empty()) os << 'p' << f; - else os << f->_name; + + os << "subgraph cluster_p" << f << " {\nlabel=\""; + + // n-level module + if(p) { + os << 'm' << dumper.visited[f]; + } + // top-level taskflow graph + else { + os << "Taskflow: "; + if(_name.empty()) os << 'p' << this; + else os << _name; + } + os << "\";\n"; - _dump(os, f->_graph, dumper); + + _dump(os, f, dumper); os << "}\n"; } } @@ -252,60 +408,49 @@ inline void Taskflow::_dump( switch(node->_handle.index()) { case Node::CONDITION: + case Node::MULTI_CONDITION: os << "shape=diamond color=black fillcolor=aquamarine style=filled"; break; - case Node::CUDAFLOW: - os << " style=\"filled\"" - << " color=\"black\" fillcolor=\"purple\"" - << " fontcolor=\"white\"" - << " shape=\"folder\""; - break; - default: break; } os << "];\n"; - + for(size_t s=0; s_successors.size(); ++s) { - if(node->_handle.index() == Node::CONDITION) { + if(node->_is_conditioner()) { // case edge is dashed - os << 'p' << node << " -> p" << node->_successors[s] + os << 'p' << node << " -> p" << node->_successors[s] << " [style=dashed label=\"" << s << "\"];\n"; - } - else { + } else { os << 'p' << node << " -> p" << node->_successors[s] << ";\n"; } } - + // subflow join node - if(node->_parent && node->_successors.size() == 0) { + if(node->_parent && node->_parent->_handle.index() == Node::DYNAMIC && + node->_successors.size() == 0 + ) { os << 'p' << node << " -> p" << node->_parent << ";\n"; } + // node info switch(node->_handle.index()) { case Node::DYNAMIC: { - auto& sbg = std::get(node->_handle).subgraph; + auto& sbg = std::get_if(&node->_handle)->subgraph; if(!sbg.empty()) { os << "subgraph cluster_p" << node << " {\nlabel=\"Subflow: "; if(node->_name.empty()) os << 'p' << node; else os << node->_name; os << "\";\n" << "color=blue\n"; - _dump(os, sbg, dumper); + _dump(os, &sbg, dumper); os << "}\n"; } } break; - - case Node::CUDAFLOW: { - std::get(node->_handle).graph->dump( - os, node, node->_name - ); - } - break; default: break; @@ -314,10 +459,10 @@ inline void Taskflow::_dump( // Procedure: _dump inline void Taskflow::_dump( - std::ostream& os, const Graph& graph, Dumper& dumper + std::ostream& os, const Graph* graph, Dumper& dumper ) const { - - for(const auto& n : graph._nodes) { + + for(const auto& n : graph->_nodes) { // regular task if(n->_handle.index() != Node::MODULE) { @@ -325,22 +470,20 @@ inline void Taskflow::_dump( } // module task else { - - auto module = std::get(n->_handle).module; + //auto module = &(std::get_if(&n->_handle)->module); + auto module = &(std::get_if(&n->_handle)->graph); os << 'p' << n << "[shape=box3d, color=blue, label=\""; - if(n->_name.empty()) os << n; + if(n->_name.empty()) os << 'p' << n; else os << n->_name; - os << " [Taskflow: "; - if(module->_name.empty()) os << 'p' << module; - else os << module->_name; - os << "]\"];\n"; if(dumper.visited.find(module) == dumper.visited.end()) { - dumper.visited.insert(module); - dumper.stack.push(module); + dumper.visited[module] = dumper.id++; + dumper.stack.push({n, module}); } + os << " [m" << dumper.visited[module] << "]\"];\n"; + for(const auto s : n->_successors) { os << 'p' << n << "->" << 'p' << s << ";\n"; } @@ -355,12 +498,12 @@ inline void Taskflow::_dump( /** @class Future -@brief class to access the result of task execution +@brief class to access the result of an execution tf::Future is a derived class from std::future that will eventually hold the -execution result of a submitted taskflow (e.g., tf::Executor::run) -or an asynchronous task (e.g., tf::Executor::async). -In addition to base methods of std::future, +execution result of a submitted taskflow (tf::Executor::run) +or an asynchronous task (tf::Executor::async, tf::Executor::silent_async). +In addition to the base methods inherited from std::future, you can call tf::Future::cancel to cancel the execution of the running taskflow associated with this future object. The following example cancels a submission of a taskflow that contains @@ -371,7 +514,7 @@ tf::Executor executor; tf::Taskflow taskflow; for(int i=0; i<1000; i++) { - taskflow.emplace([](){ + taskflow.emplace([](){ std::this_thread::sleep_for(std::chrono::seconds(1)); }); } @@ -391,17 +534,14 @@ class Future : public std::future { friend class Executor; friend class Subflow; - + friend class Runtime; + using handle_t = std::variant< - std::monostate, std::weak_ptr, std::weak_ptr + std::monostate, std::weak_ptr >; - // variant index - constexpr static auto ASYNC = get_index_v, handle_t>; - constexpr static auto TASKFLOW = get_index_v, handle_t>; - public: - + /** @brief default constructor */ @@ -411,12 +551,12 @@ class Future : public std::future { @brief disabled copy constructor */ Future(const Future&) = delete; - + /** @brief default move constructor */ Future(Future&&) = default; - + /** @brief disabled copy assignment */ @@ -428,16 +568,21 @@ class Future : public std::future { Future& operator = (Future&&) = default; /** - @brief cancels the execution of the running taskflow associated with + @brief cancels the execution of the running taskflow associated with this future object @return @c true if the execution can be cancelled or @c false if the execution has already completed + + When you request a cancellation, the executor will stop scheduling + any tasks onwards. Tasks that are already running will continue to finish + (non-preemptive). + You can call tf::Future::wait to wait for the cancellation to complete. */ bool cancel(); private: - + handle_t _handle; template @@ -462,7 +607,7 @@ bool Future::cancel() { else { auto ptr = arg.lock(); if(ptr) { - ptr->_is_cancelled = true; + ptr->_is_cancelled.store(true, std::memory_order_relaxed); return true; } return false; @@ -472,7 +617,3 @@ bool Future::cancel() { } // end of namespace tf. --------------------------------------------------- - - - - diff --git a/lib/taskflow/core/topology.hpp b/lib/taskflow/core/topology.hpp index a9b8e51..b4d9eab 100644 --- a/lib/taskflow/core/topology.hpp +++ b/lib/taskflow/core/topology.hpp @@ -6,30 +6,25 @@ namespace tf { // class: TopologyBase class TopologyBase { - + friend class Executor; friend class Node; - + template friend class Future; protected: - bool _is_cancelled { false }; + std::atomic _is_cancelled { false }; }; // ---------------------------------------------------------------------------- -// class: AsyncTopology -class AsyncTopology : public TopologyBase { -}; - -// ---------------------------------------------------------------------------- - // class: Topology class Topology : public TopologyBase { - + friend class Executor; + friend class Runtime; public: @@ -42,7 +37,7 @@ class Topology : public TopologyBase { std::promise _promise; - std::vector _sources; + SmallVector _sources; std::function _pred; std::function _call; @@ -52,7 +47,7 @@ class Topology : public TopologyBase { // Constructor template -Topology::Topology(Taskflow& tf, P&& p, C&& c): +Topology::Topology(Taskflow& tf, P&& p, C&& c): _taskflow(tf), _pred {std::forward

(p)}, _call {std::forward(c)} { diff --git a/lib/taskflow/core/tsq.hpp b/lib/taskflow/core/tsq.hpp index 0a13630..e4ea76c 100644 --- a/lib/taskflow/core/tsq.hpp +++ b/lib/taskflow/core/tsq.hpp @@ -1,31 +1,115 @@ #pragma once -#include -#include -#include -#include -#include -#include +#include "../utility/macros.hpp" +#include "../utility/traits.hpp" + +/** +@file tsq.hpp +@brief task queue include file +*/ namespace tf { + +// ---------------------------------------------------------------------------- +// Task Types +// ---------------------------------------------------------------------------- + +/** +@enum TaskPriority + +@brief enumeration of all task priority values + +A priority is an enumerated value of type @c unsigned. +Currently, %Taskflow defines three priority levels, +@c HIGH, @c NORMAL, and @c LOW, starting from 0, 1, to 2. +That is, the lower the value, the higher the priority. + +*/ +enum class TaskPriority : unsigned { + /** @brief value of the highest priority (i.e., 0) */ + HIGH = 0, + /** @brief value of the normal priority (i.e., 1) */ + NORMAL = 1, + /** @brief value of the lowest priority (i.e., 2) */ + LOW = 2, + /** @brief conventional value for iterating priority values */ + MAX = 3 +}; + + + +// ---------------------------------------------------------------------------- +// Task Queue +// ---------------------------------------------------------------------------- + + /** @class: TaskQueue -@tparam T data type (must be a pointer) +@tparam T data type (must be a pointer type) +@tparam TF_MAX_PRIORITY maximum level of the priority -@brief Lock-free unbounded single-producer multiple-consumer queue. +@brief class to create a lock-free unbounded single-producer multiple-consumer queue -This class implements the work stealing queue described in the paper, -"Correct and Efficient Work-Stealing for Weak Memory Models," -available at https://www.di.ens.fr/~zappa/readings/ppopp13.pdf. +This class implements the work-stealing queue described in the paper, +Correct and Efficient Work-Stealing for Weak Memory Models, +and extends it to include priority. Only the queue owner can perform pop and push operations, -while others can steal data from the queue. +while others can steal data from the queue simultaneously. +Priority starts from zero (highest priority) to the template value +`TF_MAX_PRIORITY-1` (lowest priority). +All operations are associated with priority values to indicate +the corresponding queues to which an operation is applied. + +The default template value, `TF_MAX_PRIORITY`, is `TaskPriority::MAX` +which applies only three priority levels to the task queue. + +@code{.cpp} +auto [A, B, C, D, E] = taskflow.emplace( + [] () { }, + [&] () { + std::cout << "Task B: " << counter++ << '\n'; // 0 + }, + [&] () { + std::cout << "Task C: " << counter++ << '\n'; // 2 + }, + [&] () { + std::cout << "Task D: " << counter++ << '\n'; // 1 + }, + [] () { } +); + +A.precede(B, C, D); +E.succeed(B, C, D); + +B.priority(tf::TaskPriority::HIGH); +C.priority(tf::TaskPriority::LOW); +D.priority(tf::TaskPriority::NORMAL); + +executor.run(taskflow).wait(); +@endcode + +In the above example, we have a task graph of five tasks, +@c A, @c B, @c C, @c D, and @c E, in which @c B, @c C, and @c D +can run in simultaneously when @c A finishes. +Since we only uses one worker thread in the executor, +we can deterministically run @c B first, then @c D, and @c C +in order of their priority values. +The output is as follows: + +@code{.shell-session} +Task B: 0 +Task D: 1 +Task C: 2 +@endcode + */ -template +template (TaskPriority::MAX)> class TaskQueue { - + + static_assert(TF_MAX_PRIORITY > 0, "TF_MAX_PRIORITY must be at least one"); static_assert(std::is_pointer_v, "T must be a pointer type"); struct Array { @@ -34,7 +118,7 @@ class TaskQueue { int64_t M; std::atomic* S; - explicit Array(int64_t c) : + explicit Array(int64_t c) : C {c}, M {c-1}, S {new std::atomic[static_cast(C)]} { @@ -47,10 +131,9 @@ class TaskQueue { int64_t capacity() const noexcept { return C; } - - template - void push(int64_t i, O&& o) noexcept { - S[i & M].store(std::forward(o), std::memory_order_relaxed); + + void push(int64_t i, T o) noexcept { + S[i & M].store(o, std::memory_order_relaxed); } T pop(int64_t i) noexcept { @@ -67,133 +150,208 @@ class TaskQueue { }; - std::atomic _top; - std::atomic _bottom; - std::atomic _array; - std::vector _garbage; + // Doubling the alignment by 2 seems to generate the most + // decent performance. + CachelineAligned> _top[TF_MAX_PRIORITY]; + CachelineAligned> _bottom[TF_MAX_PRIORITY]; + std::atomic _array[TF_MAX_PRIORITY]; + std::vector _garbage[TF_MAX_PRIORITY]; + + //std::atomic _cache {nullptr}; public: - + /** @brief constructs the queue with a given capacity @param capacity the capacity of the queue (must be power of 2) */ - explicit TaskQueue(int64_t capacity = 1024); + explicit TaskQueue(int64_t capacity = 512); /** @brief destructs the queue */ ~TaskQueue(); - + /** @brief queries if the queue is empty at the time of this call */ bool empty() const noexcept; - + + /** + @brief queries if the queue is empty at a specific priority value + */ + bool empty(unsigned priority) const noexcept; + /** @brief queries the number of items at the time of this call */ size_t size() const noexcept; + /** + @brief queries the number of items with the given priority + at the time of this call + */ + size_t size(unsigned priority) const noexcept; + /** @brief queries the capacity of the queue */ int64_t capacity() const noexcept; + /** + @brief queries the capacity of the queue at a specific priority value + */ + int64_t capacity(unsigned priority) const noexcept; + /** @brief inserts an item to the queue - Only the owner thread can insert an item to the queue. - The operation can trigger the queue to resize its capacity + @param item the item to push to the queue + @param priority priority value of the item to push (default = 0) + + Only the owner thread can insert an item to the queue. + The operation can trigger the queue to resize its capacity if more space is required. - - @tparam O data type - - @param item the item to perfect-forward to the queue */ - void push(T item); - + TF_FORCE_INLINE void push(T item, unsigned priority); + /** @brief pops out an item from the queue - Only the owner thread can pop out an item from the queue. - The return can be a nullptr if this operation failed (empty queue). + Only the owner thread can pop out an item from the queue. + The return can be a @c nullptr if this operation failed (empty queue). */ T pop(); - + + /** + @brief pops out an item with a specific priority value from the queue + + @param priority priority of the item to pop + + Only the owner thread can pop out an item from the queue. + The return can be a @c nullptr if this operation failed (empty queue). + */ + TF_FORCE_INLINE T pop(unsigned priority); + /** @brief steals an item from the queue Any threads can try to steal an item from the queue. - The return can be a nullptr if this operation failed (not necessary empty). + The return can be a @c nullptr if this operation failed (not necessary empty). */ T steal(); + + /** + @brief steals an item with a specific priority value from the queue + + @param priority priority of the item to steal + + Any threads can try to steal an item from the queue. + The return can be a @c nullptr if this operation failed (not necessary empty). + */ + T steal(unsigned priority); + + private: + TF_NO_INLINE Array* resize_array(Array* a, unsigned p, std::int64_t b, std::int64_t t); }; // Constructor -template -TaskQueue::TaskQueue(int64_t c) { +template +TaskQueue::TaskQueue(int64_t c) { assert(c && (!(c & (c-1)))); - _top.store(0, std::memory_order_relaxed); - _bottom.store(0, std::memory_order_relaxed); - _array.store(new Array{c}, std::memory_order_relaxed); - _garbage.reserve(32); + unroll<0, TF_MAX_PRIORITY, 1>([&](auto p){ + _top[p].data.store(0, std::memory_order_relaxed); + _bottom[p].data.store(0, std::memory_order_relaxed); + _array[p].store(new Array{c}, std::memory_order_relaxed); + _garbage[p].reserve(32); + }); } // Destructor -template -TaskQueue::~TaskQueue() { - for(auto a : _garbage) { - delete a; +template +TaskQueue::~TaskQueue() { + unroll<0, TF_MAX_PRIORITY, 1>([&](auto p){ + for(auto a : _garbage[p]) { + delete a; + } + delete _array[p].load(); + }); +} + +// Function: empty +template +bool TaskQueue::empty() const noexcept { + for(unsigned i=0; i -bool TaskQueue::empty() const noexcept { - int64_t b = _bottom.load(std::memory_order_relaxed); - int64_t t = _top.load(std::memory_order_relaxed); - return b <= t; +template +bool TaskQueue::empty(unsigned p) const noexcept { + int64_t b = _bottom[p].data.load(std::memory_order_relaxed); + int64_t t = _top[p].data.load(std::memory_order_relaxed); + return (b <= t); +} + +// Function: size +template +size_t TaskQueue::size() const noexcept { + size_t s; + unroll<0, TF_MAX_PRIORITY, 1>([&](auto i) { s = i ? size(i) + s : size(i); }); + return s; } // Function: size -template -size_t TaskQueue::size() const noexcept { - int64_t b = _bottom.load(std::memory_order_relaxed); - int64_t t = _top.load(std::memory_order_relaxed); +template +size_t TaskQueue::size(unsigned p) const noexcept { + int64_t b = _bottom[p].data.load(std::memory_order_relaxed); + int64_t t = _top[p].data.load(std::memory_order_relaxed); return static_cast(b >= t ? b - t : 0); } // Function: push -template -void TaskQueue::push(T o) { - int64_t b = _bottom.load(std::memory_order_relaxed); - int64_t t = _top.load(std::memory_order_acquire); - Array* a = _array.load(std::memory_order_relaxed); +template +TF_FORCE_INLINE void TaskQueue::push(T o, unsigned p) { + + int64_t b = _bottom[p].data.load(std::memory_order_relaxed); + int64_t t = _top[p].data.load(std::memory_order_acquire); + Array* a = _array[p].load(std::memory_order_relaxed); // queue is full if(a->capacity() - 1 < (b - t)) { - Array* tmp = a->resize(b, t); - _garbage.push_back(a); - std::swap(a, tmp); - _array.store(a, std::memory_order_relaxed); + a = resize_array(a, p, b, t); } a->push(b, o); std::atomic_thread_fence(std::memory_order_release); - _bottom.store(b + 1, std::memory_order_relaxed); + _bottom[p].data.store(b + 1, std::memory_order_relaxed); } // Function: pop -template -T TaskQueue::pop() { - int64_t b = _bottom.load(std::memory_order_relaxed) - 1; - Array* a = _array.load(std::memory_order_relaxed); - _bottom.store(b, std::memory_order_relaxed); +template +T TaskQueue::pop() { + for(unsigned i=0; i +TF_FORCE_INLINE T TaskQueue::pop(unsigned p) { + + int64_t b = _bottom[p].data.load(std::memory_order_relaxed) - 1; + Array* a = _array[p].load(std::memory_order_relaxed); + _bottom[p].data.store(b, std::memory_order_relaxed); std::atomic_thread_fence(std::memory_order_seq_cst); - int64_t t = _top.load(std::memory_order_relaxed); + int64_t t = _top[p].data.load(std::memory_order_relaxed); T item {nullptr}; @@ -201,36 +359,48 @@ T TaskQueue::pop() { item = a->pop(b); if(t == b) { // the last item just got stolen - if(!_top.compare_exchange_strong(t, t+1, - std::memory_order_seq_cst, - std::memory_order_relaxed)) { + if(!_top[p].data.compare_exchange_strong(t, t+1, + std::memory_order_seq_cst, + std::memory_order_relaxed)) { item = nullptr; } - _bottom.store(b + 1, std::memory_order_relaxed); + _bottom[p].data.store(b + 1, std::memory_order_relaxed); } } else { - _bottom.store(b + 1, std::memory_order_relaxed); + _bottom[p].data.store(b + 1, std::memory_order_relaxed); } return item; } // Function: steal -template -T TaskQueue::steal() { - int64_t t = _top.load(std::memory_order_acquire); - std::atomic_thread_fence(std::memory_order_seq_cst); - int64_t b = _bottom.load(std::memory_order_acquire); +template +T TaskQueue::steal() { + for(unsigned i=0; i +T TaskQueue::steal(unsigned p) { + int64_t t = _top[p].data.load(std::memory_order_acquire); + std::atomic_thread_fence(std::memory_order_seq_cst); + int64_t b = _bottom[p].data.load(std::memory_order_acquire); + T item {nullptr}; if(t < b) { - Array* a = _array.load(std::memory_order_consume); + Array* a = _array[p].load(std::memory_order_consume); item = a->pop(t); - if(!_top.compare_exchange_strong(t, t+1, - std::memory_order_seq_cst, - std::memory_order_relaxed)) { + if(!_top[p].data.compare_exchange_strong(t, t+1, + std::memory_order_seq_cst, + std::memory_order_relaxed)) { return nullptr; } } @@ -239,9 +409,33 @@ T TaskQueue::steal() { } // Function: capacity -template -int64_t TaskQueue::capacity() const noexcept { - return _array.load(std::memory_order_relaxed)->capacity(); +template +int64_t TaskQueue::capacity() const noexcept { + size_t s; + unroll<0, TF_MAX_PRIORITY, 1>([&](auto i) { + s = i ? capacity(i) + s : capacity(i); + }); + return s; } +// Function: capacity +template +int64_t TaskQueue::capacity(unsigned p) const noexcept { + return _array[p].load(std::memory_order_relaxed)->capacity(); +} + +template +TF_NO_INLINE typename TaskQueue::Array* + TaskQueue::resize_array(Array* a, unsigned p, std::int64_t b, std::int64_t t) { + + Array* tmp = a->resize(b, t); + _garbage[p].push_back(a); + std::swap(a, tmp); + _array[p].store(a, std::memory_order_release); + // Note: the original paper using relaxed causes t-san to complain + //_array.store(a, std::memory_order_relaxed); + return a; +} + + } // end of namespace tf ----------------------------------------------------- diff --git a/lib/taskflow/core/worker.hpp b/lib/taskflow/core/worker.hpp index 61b7bc8..47fcf81 100644 --- a/lib/taskflow/core/worker.hpp +++ b/lib/taskflow/core/worker.hpp @@ -4,31 +4,100 @@ #include "tsq.hpp" #include "notifier.hpp" -/** +/** @file worker.hpp @brief worker include file */ namespace tf { +// ---------------------------------------------------------------------------- +// Class Definition: Worker +// ---------------------------------------------------------------------------- + /** -@private +@class Worker + +@brief class to create a worker in an executor + +The class is primarily used by the executor to perform work-stealing algorithm. +Users can access a worker object and alter its property +(e.g., changing the thread affinity in a POSIX-like system) +using tf::WorkerInterface. */ -struct Worker { +class Worker { friend class Executor; friend class WorkerView; + public: + + /** + @brief queries the worker id associated with its parent executor + + A worker id is a unsigned integer in the range [0, N), + where @c N is the number of workers spawned at the construction + time of the executor. + */ + inline size_t id() const { return _id; } + + /** + @brief acquires a pointer access to the underlying thread + */ + inline std::thread* thread() const { return _thread; } + + /** + @brief queries the size of the queue (i.e., number of enqueued tasks to + run) associated with the worker + */ + inline size_t queue_size() const { return _wsq.size(); } + + /** + @brief queries the current capacity of the queue + */ + inline size_t queue_capacity() const { return static_cast(_wsq.capacity()); } + private: size_t _id; size_t _vtm; Executor* _executor; + std::thread* _thread; Notifier::Waiter* _waiter; std::default_random_engine _rdgen { std::random_device{}() }; TaskQueue _wsq; + Node* _cache; }; +// ---------------------------------------------------------------------------- +// Class Definition: PerThreadWorker +// ---------------------------------------------------------------------------- + +/** +@private +*/ +//struct PerThreadWorker { +// +// Worker* worker; +// +// PerThreadWorker() : worker {nullptr} {} +// +// PerThreadWorker(const PerThreadWorker&) = delete; +// PerThreadWorker(PerThreadWorker&&) = delete; +// +// PerThreadWorker& operator = (const PerThreadWorker&) = delete; +// PerThreadWorker& operator = (PerThreadWorker&&) = delete; +//}; + +/** +@private +*/ +//inline PerThreadWorker& this_worker() { +// thread_local PerThreadWorker worker; +// return worker; +//} + + // ---------------------------------------------------------------------------- // Class Definition: WorkerView // ---------------------------------------------------------------------------- @@ -44,22 +113,22 @@ when a worker runs a task, and the view object is only accessible from an observer derived from tf::ObserverInterface. */ class WorkerView { - + friend class Executor; - + public: - + /** - @brief queries the worker id associated with the executor + @brief queries the worker id associated with its parent executor A worker id is a unsigned integer in the range [0, N), where @c N is the number of workers spawned at the construction time of the executor. */ size_t id() const; - + /** - @brief queries the size of the queue (i.e., number of pending tasks to + @brief queries the size of the queue (i.e., number of pending tasks to run) associated with the worker */ size_t queue_size() const; @@ -98,6 +167,94 @@ inline size_t WorkerView::queue_capacity() const { } +// ---------------------------------------------------------------------------- +// Class Definition: WorkerInterface +// ---------------------------------------------------------------------------- + +/** +@class WorkerInterface + +@brief class to configure worker behavior in an executor + +The tf::WorkerInterface class lets users interact with the executor +to customize the worker behavior, +such as calling custom methods before and after a worker enters and leaves +the loop. +When you create an executor, it spawns a set of workers to run tasks. +The interaction between the executor and its spawned workers looks like +the following: + +for(size_t n=0; n +std::shared_ptr make_worker_interface(ArgsT&&... args) { + static_assert( + std::is_base_of_v, + "T must be derived from WorkerInterface" + ); + return std::make_shared(std::forward(args)...); +} + } // end of namespact tf ----------------------------------------------------- diff --git a/lib/taskflow/taskflow.hpp b/lib/taskflow/taskflow.hpp index c815c23..38ac741 100644 --- a/lib/taskflow/taskflow.hpp +++ b/lib/taskflow/taskflow.hpp @@ -1,26 +1,27 @@ #pragma once #include "core/executor.hpp" -#include "core/algorithm/critical.hpp" -#include "core/algorithm/for_each.hpp" -#include "core/algorithm/reduce.hpp" -#include "core/algorithm/sort.hpp" +#include "core/async.hpp" +#include "algorithm/critical.hpp" - -/** @dir taskflow +/** +@dir taskflow @brief root taskflow include dir */ -/** @dir taskflow/core +/** +@dir taskflow/core @brief taskflow core include dir */ -/** @dir taskflow/cuda -@brief taskflow CUDA include dir +/** +@dir taskflow/algorithm +@brief taskflow algorithms include dir */ -/** @dir taskflow/cuda/cublas -@brief taskflow cuBLAS include dir +/** +@dir taskflow/cuda +@brief taskflow CUDA include dir */ /** @@ -32,8 +33,8 @@ // TF_VERSION / 100 % 1000 is the minor version // TF_VERSION / 100000 is the major version -// current version: 3.1.0 -#define TF_VERSION 300100 +// current version: 3.6.0 +#define TF_VERSION 300600 #define TF_MAJOR_VERSION TF_VERSION/100000 #define TF_MINOR_VERSION TF_VERSION/100%1000 @@ -44,11 +45,19 @@ */ namespace tf { +/** +@private +*/ +namespace detail { } + + /** @brief queries the version information in a string format @c major.minor.patch + +Release notes are available here: https://taskflow.github.io/taskflow/Releases.html */ constexpr const char* version() { - return "3.1.0"; + return "3.6.0"; } diff --git a/lib/taskflow/utility/iterator.hpp b/lib/taskflow/utility/iterator.hpp index e2aa5b5..6441391 100644 --- a/lib/taskflow/utility/iterator.hpp +++ b/lib/taskflow/utility/iterator.hpp @@ -5,18 +5,18 @@ namespace tf { -template -constexpr std::enable_if_t>::value, size_t> -distance(T beg, T end, T step) { - return (end - beg + step + (step > 0 ? -1 : 1)) / step; -} - template constexpr std::enable_if_t>::value, bool> is_range_invalid(T beg, T end, T step) { - return ((step == 0 && beg != end) || - (beg < end && step <= 0) || + return ((step == 0 && beg != end) || + (beg < end && step <= 0) || (beg > end && step >= 0)); } +template +constexpr std::enable_if_t>::value, size_t> +distance(T beg, T end, T step) { + return (end - beg + step + (step > 0 ? -1 : 1)) / step; +} + } // end of namespace tf ----------------------------------------------------- diff --git a/lib/taskflow/utility/macros.hpp b/lib/taskflow/utility/macros.hpp new file mode 100644 index 0000000..e7598cf --- /dev/null +++ b/lib/taskflow/utility/macros.hpp @@ -0,0 +1,17 @@ +#pragma once + +#if defined(_MSC_VER) + #define TF_FORCE_INLINE __forceinline +#elif defined(__GNUC__) && __GNUC__ > 3 + #define TF_FORCE_INLINE __attribute__((__always_inline__)) inline +#else + #define TF_FORCE_INLINE inline +#endif + +#if defined(_MSC_VER) + #define TF_NO_INLINE __declspec(noinline) +#elif defined(__GNUC__) && __GNUC__ > 3 + #define TF_NO_INLINE __attribute__((__noinline__)) +#else + #define TF_NO_INLINE +#endif diff --git a/lib/taskflow/utility/math.hpp b/lib/taskflow/utility/math.hpp index b195b23..f80053e 100644 --- a/lib/taskflow/utility/math.hpp +++ b/lib/taskflow/utility/math.hpp @@ -42,7 +42,7 @@ template >, void>* = nullptr > constexpr bool is_pow2(const T& x) { - return x && (!(x&(x-1))); + return x && (!(x&(x-1))); } //// finds the ceil of x divided by b @@ -77,7 +77,7 @@ RandItr median_of_three(RandItr l, RandItr m, RandItr r, C cmp) { } /** -@brief finds the pseudo median of a range of items using spreaded +@brief finds the pseudo median of a range of items using spreaded nine numbers */ template @@ -121,6 +121,30 @@ T unique_id() { return counter.fetch_add(1, std::memory_order_relaxed); } +/** +@brief updates an atomic variable with a maximum value +*/ +template +inline void atomic_max(std::atomic& v, const T& max_v) noexcept { + T prev = v.load(std::memory_order_relaxed); + while(prev < max_v && + !v.compare_exchange_weak(prev, max_v, std::memory_order_relaxed, + std::memory_order_relaxed)) { + } +} + +/** +@brief updates an atomic variable with a minimum value +*/ +template +inline void atomic_min(std::atomic& v, const T& min_v) noexcept { + T prev = v.load(std::memory_order_relaxed); + while(prev > min_v && + !v.compare_exchange_weak(prev, min_v, std::memory_order_relaxed, + std::memory_order_relaxed)) { + } +} + } // end of namespace tf ----------------------------------------------------- diff --git a/lib/taskflow/utility/object_pool.hpp b/lib/taskflow/utility/object_pool.hpp index a90478b..34d60fb 100644 --- a/lib/taskflow/utility/object_pool.hpp +++ b/lib/taskflow/utility/object_pool.hpp @@ -3,7 +3,7 @@ // // 2020/02/02 - modified by Tsung-Wei Huang // - new implementation motivated by Hoard -// +// // 2019/07/10 - modified by Tsung-Wei Huang // - replace raw pointer with smart pointer // @@ -28,7 +28,7 @@ namespace tf { // Class: ObjectPool // // The class implements an efficient thread-safe object pool motivated -// by the Hoard memory allocator algorithm. +// by the Hoard memory allocator algorithm. // Different from the normal memory allocator, object pool allocates // only one object at a time. // @@ -44,13 +44,13 @@ namespace tf { // M = 30 // F = 4 // W = (30+4-1)/4 = 8 -// +// // b0: 0, 1, 2, 3, 4, 5, 6, 7 // b1: 8, 9, 10, 11, 12, 13, 14, 15 // b2: 16, 17, 18, 19, 20, 21, 22, 23 // b3: 24, 25, 26, 27, 28, 29 // b4: 30 (anything equal to M) -// +// // Example scenario 2: // M = 32 // F = 4 @@ -62,14 +62,14 @@ namespace tf { // b4: 32 (anything equal to M) // template -class ObjectPool { - - // the data column must be sufficient to hold the pointer in freelist +class ObjectPool { + + // the data column must be sufficient to hold the pointer in freelist constexpr static size_t X = (std::max)(sizeof(T*), sizeof(T)); //constexpr static size_t X = sizeof(long double) + std::max(sizeof(T*), sizeof(T)); //constexpr static size_t M = (S - offsetof(Block, data)) / X; constexpr static size_t M = S / X; - constexpr static size_t F = 4; + constexpr static size_t F = 4; constexpr static size_t B = F + 1; constexpr static size_t W = (M + F - 1) / F; constexpr static size_t K = 4; @@ -81,7 +81,7 @@ class ObjectPool { static_assert( M >= 128, "block size S must be larger enough to pool at least 128 objects" ); - + struct Blocklist { Blocklist* prev; Blocklist* next; @@ -100,7 +100,7 @@ class ObjectPool { }; struct Block { - LocalHeap* heap; + std::atomic heap; Blocklist list_node; size_t i; size_t u; @@ -110,7 +110,7 @@ class ObjectPool { }; public: - + /** @brief constructs an object pool from a number of anticipated threads */ @@ -120,18 +120,18 @@ class ObjectPool { @brief destructs the object pool */ ~ObjectPool(); - + /** @brief acquires a pointer to a object constructed from a given argument list */ template T* animate(ArgsT&&... args); - + /** @brief recycles a object pointed by @c ptr and destroys it */ void recycle(T* ptr); - + size_t num_bins_per_local_heap() const; size_t num_objects_per_bin() const; size_t num_objects_per_block() const; @@ -141,7 +141,7 @@ class ObjectPool { size_t num_local_heaps() const; size_t num_global_heaps() const; size_t num_heaps() const; - + float emptiness_threshold() const; private: @@ -158,7 +158,7 @@ class ObjectPool { template constexpr size_t _offset_in_class(const Q P::*member) const; - + template constexpr P* _parent_class_of(Q*, const Q P::*member); @@ -194,7 +194,7 @@ class ObjectPool { void _for_each_block(Blocklist*, C&&); }; - + // ---------------------------------------------------------------------------- // ObjectPool definition // ---------------------------------------------------------------------------- @@ -224,18 +224,20 @@ ObjectPool::~ObjectPool() { // clear local heaps for(auto& h : _lheaps) { for(size_t i=0; i size_t ObjectPool::num_bins_per_local_heap() const { @@ -281,11 +283,11 @@ size_t ObjectPool::num_heaps() const { // Function: capacity template size_t ObjectPool::capacity() const { - + size_t n = 0; - + // global heap - for(auto p=_gheap.list.next; p!=&_gheap.list; p=p->next) { + for(auto p=_gheap.list.next; p!=&_gheap.list; p=p->next) { n += M; }; @@ -302,9 +304,9 @@ template size_t ObjectPool::num_available_objects() const { size_t n = 0; - + // global heap - for(auto p=_gheap.list.next; p!=&_gheap.list; p=p->next) { + for(auto p=_gheap.list.next; p!=&_gheap.list; p=p->next) { n += (M - _block_of(p)->u); }; @@ -318,11 +320,11 @@ size_t ObjectPool::num_available_objects() const { // Function: num_allocated_objects template size_t ObjectPool::num_allocated_objects() const { - + size_t n = 0; - + // global heap - for(auto p=_gheap.list.next; p!=&_gheap.list; p=p->next) { + for(auto p=_gheap.list.next; p!=&_gheap.list; p=p->next) { n += _block_of(p)->u; }; @@ -368,14 +370,14 @@ constexpr P* ObjectPool::_parent_class_of( // Function: _block_of template -constexpr typename ObjectPool::Block* +constexpr typename ObjectPool::Block* ObjectPool::_block_of(Blocklist* list) { return _parent_class_of(list, &Block::list_node); } // Function: _block_of template -constexpr typename ObjectPool::Block* +constexpr typename ObjectPool::Block* ObjectPool::_block_of(const Blocklist* list) const { return _parent_class_of(list, &Block::list_node); } @@ -389,7 +391,7 @@ void ObjectPool::_blocklist_init_head(Blocklist *list) { // Procedure: _blocklist_add_impl // Insert a new entry between two known consecutive entries. -// +// // This is only for internal list manipulation where we know // the prev/next entries already! template @@ -405,10 +407,10 @@ void ObjectPool::_blocklist_add_impl( // list_push_front - add a new entry // @curr: curr entry to be added // @head: list head to add it after -// +// // Insert a new entry after the specified head. // This is good for implementing stacks. -// +// template void ObjectPool::_blocklist_push_front( Blocklist *curr, Blocklist *head @@ -419,10 +421,10 @@ void ObjectPool::_blocklist_push_front( // list_add_tail - add a new entry // @curr: curr entry to be added // @head: list head to add it before -// +// // Insert a new entry before the specified head. // This is useful for implementing queues. -// +// template void ObjectPool::_blocklist_push_back( Blocklist *curr, Blocklist *head @@ -432,10 +434,10 @@ void ObjectPool::_blocklist_push_back( // Delete a list entry by making the prev/next entries // point to each other. -// +// // This is only for internal list manipulation where we know // the prev/next entries already! -// +// template void ObjectPool::_blocklist_del_impl( Blocklist * prev, Blocklist * next @@ -458,7 +460,7 @@ void ObjectPool::_blocklist_del(Blocklist *entry) { // list_replace - replace old entry by new one // @old : the element to be replaced // @curr : the new element to insert -// +// // If @old was empty, it will be overwritten. template void ObjectPool::_blocklist_replace( @@ -537,7 +539,7 @@ void ObjectPool::_for_each_block(Blocklist* head, C&& c) { c(_block_of(p)); } } - + // Procedure: _for_each_block_safe // Iterate each item of a list - safe to free template @@ -577,15 +579,15 @@ template T* ObjectPool::animate(ArgsT&&... args) { //std::cout << "construct a new item\n"; - + // my logically mapped heap - LocalHeap& h = _this_heap(); - + LocalHeap& h = _this_heap(); + Block* s {nullptr}; h.mutex.lock(); - - // scan the list of superblocks from most full to least + + // scan the list of superblocks from the most full to the least full int f = static_cast(F-1); for(; f>=0; f--) { if(!_blocklist_is_empty(&h.lists[f])) { @@ -593,16 +595,16 @@ T* ObjectPool::animate(ArgsT&&... args) { break; } } - + // no superblock found if(f == -1) { // check heap 0 for a superblock _gheap.mutex.lock(); if(!_blocklist_is_empty(&_gheap.list)) { - + s = _block_of(_gheap.list.next); - + //printf("get a superblock from global heap %lu\n", s->u); assert(s->u < M && s->heap == nullptr); f = static_cast(_bin(s->u + 1)); @@ -620,7 +622,8 @@ T* ObjectPool::animate(ArgsT&&... args) { //printf("create a new superblock\n"); _gheap.mutex.unlock(); f = 0; - s = static_cast(std::malloc(sizeof(Block))); + //s = static_cast(std::malloc(sizeof(Block))); + s = new Block(); if(s == nullptr) { throw std::bad_alloc(); @@ -636,7 +639,7 @@ T* ObjectPool::animate(ArgsT&&... args) { h.a = h.a + M; } } - + // the superblock must have at least one space //assert(s->u < M); //printf("%lu %lu %lu\n", h.u, h.a, s->u); @@ -647,9 +650,9 @@ T* ObjectPool::animate(ArgsT&&... args) { // take one item from the superblock T* mem = _allocate(s); - + int b = static_cast(_bin(s->u)); - + if(b != f) { //printf("move superblock from list[%d] to list[%d]\n", f, b); _blocklist_move_front(&s->list_node, &h.lists[b]); @@ -670,7 +673,7 @@ T* ObjectPool::animate(ArgsT&&... args) { return mem; } - + // Function: destruct template void ObjectPool::recycle(T* mem) { @@ -684,7 +687,7 @@ void ObjectPool::recycle(T* mem) { Block* s = static_cast(mem->_object_pool_block); mem->~T(); - + //printf("deallocate %p (s=%p) M=%lu W=%lu X=%lu\n", mem, s, M, W, X); // here we need a loop because when we lock the heap, @@ -692,8 +695,8 @@ void ObjectPool::recycle(T* mem) { bool sync = false; do { - auto h = s->heap; - + LocalHeap* h = s->heap.load(std::memory_order_relaxed); + // the block is in global heap if(h == nullptr) { std::lock_guard glock(_gheap.mutex); @@ -739,14 +742,14 @@ void ObjectPool::recycle(T* mem) { } } } while(!sync); - + //std::cout << "s.i " << s->i << '\n' // << "s.u " << s->u << '\n'; } - + // Function: _this_heap template -typename ObjectPool::LocalHeap& +typename ObjectPool::LocalHeap& ObjectPool::_this_heap() { // here we don't use thread local since object pool might be // created and destroyed multiple times @@ -760,16 +763,16 @@ ObjectPool::_this_heap() { // Function: _next_pow2 template -constexpr unsigned ObjectPool::_next_pow2(unsigned n) const { +constexpr unsigned ObjectPool::_next_pow2(unsigned n) const { if(n == 0) return 1; - n--; - n |= n >> 1; - n |= n >> 2; - n |= n >> 4; - n |= n >> 8; - n |= n >> 16; - n++; - return n; -} + n--; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; + n++; + return n; +} } // end namespace tf -------------------------------------------------------- diff --git a/lib/taskflow/utility/os.hpp b/lib/taskflow/utility/os.hpp index 433f6d8..23ac301 100644 --- a/lib/taskflow/utility/os.hpp +++ b/lib/taskflow/utility/os.hpp @@ -14,7 +14,7 @@ #define TF_OS_CNK 0 #define TF_OS_HURD 0 #define TF_OS_SOLARIS 0 -#define TF_OS_UNIX 0 /* disjunction of TF_OS_LINUX, TF_OS_DARWIN etc. */ +#define TF_OS_UNIX 0 #ifdef _WIN32 #undef TF_OS_WINDOWS @@ -80,7 +80,7 @@ TF_OS_LINUX + TF_OS_DRAGONFLY + TF_OS_FREEBSD + TF_OS_NETBSD + \ TF_OS_OPENBSD + TF_OS_DARWIN + TF_OS_WINDOWS + TF_OS_HURD + \ TF_OS_SOLARIS) -#error Unknown OS +#define TF_OS_UNKNOWN 1 #endif #if TF_OS_LINUX || TF_OS_DRAGONFLY || TF_OS_FREEBSD || TF_OS_NETBSD || \ @@ -89,14 +89,60 @@ #define TF_OS_UNIX 1 #endif + +//----------------------------------------------------------------------------- +// Cache line alignment +//----------------------------------------------------------------------------- +#if defined(__i386__) || defined(__x86_64__) + #define TF_CACHELINE_SIZE 64 +#elif defined(__powerpc64__) + // TODO + // This is the L1 D-cache line size of our Power7 machines. + // Need to check if this is appropriate for other PowerPC64 systems. + #define TF_CACHELINE_SIZE 128 +#elif defined(__arm__) + // Cache line sizes for ARM: These values are not strictly correct since + // cache line sizes depend on implementations, not architectures. + // There are even implementations with cache line sizes configurable + // at boot time. + #if defined(__ARM_ARCH_5T__) + #define TF_CACHELINE_SIZE 32 + #elif defined(__ARM_ARCH_7A__) + #define TF_CACHELINE_SIZE 64 + #endif +#endif + +#ifndef TF_CACHELINE_SIZE +// A reasonable default guess. Note that overestimates tend to waste more +// space, while underestimates tend to waste more time. + #define TF_CACHELINE_SIZE 64 +#endif + + + +//----------------------------------------------------------------------------- +// pause +//----------------------------------------------------------------------------- +//#if __has_include () +// #define TF_HAS_MM_PAUSE 1 +// #include +//#endif + namespace tf { +// Struct: CachelineAligned +// Due to prefetch, we typically do 2x cacheline for the alignment. +template +struct CachelineAligned { + alignas (2*TF_CACHELINE_SIZE) T data; +}; + // Function: get_env inline std::string get_env(const std::string& str) { #ifdef _MSC_VER char *ptr = nullptr; size_t len = 0; - + if(_dupenv_s(&ptr, &len, str.c_str()) == 0 && ptr != nullptr) { std::string res(ptr, len); std::free(ptr); @@ -115,7 +161,7 @@ inline bool has_env(const std::string& str) { #ifdef _MSC_VER char *ptr = nullptr; size_t len = 0; - + if(_dupenv_s(&ptr, &len, str.c_str()) == 0 && ptr != nullptr) { std::string res(ptr, len); std::free(ptr); @@ -129,8 +175,12 @@ inline bool has_env(const std::string& str) { #endif } -// ---------------------------------------------------------------------------- - +// Procedure: relax_cpu +//inline void relax_cpu() { +//#ifdef TF_HAS_MM_PAUSE +// _mm_pause(); +//#endif +//} diff --git a/lib/taskflow/utility/serializer.hpp b/lib/taskflow/utility/serializer.hpp index 387ef43..aab00f2 100644 --- a/lib/taskflow/utility/serializer.hpp +++ b/lib/taskflow/utility/serializer.hpp @@ -1,6 +1,30 @@ #pragma once -#include "traits.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include namespace tf { @@ -9,172 +33,172 @@ namespace tf { // ---------------------------------------------------------------------------- // std::basic_string -template +template struct is_std_basic_string : std::false_type {}; -template +template struct is_std_basic_string > : std::true_type {}; -template +template constexpr bool is_std_basic_string_v = is_std_basic_string::value; // std::array -template +template struct is_std_array : std::false_type {}; -template +template struct is_std_array > : std::true_type {}; -template +template constexpr bool is_std_array_v = is_std_array::value; // std::vector -template +template struct is_std_vector : std::false_type {}; -template +template struct is_std_vector > : std::true_type {}; -template +template constexpr bool is_std_vector_v = is_std_vector::value; // std::deque -template +template struct is_std_deque : std::false_type {}; -template +template struct is_std_deque > : std::true_type {}; -template +template constexpr bool is_std_deque_v = is_std_deque::value; // std::list -template +template struct is_std_list : std::false_type {}; -template +template struct is_std_list > : std::true_type {}; -template +template constexpr bool is_std_list_v = is_std_list::value; // std::forward_list -template +template struct is_std_forward_list : std::false_type {}; -template +template struct is_std_forward_list > : std::true_type {}; -template +template constexpr bool is_std_forward_list_v = is_std_forward_list::value; // std::map -template +template struct is_std_map : std::false_type {}; -template +template struct is_std_map > : std::true_type {}; -template +template constexpr bool is_std_map_v = is_std_map::value; // std::unordered_map -template +template struct is_std_unordered_map : std::false_type {}; -template +template struct is_std_unordered_map > : std::true_type {}; -template +template constexpr bool is_std_unordered_map_v = is_std_unordered_map::value; // std::set -template +template struct is_std_set : std::false_type {}; -template +template struct is_std_set > : std::true_type {}; -template +template constexpr bool is_std_set_v = is_std_set::value; // std::unordered_set -template +template struct is_std_unordered_set : std::false_type {}; -template +template struct is_std_unordered_set > : std::true_type {}; -template +template constexpr bool is_std_unordered_set_v = is_std_unordered_set::value; // std::variant -template +template struct is_std_variant : std::false_type {}; -template +template struct is_std_variant > : std::true_type {}; -template +template constexpr bool is_std_variant_v = is_std_variant::value; // std::optional -template +template struct is_std_optional : std::false_type {}; -template +template struct is_std_optional > : std::true_type {}; -template +template constexpr bool is_std_optional_v = is_std_optional::value; // std::unique_ptr -template +template struct is_std_unique_ptr : std::false_type {}; -template +template struct is_std_unique_ptr > : std::true_type {}; -template +template constexpr bool is_std_unique_ptr_v = is_std_unique_ptr::value; // std::shared_ptr -template +template struct is_std_shared_ptr : std::false_type {}; -template +template struct is_std_shared_ptr > : std::true_type {}; -template +template constexpr bool is_std_shared_ptr_v = is_std_shared_ptr::value; // std::duration template struct is_std_duration : std::false_type {}; -template +template struct is_std_duration> : std::true_type {}; -template +template constexpr bool is_std_duration_v = is_std_duration::value; // std::time_point -template +template struct is_std_time_point : std::false_type {}; -template +template struct is_std_time_point> : std::true_type {}; -template +template constexpr bool is_std_time_point_v = is_std_time_point::value; // std::tuple -template +template struct is_std_tuple : std::false_type {}; -template +template struct is_std_tuple> : std::true_type {}; -template +template constexpr bool is_std_tuple_v = is_std_tuple::value; //----------------------------------------------------------------------------- @@ -182,7 +206,7 @@ constexpr bool is_std_tuple_v = is_std_tuple::value; //----------------------------------------------------------------------------- // ExtractType: forward declaration -template +template struct ExtractType; // ExtractType_t: alias interface @@ -211,23 +235,23 @@ struct ExtractType > : ExtractType> { // ---------------------------------------------------------------------------- // Struct: SizeTag -// Class that wraps a given size item which can be customized. +// Class that wraps a given size item which can be customized. template class SizeTag { - public: - + public: + using type = std::conditional_t, T, std::decay_t>; - + SizeTag(T&& item) : _item(std::forward(item)) {} - + SizeTag& operator = (const SizeTag&) = delete; inline const T& get() const {return _item;} template auto save(ArchiverT & ar) const { return ar(_item); } - + template auto load(ArchiverT & ar) { return ar(_item); } @@ -249,9 +273,9 @@ SizeTag make_size_tag(T&& t) { // Class: MapItem template class MapItem { - + public: - + using KeyType = std::conditional_t , KeyT, std::decay_t>; using ValueType = std::conditional_t , ValueT, std::decay_t>; @@ -263,7 +287,7 @@ class MapItem { template auto save(ArchiverT & ar) const { return ar(_key, _value); } - + template auto load(ArchiverT & ar) { return ar(_key, _value); } @@ -284,7 +308,7 @@ MapItem make_kv_pair(KeyT&& k, ValueT&& v) { // ---------------------------------------------------------------------------- template -constexpr auto is_default_serializable_v = +constexpr auto is_default_serializable_v = ( std::is_arithmetic_v || std::is_enum_v || is_std_basic_string_v || @@ -301,166 +325,169 @@ constexpr auto is_default_serializable_v = is_std_variant_v || is_std_optional_v || is_std_tuple_v || - is_std_array_v; + is_std_array_v +); // Class: Serializer -template +template class Serializer { public: - - Serializer(Device& device); - + + Serializer(Stream& stream); + template SizeType operator()(T&&... items); - + private: - Device& _device; - - template >, void>* = nullptr + > + SizeType _save(T&&); + + template >, void>* = nullptr > SizeType _save(T&&); - - template >, void>* = nullptr > SizeType _save(T&&); - - template >, void>* = nullptr > SizeType _save(T&&); - - template > || - is_std_list_v>, + is_std_list_v>, void >* = nullptr > SizeType _save(T&&); - - template >, + is_std_forward_list_v>, void >* = nullptr > SizeType _save(T&&); - - template > || - is_std_unordered_map_v>, + is_std_unordered_map_v>, void >* = nullptr > SizeType _save(T&&); - - template > || - is_std_unordered_set_v>, + is_std_unordered_set_v>, void >* = nullptr > SizeType _save(T&&); - - template >, void>* = nullptr > SizeType _save(T&&); - template >, void>* = nullptr > SizeType _save(T&&); - template >, void>* = nullptr > SizeType _save(T&&); - template >, void>* = nullptr > SizeType _save(T&&); - - template >, void>* = nullptr > SizeType _save(T&&); - - template >, void>* = nullptr > SizeType _save(T&&); - - template >, void>* = nullptr > SizeType _save(T&&); - - template >, void>* = nullptr - > - SizeType _save(T&&); + + }; // Constructor -template -Serializer::Serializer(Device& device) : _device(device) { +template +Serializer::Serializer(Stream& stream) : _stream(stream) { } // Operator () -template +template template -SizeType Serializer::operator() (T&&... items) { +SizeType Serializer::operator() (T&&... items) { return (_save(std::forward(items)) + ...); } // arithmetic data type -template -template +template >, void>* > -SizeType Serializer::_save(T&& t) { - _device.write(reinterpret_cast(std::addressof(t)), sizeof(t)); +SizeType Serializer::_save(T&& t) { + _stream.write(reinterpret_cast(std::addressof(t)), sizeof(t)); return sizeof(t); } // std::basic_string -template -template +template >, void>* > -SizeType Serializer::_save(T&& t) { +SizeType Serializer::_save(T&& t) { using U = std::decay_t; auto sz = _save(make_size_tag(t.size())); - _device.write( - reinterpret_cast(t.data()), + _stream.write( + reinterpret_cast(t.data()), t.size()*sizeof(typename U::value_type) ); return sz + t.size()*sizeof(typename U::value_type); } // std::vector -template -template +template >, void>* > -SizeType Serializer::_save(T&& t) { +SizeType Serializer::_save(T&& t) { using U = std::decay_t; - + auto sz = _save(make_size_tag(t.size())); if constexpr (std::is_arithmetic_v) { - _device.write( - reinterpret_cast(t.data()), + _stream.write( + reinterpret_cast(t.data()), t.size() * sizeof(typename U::value_type) ); sz += t.size() * sizeof(typename U::value_type); @@ -474,12 +501,12 @@ SizeType Serializer::_save(T&& t) { } // std::list and std::deque -template -template +template > || is_std_list_v>, void>* > -SizeType Serializer::_save(T&& t) { +SizeType Serializer::_save(T&& t) { auto sz = _save(make_size_tag(t.size())); for(auto&& item : t) { sz += _save(item); @@ -488,11 +515,11 @@ SizeType Serializer::_save(T&& t) { } // std::forward_list -template -template +template >, void>* > -SizeType Serializer::_save(T&& t) { +SizeType Serializer::_save(T&& t) { auto sz = _save(make_size_tag(std::distance(t.begin(), t.end()))); for(auto&& item : t) { sz += _save(item); @@ -501,13 +528,13 @@ SizeType Serializer::_save(T&& t) { } // std::map and std::unordered_map -template +template template > || - is_std_unordered_map_v>, + is_std_unordered_map_v>, void >*> -SizeType Serializer::_save(T&& t) { +SizeType Serializer::_save(T&& t) { auto sz = _save(make_size_tag(t.size())); for(auto&& [k, v] : t) { sz += _save(make_kv_pair(k, v)); @@ -516,13 +543,13 @@ SizeType Serializer::_save(T&& t) { } // std::set and std::unordered_set -template +template template > || - is_std_unordered_set_v>, + is_std_unordered_set_v>, void >*> -SizeType Serializer::_save(T&& t) { +SizeType Serializer::_save(T&& t) { auto sz = _save(make_size_tag(t.size())); for(auto&& item : t) { sz += _save(item); @@ -531,39 +558,39 @@ SizeType Serializer::_save(T&& t) { } // enum data type -template -template +template >, void>* > -SizeType Serializer::_save(T&& t) { +SizeType Serializer::_save(T&& t) { using U = std::decay_t; return _save(static_cast>(t)); } // duration data type -template -template +template >, void>* > -SizeType Serializer::_save(T&& t) { +SizeType Serializer::_save(T&& t) { return _save(t.count()); } // time point data type -template -template +template >, void>* > -SizeType Serializer::_save(T&& t) { +SizeType Serializer::_save(T&& t) { return _save(t.time_since_epoch()); } // optional data type -template -template +template >, void>* > -SizeType Serializer::_save(T&& t) { +SizeType Serializer::_save(T&& t) { if(bool flag = t.has_value(); flag) { return _save(flag) + _save(*t); } @@ -573,35 +600,35 @@ SizeType Serializer::_save(T&& t) { } // variant type -template -template +template >, void>* > -SizeType Serializer::_save(T&& t) { - return _save(t.index()) + +SizeType Serializer::_save(T&& t) { + return _save(t.index()) + std::visit([&] (auto&& arg){ return _save(arg);}, t); } // tuple type -template -template +template >, void>* > -SizeType Serializer::_save(T&& t) { +SizeType Serializer::_save(T&& t) { return std::apply( [&] (auto&&... args) { - return (_save(std::forward(args)) + ... + 0); + return (_save(std::forward(args)) + ... + 0); }, std::forward(t) ); } // array -template -template +template >, void>* > -SizeType Serializer::_save(T&& t) { +SizeType Serializer::_save(T&& t) { using U = std::decay_t; @@ -610,9 +637,9 @@ SizeType Serializer::_save(T&& t) { SizeType sz; if constexpr(std::is_arithmetic_v) { - _device.write(reinterpret_cast(t.data()), sizeof(t)); + _stream.write(reinterpret_cast(t.data()), sizeof(t)); sz = sizeof(t); - } + } else { sz = 0; for(auto&& item : t) { @@ -623,12 +650,12 @@ SizeType Serializer::_save(T&& t) { return sz; } -// custom save method -template -template +template >, void>* > -SizeType Serializer::_save(T&& t) { +SizeType Serializer::_save(T&& t) { return t.save(*this); } @@ -637,7 +664,7 @@ SizeType Serializer::_save(T&& t) { // ---------------------------------------------------------------------------- template -constexpr auto is_default_deserializable_v = +constexpr auto is_default_deserializable_v = std::is_arithmetic_v || std::is_enum_v || is_std_basic_string_v || @@ -657,199 +684,199 @@ constexpr auto is_default_deserializable_v = is_std_array_v; // Class: Deserializer -template +template class Deserializer { public: - - Deserializer(Device& device); - + + Deserializer(Stream& stream); + template SizeType operator()(T&&... items); - + private: - Device& _device; - + Stream& _stream; + // Function: _variant_helper template < - size_t I = 0, typename... ArgsT, + size_t I = 0, typename... ArgsT, std::enable_if_t* = nullptr > SizeType _variant_helper(size_t, std::variant&); - + // Function: _variant_helper template < - size_t I = 0, typename... ArgsT, + size_t I = 0, typename... ArgsT, std::enable_if_t* = nullptr > SizeType _variant_helper(size_t, std::variant&); - - template >, void>* = nullptr > SizeType _load(T&&); - - template >, void>* = nullptr > SizeType _load(T&&); - - template >, void>* = nullptr > SizeType _load(T&&); - - template > || is_std_list_v> || - is_std_forward_list_v>, + is_std_forward_list_v>, void >* = nullptr > SizeType _load(T&&); - - template >, void>* = nullptr > SizeType _load(T&&); - - template >, void>* = nullptr > SizeType _load(T&&); - - template >, void>* = nullptr > SizeType _load(T&&); - - template >, void>* = nullptr > SizeType _load(T&&); - - template >, void>* = nullptr > SizeType _load(T&&); - template >, void>* = nullptr > SizeType _load(T&&); - template >, void>* = nullptr > SizeType _load(T&&); - template >, void>* = nullptr > SizeType _load(T&&); - - template >, void>* = nullptr > SizeType _load(T&&); - - template >, void>* = nullptr > SizeType _load(T&&); - - template >, void>* = nullptr > SizeType _load(T&&); - - template >, void>* = nullptr > SizeType _load(T&&); }; // Constructor -template -Deserializer::Deserializer(Device& device) : _device(device) { +template +Deserializer::Deserializer(Stream& stream) : _stream(stream) { } // Operator () -template +template template -SizeType Deserializer::operator() (T&&... items) { +SizeType Deserializer::operator() (T&&... items) { return (_load(std::forward(items)) + ...); } // Function: _variant_helper -template +template template *> -SizeType Deserializer::_variant_helper(size_t, std::variant&) { +SizeType Deserializer::_variant_helper(size_t, std::variant&) { return 0; } // Function: _variant_helper -template +template template *> -SizeType Deserializer::_variant_helper(size_t i, std::variant& v) { +SizeType Deserializer::_variant_helper(size_t i, std::variant& v) { if(i == 0) { using type = ExtractType_t>; if(v.index() != I) { static_assert( - std::is_default_constructible::value, + std::is_default_constructible::value, "Failed to archive variant (type should be default constructible T())" ); v = type(); } - return _load(std::get(v)); + return _load(*std::get_if(&v)); } return _variant_helper(i-1, v); } // arithmetic data type -template -template +template >, void>* > -SizeType Deserializer::_load(T&& t) { - _device.read(reinterpret_cast(std::addressof(t)), sizeof(t)); +SizeType Deserializer::_load(T&& t) { + _stream.read(reinterpret_cast(std::addressof(t)), sizeof(t)); return sizeof(t); } // std::basic_string -template -template +template >, void>* > -SizeType Deserializer::_load(T&& t) { +SizeType Deserializer::_load(T&& t) { using U = std::decay_t; typename U::size_type num_chars; auto sz = _load(make_size_tag(num_chars)); t.resize(num_chars); - _device.read(reinterpret_cast(t.data()), num_chars*sizeof(typename U::value_type)); + _stream.read(reinterpret_cast(t.data()), num_chars*sizeof(typename U::value_type)); return sz + num_chars*sizeof(typename U::value_type); } // std::vector -template -template +template >, void>* > -SizeType Deserializer::_load(T&& t) { +SizeType Deserializer::_load(T&& t) { using U = std::decay_t; - + typename U::size_type num_data; - + auto sz = _load(make_size_tag(num_data)); if constexpr(std::is_arithmetic_v) { t.resize(num_data); - _device.read(reinterpret_cast(t.data()), num_data * sizeof(typename U::value_type)); + _stream.read(reinterpret_cast(t.data()), num_data * sizeof(typename U::value_type)); sz += num_data * sizeof(typename U::value_type); - } + } else { t.resize(num_data); for(auto && v : t) { @@ -860,15 +887,15 @@ SizeType Deserializer::_load(T&& t) { } // std::list and std::deque -template -template +template > || is_std_list_v> || is_std_forward_list_v>, void>* > -SizeType Deserializer::_load(T&& t) { +SizeType Deserializer::_load(T&& t) { using U = std::decay_t; - + typename U::size_type num_data; auto sz = _load(make_size_tag(num_data)); @@ -879,21 +906,21 @@ SizeType Deserializer::_load(T&& t) { return sz; } -// std::map -template -template +template >, void>* > -SizeType Deserializer::_load(T&& t) { - +SizeType Deserializer::_load(T&& t) { + using U = std::decay_t; typename U::size_type num_data; auto sz = _load(make_size_tag(num_data)); - + t.clear(); auto hint = t.begin(); - + typename U::key_type k; typename U::mapped_type v; @@ -905,11 +932,11 @@ SizeType Deserializer::_load(T&& t) { } // std::unordered_map -template -template +template >, void>* > -SizeType Deserializer::_load(T&& t) { +SizeType Deserializer::_load(T&& t) { using U = std::decay_t; typename U::size_type num_data; auto sz = _load(make_size_tag(num_data)); @@ -924,17 +951,17 @@ SizeType Deserializer::_load(T&& t) { sz += _load(make_kv_pair(k, v)); t.emplace(std::move(k), std::move(v)); } - + return sz; } -// std::set -template -template +template >, void>* > -SizeType Deserializer::_load(T&& t) { - +SizeType Deserializer::_load(T&& t) { + using U = std::decay_t; typename U::size_type num_data; @@ -942,46 +969,46 @@ SizeType Deserializer::_load(T&& t) { t.clear(); auto hint = t.begin(); - + typename U::key_type k; - for(size_t i=0; i -template +template >, void>* > -SizeType Deserializer::_load(T&& t) { - +SizeType Deserializer::_load(T&& t) { + using U = std::decay_t; - + typename U::size_type num_data; auto sz = _load(make_size_tag(num_data)); t.clear(); t.reserve(num_data); - + typename U::key_type k; - for(size_t i=0; i -template +template >, void>* > -SizeType Deserializer::_load(T&& t) { +SizeType Deserializer::_load(T&& t) { using U = std::decay_t; std::underlying_type_t k; auto sz = _load(k); @@ -990,11 +1017,11 @@ SizeType Deserializer::_load(T&& t) { } // duration data type -template -template +template >, void>* > -SizeType Deserializer::_load(T&& t) { +SizeType Deserializer::_load(T&& t) { using U = std::decay_t; typename U::rep count; auto s = _load(count); @@ -1003,11 +1030,11 @@ SizeType Deserializer::_load(T&& t) { } // time point data type -template -template +template >, void>* > -SizeType Deserializer::_load(T&& t) { +SizeType Deserializer::_load(T&& t) { using U = std::decay_t; typename U::duration elapsed; auto s = _load(elapsed); @@ -1016,12 +1043,12 @@ SizeType Deserializer::_load(T&& t) { } // optional data type -template -template +template >, void>* > -SizeType Deserializer::_load(T&& t) { - +SizeType Deserializer::_load(T&& t) { + using U = std::decay_t; bool has_value; @@ -1033,53 +1060,53 @@ SizeType Deserializer::_load(T&& t) { s += _load(*t); } else { - t.reset(); + t.reset(); } return s; } // variant type -template -template +template >, void>* > -SizeType Deserializer::_load(T&& t) { +SizeType Deserializer::_load(T&& t) { std::decay_t idx; auto s = _load(idx); return s + _variant_helper(idx, t); } // tuple type -template -template +template >, void>* > -SizeType Deserializer::_load(T&& t) { +SizeType Deserializer::_load(T&& t) { return std::apply( [&] (auto&&... args) { - return (_load(std::forward(args)) + ... + 0); + return (_load(std::forward(args)) + ... + 0); }, std::forward(t) ); } // array -template -template +template >, void>* > -SizeType Deserializer::_load(T&& t) { +SizeType Deserializer::_load(T&& t) { using U = std::decay_t; static_assert(std::tuple_size::value > 0, "Array size can't be zero"); SizeType sz; - + if constexpr(std::is_arithmetic_v) { - _device.read(reinterpret_cast(t.data()), sizeof(t)); + _stream.read(reinterpret_cast(t.data()), sizeof(t)); sz = sizeof(t); - } + } else { sz = 0; for(auto && v : t) { @@ -1090,12 +1117,12 @@ SizeType Deserializer::_load(T&& t) { return sz; } -// custom save method -template -template +template >, void>* > -SizeType Deserializer::_load(T&& t) { +SizeType Deserializer::_load(T&& t) { return t.load(*this); } diff --git a/lib/taskflow/utility/singleton.hpp b/lib/taskflow/utility/singleton.hpp index 01d521c..aab50bc 100644 --- a/lib/taskflow/utility/singleton.hpp +++ b/lib/taskflow/utility/singleton.hpp @@ -11,7 +11,7 @@ template class Singleton { public: - + /** @brief get a reference to the singleton object */ diff --git a/lib/taskflow/utility/small_vector.hpp b/lib/taskflow/utility/small_vector.hpp new file mode 100644 index 0000000..a42c264 --- /dev/null +++ b/lib/taskflow/utility/small_vector.hpp @@ -0,0 +1,1048 @@ +// small vector modified from llvm + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__GNUC__) + #define TF_LIKELY(x) (__builtin_expect((x), 1)) + #define TF_UNLIKELY(x) (__builtin_expect((x), 0)) +#else + #define TF_LIKELY(x) (x) + #define TF_UNLIKELY(x) (x) +#endif + +/** +@file small_vector.hpp +@brief small vector include file +*/ + +namespace tf { namespace detail { + +/** +@private +@brief NextCapacity - Returns the next power of two (in 64-bits) + that is strictly greater than A. Returns zero on overflow. + this function assumes A to be positive +*/ +inline uint64_t NextCapacity(uint64_t A) { + A |= (A >> 1); + A |= (A >> 2); + A |= (A >> 4); + A |= (A >> 8); + A |= (A >> 16); + A |= (A >> 32); + return A + 1; +} + +}} // end of namespace tf::detail -------------------------------------------- + + +namespace tf { + +/** +@private +*/ +template +struct IsPod : std::integral_constant::value && + std::is_trivial::value> {}; + +/** +@private +*/ +class SmallVectorBase { +protected: + void *BeginX, *EndX, *CapacityX; + +protected: + SmallVectorBase(void *FirstEl, size_t Size) + : BeginX(FirstEl), EndX(FirstEl), CapacityX((char*)FirstEl+Size) {} + + /// This is an implementation of the grow() method which only works + /// on POD-like data types and is out of line to reduce code duplication. + void grow_pod(void *FirstEl, size_t MinSizeInBytes, size_t TSize){ + size_t CurSizeBytes = size_in_bytes(); + size_t NewCapacityInBytes = 2 * capacity_in_bytes() + TSize; // Always grow. + if (NewCapacityInBytes < MinSizeInBytes) { + NewCapacityInBytes = MinSizeInBytes; + } + + void *NewElts; + if (BeginX == FirstEl) { + NewElts = std::malloc(NewCapacityInBytes); + + // Copy the elements over. No need to run dtors on PODs. + memcpy(NewElts, this->BeginX, CurSizeBytes); + } else { + // If this wasn't grown from the inline copy, grow the allocated space. + NewElts = realloc(this->BeginX, NewCapacityInBytes); + } + //assert(NewElts && "Out of memory"); + + this->EndX = (char*)NewElts+CurSizeBytes; + this->BeginX = NewElts; + this->CapacityX = (char*)this->BeginX + NewCapacityInBytes; + } + +public: + /// This returns size()*sizeof(T). + size_t size_in_bytes() const { + return size_t((char*)EndX - (char*)BeginX); + } + + /// capacity_in_bytes - This returns capacity()*sizeof(T). + size_t capacity_in_bytes() const { + return size_t((char*)CapacityX - (char*)BeginX); + } + + bool empty() const { return BeginX == EndX; } +}; + +/** +@private +*/ +template struct SmallVectorStorage; + +/** +@private +*/ +template +class SmallVectorTemplateCommon : public SmallVectorBase { + + private: + template friend struct SmallVectorStorage; + + template + struct AlignedUnionType { + alignas(X) std::byte buff[std::max(sizeof(std::byte), sizeof(X))]; + }; + + // Allocate raw space for N elements of type T. If T has a ctor or dtor, we + // don't want it to be automatically run, so we need to represent the space as + // something else. Use an array of char of sufficient alignment. + + // deprecated in c++23 + //typedef typename std::aligned_union<1, T>::type U; + typedef AlignedUnionType U; + + U FirstEl; + // Space after 'FirstEl' is clobbered, do not add any instance vars after it. + + protected: + SmallVectorTemplateCommon(size_t Size) : SmallVectorBase(&FirstEl, Size) {} + + void grow_pod(size_t MinSizeInBytes, size_t TSize) { + SmallVectorBase::grow_pod(&FirstEl, MinSizeInBytes, TSize); + } + + /// Return true if this is a smallvector which has not had dynamic + /// memory allocated for it. + bool isSmall() const { + return BeginX == static_cast(&FirstEl); + } + + /// Put this vector in a state of being small. + void resetToSmall() { + BeginX = EndX = CapacityX = &FirstEl; + } + + void setEnd(T *P) { this->EndX = P; } + + public: + typedef size_t size_type; + typedef ptrdiff_t difference_type; + typedef T value_type; + typedef T *iterator; + typedef const T *const_iterator; + + typedef std::reverse_iterator const_reverse_iterator; + typedef std::reverse_iterator reverse_iterator; + + typedef T &reference; + typedef const T &const_reference; + typedef T *pointer; + typedef const T *const_pointer; + + // forward iterator creation methods. + inline iterator begin() { return (iterator)this->BeginX; } + inline const_iterator begin() const { return (const_iterator)this->BeginX; } + inline iterator end() { return (iterator)this->EndX; } + inline const_iterator end() const { return (const_iterator)this->EndX; } + + protected: + + iterator capacity_ptr() { return (iterator)this->CapacityX; } + const_iterator capacity_ptr() const { return (const_iterator)this->CapacityX;} + + public: + + // reverse iterator creation methods. + reverse_iterator rbegin() { return reverse_iterator(end()); } + const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); } + reverse_iterator rend() { return reverse_iterator(begin()); } + const_reverse_iterator rend() const { return const_reverse_iterator(begin());} + + inline size_type size() const { return end()-begin(); } + inline size_type max_size() const { return size_type(-1) / sizeof(T); } + + /// Return the total number of elements in the currently allocated buffer. + size_t capacity() const { return capacity_ptr() - begin(); } + + /// Return a pointer to the vector's buffer, even if empty(). + pointer data() { return pointer(begin()); } + /// Return a pointer to the vector's buffer, even if empty(). + const_pointer data() const { return const_pointer(begin()); } + + inline reference operator[](size_type idx) { + //assert(idx < size()); + return begin()[idx]; + } + + inline const_reference operator[](size_type idx) const { + //assert(idx < size()); + return begin()[idx]; + } + + reference front() { + //assert(!empty()); + return begin()[0]; + } + + const_reference front() const { + //assert(!empty()); + return begin()[0]; + } + + reference back() { + //assert(!empty()); + return end()[-1]; + } + + const_reference back() const { + //assert(!empty()); + return end()[-1]; + } +}; + +/** +@private +*/ +template +class SmallVectorTemplateBase : public SmallVectorTemplateCommon { + +protected: + SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon(Size) {} + + static void destroy_range(T *S, T *E) { + while (S != E) { + --E; + E->~T(); + } + } + + /// Move the range [I, E) into the uninitialized memory starting with "Dest", + /// constructing elements as needed. + template + static void uninitialized_move(It1 I, It1 E, It2 Dest) { + std::uninitialized_copy(std::make_move_iterator(I), + std::make_move_iterator(E), Dest); + } + + /// Copy the range [I, E) onto the uninitialized memory starting with "Dest", + /// constructing elements as needed. + template + static void uninitialized_copy(It1 I, It1 E, It2 Dest) { + std::uninitialized_copy(I, E, Dest); + } + + /// Grow the allocated memory (without initializing new elements), doubling + /// the size of the allocated memory. Guarantees space for at least one more + /// element, or MinSize more elements if specified. + void grow(size_t MinSize = 0); + +public: + void push_back(const T &Elt) { + if (TF_UNLIKELY(this->EndX >= this->CapacityX)) + this->grow(); + ::new ((void*) this->end()) T(Elt); + this->setEnd(this->end()+1); + } + + void push_back(T &&Elt) { + if (TF_UNLIKELY(this->EndX >= this->CapacityX)) + this->grow(); + ::new ((void*) this->end()) T(::std::move(Elt)); + this->setEnd(this->end()+1); + } + + void pop_back() { + this->setEnd(this->end()-1); + this->end()->~T(); + } +}; + +/** +@private +*/ +template +void SmallVectorTemplateBase::grow(size_t MinSize) { + size_t CurCapacity = this->capacity(); + size_t CurSize = this->size(); + // Always grow, even from zero. + size_t NewCapacity = size_t(tf::detail::NextCapacity(CurCapacity+2)); + if (NewCapacity < MinSize) + NewCapacity = MinSize; + T *NewElts = static_cast(std::malloc(NewCapacity*sizeof(T))); + + // Move the elements over. + this->uninitialized_move(this->begin(), this->end(), NewElts); + + // Destroy the original elements. + destroy_range(this->begin(), this->end()); + + // If this wasn't grown from the inline copy, deallocate the old space. + if (!this->isSmall()) + std::free(this->begin()); + + this->setEnd(NewElts+CurSize); + this->BeginX = NewElts; + this->CapacityX = this->begin()+NewCapacity; +} + +/** +@private +*/ +template +class SmallVectorTemplateBase : public SmallVectorTemplateCommon { +protected: + SmallVectorTemplateBase(size_t Size) : SmallVectorTemplateCommon(Size) {} + + // No need to do a destroy loop for POD's. + static void destroy_range(T *, T *) {} + + /// Move the range [I, E) onto the uninitialized memory + /// starting with "Dest", constructing elements into it as needed. + template + static void uninitialized_move(It1 I, It1 E, It2 Dest) { + // Just do a copy. + uninitialized_copy(I, E, Dest); + } + + /// Copy the range [I, E) onto the uninitialized memory + /// starting with "Dest", constructing elements into it as needed. + template + static void uninitialized_copy(It1 I, It1 E, It2 Dest) { + // Arbitrary iterator types; just use the basic implementation. + std::uninitialized_copy(I, E, Dest); + } + + /// Copy the range [I, E) onto the uninitialized memory + /// starting with "Dest", constructing elements into it as needed. + template + static void uninitialized_copy( + T1 *I, T1 *E, T2 *Dest, + typename std::enable_if::type, + T2>::value>::type * = nullptr) { + // Use memcpy for PODs iterated by pointers (which includes SmallVector + // iterators): std::uninitialized_copy optimizes to memmove, but we can + // use memcpy here. Note that I and E are iterators and thus might be + // invalid for memcpy if they are equal. + if (I != E) + memcpy(Dest, I, (E - I) * sizeof(T)); + } + + /// Double the size of the allocated memory, guaranteeing space for at + /// least one more element or MinSize if specified. + void grow(size_t MinSize = 0) { + this->grow_pod(MinSize*sizeof(T), sizeof(T)); + } +public: + void push_back(const T &Elt) { + if (TF_UNLIKELY(this->EndX >= this->CapacityX)) + this->grow(); + memcpy(this->end(), &Elt, sizeof(T)); + this->setEnd(this->end()+1); + } + + void pop_back() { + this->setEnd(this->end()-1); + } +}; + +/** +@private +*/ +template +class SmallVectorImpl : public SmallVectorTemplateBase::value> { + typedef SmallVectorTemplateBase::value> SuperClass; + + SmallVectorImpl(const SmallVectorImpl&) = delete; + +public: + typedef typename SuperClass::iterator iterator; + typedef typename SuperClass::const_iterator const_iterator; + typedef typename SuperClass::size_type size_type; + +protected: + // Default ctor - Initialize to empty. + explicit SmallVectorImpl(unsigned N) + : SmallVectorTemplateBase::value>(N*sizeof(T)) { + } + +public: + ~SmallVectorImpl() { + // Destroy the constructed elements in the vector. + this->destroy_range(this->begin(), this->end()); + + // If this wasn't grown from the inline copy, deallocate the old space. + if (!this->isSmall()) + std::free(this->begin()); + } + + + void clear() { + this->destroy_range(this->begin(), this->end()); + this->EndX = this->BeginX; + } + + void resize(size_type N) { + if (N < this->size()) { + this->destroy_range(this->begin()+N, this->end()); + this->setEnd(this->begin()+N); + } else if (N > this->size()) { + if (this->capacity() < N) + this->grow(N); + for (auto I = this->end(), E = this->begin() + N; I != E; ++I) + new (&*I) T(); + this->setEnd(this->begin()+N); + } + } + + void resize(size_type N, const T &NV) { + if (N < this->size()) { + this->destroy_range(this->begin()+N, this->end()); + this->setEnd(this->begin()+N); + } else if (N > this->size()) { + if (this->capacity() < N) + this->grow(N); + std::uninitialized_fill(this->end(), this->begin()+N, NV); + this->setEnd(this->begin()+N); + } + } + + void reserve(size_type N) { + if (this->capacity() < N) + this->grow(N); + } + + T pop_back_val() { + T Result = ::std::move(this->back()); + this->pop_back(); + return Result; + } + + void swap(SmallVectorImpl &RHS); + + /// Add the specified range to the end of the SmallVector. + template + void append(in_iter in_start, in_iter in_end) { + size_type NumInputs = std::distance(in_start, in_end); + // Grow allocated space if needed. + if (NumInputs > size_type(this->capacity_ptr()-this->end())) + this->grow(this->size()+NumInputs); + + // Copy the new elements over. + this->uninitialized_copy(in_start, in_end, this->end()); + this->setEnd(this->end() + NumInputs); + } + + /// Add the specified range to the end of the SmallVector. + void append(size_type NumInputs, const T &Elt) { + // Grow allocated space if needed. + if (NumInputs > size_type(this->capacity_ptr()-this->end())) + this->grow(this->size()+NumInputs); + + // Copy the new elements over. + std::uninitialized_fill_n(this->end(), NumInputs, Elt); + this->setEnd(this->end() + NumInputs); + } + + void append(std::initializer_list IL) { + append(IL.begin(), IL.end()); + } + + void assign(size_type NumElts, const T &Elt) { + clear(); + if (this->capacity() < NumElts) + this->grow(NumElts); + this->setEnd(this->begin()+NumElts); + std::uninitialized_fill(this->begin(), this->end(), Elt); + } + + void assign(std::initializer_list IL) { + clear(); + append(IL); + } + + iterator erase(const_iterator CI) { + // Just cast away constness because this is a non-const member function. + iterator I = const_cast(CI); + + //assert(I >= this->begin() && "Iterator to erase is out of bounds."); + //assert(I < this->end() && "Erasing at past-the-end iterator."); + + iterator N = I; + // Shift all elts down one. + std::move(I+1, this->end(), I); + // Drop the last elt. + this->pop_back(); + return(N); + } + + iterator erase(const_iterator CS, const_iterator CE) { + // Just cast away constness because this is a non-const member function. + iterator S = const_cast(CS); + iterator E = const_cast(CE); + + //assert(S >= this->begin() && "Range to erase is out of bounds."); + //assert(S <= E && "Trying to erase invalid range."); + //assert(E <= this->end() && "Trying to erase past the end."); + + iterator N = S; + // Shift all elts down. + iterator I = std::move(E, this->end(), S); + // Drop the last elts. + this->destroy_range(I, this->end()); + this->setEnd(I); + return(N); + } + + iterator insert(iterator I, T &&Elt) { + if (I == this->end()) { // Important special case for empty vector. + this->push_back(::std::move(Elt)); + return this->end()-1; + } + + //assert(I >= this->begin() && "Insertion iterator is out of bounds."); + //assert(I <= this->end() && "Inserting past the end of the vector."); + + if (this->EndX >= this->CapacityX) { + size_t EltNo = I-this->begin(); + this->grow(); + I = this->begin()+EltNo; + } + + ::new ((void*) this->end()) T(::std::move(this->back())); + // Push everything else over. + std::move_backward(I, this->end()-1, this->end()); + this->setEnd(this->end()+1); + + // If we just moved the element we're inserting, be sure to update + // the reference. + T *EltPtr = &Elt; + if (I <= EltPtr && EltPtr < this->EndX) + ++EltPtr; + + *I = ::std::move(*EltPtr); + return I; + } + + iterator insert(iterator I, const T &Elt) { + if (I == this->end()) { // Important special case for empty vector. + this->push_back(Elt); + return this->end()-1; + } + + //assert(I >= this->begin() && "Insertion iterator is out of bounds."); + //assert(I <= this->end() && "Inserting past the end of the vector."); + + if (this->EndX >= this->CapacityX) { + size_t EltNo = I-this->begin(); + this->grow(); + I = this->begin()+EltNo; + } + ::new ((void*) this->end()) T(std::move(this->back())); + // Push everything else over. + std::move_backward(I, this->end()-1, this->end()); + this->setEnd(this->end()+1); + + // If we just moved the element we're inserting, be sure to update + // the reference. + const T *EltPtr = &Elt; + if (I <= EltPtr && EltPtr < this->EndX) + ++EltPtr; + + *I = *EltPtr; + return I; + } + + iterator insert(iterator I, size_type NumToInsert, const T &Elt) { + // Convert iterator to elt# to avoid invalidating iterator when we reserve() + size_t InsertElt = I - this->begin(); + + if (I == this->end()) { // Important special case for empty vector. + append(NumToInsert, Elt); + return this->begin()+InsertElt; + } + + //assert(I >= this->begin() && "Insertion iterator is out of bounds."); + //assert(I <= this->end() && "Inserting past the end of the vector."); + + // Ensure there is enough space. + reserve(this->size() + NumToInsert); + + // Uninvalidate the iterator. + I = this->begin()+InsertElt; + + // If there are more elements between the insertion point and the end of the + // range than there are being inserted, we can use a simple approach to + // insertion. Since we already reserved space, we know that this won't + // reallocate the vector. + if (size_t(this->end()-I) >= NumToInsert) { + T *OldEnd = this->end(); + append(std::move_iterator(this->end() - NumToInsert), + std::move_iterator(this->end())); + + // Copy the existing elements that get replaced. + std::move_backward(I, OldEnd-NumToInsert, OldEnd); + + std::fill_n(I, NumToInsert, Elt); + return I; + } + + // Otherwise, we're inserting more elements than exist already, and we're + // not inserting at the end. + + // Move over the elements that we're about to overwrite. + T *OldEnd = this->end(); + this->setEnd(this->end() + NumToInsert); + size_t NumOverwritten = OldEnd-I; + this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten); + + // Replace the overwritten part. + std::fill_n(I, NumOverwritten, Elt); + + // Insert the non-overwritten middle part. + std::uninitialized_fill_n(OldEnd, NumToInsert-NumOverwritten, Elt); + return I; + } + + template + iterator insert(iterator I, ItTy From, ItTy To) { + // Convert iterator to elt# to avoid invalidating iterator when we reserve() + size_t InsertElt = I - this->begin(); + + if (I == this->end()) { // Important special case for empty vector. + append(From, To); + return this->begin()+InsertElt; + } + + //assert(I >= this->begin() && "Insertion iterator is out of bounds."); + //assert(I <= this->end() && "Inserting past the end of the vector."); + + size_t NumToInsert = std::distance(From, To); + + // Ensure there is enough space. + reserve(this->size() + NumToInsert); + + // Uninvalidate the iterator. + I = this->begin()+InsertElt; + + // If there are more elements between the insertion point and the end of the + // range than there are being inserted, we can use a simple approach to + // insertion. Since we already reserved space, we know that this won't + // reallocate the vector. + if (size_t(this->end()-I) >= NumToInsert) { + T *OldEnd = this->end(); + append(std::move_iterator(this->end() - NumToInsert), + std::move_iterator(this->end())); + + // Copy the existing elements that get replaced. + std::move_backward(I, OldEnd-NumToInsert, OldEnd); + + std::copy(From, To, I); + return I; + } + + // Otherwise, we're inserting more elements than exist already, and we're + // not inserting at the end. + + // Move over the elements that we're about to overwrite. + T *OldEnd = this->end(); + this->setEnd(this->end() + NumToInsert); + size_t NumOverwritten = OldEnd-I; + this->uninitialized_move(I, OldEnd, this->end()-NumOverwritten); + + // Replace the overwritten part. + for (T *J = I; NumOverwritten > 0; --NumOverwritten) { + *J = *From; + ++J; ++From; + } + + // Insert the non-overwritten middle part. + this->uninitialized_copy(From, To, OldEnd); + return I; + } + + void insert(iterator I, std::initializer_list IL) { + insert(I, IL.begin(), IL.end()); + } + + template void emplace_back(ArgTypes &&... Args) { + if (TF_UNLIKELY(this->EndX >= this->CapacityX)) + this->grow(); + ::new ((void *)this->end()) T(std::forward(Args)...); + this->setEnd(this->end() + 1); + } + + SmallVectorImpl &operator=(const SmallVectorImpl &RHS); + + SmallVectorImpl &operator=(SmallVectorImpl &&RHS); + + bool operator==(const SmallVectorImpl &RHS) const { + if (this->size() != RHS.size()) return false; + return std::equal(this->begin(), this->end(), RHS.begin()); + } + bool operator!=(const SmallVectorImpl &RHS) const { + return !(*this == RHS); + } + + bool operator<(const SmallVectorImpl &RHS) const { + return std::lexicographical_compare(this->begin(), this->end(), + RHS.begin(), RHS.end()); + } + + /// Set the array size to \p N, which the current array must have enough + /// capacity for. + /// + /// This does not construct or destroy any elements in the vector. + /// + /// Clients can use this in conjunction with capacity() to write past the end + /// of the buffer when they know that more elements are available, and only + /// update the size later. This avoids the cost of value initializing elements + /// which will only be overwritten. + void set_size(size_type N) { + //assert(N <= this->capacity()); + this->setEnd(this->begin() + N); + } +}; + + +template +void SmallVectorImpl::swap(SmallVectorImpl &RHS) { + if (this == &RHS) return; + + // We can only avoid copying elements if neither vector is small. + if (!this->isSmall() && !RHS.isSmall()) { + std::swap(this->BeginX, RHS.BeginX); + std::swap(this->EndX, RHS.EndX); + std::swap(this->CapacityX, RHS.CapacityX); + return; + } + if (RHS.size() > this->capacity()) + this->grow(RHS.size()); + if (this->size() > RHS.capacity()) + RHS.grow(this->size()); + + // Swap the shared elements. + size_t NumShared = this->size(); + if (NumShared > RHS.size()) NumShared = RHS.size(); + for (size_type i = 0; i != NumShared; ++i) + std::swap((*this)[i], RHS[i]); + + // Copy over the extra elts. + if (this->size() > RHS.size()) { + size_t EltDiff = this->size() - RHS.size(); + this->uninitialized_copy(this->begin()+NumShared, this->end(), RHS.end()); + RHS.setEnd(RHS.end()+EltDiff); + this->destroy_range(this->begin()+NumShared, this->end()); + this->setEnd(this->begin()+NumShared); + } else if (RHS.size() > this->size()) { + size_t EltDiff = RHS.size() - this->size(); + this->uninitialized_copy(RHS.begin()+NumShared, RHS.end(), this->end()); + this->setEnd(this->end() + EltDiff); + this->destroy_range(RHS.begin()+NumShared, RHS.end()); + RHS.setEnd(RHS.begin()+NumShared); + } +} + +template +SmallVectorImpl &SmallVectorImpl:: + operator=(const SmallVectorImpl &RHS) { + // Avoid self-assignment. + if (this == &RHS) return *this; + + // If we already have sufficient space, assign the common elements, then + // destroy any excess. + size_t RHSSize = RHS.size(); + size_t CurSize = this->size(); + if (CurSize >= RHSSize) { + // Assign common elements. + iterator NewEnd; + if (RHSSize) + NewEnd = std::copy(RHS.begin(), RHS.begin()+RHSSize, this->begin()); + else + NewEnd = this->begin(); + + // Destroy excess elements. + this->destroy_range(NewEnd, this->end()); + + // Trim. + this->setEnd(NewEnd); + return *this; + } + + // If we have to grow to have enough elements, destroy the current elements. + // This allows us to avoid copying them during the grow. + // FIXME: don't do this if they're efficiently moveable. + if (this->capacity() < RHSSize) { + // Destroy current elements. + this->destroy_range(this->begin(), this->end()); + this->setEnd(this->begin()); + CurSize = 0; + this->grow(RHSSize); + } else if (CurSize) { + // Otherwise, use assignment for the already-constructed elements. + std::copy(RHS.begin(), RHS.begin()+CurSize, this->begin()); + } + + // Copy construct the new elements in place. + this->uninitialized_copy(RHS.begin()+CurSize, RHS.end(), + this->begin()+CurSize); + + // Set end. + this->setEnd(this->begin()+RHSSize); + return *this; +} + +template +SmallVectorImpl &SmallVectorImpl::operator=(SmallVectorImpl &&RHS) { + // Avoid self-assignment. + if (this == &RHS) return *this; + + // If the RHS isn't small, clear this vector and then steal its buffer. + if (!RHS.isSmall()) { + this->destroy_range(this->begin(), this->end()); + if (!this->isSmall()) std::free(this->begin()); + this->BeginX = RHS.BeginX; + this->EndX = RHS.EndX; + this->CapacityX = RHS.CapacityX; + RHS.resetToSmall(); + return *this; + } + + // If we already have sufficient space, assign the common elements, then + // destroy any excess. + size_t RHSSize = RHS.size(); + size_t CurSize = this->size(); + if (CurSize >= RHSSize) { + // Assign common elements. + iterator NewEnd = this->begin(); + if (RHSSize) + NewEnd = std::move(RHS.begin(), RHS.end(), NewEnd); + + // Destroy excess elements and trim the bounds. + this->destroy_range(NewEnd, this->end()); + this->setEnd(NewEnd); + + // Clear the RHS. + RHS.clear(); + + return *this; + } + + // If we have to grow to have enough elements, destroy the current elements. + // This allows us to avoid copying them during the grow. + // FIXME: this may not actually make any sense if we can efficiently move + // elements. + if (this->capacity() < RHSSize) { + // Destroy current elements. + this->destroy_range(this->begin(), this->end()); + this->setEnd(this->begin()); + CurSize = 0; + this->grow(RHSSize); + } else if (CurSize) { + // Otherwise, use assignment for the already-constructed elements. + std::move(RHS.begin(), RHS.begin()+CurSize, this->begin()); + } + + // Move-construct the new elements in place. + this->uninitialized_move(RHS.begin()+CurSize, RHS.end(), + this->begin()+CurSize); + + // Set end. + this->setEnd(this->begin()+RHSSize); + + RHS.clear(); + return *this; +} + +/** +@private +*/ +template +struct SmallVectorStorage { + /** + @private + */ + typename SmallVectorTemplateCommon::U InlineElts[N - 1]; +}; + +/** +@private +*/ +template struct SmallVectorStorage {}; + +/** +@private +*/ +template struct SmallVectorStorage {}; + +/** +@brief class to define a vector optimized for small array + +@tparam T data type +@tparam N threshold of the number of elements in the initial storage + +The class defines a C++ STL-styled vector (a variable-sized array) +optimized for the case when the array is small. +It contains some number of elements in-place, +which allows it to avoid heap allocation when the actual number of +elements is below that threshold. This allows normal @em small cases to be +fast without losing generality for large inputs. +All the methods in [std::vector](https://en.cppreference.com/w/cpp/container/vector) +can apply to this class. + +The class is stripped from the LLVM codebase. +*/ +template +class SmallVector : public SmallVectorImpl { + /// Inline space for elements which aren't stored in the base class. + SmallVectorStorage Storage; + +public: + + /** + @brief constructs an empty vector + */ + SmallVector() : SmallVectorImpl(N) { + } + + /** + @brief constructs a vector with @c Size copies of elements with value @c value + */ + explicit SmallVector(size_t Size, const T &Value = T()) + : SmallVectorImpl(N) { + this->assign(Size, Value); + } + + /** + @brief constructs a vector with the contents of the range + [S, E) + */ + template + SmallVector(ItTy S, ItTy E) : SmallVectorImpl(N) { + this->append(S, E); + } + + //template + //explicit SmallVector(const tf::iterator_range &R) + // : SmallVectorImpl(N) { + // this->append(R.begin(), R.end()); + //} + + /** + @brief constructs a vector with the contents of the initializer list @c IL + */ + SmallVector(std::initializer_list IL) : SmallVectorImpl(N) { + this->assign(IL); + } + + /** + @brief constructs the vector with the copy of the contents of @c RHS + */ + SmallVector(const SmallVector &RHS) : SmallVectorImpl(N) { + if (!RHS.empty()) + SmallVectorImpl::operator=(RHS); + } + + /** + @brief constructs the vector with the contents of @c RHS using move semantics + */ + SmallVector(SmallVector &&RHS) : SmallVectorImpl(N) { + if (!RHS.empty()) + SmallVectorImpl::operator=(::std::move(RHS)); + } + + /** + @brief replaces the contents with a copy of the contents of @c RHS + */ + const SmallVector &operator=(const SmallVector &RHS) { + SmallVectorImpl::operator=(RHS); + return *this; + } + + /** + @brief replaces the contents with the contents of @c RHS using move semantics + */ + const SmallVector &operator=(SmallVector &&RHS) { + SmallVectorImpl::operator=(::std::move(RHS)); + return *this; + } + + /** + @brief constructs a vector with the contents of @c RHS using move semantics + */ + SmallVector(SmallVectorImpl &&RHS) : SmallVectorImpl(N) { + if (!RHS.empty()) + SmallVectorImpl::operator=(::std::move(RHS)); + } + + /** + @brief replaces the contents with the contents of @c RHS using move semantics + */ + const SmallVector &operator=(SmallVectorImpl &&RHS) { + SmallVectorImpl::operator=(::std::move(RHS)); + return *this; + } + + /** + @brief replaces the contents with the copy of the contents of an initializer list @c IL + */ + const SmallVector &operator=(std::initializer_list IL) { + this->assign(IL); + return *this; + } +}; + +template +static inline size_t capacity_in_bytes(const SmallVector &X) { + return X.capacity_in_bytes(); +} + +} // end tf namespace --------------------------------------------------------- + +namespace std { + /// Implement std::swap in terms of SmallVector swap. + template + inline void + swap(tf::SmallVectorImpl &LHS, tf::SmallVectorImpl &RHS) { + LHS.swap(RHS); + } + + /// Implement std::swap in terms of SmallVector swap. + template + inline void + swap(tf::SmallVector &LHS, tf::SmallVector &RHS) { + LHS.swap(RHS); + } +} // end of namespace std ---------------------------------------------------- + + diff --git a/lib/taskflow/utility/stream.hpp b/lib/taskflow/utility/stream.hpp index 6063f8c..320aa6c 100644 --- a/lib/taskflow/utility/stream.hpp +++ b/lib/taskflow/utility/stream.hpp @@ -8,7 +8,7 @@ namespace tf { // Procedure: ostreamize template void ostreamize(std::ostream& os, T&& token) { - os << std::forward(token); + os << std::forward(token); } // Procedure: ostreamize diff --git a/lib/taskflow/utility/traits.hpp b/lib/taskflow/utility/traits.hpp index 58b3839..196b147 100644 --- a/lib/taskflow/utility/traits.hpp +++ b/lib/taskflow/utility/traits.hpp @@ -14,23 +14,20 @@ #include #include #include -#include -#include #include #include #include #include -#include #include #include #include #include #include #include -#include +#include #include #include -#include +#include "os.hpp" namespace tf { @@ -38,14 +35,49 @@ namespace tf { // Traits //----------------------------------------------------------------------------- -// Struct: dependent_false -template -struct dependent_false { - static constexpr bool value = false; +//// Struct: dependent_false +//template +//struct dependent_false { +// static constexpr bool value = false; +//}; +// +//template +//constexpr auto dependent_false_v = dependent_false::value; + +template inline constexpr bool dependent_false_v = false; + +// ---------------------------------------------------------------------------- +// is_pod +//----------------------------------------------------------------------------- +template +struct is_pod { + static const bool value = std::is_trivial_v && + std::is_standard_layout_v; }; -template -constexpr auto dependent_false_v = dependent_false::value; +template +constexpr bool is_pod_v = is_pod::value; + +//----------------------------------------------------------------------------- +// NoInit +//----------------------------------------------------------------------------- + +template +struct NoInit { + + //static_assert(is_pod_v, "NoInit only supports POD type"); + + // constructor without initialization + NoInit () noexcept {} + + // implicit conversion T -> NoInit + constexpr NoInit (T value) noexcept : v{value} {} + + // implicit conversion NoInit -> T + constexpr operator T () const noexcept { return v; } + + T v; +}; //----------------------------------------------------------------------------- // Move-On-Copy @@ -59,8 +91,8 @@ struct MoC { MoC(const MoC& other) : object(std::move(other.object)) {} T& get() { return object; } - - mutable T object; + + mutable T object; }; template @@ -72,146 +104,14 @@ auto make_moc(T&& m) { // Visitors. //----------------------------------------------------------------------------- -// Overloadded. -template -struct Visitors : Ts... { - using Ts::operator()... ; -}; - -template -Visitors(Ts...) -> Visitors; - -// ---------------------------------------------------------------------------- -// Function Traits -// reference: https://github.com/ros2/rclcpp -// ---------------------------------------------------------------------------- - -template -struct tuple_tail; - -template -struct tuple_tail> { - using type = std::tuple; -}; - -// std::function -template -struct function_traits -{ - using arguments = typename tuple_tail< - typename function_traits::argument_tuple_type - >::type; - - static constexpr size_t arity = std::tuple_size_v; - - template - struct argument { - static_assert(N < arity, "error: invalid parameter index."); - using type = std::tuple_element_t; - }; - - template - using argument_t = typename argument::type; - - using return_type = typename function_traits::return_type; -}; - -// Free functions -template -struct function_traits { - - using return_type = R; - using argument_tuple_type = std::tuple; - - static constexpr size_t arity = sizeof...(Args); - - template - struct argument { - static_assert(N < arity, "error: invalid parameter index."); - using type = std::tuple_element_t>; - }; - - template - using argument_t = typename argument::type; -}; - -// function pointer -template -struct function_traits : function_traits { -}; - -// function reference -template -struct function_traits : function_traits { -}; - -// immutable lambda -template -struct function_traits - : function_traits -{}; - -// mutable lambda -template -struct function_traits - : function_traits -{}; - -/*// std::bind for object methods -template -#if defined _LIBCPP_VERSION // libc++ (Clang) -struct function_traits> -#elif defined _GLIBCXX_RELEASE // glibc++ (GNU C++ >= 7.1) -struct function_traits> -#elif defined __GLIBCXX__ // glibc++ (GNU C++) -struct function_traits(FArgs ...)>> -#elif defined _MSC_VER // MS Visual Studio -struct function_traits< - std::_Binder> -#else -#error "Unsupported C++ compiler / standard library" -#endif - : function_traits -{}; - -// std::bind for object const methods -template -#if defined _LIBCPP_VERSION // libc++ (Clang) -struct function_traits> -#elif defined _GLIBCXX_RELEASE // glibc++ (GNU C++ >= 7.1) -struct function_traits> -#elif defined __GLIBCXX__ // glibc++ (GNU C++) -struct function_traits(FArgs ...)>> -#elif defined _MSC_VER // MS Visual Studio -struct function_traits< - std::_Binder> -#else -#error "Unsupported C++ compiler / standard library" -#endif - : function_traits -{}; - -// std::bind for free functions -template -#if defined _LIBCPP_VERSION // libc++ (Clang) -struct function_traits> -#elif defined __GLIBCXX__ // glibc++ (GNU C++) -struct function_traits> -#elif defined _MSC_VER // MS Visual Studio -struct function_traits> -#else -#error "Unsupported C++ compiler / standard library" -#endif - : function_traits -{}; */ - -// decay to the raw type -template -struct function_traits : function_traits {}; - -template -struct function_traits : function_traits {}; - +//// Overloadded. +//template +//struct Visitors : Ts... { +// using Ts::operator()... ; +//}; +// +//template +//Visitors(Ts...) -> Visitors; // ---------------------------------------------------------------------------- // std::variant @@ -219,51 +119,21 @@ struct function_traits : function_traits {}; template struct get_index; -template +template struct get_index_impl {}; -template +template struct get_index_impl : std::integral_constant{}; -template +template struct get_index_impl : get_index_impl{}; -template +template struct get_index> : get_index_impl<0, T, Ts...>{}; template constexpr auto get_index_v = get_index::value; -// ---------------------------------------------------------------------------- -// is_pod -//----------------------------------------------------------------------------- -template -struct is_pod { - static const bool value = std::is_trivial_v && - std::is_standard_layout_v; -}; - -template -constexpr bool is_pod_v = is_pod::value; - -// ---------------------------------------------------------------------------- -// bit_cast -//----------------------------------------------------------------------------- -template -typename std::enable_if< - (sizeof(To) == sizeof(From)) && - std::is_trivially_copyable_v && - std::is_trivial_v, - // this implementation requires that To is trivially default constructible - To ->::type -// constexpr support needs compiler magic -bit_cast(const From &src) noexcept { - To dst; - std::memcpy(&dst, &src, sizeof(To)); - return dst; -} - // ---------------------------------------------------------------------------- // unwrap_reference // ---------------------------------------------------------------------------- @@ -293,7 +163,7 @@ struct stateful_iterator { using TB = std::decay_t>; using TE = std::decay_t>; - + static_assert(std::is_same_v, "decayed iterator types must match"); using type = TB; @@ -313,11 +183,11 @@ struct stateful_index { static_assert( std::is_integral_v, "decayed beg index must be an integral type" ); - + static_assert( std::is_integral_v, "decayed end index must be an integral type" ); - + static_assert( std::is_integral_v, "decayed step must be an integral type" ); @@ -333,6 +203,95 @@ struct stateful_index { template using stateful_index_t = typename stateful_index::type; +// ---------------------------------------------------------------------------- +// visit a tuple with a functor at runtime +// ---------------------------------------------------------------------------- + +template +void visit_tuple(Func func, Tuple& tup, size_t idx) { + if (N == idx) { + std::invoke(func, std::get(tup)); + return; + } + if constexpr (N + 1 < std::tuple_size_v) { + return visit_tuple(func, tup, idx); + } +} + +// ---------------------------------------------------------------------------- +// unroll loop +// ---------------------------------------------------------------------------- + +// Template unrolled looping construct. +template +struct Unroll { + template + static void eval(F f) { + f(beg); + Unroll::eval(f); + } +}; + +template +struct Unroll { + template + static void eval(F) { } +}; + +template +void unroll(F f) { + Unroll::eval(f); +} + +// ---------------------------------------------------------------------------- +// make types of variant unique +// ---------------------------------------------------------------------------- + +template +struct filter_duplicates { using type = T; }; + +template