From 7c227cd871ce9e8a5690db93b8d8b46d6e973d51 Mon Sep 17 00:00:00 2001 From: FritzFlorian Date: Wed, 12 Jun 2019 18:44:27 +0200 Subject: [PATCH] Unify interface of algorithms. Now all algorithms are used without the parallel pre-/suffix and the for_each method has an specialization for integer ranges. --- app/benchmark_fft/main.cpp | 2 +- app/benchmark_matrix/main.cpp | 4 ++-- app/benchmark_prefix/main.cpp | 2 +- app/benchmark_unbalanced/main.cpp | 12 +++++++----- app/invoke_parallel/main.cpp | 2 +- app/playground/main.cpp | 4 +++- lib/pls/CMakeLists.txt | 12 ++++++------ lib/pls/include/pls/algorithms/for_each.h | 18 ++++++++++++++++++ lib/pls/include/pls/algorithms/for_each_impl.h | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ lib/pls/include/pls/algorithms/invoke.h | 23 +++++++++++++++++++++++ lib/pls/include/pls/algorithms/invoke_impl.h | 40 ++++++++++++++++++++++++++++++++++++++++ lib/pls/include/pls/algorithms/invoke_parallel.h | 23 ----------------------- lib/pls/include/pls/algorithms/invoke_parallel_impl.h | 40 ---------------------------------------- lib/pls/include/pls/algorithms/parallel_for.h | 24 ------------------------ lib/pls/include/pls/algorithms/parallel_for_impl.h | 43 ------------------------------------------- lib/pls/include/pls/algorithms/parallel_scan.h | 15 --------------- lib/pls/include/pls/algorithms/parallel_scan_impl.h | 79 ------------------------------------------------------------------------------- lib/pls/include/pls/algorithms/scan.h | 15 +++++++++++++++ lib/pls/include/pls/algorithms/scan_impl.h | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ lib/pls/include/pls/pls.h | 12 ++++++------ 20 files changed, 262 insertions(+), 247 deletions(-) create mode 100644 lib/pls/include/pls/algorithms/for_each.h create mode 100644 lib/pls/include/pls/algorithms/for_each_impl.h create mode 100644 lib/pls/include/pls/algorithms/invoke.h create mode 100644 lib/pls/include/pls/algorithms/invoke_impl.h delete mode 100644 lib/pls/include/pls/algorithms/invoke_parallel.h delete mode 100644 lib/pls/include/pls/algorithms/invoke_parallel_impl.h delete mode 100644 lib/pls/include/pls/algorithms/parallel_for.h delete mode 100644 lib/pls/include/pls/algorithms/parallel_for_impl.h delete mode 100644 lib/pls/include/pls/algorithms/parallel_scan.h delete mode 100644 lib/pls/include/pls/algorithms/parallel_scan_impl.h create mode 100644 lib/pls/include/pls/algorithms/scan.h create mode 100644 lib/pls/include/pls/algorithms/scan_impl.h diff --git a/app/benchmark_fft/main.cpp b/app/benchmark_fft/main.cpp index cbb8445..aa21658 100644 --- a/app/benchmark_fft/main.cpp +++ b/app/benchmark_fft/main.cpp @@ -49,7 +49,7 @@ void fft(complex_vector::iterator data, int n) { fft(data, n / 2); fft(data + n / 2, n / 2); } else { - pls::invoke_parallel( + pls::invoke( [&] { fft(data, n / 2); }, [&] { fft(data + n / 2, n / 2); } ); diff --git a/app/benchmark_matrix/main.cpp b/app/benchmark_matrix/main.cpp index d536e6c..327cfca 100644 --- a/app/benchmark_matrix/main.cpp +++ b/app/benchmark_matrix/main.cpp @@ -9,12 +9,12 @@ class matrix { public: T data[SIZE][SIZE]; - matrix(T i = 1) { + explicit matrix(T i = 1) { std::fill(&data[0][0], &data[0][0] + SIZE * SIZE, i); } void multiply(const matrix &a, const matrix &b) { - pls::algorithm::parallel_for(0, SIZE, [&](int i) { + pls::algorithm::for_each_range(0, SIZE, [&](int i) { this->multiply_column(i, a, b); }); } diff --git a/app/benchmark_prefix/main.cpp b/app/benchmark_prefix/main.cpp index 3c9c8a0..4667c1d 100644 --- a/app/benchmark_prefix/main.cpp +++ b/app/benchmark_prefix/main.cpp @@ -18,7 +18,7 @@ int main() { } pls::internal::helpers::run_mini_benchmark([&] { - pls::parallel_scan(vec.begin(), vec.end(), out.begin(), std::plus(), 0.0); + pls::scan(vec.begin(), vec.end(), out.begin(), std::plus(), 0.0); }, 8, 1000); PROFILE_SAVE("test_profile.prof") diff --git a/app/benchmark_unbalanced/main.cpp b/app/benchmark_unbalanced/main.cpp index e3e29d4..75f5daa 100644 --- a/app/benchmark_unbalanced/main.cpp +++ b/app/benchmark_unbalanced/main.cpp @@ -22,9 +22,11 @@ int count_child_nodes(uts::node &node) { std::vector results(children.size()); for (size_t i = 0; i < children.size(); i++) { size_t index = i; - auto lambda = [&, index] { results[index] = count_child_nodes(children[index]); }; - pls::lambda_task_by_value sub_task(lambda); - pls::scheduler::spawn_child(sub_task); + auto lambda = [&, index] { + results[index] = count_child_nodes(children[index]); + }; + using child_type = pls::lambda_task_by_value; + pls::scheduler::spawn_child(lambda); } pls::scheduler::wait_for_all(); for (auto result : results) { @@ -41,8 +43,8 @@ int unbalanced_tree_search(int seed, int root_children, double q, int normal_chi uts::node root(seed, root_children, q, normal_children); result = count_child_nodes(root); }; - pls::lambda_task_by_reference sub_task(lambda); - pls::scheduler::spawn_child(sub_task); + using child_type = pls::lambda_task_by_reference; + pls::scheduler::spawn_child(lambda); pls::scheduler::wait_for_all(); return result; diff --git a/app/invoke_parallel/main.cpp b/app/invoke_parallel/main.cpp index 4382168..903f704 100644 --- a/app/invoke_parallel/main.cpp +++ b/app/invoke_parallel/main.cpp @@ -54,7 +54,7 @@ void fft(complex_vector::iterator data, int n) { fft(data, n / 2); fft(data + n / 2, n / 2); } else { - pls::invoke_parallel( + pls::invoke( [&] { fft(data, n / 2); }, [&] { fft(data + n / 2, n / 2); } ); diff --git a/app/playground/main.cpp b/app/playground/main.cpp index cde3abd..65be9ea 100644 --- a/app/playground/main.cpp +++ b/app/playground/main.cpp @@ -10,5 +10,7 @@ #include int main() { - + auto range = boost::irange(0, 10); + bool test = std::is_integral::type>::value; + std::cout << test << std::endl; } diff --git a/lib/pls/CMakeLists.txt b/lib/pls/CMakeLists.txt index f054b45..050054d 100644 --- a/lib/pls/CMakeLists.txt +++ b/lib/pls/CMakeLists.txt @@ -2,12 +2,12 @@ add_library(pls STATIC include/pls/pls.h src/pls.cpp - include/pls/algorithms/invoke_parallel.h - include/pls/algorithms/invoke_parallel_impl.h - include/pls/algorithms/parallel_for.h - include/pls/algorithms/parallel_for_impl.h - include/pls/algorithms/parallel_scan.h - include/pls/algorithms/parallel_scan_impl.h + include/pls/algorithms/invoke.h + include/pls/algorithms/invoke_impl.h + include/pls/algorithms/for_each.h + include/pls/algorithms/for_each_impl.h + include/pls/algorithms/scan.h + include/pls/algorithms/scan_impl.h include/pls/internal/base/spin_lock.h include/pls/internal/base/tas_spin_lock.h src/internal/base/tas_spin_lock.cpp diff --git a/lib/pls/include/pls/algorithms/for_each.h b/lib/pls/include/pls/algorithms/for_each.h new file mode 100644 index 0000000..9d0ed6a --- /dev/null +++ b/lib/pls/include/pls/algorithms/for_each.h @@ -0,0 +1,18 @@ + +#ifndef PLS_PARALLEL_FOR_H +#define PLS_PARALLEL_FOR_H + +namespace pls { +namespace algorithm { + +template +void for_each_range(size_t first, size_t last, const Function &function); + +template +void for_each(RandomIt first, RandomIt last, const Function &function); + +} +} +#include "for_each_impl.h" + +#endif //PLS_PARALLEL_FOR_H diff --git a/lib/pls/include/pls/algorithms/for_each_impl.h b/lib/pls/include/pls/algorithms/for_each_impl.h new file mode 100644 index 0000000..41c90c3 --- /dev/null +++ b/lib/pls/include/pls/algorithms/for_each_impl.h @@ -0,0 +1,60 @@ + +#ifndef PLS_PARALLEL_FOR_IMPL_H +#define PLS_PARALLEL_FOR_IMPL_H + +#include "pls/internal/scheduling/task.h" +#include "pls/internal/scheduling/scheduler.h" + +#include "pls/internal/helpers/unique_id.h" + +// TODO: Replace with own integer iterator to remove dependency +#include + +namespace pls { +namespace algorithm { +namespace internal { + +template +void for_each(RandomIt first, RandomIt last, const Function &function) { + using namespace ::pls::internal::scheduling; + constexpr long min_elements = 4; + + long num_elements = std::distance(first, last); + if (num_elements <= min_elements) { + // calculate last elements in loop to avoid overhead + for (auto current = first; current != last; current++) { + function(*current); + } + } else { + // Cut in half recursively + long middle_index = num_elements / 2; + + auto second_half_body = + [first, middle_index, last, &function] { internal::for_each(first + middle_index, last, function); }; + using second_half_t = lambda_task_by_reference; + scheduler::spawn_child(std::move(second_half_body)); + + auto first_half_body = + [first, middle_index, last, &function] { internal::for_each(first, first + middle_index, function); }; + using first_half_t = lambda_task_by_reference; + scheduler::spawn_child_and_wait(std::move(first_half_body)); + } +} + +} + +template +void for_each_range(size_t first, size_t last, const Function &function) { + auto range = boost::irange(first, last); + internal::for_each(range.begin(), range.end(), function); +} + +template +void for_each(RandomIt first, RandomIt last, const Function &function) { + internal::for_each(first, last, function); +} + +} +} + +#endif //PLS_INVOKE_PARALLEL_IMPL_H diff --git a/lib/pls/include/pls/algorithms/invoke.h b/lib/pls/include/pls/algorithms/invoke.h new file mode 100644 index 0000000..3197a97 --- /dev/null +++ b/lib/pls/include/pls/algorithms/invoke.h @@ -0,0 +1,23 @@ + +#ifndef PLS_PARALLEL_INVOKE_H +#define PLS_PARALLEL_INVOKE_H + +#include "pls/internal/scheduling/task.h" +#include "pls/internal/scheduling/scheduler.h" + +namespace pls { +namespace algorithm { + +template +void invoke(const Function1 &function1, const Function2 &function2); + +template +void invoke(const Function1 &function1, const Function2 &function2, const Function3 &function3); + +// ...and so on, add more if we decide to keep this design + +} +} +#include "invoke_impl.h" + +#endif //PLS_PARALLEL_INVOKE_H diff --git a/lib/pls/include/pls/algorithms/invoke_impl.h b/lib/pls/include/pls/algorithms/invoke_impl.h new file mode 100644 index 0000000..fe64cd7 --- /dev/null +++ b/lib/pls/include/pls/algorithms/invoke_impl.h @@ -0,0 +1,40 @@ + +#ifndef PLS_INVOKE_PARALLEL_IMPL_H +#define PLS_INVOKE_PARALLEL_IMPL_H + +#include "pls/internal/scheduling/task.h" +#include "pls/internal/scheduling/lambda_task.h" +#include "pls/internal/scheduling/scheduler.h" +#include "pls/internal/scheduling/thread_state.h" + +namespace pls { +namespace algorithm { + +template +void invoke(Function1 &&function1, Function2 &&function2) { + using namespace ::pls::internal::scheduling; + + using task_1_t = lambda_task_by_value; + using task_2_t = lambda_task_by_value; + + scheduler::spawn_child(std::forward(function2)); + scheduler::spawn_child_and_wait(std::forward(function1)); +} + +template +void invoke(Function1 &&function1, Function2 &&function2, Function3 &&function3) { + using namespace ::pls::internal::scheduling; + + using task_1_t = lambda_task_by_value; + using task_2_t = lambda_task_by_value; + using task_3_t = lambda_task_by_value; + + scheduler::spawn_child(std::forward(function3)); + scheduler::spawn_child(std::forward(function2)); + scheduler::spawn_child_and_wait(std::forward(function1)); +} + +} +} + +#endif //PLS_INVOKE_PARALLEL_IMPL_H diff --git a/lib/pls/include/pls/algorithms/invoke_parallel.h b/lib/pls/include/pls/algorithms/invoke_parallel.h deleted file mode 100644 index e311a71..0000000 --- a/lib/pls/include/pls/algorithms/invoke_parallel.h +++ /dev/null @@ -1,23 +0,0 @@ - -#ifndef PLS_PARALLEL_INVOKE_H -#define PLS_PARALLEL_INVOKE_H - -#include "pls/internal/scheduling/task.h" -#include "pls/internal/scheduling/scheduler.h" - -namespace pls { -namespace algorithm { - -template -void invoke_parallel(const Function1 &function1, const Function2 &function2); - -template -void invoke_parallel(const Function1 &function1, const Function2 &function2, const Function3 &function3); - -// ...and so on, add more if we decide to keep this design - -} -} -#include "invoke_parallel_impl.h" - -#endif //PLS_PARALLEL_INVOKE_H diff --git a/lib/pls/include/pls/algorithms/invoke_parallel_impl.h b/lib/pls/include/pls/algorithms/invoke_parallel_impl.h deleted file mode 100644 index a25336e..0000000 --- a/lib/pls/include/pls/algorithms/invoke_parallel_impl.h +++ /dev/null @@ -1,40 +0,0 @@ - -#ifndef PLS_INVOKE_PARALLEL_IMPL_H -#define PLS_INVOKE_PARALLEL_IMPL_H - -#include "pls/internal/scheduling/task.h" -#include "pls/internal/scheduling/lambda_task.h" -#include "pls/internal/scheduling/scheduler.h" -#include "pls/internal/scheduling/thread_state.h" - -namespace pls { -namespace algorithm { - -template -void invoke_parallel(Function1 &&function1, Function2 &&function2) { - using namespace ::pls::internal::scheduling; - - using task_1_t = lambda_task_by_value; - using task_2_t = lambda_task_by_value; - - scheduler::spawn_child(std::forward(function2)); - scheduler::spawn_child_and_wait(std::forward(function1)); -} - -template -void invoke_parallel(Function1 &&function1, Function2 &&function2, Function3 &&function3) { - using namespace ::pls::internal::scheduling; - - using task_1_t = lambda_task_by_value; - using task_2_t = lambda_task_by_value; - using task_3_t = lambda_task_by_value; - - scheduler::spawn_child(std::forward(function3)); - scheduler::spawn_child(std::forward(function2)); - scheduler::spawn_child_and_wait(std::forward(function1)); -} - -} -} - -#endif //PLS_INVOKE_PARALLEL_IMPL_H diff --git a/lib/pls/include/pls/algorithms/parallel_for.h b/lib/pls/include/pls/algorithms/parallel_for.h deleted file mode 100644 index 6860863..0000000 --- a/lib/pls/include/pls/algorithms/parallel_for.h +++ /dev/null @@ -1,24 +0,0 @@ - -#ifndef PLS_PARALLEL_FOR_H -#define PLS_PARALLEL_FOR_H - -// TODO: Replace with own integer iterator to remove dependency -#include - -namespace pls { -namespace algorithm { - -template -void parallel_for(RandomIt first, RandomIt last, const Function &function); - -template -void parallel_for(size_t first, size_t last, const Function &function) { - auto range = boost::irange(first, last); - parallel_for(range.begin(), range.end(), function); -} - -} -} -#include "parallel_for_impl.h" - -#endif //PLS_PARALLEL_FOR_H diff --git a/lib/pls/include/pls/algorithms/parallel_for_impl.h b/lib/pls/include/pls/algorithms/parallel_for_impl.h deleted file mode 100644 index 215f597..0000000 --- a/lib/pls/include/pls/algorithms/parallel_for_impl.h +++ /dev/null @@ -1,43 +0,0 @@ - -#ifndef PLS_PARALLEL_FOR_IMPL_H -#define PLS_PARALLEL_FOR_IMPL_H - -#include "pls/internal/scheduling/task.h" -#include "pls/internal/scheduling/scheduler.h" - -#include "pls/internal/helpers/unique_id.h" - -namespace pls { -namespace algorithm { - -template -void parallel_for(RandomIt first, RandomIt last, const Function &function) { - using namespace ::pls::internal::scheduling; - constexpr long min_elements = 4; - - long num_elements = std::distance(first, last); - if (num_elements <= min_elements) { - // calculate last elements in loop to avoid overhead - for (auto current = first; current != last; current++) { - function(*current); - } - } else { - // Cut in half recursively - long middle_index = num_elements / 2; - - auto second_half_body = - [first, middle_index, last, &function] { parallel_for(first + middle_index, last, function); }; - using second_half_t = lambda_task_by_reference; - scheduler::spawn_child(std::move(second_half_body)); - - auto first_half_body = - [first, middle_index, last, &function] { parallel_for(first, first + middle_index, function); }; - using first_half_t = lambda_task_by_reference; - scheduler::spawn_child_and_wait(std::move(first_half_body)); - } -} - -} -} - -#endif //PLS_INVOKE_PARALLEL_IMPL_H diff --git a/lib/pls/include/pls/algorithms/parallel_scan.h b/lib/pls/include/pls/algorithms/parallel_scan.h deleted file mode 100644 index 9149371..0000000 --- a/lib/pls/include/pls/algorithms/parallel_scan.h +++ /dev/null @@ -1,15 +0,0 @@ - -#ifndef PLS_PARALLEL_SCAN_H_ -#define PLS_PARALLEL_SCAN_H_ - -namespace pls { -namespace algorithm { - -template -void parallel_scan(InIter in_start, const InIter in_end, OutIter out, BinaryOp op, Type neutral_elem); - -} -} -#include "parallel_scan_impl.h" - -#endif //PLS_PARALLEL_SCAN_H_ diff --git a/lib/pls/include/pls/algorithms/parallel_scan_impl.h b/lib/pls/include/pls/algorithms/parallel_scan_impl.h deleted file mode 100644 index 3afd8bf..0000000 --- a/lib/pls/include/pls/algorithms/parallel_scan_impl.h +++ /dev/null @@ -1,79 +0,0 @@ - -#ifndef PLS_PARALLEL_SCAN_IMPL_H_ -#define PLS_PARALLEL_SCAN_IMPL_H_ - -#include -#include - -#include "pls/pls.h" -#include "pls/internal/scheduling/thread_state.h" - -namespace pls { -namespace algorithm { -namespace internal { -template -void serial_scan(InIter input_start, const InIter input_end, OutIter output, BinaryOp op, Type neutral_element) { - auto current_input = input_start; - auto current_output = output; - auto last_value = neutral_element; - while (current_input != input_end) { - last_value = op(last_value, *current_input); - *current_output = last_value; - - current_input++; - current_output++; - } -} - -} - -template -void parallel_scan(InIter in_start, const InIter in_end, OutIter out, BinaryOp op, Type neutral_elem) { - constexpr auto chunks_per_thread = 4; - using namespace pls::internal::scheduling; - - // TODO: This must be dynamic to make sense, as it has a far bigger influence than any other cutoff. - // The current strategy is static partitioning, and suboptimal in inballanced workloads. - auto size = std::distance(in_start, in_end); - auto num_threads = thread_state::get()->scheduler_->num_threads(); - auto chunks = num_threads * chunks_per_thread; - auto items_per_chunk = std::max(1l, size / chunks); - - scheduler::allocate_on_stack(sizeof(Type) * (chunks), [&](void *memory) { - Type *chunk_sums = reinterpret_cast(memory); - - // First Pass = calculate each chunks individual prefix sum - parallel_for(0, chunks, [&](int i) { - auto chunk_start = in_start + items_per_chunk * i; - auto chunk_end = std::min(in_end, chunk_start + items_per_chunk); - auto chunk_output = out + items_per_chunk * i; - - internal::serial_scan(chunk_start, chunk_end, chunk_output, op, neutral_elem); - chunk_sums[i] = *(out + std::distance(chunk_start, chunk_end) - 1); - }); - - // Calculate prefix sums of each chunks sum - // (effectively the prefix sum at the end of each chunk, then used to correct the following chunk). - internal::serial_scan(chunk_sums, chunk_sums + chunks, chunk_sums, std::plus(), 0); - - // Second Pass = Use results from first pass to correct each chunks sum - auto output_start = out; - auto output_end = out + size; - parallel_for(1, chunks, [&](int i) { - auto chunk_start = output_start + items_per_chunk * i; - auto chunk_end = std::min(output_end, chunk_start + items_per_chunk); - - for (; chunk_start != chunk_end; chunk_start++) { - *chunk_start = op(*chunk_start, chunk_sums[i - 1]); - } - }); - }); - - // End this work section by cleaning up stack and tasks - scheduler::wait_for_all(); -} - -} -} - -#endif //PLS_PARALLEL_SCAN_IMPL_H_ diff --git a/lib/pls/include/pls/algorithms/scan.h b/lib/pls/include/pls/algorithms/scan.h new file mode 100644 index 0000000..1db358b --- /dev/null +++ b/lib/pls/include/pls/algorithms/scan.h @@ -0,0 +1,15 @@ + +#ifndef PLS_PARALLEL_SCAN_H_ +#define PLS_PARALLEL_SCAN_H_ + +namespace pls { +namespace algorithm { + +template +void scan(InIter in_start, const InIter in_end, OutIter out, BinaryOp op, Type neutral_elem); + +} +} +#include "scan_impl.h" + +#endif //PLS_PARALLEL_SCAN_H_ diff --git a/lib/pls/include/pls/algorithms/scan_impl.h b/lib/pls/include/pls/algorithms/scan_impl.h new file mode 100644 index 0000000..007198a --- /dev/null +++ b/lib/pls/include/pls/algorithms/scan_impl.h @@ -0,0 +1,79 @@ + +#ifndef PLS_PARALLEL_SCAN_IMPL_H_ +#define PLS_PARALLEL_SCAN_IMPL_H_ + +#include +#include + +#include "pls/pls.h" +#include "pls/internal/scheduling/thread_state.h" + +namespace pls { +namespace algorithm { +namespace internal { +template +void serial_scan(InIter input_start, const InIter input_end, OutIter output, BinaryOp op, Type neutral_element) { + auto current_input = input_start; + auto current_output = output; + auto last_value = neutral_element; + while (current_input != input_end) { + last_value = op(last_value, *current_input); + *current_output = last_value; + + current_input++; + current_output++; + } +} + +} + +template +void scan(InIter in_start, const InIter in_end, OutIter out, BinaryOp op, Type neutral_elem) { + constexpr auto chunks_per_thread = 4; + using namespace pls::internal::scheduling; + + // TODO: This must be dynamic to make sense, as it has a far bigger influence than any other cutoff. + // The current strategy is static partitioning, and suboptimal in inballanced workloads. + auto size = std::distance(in_start, in_end); + auto num_threads = thread_state::get()->scheduler_->num_threads(); + auto chunks = num_threads * chunks_per_thread; + auto items_per_chunk = std::max(1l, size / chunks); + + scheduler::allocate_on_stack(sizeof(Type) * (chunks), [&](void *memory) { + Type *chunk_sums = reinterpret_cast(memory); + + // First Pass = calculate each chunks individual prefix sum + for_each_range(0, chunks, [&](int i) { + auto chunk_start = in_start + items_per_chunk * i; + auto chunk_end = std::min(in_end, chunk_start + items_per_chunk); + auto chunk_output = out + items_per_chunk * i; + + internal::serial_scan(chunk_start, chunk_end, chunk_output, op, neutral_elem); + chunk_sums[i] = *(out + std::distance(chunk_start, chunk_end) - 1); + }); + + // Calculate prefix sums of each chunks sum + // (effectively the prefix sum at the end of each chunk, then used to correct the following chunk). + internal::serial_scan(chunk_sums, chunk_sums + chunks, chunk_sums, std::plus(), 0); + + // Second Pass = Use results from first pass to correct each chunks sum + auto output_start = out; + auto output_end = out + size; + for_each_range(1, chunks, [&](int i) { + auto chunk_start = output_start + items_per_chunk * i; + auto chunk_end = std::min(output_end, chunk_start + items_per_chunk); + + for (; chunk_start != chunk_end; chunk_start++) { + *chunk_start = op(*chunk_start, chunk_sums[i - 1]); + } + }); + }); + + // End this work section by cleaning up stack and tasks + scheduler::wait_for_all(); +} + +} +} + +#endif //PLS_PARALLEL_SCAN_IMPL_H_ diff --git a/lib/pls/include/pls/pls.h b/lib/pls/include/pls/pls.h index c6aa33f..9308710 100644 --- a/lib/pls/include/pls/pls.h +++ b/lib/pls/include/pls/pls.h @@ -1,9 +1,9 @@ #ifndef PLS_LIBRARY_H #define PLS_LIBRARY_H -#include "pls/algorithms/invoke_parallel.h" -#include "pls/algorithms/parallel_for.h" -#include "pls/algorithms/parallel_scan.h" +#include "pls/algorithms/invoke.h" +#include "pls/algorithms/for_each.h" +#include "pls/algorithms/scan.h" #include "pls/internal/scheduling/task.h" #include "pls/internal/scheduling/scheduler.h" #include "pls/internal/helpers/unique_id.h" @@ -22,9 +22,9 @@ using internal::scheduling::lambda_task_by_reference; using internal::scheduling::lambda_task_by_value; using internal::scheduling::task; -using algorithm::invoke_parallel; -using algorithm::parallel_for; -using algorithm::parallel_scan; +using algorithm::invoke; +using algorithm::for_each; +using algorithm::scan; } #endif -- libgit2 0.26.0