From 9fa9296a2e99c2eaf09831e8663c7f6d7c4996b1 Mon Sep 17 00:00:00 2001 From: FritzFlorian Date: Sun, 28 Jun 2020 20:06:09 +0200 Subject: [PATCH] Final version ready for benchmarking. --- CMakeLists.txt | 3 +++ app/benchmark_fft/CMakeLists.txt | 2 ++ app/benchmark_fib/CMakeLists.txt | 2 ++ app/benchmark_matrix/CMakeLists.txt | 2 ++ app/benchmark_matrix/main.cpp | 11 +++++++++++ app/benchmark_matrix_div_conquer/CMakeLists.txt | 2 ++ app/benchmark_matrix_div_conquer/main.cpp | 10 +++------- app/benchmark_unbalanced/CMakeLists.txt | 2 ++ app/benchmark_unbalanced/main.cpp | 2 +- extern/benchmark_base/CMakeLists.txt | 2 +- extern/benchmark_base/include/benchmark_base/matrix_div_conquer.h | 71 +++++++++++++++++++++++------------------------------------------------ extern/benchmark_base/src/matrix_div_conquer.cpp | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ extern/benchmark_runner/benchmark_runner.h | 2 +- lib/pls/CMakeLists.txt | 2 +- lib/pls/include/pls/algorithms/divide_and_conquer_buffers.h | 5 ----- lib/pls/include/pls/algorithms/for_each.h | 8 ++++---- lib/pls/include/pls/algorithms/for_each_impl.h | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------- lib/pls/include/pls/internal/base/error_handling.h | 5 +++++ lib/pls/include/pls/internal/base/stack_allocator.h | 1 + lib/pls/include/pls/internal/scheduling/base_task.h | 9 +++++---- lib/pls/include/pls/internal/scheduling/lock_free/task.h | 4 +--- lib/pls/include/pls/internal/scheduling/scheduler_impl.h | 22 ++++++++++++++++++---- lib/pls/include/pls/pls.h | 3 +++ lib/pls/src/internal/scheduling/lock_free/external_trading_deque.cpp | 38 ++++++++++++++++++++++---------------- lib/pls/src/internal/scheduling/lock_free/task.cpp | 32 ++++++++++++-------------------- lib/pls/src/internal/scheduling/lock_free/task_manager.cpp | 9 +++++---- lib/pls/src/internal/scheduling/scheduler.cpp | 15 ++++++++------- 27 files changed, 255 insertions(+), 149 deletions(-) create mode 100644 extern/benchmark_base/src/matrix_div_conquer.cpp delete mode 100644 lib/pls/include/pls/algorithms/divide_and_conquer_buffers.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 215aa65..12cb45e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,6 +43,9 @@ ADD_CUSTOM_TARGET(install.pls -P ${CMAKE_BINARY_DIR}/cmake_install.cmake) ADD_DEPENDENCIES(install.pls context_switcher pls) +# ... second custom target to only build the benchmarks. +ADD_CUSTOM_TARGET(benchmark.pls) + # Include examples add_subdirectory(app/playground) add_subdirectory(app/benchmark_fft) diff --git a/app/benchmark_fft/CMakeLists.txt b/app/benchmark_fft/CMakeLists.txt index cfef00b..c351434 100644 --- a/app/benchmark_fft/CMakeLists.txt +++ b/app/benchmark_fft/CMakeLists.txt @@ -1,5 +1,7 @@ add_executable(benchmark_fft_pls_v3 main.cpp) target_link_libraries(benchmark_fft_pls_v3 pls benchmark_runner benchmark_base) +ADD_DEPENDENCIES(benchmark.pls benchmark_fft_pls_v3) + if (EASY_PROFILER) target_link_libraries(benchmark_fft_pls_v3 easy_profiler) endif () diff --git a/app/benchmark_fib/CMakeLists.txt b/app/benchmark_fib/CMakeLists.txt index 5233f4f..1369f79 100644 --- a/app/benchmark_fib/CMakeLists.txt +++ b/app/benchmark_fib/CMakeLists.txt @@ -1,5 +1,7 @@ add_executable(benchmark_fib_pls_v3 main.cpp) target_link_libraries(benchmark_fib_pls_v3 pls benchmark_runner benchmark_base) +ADD_DEPENDENCIES(benchmark.pls benchmark_fib_pls_v3) + if (EASY_PROFILER) target_link_libraries(benchmark_fib_pls_v3 easy_profiler) endif () diff --git a/app/benchmark_matrix/CMakeLists.txt b/app/benchmark_matrix/CMakeLists.txt index fe40f88..e2a6619 100644 --- a/app/benchmark_matrix/CMakeLists.txt +++ b/app/benchmark_matrix/CMakeLists.txt @@ -1,5 +1,7 @@ add_executable(benchmark_matrix_pls_v3 main.cpp) target_link_libraries(benchmark_matrix_pls_v3 pls benchmark_runner benchmark_base) +ADD_DEPENDENCIES(benchmark.pls benchmark_matrix_pls_v3) + if (EASY_PROFILER) target_link_libraries(benchmark_matrix_pls_v3 easy_profiler) endif () diff --git a/app/benchmark_matrix/main.cpp b/app/benchmark_matrix/main.cpp index 4ff4075..aa4348b 100644 --- a/app/benchmark_matrix/main.cpp +++ b/app/benchmark_matrix/main.cpp @@ -37,6 +37,12 @@ int main(int argc, char **argv) { pls::scheduler scheduler{(unsigned) settings.num_threads_, MAX_NUM_TASKS, MAX_STACK_SIZE}; if (settings.type_ == benchmark_runner::benchmark_settings::ISOLATED) { +#if PLS_PROFILING_ENABLED + scheduler.get_profiler().disable_memory_measure(); + runner.add_custom_stats_field("T_1"); + runner.add_custom_stats_field("T_inf"); +#endif + printf("Running isolated measurement...\n"); runner.enable_memory_stats(); runner.pre_allocate_stats(); @@ -45,6 +51,11 @@ int main(int argc, char **argv) { scheduler.perform_work([&]() { result.multiply(a, b); }); + }, [&]() {}, [&]() { +#if PLS_PROFILING_ENABLED + runner.store_custom_stat("T_1", scheduler.get_profiler().current_run().t_1_); + runner.store_custom_stat("T_inf", scheduler.get_profiler().current_run().t_inf_); +#endif }); runner.commit_results(true); } else { diff --git a/app/benchmark_matrix_div_conquer/CMakeLists.txt b/app/benchmark_matrix_div_conquer/CMakeLists.txt index 4d510c0..a8ed2ce 100644 --- a/app/benchmark_matrix_div_conquer/CMakeLists.txt +++ b/app/benchmark_matrix_div_conquer/CMakeLists.txt @@ -1,5 +1,7 @@ add_executable(benchmark_matrix_div_conquer_pls_v3 main.cpp) target_link_libraries(benchmark_matrix_div_conquer_pls_v3 pls benchmark_runner benchmark_base) +ADD_DEPENDENCIES(benchmark.pls benchmark_matrix_div_conquer_pls_v3) + if (EASY_PROFILER) target_link_libraries(benchmark_matrix_div_conquer_pls_v3 easy_profiler) endif () diff --git a/app/benchmark_matrix_div_conquer/main.cpp b/app/benchmark_matrix_div_conquer/main.cpp index 050090f..0175d0f 100644 --- a/app/benchmark_matrix_div_conquer/main.cpp +++ b/app/benchmark_matrix_div_conquer/main.cpp @@ -92,12 +92,10 @@ void multiply_div_conquer(const std::vector>>> div_conquer_temp_arrays; @@ -138,7 +134,7 @@ int main(int argc, char **argv) { while (remaining_size > matrix_div_conquer::CUTOFF_SIZE) { auto &depth_buffers = div_conquer_temp_arrays.emplace_back(); buffers_needed = std::min(buffers_needed, (size_t) settings.num_threads_); - for (int thread_id = 0; thread_id < buffers_needed; thread_id++) { + for (size_t thread_id = 0; thread_id < buffers_needed; thread_id++) { auto &depth_thread_buffers = depth_buffers.emplace_back(); for (int i = 0; i < 8; i++) { size_t matrix_elements = (remaining_size / 2) * (remaining_size / 2); @@ -159,7 +155,7 @@ int main(int argc, char **argv) { string full_directory = settings.output_directory_ + "/PLS_v3/"; benchmark_runner runner{full_directory, test_name}; - pls::scheduler scheduler{(unsigned) settings.num_threads_, MAX_NUM_TASKS, MAX_STACK_SIZE}; + pls::scheduler scheduler{(unsigned) settings.num_threads_, max_depth + 2, MAX_STACK_SIZE}; if (settings.type_ == benchmark_runner::benchmark_settings::ISOLATED) { printf("Running isolated measurement...\n"); diff --git a/app/benchmark_unbalanced/CMakeLists.txt b/app/benchmark_unbalanced/CMakeLists.txt index 9c86805..26f4649 100644 --- a/app/benchmark_unbalanced/CMakeLists.txt +++ b/app/benchmark_unbalanced/CMakeLists.txt @@ -1,5 +1,7 @@ add_executable(benchmark_unbalanced_pls_v3 main.cpp) target_link_libraries(benchmark_unbalanced_pls_v3 benchmark_runner benchmark_base pls) +ADD_DEPENDENCIES(benchmark.pls benchmark_unbalanced_pls_v3) + if (EASY_PROFILER) target_link_libraries(benchmark_unbalanced_pls_v3 easy_profiler) endif () diff --git a/app/benchmark_unbalanced/main.cpp b/app/benchmark_unbalanced/main.cpp index d4936ac..c482090 100644 --- a/app/benchmark_unbalanced/main.cpp +++ b/app/benchmark_unbalanced/main.cpp @@ -31,7 +31,7 @@ int unbalanced_tree_search(int seed, int root_children, double q, int normal_chi return count_child_nodes(root); } -constexpr int MAX_NUM_TASKS = 256; +constexpr int MAX_NUM_TASKS = 180; constexpr int MAX_STACK_SIZE = 4096 * 1; int main(int argc, char **argv) { diff --git a/extern/benchmark_base/CMakeLists.txt b/extern/benchmark_base/CMakeLists.txt index da76867..03de16c 100644 --- a/extern/benchmark_base/CMakeLists.txt +++ b/extern/benchmark_base/CMakeLists.txt @@ -9,7 +9,7 @@ add_library(benchmark_base STATIC include/benchmark_base/unbalanced.h src/unbalanced.cpp include/benchmark_base/range.h include/benchmark_base/fib.h - include/benchmark_base/matrix_div_conquer.h) + include/benchmark_base/matrix_div_conquer.h src/matrix_div_conquer.cpp) target_include_directories(benchmark_base PUBLIC diff --git a/extern/benchmark_base/include/benchmark_base/matrix_div_conquer.h b/extern/benchmark_base/include/benchmark_base/matrix_div_conquer.h index 9002de1..f5e5221 100644 --- a/extern/benchmark_base/include/benchmark_base/matrix_div_conquer.h +++ b/extern/benchmark_base/include/benchmark_base/matrix_div_conquer.h @@ -1,51 +1,27 @@ #ifndef COMPARISON_BENCHMARKS_BASE_MATRIX_DIV_CONQUER_H #define COMPARISON_BENCHMARKS_BASE_MATRIX_DIV_CONQUER_H -#include +#include +#include +#include namespace comparison_benchmarks { namespace base { namespace matrix_div_conquer { -const int MATRIX_SIZE = 128; const int CUTOFF_SIZE = 8; -const int NUM_ITERATIONS = 100; -const int WARMUP_ITERATIONS = 10; - -// Helpers to directly index into blocked matrices -const size_t MAX_SIZE = 128; -std::array, MAX_SIZE> BLOCK_LOOKUP; // ROW, COLUMN -void fill_block_lookup(size_t size = MAX_SIZE) { - if (size <= 1) { - BLOCK_LOOKUP[0][0] = 0; - return; - } - - fill_block_lookup(size / 2); - - size_t elements_per_quarter = (size / 2) * (size / 2); - for (size_t row = 0; row < size / 2; row++) { - for (size_t column = 0; column < size / 2; column++) { - BLOCK_LOOKUP[row][size / 2 + column] = BLOCK_LOOKUP[row][column] + elements_per_quarter; - BLOCK_LOOKUP[size / 2 + row][column] = BLOCK_LOOKUP[row][column] + 2 * elements_per_quarter; - BLOCK_LOOKUP[size / 2 + row][size / 2 + column] = BLOCK_LOOKUP[row][column] + 3 * elements_per_quarter; - } - } -} - class blocked_matrix_view { public: - blocked_matrix_view(double *data, size_t size) : data_{data}, size_{size} {} - - void fill_default_data() { - for (size_t row = 0; row < size_; row++) { - for (size_t column = 0; column < size_; column++) { - at(row, column) = row; - } + blocked_matrix_view(double *data, size_t size) : data_{data}, + size_{size} { + if (size > BLOCK_LOOKUPS_SIZE) { + init_block_lookup(size); } } + void fill_default_data(); + blocked_matrix_view quadrant_1_1() { size_t elements_per_quarter = (size_ / 2) * (size_ / 2); return blocked_matrix_view(data_ + 0 * elements_per_quarter, size_ / 2); @@ -64,7 +40,7 @@ class blocked_matrix_view { } double &at(size_t row, size_t column) { - return data_[BLOCK_LOOKUP[row][column]]; + return data_[BLOCK_LOOKUPS[block_lookup_at(row, column)]]; } double *get_data() { @@ -72,22 +48,21 @@ class blocked_matrix_view { } private: - double *data_; - size_t size_; -}; + double *const data_; + const size_t size_; -void multiply_naive(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) { - for (size_t i = 0; i < size; i++) { - for (size_t j = 0; j < size; j++) { - result.at(i, j) = 0; - } - for (size_t j = 0; j < size; j++) { - for (size_t k = 0; k < size; k++) { - result.at(i, j) += a.at(i, k) * b.at(k, j); - } - } + // Lookup indices for non divide-conquer block lookups + static std::vector BLOCK_LOOKUPS; + static size_t BLOCK_LOOKUPS_SIZE; + + static void fill_block_lookup(size_t size, std::vector &BLOCK_LOOKUP); + static void init_block_lookup(size_t max_size); + static size_t block_lookup_at(size_t row, size_t column) { + return row * BLOCK_LOOKUPS_SIZE + column; } -} +}; + +void multiply_naive(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b); } } diff --git a/extern/benchmark_base/src/matrix_div_conquer.cpp b/extern/benchmark_base/src/matrix_div_conquer.cpp new file mode 100644 index 0000000..8155588 --- /dev/null +++ b/extern/benchmark_base/src/matrix_div_conquer.cpp @@ -0,0 +1,64 @@ +#include +#include +#include "benchmark_base/matrix_div_conquer.h" + +namespace comparison_benchmarks { +namespace base { +namespace matrix_div_conquer { + +void multiply_naive(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) { + for (size_t i = 0; i < size; i++) { + for (size_t j = 0; j < size; j++) { + result.at(i, j) = 0; + } + for (size_t j = 0; j < size; j++) { + for (size_t k = 0; k < size; k++) { + result.at(i, j) += a.at(i, k) * b.at(k, j); + } + } + } +} + +void blocked_matrix_view::fill_default_data() { + for (size_t row = 0; row < size_; row++) { + for (size_t column = 0; column < size_; column++) { + at(row, column) = row; + } + } +} + +std::vector blocked_matrix_view::BLOCK_LOOKUPS; +size_t blocked_matrix_view::BLOCK_LOOKUPS_SIZE; + +void blocked_matrix_view::fill_block_lookup(size_t size, std::vector &BLOCK_LOOKUP) { + if (size <= 1) { + BLOCK_LOOKUP[block_lookup_at(0, 0)] = 0; + return; + } + + fill_block_lookup(size / 2, BLOCK_LOOKUP); + + size_t elements_per_quarter = (size / 2) * (size / 2); + for (size_t row = 0; row < size / 2; row++) { + for (size_t column = 0; column < size / 2; column++) { + BLOCK_LOOKUP[block_lookup_at(row, size / 2 + column)] = + BLOCK_LOOKUP[block_lookup_at(row, column)] + elements_per_quarter; + BLOCK_LOOKUP[block_lookup_at(size / 2 + row, column)] = + BLOCK_LOOKUP[block_lookup_at(row, column)] + 2 * elements_per_quarter; + BLOCK_LOOKUP[block_lookup_at(size / 2 + row, size / 2 + column)] = + BLOCK_LOOKUP[block_lookup_at(row, column)] + 3 * elements_per_quarter; + } + } +} + +void blocked_matrix_view::init_block_lookup(size_t max_size) { + if (BLOCK_LOOKUPS.size() < max_size) { + BLOCK_LOOKUPS = std::vector(max_size * max_size); + BLOCK_LOOKUPS_SIZE = max_size; + fill_block_lookup(max_size, BLOCK_LOOKUPS); + } +} + +} +} +} diff --git a/extern/benchmark_runner/benchmark_runner.h b/extern/benchmark_runner/benchmark_runner.h index 1a89990..36a8f55 100644 --- a/extern/benchmark_runner/benchmark_runner.h +++ b/extern/benchmark_runner/benchmark_runner.h @@ -299,7 +299,7 @@ class benchmark_runner { long wall_time_us = 0; wall_time_us += (finish_time.tv_sec - iteration_start.tv_sec) * 1000l * 1000l; wall_time_us += ((long) finish_time.tv_nsec - (long) iteration_start.tv_nsec) / 1000l; - printf("Difference: %d\n", wall_time_us - times_[current_iteration]); + printf("Difference: %ld\n", wall_time_us - times_[current_iteration]); times_[current_iteration] = wall_time_us; if (finish_time.tv_sec >= deadline_end.tv_sec && finish_time.tv_nsec > deadline_end.tv_nsec) { diff --git a/lib/pls/CMakeLists.txt b/lib/pls/CMakeLists.txt index 8065e84..e127f98 100644 --- a/lib/pls/CMakeLists.txt +++ b/lib/pls/CMakeLists.txt @@ -49,7 +49,7 @@ add_library(pls STATIC include/pls/internal/profiling/dag_node.h src/internal/profiling/dag_node.cpp include/pls/internal/profiling/profiler.h src/internal/profiling/profiler.cpp - include/pls/internal/profiling/thread_stats.h src/internal/profiling/thread_stats.cpp include/pls/algorithms/divide_and_conquer_buffers.h) + include/pls/internal/profiling/thread_stats.h src/internal/profiling/thread_stats.cpp) # Dependencies for pls target_link_libraries(pls Threads::Threads) diff --git a/lib/pls/include/pls/algorithms/divide_and_conquer_buffers.h b/lib/pls/include/pls/algorithms/divide_and_conquer_buffers.h deleted file mode 100644 index 8786c30..0000000 --- a/lib/pls/include/pls/algorithms/divide_and_conquer_buffers.h +++ /dev/null @@ -1,5 +0,0 @@ - -#ifndef PLS_ALGORITHMS_DIVIDE_AND_CONQUER_BUFFERS_H_ -#define PLS_ALGORITHMS_DIVIDE_AND_CONQUER_BUFFERS_H_ - -#endif //PLS_ALGORITHMS_DIVIDE_AND_CONQUER_BUFFERS_H_ diff --git a/lib/pls/include/pls/algorithms/for_each.h b/lib/pls/include/pls/algorithms/for_each.h index 7430ea5..9a1e300 100644 --- a/lib/pls/include/pls/algorithms/for_each.h +++ b/lib/pls/include/pls/algorithms/for_each.h @@ -7,14 +7,14 @@ namespace pls::algorithm { template -static void for_each_range(unsigned long first, - unsigned long last, +static void for_each_range(long first, + long last, const Function &function, ExecutionStrategy &execution_strategy); template -static void for_each_range(unsigned long first, - unsigned long last, +static void for_each_range(long first, + long last, const Function &function); template diff --git a/lib/pls/include/pls/algorithms/for_each_impl.h b/lib/pls/include/pls/algorithms/for_each_impl.h index 722cb69..9ba4479 100644 --- a/lib/pls/include/pls/algorithms/for_each_impl.h +++ b/lib/pls/include/pls/algorithms/for_each_impl.h @@ -10,10 +10,10 @@ namespace pls::algorithm { namespace internal { template -static void for_each(const RandomIt first, - const RandomIt last, - const Function &function, - const long min_elements) { +static void for_each_iterator(const RandomIt first, + const RandomIt last, + const Function &function, + const size_t min_elements) { using namespace ::pls::internal::scheduling; const long num_elements = std::distance(first, last); @@ -27,16 +27,48 @@ static void for_each(const RandomIt first, const long middle_index = num_elements / 2; scheduler::spawn([first, middle_index, last, &function, min_elements] { - internal::for_each(first, - first + middle_index, - function, - min_elements); + internal::for_each_iterator(first, + first + middle_index, + function, + min_elements); + }); + scheduler::spawn_and_sync([first, middle_index, last, &function, min_elements] { + internal::for_each_iterator(first + middle_index, + last, + function, + min_elements); + }); + } +} + +template +static void for_each_range(const long first, + const long last, + const Function &function, + const size_t min_elements) { + using namespace ::pls::internal::scheduling; + + const long num_elements = last - first; + if (num_elements <= min_elements) { + // calculate last elements in loop to avoid overhead + for (auto current = first; current != last; current++) { + function(current); + } + } else { + // Cut in half recursively + const long middle_index = num_elements / 2; + + scheduler::spawn([first, middle_index, last, &function, min_elements] { + internal::for_each_range(first, + first + middle_index, + function, + min_elements); }); scheduler::spawn_and_sync([first, middle_index, last, &function, min_elements] { - internal::for_each(first + middle_index, - last, - function, - min_elements); + internal::for_each_range(first + middle_index, + last, + function, + min_elements); }); } } @@ -44,15 +76,14 @@ static void for_each(const RandomIt first, } template -static void for_each(RandomIt - first, +static void for_each(RandomIt first, RandomIt last, const Function &function, ExecutionStrategy execution_strategy) { long num_elements = std::distance(first, last); return - internal::for_each(first, last, function, execution_strategy.calculate_min_elements(num_elements)); + internal::for_each_iterator(first, last, function, execution_strategy.calculate_min_elements(num_elements)); } template @@ -61,20 +92,19 @@ static void for_each(RandomIt first, RandomIt last, const Function &function) { } template -static void for_each_range(unsigned long first, - unsigned long last, +static void for_each_range(long first, + long last, const Function &function, ExecutionStrategy execution_strategy) { - auto range = pls::internal::helpers::range(first, last); - return for_each(range.begin(), range.end(), function, execution_strategy); + long num_elements = last - first; + return internal::for_each_range(first, last, function, execution_strategy.calculate_min_elements(num_elements)); } template -static void for_each_range(unsigned long first, - unsigned long last, +static void for_each_range(long first, + long last, const Function &function) { - auto range = pls::internal::helpers::range(first, last); - return for_each(range.begin(), range.end(), function); + return for_each_range(first, last, function, dynamic_strategy{4}); } } diff --git a/lib/pls/include/pls/internal/base/error_handling.h b/lib/pls/include/pls/internal/base/error_handling.h index 8704cc8..2a3e156 100644 --- a/lib/pls/include/pls/internal/base/error_handling.h +++ b/lib/pls/include/pls/internal/base/error_handling.h @@ -18,4 +18,9 @@ void pls_error(const char *msg); // TODO: Distinguish between debug/internal asserts and production asserts. #define PLS_ASSERT(cond, msg) if (!(cond)) { pls_error(msg); } +// Enable/Disable more expensive asserts. +// On very small workloads also the 'normal' asserts can be disabled for more performance. +//#define PLS_ASSERT_EXPENSIVE(cond, msg) if (!(cond)) { pls_error(msg); } +#define PLS_ASSERT_EXPENSIVE(cond, msg) + #endif //PLS_ERROR_HANDLING_H diff --git a/lib/pls/include/pls/internal/base/stack_allocator.h b/lib/pls/include/pls/internal/base/stack_allocator.h index ec99bf5..6296131 100644 --- a/lib/pls/include/pls/internal/base/stack_allocator.h +++ b/lib/pls/include/pls/internal/base/stack_allocator.h @@ -5,6 +5,7 @@ #include namespace pls::internal::base { + class stack_allocator { public: virtual char *allocate_stack(size_t size) = 0; diff --git a/lib/pls/include/pls/internal/scheduling/base_task.h b/lib/pls/include/pls/internal/scheduling/base_task.h index 231c402..3e744f0 100644 --- a/lib/pls/include/pls/internal/scheduling/base_task.h +++ b/lib/pls/include/pls/internal/scheduling/base_task.h @@ -56,12 +56,13 @@ struct base_task { } // General task information - unsigned depth_; - unsigned thread_id_; + const unsigned depth_; + const unsigned thread_id_; // Stack/continuation management - char *stack_memory_; - size_t stack_size_; + char * const stack_memory_; + const size_t stack_size_; + context_switcher::continuation continuation_; bool is_synchronized_; bool is_serial_section_; diff --git a/lib/pls/include/pls/internal/scheduling/lock_free/task.h b/lib/pls/include/pls/internal/scheduling/lock_free/task.h index 723a067..967688d 100644 --- a/lib/pls/include/pls/internal/scheduling/lock_free/task.h +++ b/lib/pls/include/pls/internal/scheduling/lock_free/task.h @@ -30,11 +30,9 @@ struct task : public base_task { static task *find_task(unsigned id, unsigned depth); private: - std::atomic num_resources_{}; - // STAMP = thread id of 'owning' thread before task was inserted into stack. // VALUE = next item in stack, indicated by thread ID. - std::atomic resource_stack_next_{{0, 0}}; + PLS_CACHE_ALIGN std::atomic resource_stack_next_{{0, 0}}; // STAMP = CAS stamp, half CAS length (16 or 32 Bit) // VALUE = Root of the actual stack, indicated by thread ID (16 or 32 Bit) diff --git a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h index 3e1077b..8101c05 100644 --- a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h +++ b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h @@ -63,6 +63,20 @@ scheduler::scheduler(unsigned int num_threads, work_thread_main_loop(); }); } + + // Make sure all threads are created and touched their stacks. + // Executing a work section ensures one wakeup/sleep cycle of all workers + // and explicitly forcing one task per worker forces them to initialize their stacks. + std::atomic num_spawned; + this->perform_work([&]() { + for (unsigned i = 0; i < num_threads; i++) { + spawn([&]() { + num_spawned++; + while (num_spawned < num_threads) std::this_thread::yield(); + }); + } + sync(); + }); } class scheduler::init_function { @@ -195,7 +209,7 @@ void scheduler::spawn_internal(Function &&lambda) { #if PLS_SLEEP_WORKERS_ON_EMPTY // TODO: relax atomic operations on empty flag data_structures::stamped_integer queue_empty_flag = spawning_state.get_queue_empty_flag().load(); - switch (queue_empty_flag.value) { + switch (queue_empty_flag.value_) { case EMPTY_QUEUE_STATE::QUEUE_NON_EMPTY: { // The queue was not found empty, ignore it. break; @@ -203,9 +217,9 @@ void scheduler::spawn_internal(Function &&lambda) { case EMPTY_QUEUE_STATE::QUEUE_MAYBE_EMPTY: { // Someone tries to mark us empty and might be re-stealing right now. data_structures::stamped_integer - queue_non_empty_flag{queue_empty_flag.stamp++, EMPTY_QUEUE_STATE::QUEUE_NON_EMPTY}; + queue_non_empty_flag{queue_empty_flag.stamp_++, EMPTY_QUEUE_STATE::QUEUE_NON_EMPTY}; auto actual_empty_flag = spawning_state.get_queue_empty_flag().exchange(queue_non_empty_flag); - if (actual_empty_flag.value == EMPTY_QUEUE_STATE::QUEUE_EMPTY) { + if (actual_empty_flag.value_ == EMPTY_QUEUE_STATE::QUEUE_EMPTY) { spawning_state.get_scheduler().empty_queue_decrease_counter_and_wake(); } break; @@ -213,7 +227,7 @@ void scheduler::spawn_internal(Function &&lambda) { case EMPTY_QUEUE_STATE::QUEUE_EMPTY: { // Someone already marked the queue empty, we must revert its action on the central queue. data_structures::stamped_integer - queue_non_empty_flag{queue_empty_flag.stamp++, EMPTY_QUEUE_STATE::QUEUE_NON_EMPTY}; + queue_non_empty_flag{queue_empty_flag.stamp_++, EMPTY_QUEUE_STATE::QUEUE_NON_EMPTY}; spawning_state.get_queue_empty_flag().store(queue_non_empty_flag); spawning_state.get_scheduler().empty_queue_decrease_counter_and_wake(); break; diff --git a/lib/pls/include/pls/pls.h b/lib/pls/include/pls/pls.h index 9755229..7f5895c 100644 --- a/lib/pls/include/pls/pls.h +++ b/lib/pls/include/pls/pls.h @@ -10,6 +10,7 @@ #include "pls/internal/scheduling/scheduler.h" #include "pls/internal/scheduling/strain_local_resource.h" +#include "pls/internal/base/stack_allocator.h" #include "pls/internal/helpers/range.h" #include "pls/internal/helpers/member_function.h" @@ -18,6 +19,8 @@ namespace pls { // 'basic' for-join APIs using internal::scheduling::scheduler; +using internal::base::heap_stack_allocator; +using internal::base::mmap_stack_allocator; template static void spawn(Function &&function) { scheduler::spawn(std::forward(function)); diff --git a/lib/pls/src/internal/scheduling/lock_free/external_trading_deque.cpp b/lib/pls/src/internal/scheduling/lock_free/external_trading_deque.cpp index f57c6b6..9bf2e4e 100644 --- a/lib/pls/src/internal/scheduling/lock_free/external_trading_deque.cpp +++ b/lib/pls/src/internal/scheduling/lock_free/external_trading_deque.cpp @@ -5,7 +5,7 @@ namespace pls::internal::scheduling::lock_free { traded_cas_field external_trading_deque::peek_traded_object(task *target_task) { - traded_cas_field current_cas = target_task->external_trading_deque_cas_.load(); + traded_cas_field current_cas = target_task->external_trading_deque_cas_.load(std::memory_order_relaxed); return current_cas; } @@ -17,7 +17,9 @@ task *external_trading_deque::get_trade_object(task *target_task, traded_cas_field empty_cas = peeked_cas; empty_cas.make_empty(); - if (target_task->external_trading_deque_cas_.compare_exchange_strong(current_cas, empty_cas)) { + if (target_task->external_trading_deque_cas_.compare_exchange_strong(current_cas, + empty_cas, + std::memory_order_acq_rel)) { task *result = task::find_task(result_id, target_task->depth_); return result; } @@ -50,8 +52,8 @@ void external_trading_deque::reset_bot_and_top() { bot_internal_.value_ = 0; bot_internal_.stamp_++; - bot_.store(0); - top_.store({bot_internal_.stamp_, 0}); + bot_.store(0, std::memory_order_release); + top_.store({bot_internal_.stamp_, 0}, std::memory_order_release); } task *external_trading_deque::pop_bot() { @@ -83,11 +85,11 @@ task *external_trading_deque::pop_bot() { } external_trading_deque::peek_result external_trading_deque::peek_top() { - auto local_top = top_.load(); - auto local_bot = bot_.load(); + auto local_top = top_.load(std::memory_order_acquire); + auto local_bot = bot_.load(std::memory_order_acquire); if (local_top.value_ < local_bot) { - return peek_result{entries_[local_top.value_].traded_task_, local_top}; + return peek_result{entries_[local_top.value_].traded_task_.load(std::memory_order_relaxed), local_top}; } else { return peek_result{nullptr, local_top}; } @@ -95,7 +97,7 @@ external_trading_deque::peek_result external_trading_deque::peek_top() { task *external_trading_deque::pop_top(task *offered_task, peek_result peek_result) { stamped_integer expected_top = peek_result.top_pointer_; - auto local_bot = bot_.load(); + auto local_bot = bot_.load(std::memory_order_acquire); if (expected_top.value_ >= local_bot) { return nullptr; } @@ -103,8 +105,8 @@ task *external_trading_deque::pop_top(task *offered_task, peek_result peek_resul auto &target_entry = entries_[expected_top.value_]; // Read our potential result - task *result = target_entry.traded_task_.load(); - unsigned long forwarding_stamp = target_entry.forwarding_stamp_.load(); + task *result = target_entry.traded_task_.load(std::memory_order_relaxed); + unsigned long forwarding_stamp = target_entry.forwarding_stamp_.load(std::memory_order_relaxed); if (result == nullptr) { return nullptr; @@ -112,7 +114,7 @@ task *external_trading_deque::pop_top(task *offered_task, peek_result peek_resul if (forwarding_stamp != expected_top.stamp_) { // ...we failed because the top tag lags behind...try to fix it. // This means only updating the tag, as this location can still hold data we need. - top_.compare_exchange_strong(expected_top, {forwarding_stamp, expected_top.value_}); + top_.compare_exchange_strong(expected_top, {forwarding_stamp, expected_top.value_}, std::memory_order_relaxed); return nullptr; } @@ -123,16 +125,20 @@ task *external_trading_deque::pop_top(task *offered_task, peek_result peek_resul traded_cas_field offered_field = expected_sync_cas_field; offered_field.fill_with_task(offered_task->thread_id_); - if (result->external_trading_deque_cas_.compare_exchange_strong(expected_sync_cas_field, offered_field)) { + if (result->external_trading_deque_cas_.compare_exchange_strong(expected_sync_cas_field, + offered_field, + std::memory_order_acq_rel)) { // We got it, for sure move the top pointer forward. - top_.compare_exchange_strong(expected_top, {expected_top.stamp_ + 1, expected_top.value_ + 1}); + top_.compare_exchange_strong(expected_top, + {expected_top.stamp_ + 1, expected_top.value_ + 1}, + std::memory_order_acq_rel); return result; } else { - // TODO: Re-Check this condition for forwarding the stamp! Should only happen if another top-stealer took the - // slot that we where interested in! if (expected_sync_cas_field.is_filled_with_object() && expected_sync_cas_field.get_stamp() == expected_top.stamp_ && expected_sync_cas_field.get_trade_request_thread_id() == thread_id_) { - top_.compare_exchange_strong(expected_top, {expected_top.stamp_ + 1, expected_top.value_ + 1}); + top_.compare_exchange_strong(expected_top, + {expected_top.stamp_ + 1, expected_top.value_ + 1}, + std::memory_order_relaxed); } return nullptr; } diff --git a/lib/pls/src/internal/scheduling/lock_free/task.cpp b/lib/pls/src/internal/scheduling/lock_free/task.cpp index 7100b46..d077693 100644 --- a/lib/pls/src/internal/scheduling/lock_free/task.cpp +++ b/lib/pls/src/internal/scheduling/lock_free/task.cpp @@ -15,14 +15,12 @@ void task::prepare_for_push(unsigned int pushing_thread_id) { } bool task::push_task_chain(task *spare_task_chain, unsigned pushing_thread_id) { - num_resources_++; - PLS_ASSERT(this->thread_id_ != spare_task_chain->thread_id_, "Makes no sense to push task onto itself, as it is not clean by definition."); PLS_ASSERT(this->depth_ == spare_task_chain->depth_, "Must only push tasks with correct depth."); - data_structures::stamped_integer current_root; + data_structures::stamped_integer current_root = this->resource_stack_root_.load(std::memory_order_relaxed); data_structures::stamped_integer target_root; data_structures::stamped_integer expected_next_field; @@ -30,10 +28,8 @@ bool task::push_task_chain(task *spare_task_chain, unsigned pushing_thread_id) { expected_next_field.stamp_ = pushing_thread_id + 1; expected_next_field.value_ = 0; - int iteration = 0; do { - iteration++; - current_root = this->resource_stack_root_.load(); + // current_root implicitly re-loaded by CAS in loop target_root.stamp_ = current_root.stamp_ + 1; target_root.value_ = spare_task_chain->thread_id_ + 1; @@ -50,53 +46,49 @@ bool task::push_task_chain(task *spare_task_chain, unsigned pushing_thread_id) { target_next_field.value_ = current_root_task->thread_id_ + 1; } - if (!spare_task_chain->resource_stack_next_.compare_exchange_strong(expected_next_field, target_next_field)) { - num_resources_--; + if (!spare_task_chain->resource_stack_next_.compare_exchange_strong(expected_next_field, + target_next_field, + std::memory_order_relaxed)) { return false; } else { expected_next_field = target_next_field; } - } while (!this->resource_stack_root_.compare_exchange_strong(current_root, target_root)); + } while (!this->resource_stack_root_.compare_exchange_strong(current_root, target_root, std::memory_order_acq_rel)); return true; } void task::reset_task_chain(task *expected_content) { - num_resources_--; - - data_structures::stamped_integer current_root = this->resource_stack_root_.load(); + data_structures::stamped_integer current_root = this->resource_stack_root_.load(std::memory_order_relaxed); PLS_ASSERT(current_root.value_ == expected_content->thread_id_ + 1, "Must only reset the task chain if we exactly know its state! (current_root.value_)"); data_structures::stamped_integer target_root; target_root.stamp_ = current_root.stamp_ + 1; - bool success = this->resource_stack_root_.compare_exchange_strong(current_root, target_root); - PLS_ASSERT(success, "Must always succeed in resetting the chain, as we must be the sole one operating on it!"); + this->resource_stack_root_.store(target_root, std::memory_order_relaxed); } task *task::pop_task_chain() { - data_structures::stamped_integer current_root; + data_structures::stamped_integer current_root = this->resource_stack_root_.load(std::memory_order_relaxed); data_structures::stamped_integer target_root; task *output_task; do { - current_root = this->resource_stack_root_.load(); + // current_root implicitly re-loaded by CAS in loop if (current_root.value_ == 0) { // Empty... return nullptr; } else { // Found something, try to pop it auto *current_root_task = find_task(current_root.value_ - 1, this->depth_); - auto next_stack_cas = current_root_task->resource_stack_next_.load(); + auto next_stack_cas = current_root_task->resource_stack_next_.load(std::memory_order_relaxed); target_root.stamp_ = current_root.stamp_ + 1; target_root.value_ = next_stack_cas.value_; output_task = current_root_task; } - } while (!this->resource_stack_root_.compare_exchange_strong(current_root, target_root)); - - PLS_ASSERT(num_resources_.fetch_add(-1) > 0, "Must only return an task from the chain if there are items!"); + } while (!this->resource_stack_root_.compare_exchange_strong(current_root, target_root, std::memory_order_acq_rel)); output_task->resource_stack_next_.store({0, 0}); return output_task; diff --git a/lib/pls/src/internal/scheduling/lock_free/task_manager.cpp b/lib/pls/src/internal/scheduling/lock_free/task_manager.cpp index e1e2da9..251f705 100644 --- a/lib/pls/src/internal/scheduling/lock_free/task_manager.cpp +++ b/lib/pls/src/internal/scheduling/lock_free/task_manager.cpp @@ -44,7 +44,7 @@ base_task *task_manager::pop_local_task() { std::tuple task_manager::steal_task(thread_state &stealing_state) { PLS_ASSERT(stealing_state.get_active_task()->depth_ == 0, "Must only steal with clean task chain."); - PLS_ASSERT(scheduler::check_task_chain(*stealing_state.get_active_task()), "Must only steal with clean task chain."); + PLS_ASSERT_EXPENSIVE(scheduler::check_task_chain(*stealing_state.get_active_task()), "Must only steal with clean task chain."); auto peek = deque_.peek_top(); if (peek.top_task_) { @@ -83,7 +83,6 @@ std::tuple task_manager::steal_task(thread_state return std::tuple{stolen_task, chain_after_stolen_task, true}; } else { - // TODO: traded task resource_stack_next_ field is de-marked from being mine return std::tuple{nullptr, nullptr, false}; } } else { @@ -94,15 +93,17 @@ std::tuple task_manager::steal_task(thread_state base_task *task_manager::pop_clean_task_chain(base_task *base_task) { task *target_task = static_cast(base_task); + traded_cas_field peeked_task_cas_before, peeked_task_cas_after; + peeked_task_cas_after = external_trading_deque::peek_traded_object(target_task); while (true) { // Try to get a clean resource chain to go back to the main stealing loop - auto peeked_task_cas_before = external_trading_deque::peek_traded_object(target_task); + peeked_task_cas_before = peeked_task_cas_after; task *pop_result = target_task->pop_task_chain(); if (pop_result) { PLS_ASSERT(scheduler::check_task_chain_backward(*pop_result), "Must only pop proper task chains."); return pop_result; // Got something, so we are simply done here } - auto peeked_task_cas_after = external_trading_deque::peek_traded_object(target_task); + peeked_task_cas_after = external_trading_deque::peek_traded_object(target_task); if (peeked_task_cas_before != peeked_task_cas_after) { continue; diff --git a/lib/pls/src/internal/scheduling/scheduler.cpp b/lib/pls/src/internal/scheduling/scheduler.cpp index f2f5986..6596c6f 100644 --- a/lib/pls/src/internal/scheduling/scheduler.cpp +++ b/lib/pls/src/internal/scheduling/scheduler.cpp @@ -5,6 +5,7 @@ #include "pls/internal/scheduling/strain_local_resource.h" #include "pls/internal/build_flavour.h" #include "pls/internal/base/error_handling.h" +#include "pls/internal/base/futex_wrapper.h" #include @@ -57,7 +58,7 @@ void scheduler::work_thread_work_section() { #if PLS_PROFILING_ENABLED my_state.get_scheduler().profiler_.stealing_start(my_state.get_thread_id()); #endif - PLS_ASSERT(check_task_chain(*my_state.get_active_task()), "Must start stealing with a clean task chain."); + PLS_ASSERT_EXPENSIVE(check_task_chain(*my_state.get_active_task()), "Must start stealing with a clean task chain."); size_t target; do { @@ -91,7 +92,7 @@ void scheduler::work_thread_work_section() { auto *stolen_resources = stolen_task->attached_resources_.load(std::memory_order_relaxed); strain_local_resource::acquire_locally(stolen_resources, my_state.get_thread_id()); - PLS_ASSERT(check_task_chain_forward(*my_state.get_active_task()), + PLS_ASSERT_EXPENSIVE(check_task_chain_forward(*my_state.get_active_task()), "We are sole owner of this chain, it has to be valid!"); // Execute the stolen task by jumping to it's continuation. @@ -117,12 +118,12 @@ void scheduler::work_thread_work_section() { my_state.get_scheduler().profiler_.stealing_end(my_state.get_thread_id(), false); #endif #if PLS_SLEEP_WORKERS_ON_EMPTY - switch (target_queue_empty_flag.value) { + switch (target_queue_empty_flag.value_) { case EMPTY_QUEUE_STATE::QUEUE_NON_EMPTY: { // We found the queue empty, but the flag says it should still be full. // We want to declare it empty, bet we need to re-check the queue in a sub-step to avoid races. data_structures::stamped_integer - maybe_empty_flag{target_queue_empty_flag.stamp + 1, EMPTY_QUEUE_STATE::QUEUE_MAYBE_EMPTY}; + maybe_empty_flag{target_queue_empty_flag.stamp_ + 1, EMPTY_QUEUE_STATE::QUEUE_MAYBE_EMPTY}; if (target_state.get_queue_empty_flag().compare_exchange_strong(target_queue_empty_flag, maybe_empty_flag)) { goto queue_empty_flag_retry_steal; @@ -133,7 +134,7 @@ void scheduler::work_thread_work_section() { // We found the queue empty and it was already marked as maybe empty. // We can safely mark it empty and increment the central counter. data_structures::stamped_integer - empty_flag{target_queue_empty_flag.stamp + 1, EMPTY_QUEUE_STATE::QUEUE_EMPTY}; + empty_flag{target_queue_empty_flag.stamp_ + 1, EMPTY_QUEUE_STATE::QUEUE_EMPTY}; if (target_state.get_queue_empty_flag().compare_exchange_strong(target_queue_empty_flag, empty_flag)) { // We marked it empty, now its our duty to modify the central counter my_state.get_scheduler().empty_queue_increase_counter(); @@ -216,8 +217,8 @@ context_switcher::continuation scheduler::slow_return(thread_state &calling_stat "Resources must only reside in the correct depth!"); PLS_ASSERT(last_task != clean_chain, "We want to swap out the last task and its chain to use a clean one, thus they must differ."); - PLS_ASSERT(check_task_chain_backward(*clean_chain), - "Can only acquire clean chains for clean returns!"); + PLS_ASSERT_EXPENSIVE(check_task_chain_backward(*clean_chain), + "Can only acquire clean chains for clean returns!"); // Acquire it/merge it with our task chain. this_task->prev_ = clean_chain; -- libgit2 0.26.0