diff --git a/PERFORMANCE.md b/PERFORMANCE-v1.md similarity index 100% rename from PERFORMANCE.md rename to PERFORMANCE-v1.md diff --git a/PERFORMANCE-v2.md b/PERFORMANCE-v2.md new file mode 100644 index 0000000..b8a27df --- /dev/null +++ b/PERFORMANCE-v2.md @@ -0,0 +1,22 @@ +# Notes on performance measures during development + +#### Commit e34ea267 - 05.12.2019 - First Version of new Algorithm - Scaling Problems + +The first version of our memory trading work stealing algorithm works. It still shows scaling issues over +the hyperthreading mark, very similar to what we have seen in version 1. This indicates some sort of +contention between the threads when running the FFT algorithm. + +Analyzing the current version we find issue with the frequent call to `thread_state_for(id)` in +the stealing loop. + +![](./media/e34ea267_thread_state_for.png) + +It is obvious that the method takes some amount of runtime, as FFT has a structure that tends to only +work on the continuations in the end of the computation (the critical path of FFT can only be executed +after most parallel tasks are done). + +![](./media/e34ea267_fft_execution_pattern.png) + +What we can see here is the long tail of continuations running at the end of the computation. During +this time the non working threads constantly steal, thus requiring the `thread_state_for(id)` +virtual method, potentially hindering other threads from doing their work properly. diff --git a/app/benchmark_fft/main.cpp b/app/benchmark_fft/main.cpp index e46370d..68a587c 100644 --- a/app/benchmark_fft/main.cpp +++ b/app/benchmark_fft/main.cpp @@ -1,6 +1,7 @@ #include "pls/internal/scheduling/scheduler.h" #include "pls/internal/scheduling/parallel_result.h" #include "pls/internal/scheduling/scheduler_memory.h" +#include "pls/internal/helpers/profiler.h" using namespace pls::internal::scheduling; #include @@ -9,7 +10,7 @@ using namespace pls::internal::scheduling; #include static constexpr int CUTOFF = 16; -static constexpr int INPUT_SIZE = 8192; +static constexpr int INPUT_SIZE = 16384; typedef std::vector> complex_vector; void divide(complex_vector::iterator data, int n) { @@ -90,8 +91,8 @@ complex_vector prepare_input(int input_size) { return data; } -static constexpr int NUM_ITERATIONS = 1000; -constexpr size_t NUM_THREADS = 5; +static constexpr int NUM_ITERATIONS = 500; +constexpr size_t NUM_THREADS = 2; constexpr size_t NUM_TASKS = 128; @@ -99,6 +100,7 @@ constexpr size_t NUM_CONTS = 128; constexpr size_t MAX_CONT_SIZE = 512; int main() { + PROFILE_ENABLE; complex_vector initial_input = prepare_input(INPUT_SIZE); static_scheduler_memory{0}; }); }); + PROFILE_LOCK("DONE"); } auto end = std::chrono::steady_clock::now(); std::cout << "Framework: " << std::chrono::duration_cast(end - start).count() << std::endl; + PROFILE_SAVE("test_profile.prof"); start = std::chrono::steady_clock::now(); for (int i = 0; i < NUM_ITERATIONS; i++) { diff --git a/app/benchmark_matrix/main.cpp b/app/benchmark_matrix/main.cpp index ed6d874..9a14a57 100644 --- a/app/benchmark_matrix/main.cpp +++ b/app/benchmark_matrix/main.cpp @@ -1,6 +1,8 @@ -#include -#include -#include +#include "pls/internal/scheduling/scheduler.h" +#include "pls/internal/scheduling/parallel_result.h" +#include "pls/internal/scheduling/scheduler_memory.h" +#include "pls/algorithms/for_each.h" +using namespace pls::internal::scheduling; #include @@ -15,8 +17,8 @@ class matrix { std::fill(&data[0][0], &data[0][0] + SIZE * SIZE, i); } - void multiply(const matrix &a, const matrix &b) { - pls::for_each_range(0, SIZE, [&](int i) { + parallel_result multiply(const matrix &a, const matrix &b) { + return pls::algorithm::for_each_range(0, SIZE, [&](int i) { this->multiply_column(i, a, b); }); } @@ -44,6 +46,14 @@ void fill_with_data(matrix &a, matrix } } +static constexpr int NUM_ITERATIONS = 1000; +constexpr size_t NUM_THREADS = 3; + +constexpr size_t NUM_TASKS = 128; + +constexpr size_t NUM_CONTS = 128; +constexpr size_t MAX_CONT_SIZE = 512; + int main() { PROFILE_ENABLE matrix a; @@ -51,11 +61,29 @@ int main() { matrix result; fill_with_data(a, b); - pls::internal::helpers::run_mini_benchmark([&] { - result.multiply(a, b); - }, 8, 1000); + static_scheduler_memory static_scheduler_memory; - PROFILE_SAVE("test_profile.prof") + scheduler scheduler{static_scheduler_memory, NUM_THREADS}; + + auto start = std::chrono::steady_clock::now(); + for (int i = 0; i < NUM_ITERATIONS; i++) { + scheduler.perform_work([&]() { + PROFILE_MAIN_THREAD; + return scheduler::par([&]() { + return result.multiply(a, b); + }, []() { + return parallel_result{0}; + }).then([](int, int) { + return parallel_result{0}; + }); + }); + } + auto end = std::chrono::steady_clock::now(); + std::cout << "Framework: " << std::chrono::duration_cast(end - start).count() + << std::endl; } //int main() { diff --git a/app/benchmark_unbalanced/main.cpp b/app/benchmark_unbalanced/main.cpp index 75f5daa..2753b8c 100644 --- a/app/benchmark_unbalanced/main.cpp +++ b/app/benchmark_unbalanced/main.cpp @@ -1,6 +1,7 @@ -#include -#include -#include +#include "pls/internal/scheduling/scheduler.h" +#include "pls/internal/scheduling/parallel_result.h" +#include "pls/internal/scheduling/scheduler_memory.h" +using namespace pls::internal::scheduling; #include "node.h" @@ -11,7 +12,7 @@ const int NORMAL_CHILDREN = 8; const int NUM_NODES = 71069; -int count_child_nodes(uts::node &node) { +parallel_result count_child_nodes(uts::node &node) { int child_count = 1; std::vector children = node.spawn_child_nodes(); @@ -36,7 +37,7 @@ int count_child_nodes(uts::node &node) { return child_count; } -int unbalanced_tree_search(int seed, int root_children, double q, int normal_children) { +parallel_result unbalanced_tree_search(int seed, int root_children, double q, int normal_children) { int result; auto lambda = [&] { @@ -50,11 +51,33 @@ int unbalanced_tree_search(int seed, int root_children, double q, int normal_chi return result; } +constexpr size_t NUM_THREADS = 5; + +constexpr size_t NUM_TASKS = 128; + +constexpr size_t NUM_CONTS = 128; +constexpr size_t MAX_CONT_SIZE = 512; + +volatile int result; int main() { PROFILE_ENABLE - pls::internal::helpers::run_mini_benchmark([&] { - unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN); - }, 8, 2000); + static_scheduler_memory static_scheduler_memory; + + scheduler scheduler{static_scheduler_memory, NUM_THREADS}; + + scheduler.perform_work([&]() { + return scheduler::par([&]() { + return unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN); + }, []() { + return parallel_result{0}; + }).then([](int a, int) { + result = a; + return parallel_result{0}; + }); + }); PROFILE_SAVE("test_profile.prof") } diff --git a/app/playground/main.cpp b/app/playground/main.cpp index 4db3a7b..7c353ae 100644 --- a/app/playground/main.cpp +++ b/app/playground/main.cpp @@ -8,7 +8,7 @@ using namespace pls::internal; -constexpr size_t NUM_THREADS = 1; +constexpr size_t NUM_THREADS = 4; constexpr size_t NUM_TASKS = 128; static constexpr int NUM_ITERATIONS = 100; @@ -29,11 +29,8 @@ int fib_normal(int n) { } scheduling::parallel_result fib(int n) { - if (n == 0) { - return 0; - } - if (n == 1) { - return 1; + if (n <= 10) { + return fib_normal(n); } return scheduling::scheduler::par([=]() { @@ -47,6 +44,7 @@ scheduling::parallel_result fib(int n) { static volatile int result; int main() { + PROFILE_ENABLE; scheduling::static_scheduler_memory(end - start).count() @@ -66,16 +64,20 @@ int main() { for (int i = 0; i < NUM_ITERATIONS; i++) { scheduler.perform_work([]() { + PROFILE_MAIN_THREAD; return scheduling::scheduler::par([]() { return scheduling::parallel_result(0); }, []() { - return fib(30); + return fib(35); }).then([](int, int b) { result = b; + PROFILE_LOCK("DONE"); return scheduling::parallel_result{0}; }); }); + PROFILE_LOCK("DONE"); } + PROFILE_SAVE("test_profile.prof"); end = std::chrono::steady_clock::now(); std::cout << "Framework: " << std::chrono::duration_cast(end - start).count() << std::endl; diff --git a/lib/pls/include/pls/algorithms/for_each.h b/lib/pls/include/pls/algorithms/for_each.h index 0cc11b1..54af7fa 100644 --- a/lib/pls/include/pls/algorithms/for_each.h +++ b/lib/pls/include/pls/algorithms/for_each.h @@ -2,6 +2,8 @@ #ifndef PLS_PARALLEL_FOR_H #define PLS_PARALLEL_FOR_H +#include "pls/internal/scheduling/parallel_result.h" + namespace pls { namespace algorithm { @@ -9,19 +11,26 @@ class fixed_strategy; class dynamic_strategy; template -void for_each_range(unsigned long first, - unsigned long last, - const Function &function, - ExecutionStrategy &execution_strategy); +pls::internal::scheduling::parallel_result for_each_range(unsigned long first, + unsigned long last, + const Function &function, + ExecutionStrategy &execution_strategy); template -void for_each_range(unsigned long first, unsigned long last, const Function &function); +pls::internal::scheduling::parallel_result for_each_range(unsigned long first, + unsigned long last, + const Function &function); template -void for_each(RandomIt first, RandomIt last, const Function &function, ExecutionStrategy execution_strategy); +pls::internal::scheduling::parallel_result for_each(RandomIt first, + RandomIt last, + const Function &function, + ExecutionStrategy execution_strategy); template -void for_each(RandomIt first, RandomIt last, const Function &function); +pls::internal::scheduling::parallel_result for_each(RandomIt first, + RandomIt last, + const Function &function); } } diff --git a/lib/pls/include/pls/algorithms/for_each_impl.h b/lib/pls/include/pls/algorithms/for_each_impl.h index 058ae9f..2473f24 100644 --- a/lib/pls/include/pls/algorithms/for_each_impl.h +++ b/lib/pls/include/pls/algorithms/for_each_impl.h @@ -2,11 +2,8 @@ #ifndef PLS_PARALLEL_FOR_IMPL_H #define PLS_PARALLEL_FOR_IMPL_H -#include "pls/internal/scheduling/task.h" #include "pls/internal/scheduling/scheduler.h" #include "pls/internal/scheduling/thread_state.h" - -#include "pls/internal/helpers/unique_id.h" #include "pls/internal/helpers/range.h" namespace pls { @@ -14,7 +11,10 @@ namespace algorithm { namespace internal { template -void for_each(const RandomIt first, const RandomIt last, const Function function, const long min_elements) { +pls::internal::scheduling::parallel_result for_each(const RandomIt first, + const RandomIt last, + const Function function, + const long min_elements) { using namespace ::pls::internal::scheduling; const long num_elements = std::distance(first, last); @@ -23,29 +23,25 @@ void for_each(const RandomIt first, const RandomIt last, const Function function for (auto current = first; current != last; current++) { function(*current); } + + return parallel_result{0}; } else { // Cut in half recursively const long middle_index = num_elements / 2; - auto second_half_body = - [first, middle_index, last, &function, min_elements] { - internal::for_each(first + middle_index, - last, - function, - min_elements); - }; - using second_half_t = lambda_task_by_reference; - scheduler::spawn_child(std::move(second_half_body)); - - auto first_half_body = - [first, middle_index, last, &function, min_elements] { - internal::for_each(first, - first + middle_index, - function, - min_elements); - }; - using first_half_t = lambda_task_by_reference; - scheduler::spawn_child_and_wait(std::move(first_half_body)); + return scheduler::par([first, middle_index, last, &function, min_elements] { + return internal::for_each(first, + first + middle_index, + function, + min_elements); + }, [first, middle_index, last, &function, min_elements] { + return internal::for_each(first + middle_index, + last, + function, + min_elements); + }).then([](int, int) { + return parallel_result{0}; + }); } } @@ -56,7 +52,7 @@ class dynamic_strategy { explicit dynamic_strategy(const unsigned int tasks_per_thread = 4) : tasks_per_thread_{tasks_per_thread} {}; long calculate_min_elements(long num_elements) const { - const long num_threads = pls::internal::scheduling::thread_state::get()->scheduler_->num_threads(); + const long num_threads = pls::internal::scheduling::thread_state::get().scheduler_->num_threads(); return num_elements / (num_threads * tasks_per_thread_); } private: @@ -75,29 +71,34 @@ class fixed_strategy { }; template -void for_each(RandomIt first, RandomIt last, const Function &function, ExecutionStrategy execution_strategy) { +pls::internal::scheduling::parallel_result for_each(RandomIt first, + RandomIt last, + const Function &function, + ExecutionStrategy execution_strategy) { long num_elements = std::distance(first, last); - internal::for_each(first, last, function, execution_strategy.calculate_min_elements(num_elements)); + return internal::for_each(first, last, function, execution_strategy.calculate_min_elements(num_elements)); } template -void for_each(RandomIt first, RandomIt last, const Function &function) { - for_each(first, last, function, dynamic_strategy{4}); +pls::internal::scheduling::parallel_result for_each(RandomIt first, RandomIt last, const Function &function) { + return for_each(first, last, function, dynamic_strategy{4}); } template -void for_each_range(unsigned long first, - unsigned long last, - const Function &function, - ExecutionStrategy execution_strategy) { +pls::internal::scheduling::parallel_result for_each_range(unsigned long first, + unsigned long last, + const Function &function, + ExecutionStrategy execution_strategy) { auto range = pls::internal::helpers::range(first, last); - for_each(range.begin(), range.end(), function, execution_strategy); + return for_each(range.begin(), range.end(), function, execution_strategy); } template -void for_each_range(unsigned long first, unsigned long last, const Function &function) { +pls::internal::scheduling::parallel_result for_each_range(unsigned long first, + unsigned long last, + const Function &function) { auto range = pls::internal::helpers::range(first, last); - for_each(range.begin(), range.end(), function); + return for_each(range.begin(), range.end(), function); } } diff --git a/lib/pls/include/pls/internal/data_structures/bounded_trading_deque.h b/lib/pls/include/pls/internal/data_structures/bounded_trading_deque.h index 6f77d39..e54d9d7 100644 --- a/lib/pls/include/pls/internal/data_structures/bounded_trading_deque.h +++ b/lib/pls/include/pls/internal/data_structures/bounded_trading_deque.h @@ -247,9 +247,9 @@ class bounded_trading_deque { deque_entry *entries_; size_t num_entries_; - std::atomic top_{{0, 0}}; + alignas(base::system_details::CACHE_LINE_SIZE) std::atomic top_{{0, 0}}; + alignas(base::system_details::CACHE_LINE_SIZE) std::atomic bot_{0}; - std::atomic bot_{0}; stamped_integer bot_internal_{0, 0}; }; diff --git a/lib/pls/include/pls/internal/helpers/profiler.h b/lib/pls/include/pls/internal/helpers/profiler.h index 2902344..e0fe3ad 100644 --- a/lib/pls/include/pls/internal/helpers/profiler.h +++ b/lib/pls/include/pls/internal/helpers/profiler.h @@ -6,9 +6,10 @@ #include #include -#define PROFILE_WORK_BLOCK(msg) EASY_BLOCK(msg, profiler::colors::LightGreen) -#define PROFILE_FORK_JOIN_STEALING(msg) EASY_BLOCK(msg, profiler::colors::LightBlue) -#define PROFILE_STEALING(msg) EASY_BLOCK(msg, profiler::colors::Blue) +#define PROFILE_TASK(msg) EASY_BLOCK(msg, profiler::colors::LightBlue) +#define PROFILE_CONTINUATION(msg) EASY_BLOCK(msg, profiler::colors::LightBlue) +#define PROFILE_FAST_PATH(msg) EASY_BLOCK(msg, profiler::colors::Green) +#define PROFILE_STEALING(msg) EASY_BLOCK(msg, profiler::colors::Orange) #define PROFILE_LOCK(msg) EASY_BLOCK(msg, profiler::colors::Red) #define PROFILE_END_BLOCK EASY_END_BLOCK @@ -21,8 +22,9 @@ #else //ENABLE_EASY_PROFILER -#define PROFILE_WORK_BLOCK(msg) -#define PROFILE_FORK_JOIN_STEALING(msg) +#define PROFILE_TASK(msg) +#define PROFILE_CONTINUATION(msg) +#define PROFILE_FAST_PATH(msg) #define PROFILE_STEALING(msg) #define PROFILE_LOCK(msg) diff --git a/lib/pls/include/pls/internal/scheduling/cont.h b/lib/pls/include/pls/internal/scheduling/cont.h index dd05411..1b981e8 100644 --- a/lib/pls/include/pls/internal/scheduling/cont.h +++ b/lib/pls/include/pls/internal/scheduling/cont.h @@ -11,6 +11,8 @@ #include "pls/internal/base/alignment.h" #include "pls/internal/base/error_handling.h" +#include "pls/internal/helpers/profiler.h" + #include "parallel_result.h" #include "memory_block.h" @@ -119,6 +121,7 @@ class cont : public base_cont { task_{std::forward(task_2_args)..., this} {}; void execute() override { + PROFILE_CONTINUATION("execute_cont"); using result_type = decltype(function_((*left_result_).value(), (*right_result_).value())); result_runner::execute(*this); diff --git a/lib/pls/include/pls/internal/scheduling/memory_block.h b/lib/pls/include/pls/internal/scheduling/memory_block.h index 42ad2c0..f4d42c1 100644 --- a/lib/pls/include/pls/internal/scheduling/memory_block.h +++ b/lib/pls/include/pls/internal/scheduling/memory_block.h @@ -27,8 +27,7 @@ class memory_block { memory_buffer_{memory_buffer}, memory_buffer_size_{memory_buffer_size}, memory_buffer_used_{false}, - depth_{depth}, - owner_{0} {}; + depth_{depth} {}; template T *place_in_buffer(ARGS &&...args) { @@ -82,13 +81,6 @@ class memory_block { results_missing_.store(2); } - void set_owner(int owner) { - owner_ = owner; - } - int get_owner() { - return owner_; - } - private: // Linked list property of memory blocks (a complete list represents a threads currently owned memory). // Each block knows its chain start to allow stealing a whole chain in O(1) @@ -120,9 +112,6 @@ class memory_block { // Swapping parts of a memory chain will not reorder it, as always parts of // the same size are exchanged. const int depth_; - - // TODO: Remove, debug only - int owner_; }; } diff --git a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h index f28b844..8b2e8d7 100644 --- a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h +++ b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h @@ -7,6 +7,8 @@ #include "pls/internal/scheduling/parallel_result.h" #include "pls/internal/scheduling/task.h" +#include "pls/internal/helpers/profiler.h" + namespace pls { namespace internal { namespace scheduling { @@ -32,6 +34,7 @@ struct scheduler::starter { auto then(FCONT &&cont_function) -> decltype(cont_function(std::declval(), std::declval())) { + PROFILE_FAST_PATH("then"); using continuation_type = cont, return_type_1, return_type_2, FCONT>; using result_type = decltype(cont_function(std::declval(), std::declval())); @@ -49,7 +52,6 @@ struct scheduler::starter { const bool is_right_cont = my_state.right_spawn_; base_cont *parent_cont = my_state.parent_cont_; - current_memory_block->set_owner(my_state.get_id()); continuation_type *current_cont = current_memory_block->place_in_buffer(parent_cont, current_memory_block, is_right_cont, @@ -63,7 +65,7 @@ struct scheduler::starter { // Call first function on fast path my_state.right_spawn_ = false; return_type_1 result_1 = function_1_(); - if (cont_manager.falling_through()) { + if (!result_1.fast_path()) { // Get our replacement from the task stack and store it for later use when we are actually blocked. auto traded_memory = my_state.get_task_manager().try_pop_local(); current_cont->get_memory_block()->get_offered_chain().store(*traded_memory); @@ -87,7 +89,7 @@ struct scheduler::starter { } else { my_state.right_spawn_ = true; return_type_2 result_2 = function_2_(); - if (cont_manager.falling_through()) { + if (!result_2.fast_path()) { // Main scheduling loop is responsible for entering the result to the slow path... current_cont->store_left_result(std::move(result_1)); current_cont->get_memory_block()->get_results_missing().fetch_add(-1); @@ -109,7 +111,7 @@ struct scheduler::starter { my_state.right_spawn_ = is_right_cont; auto cont_result = cont_function(result_1.value(), result_2.value()); - if (cont_manager.falling_through()) { + if (!cont_result.fast_path()) { // Unwind stack... return result_type{}; } @@ -154,8 +156,6 @@ class scheduler::init_function_impl : public init_function { template void scheduler::perform_work(Function work_section) { - PROFILE_WORK_BLOCK("scheduler::perform_work") - // Prepare main root task init_function_impl starter_function{work_section}; main_thread_starter_function_ = &starter_function; diff --git a/lib/pls/include/pls/internal/scheduling/scheduler_memory.h b/lib/pls/include/pls/internal/scheduling/scheduler_memory.h index 935c499..25c8063 100644 --- a/lib/pls/include/pls/internal/scheduling/scheduler_memory.h +++ b/lib/pls/include/pls/internal/scheduling/scheduler_memory.h @@ -18,19 +18,27 @@ class scheduler_memory { // By not having an initialization routine we can do our 'static and heap specialization' // without running into any ordering problems in the initialization sequence. - // We first worried about performance of this being virtual. - // However, we decided that only thread_state_for is used during the - // runtime and that only when stealing. As stealing is expensive anyways, - // this should not add too much overhead. + protected: + thread_state **thread_states_array_{nullptr}; + public: virtual size_t max_threads() const = 0; virtual base::thread &thread_for(size_t id) = 0; - virtual thread_state &thread_state_for(size_t id) = 0; + thread_state &thread_state_for(size_t id) { + return *thread_states_array_[id]; + } }; template class static_scheduler_memory : public scheduler_memory { public: + static_scheduler_memory() : scheduler_memory{} { + for (size_t i = 0; i < MAX_THREADS; i++) { + thread_state_pointers_[i] = &thread_states_[i].get_thread_state(); + } + thread_states_array_ = thread_state_pointers_.data(); + } + size_t max_threads() const override { return MAX_THREADS; } @@ -38,16 +46,12 @@ class static_scheduler_memory : public scheduler_memory { base::thread &thread_for(size_t id) override { return threads_[id]; } - - thread_state &thread_state_for(size_t id) override { - return thread_states_[id].get_thread_state(); - } - private: using thread_state_type = thread_state_static; alignas(base::system_details::CACHE_LINE_SIZE) std::array threads_; alignas(base::system_details::CACHE_LINE_SIZE) std::array thread_states_; + alignas(base::system_details::CACHE_LINE_SIZE) std::array thread_state_pointers_; }; template @@ -55,14 +59,17 @@ class heap_scheduler_memory : public scheduler_memory { public: explicit heap_scheduler_memory(size_t max_threads) : max_threads_{max_threads}, thread_vector_{}, - thread_state_vector_{} { + thread_state_vector_{}, + thread_state_pointers_{} { thread_vector_.reserve(max_threads); thread_state_vector_.reserve(max_threads); for (size_t i = 0; i < max_threads; i++) { thread_vector_.emplace_back(); thread_state_vector_.emplace_back(); + thread_state_pointers_.emplace_back(&thread_state_vector_[i].get_thread_state()); } + thread_states_array_ = thread_state_pointers_.data(); } size_t max_threads() const override { @@ -72,11 +79,6 @@ class heap_scheduler_memory : public scheduler_memory { base::thread &thread_for(size_t id) override { return thread_vector_[id]; } - - thread_state &thread_state_for(size_t id) override { - return thread_state_vector_[id].object().get_thread_state(); - } - private: using thread_state_type = thread_state_static; // thread_state_type is aligned at the cache line and therefore overaligned (C++ 11 does not require @@ -88,6 +90,7 @@ class heap_scheduler_memory : public scheduler_memory { size_t max_threads_; std::vector thread_vector_; std::vector thread_state_vector_; + std::vector thread_state_pointers_; }; } diff --git a/lib/pls/include/pls/internal/scheduling/task.h b/lib/pls/include/pls/internal/scheduling/task.h index 8b10e90..372a18e 100644 --- a/lib/pls/include/pls/internal/scheduling/task.h +++ b/lib/pls/include/pls/internal/scheduling/task.h @@ -4,6 +4,8 @@ #include "pls/internal/scheduling/cont.h" #include "pls/internal/scheduling/memory_block.h" +#include "pls/internal/helpers/profiler.h" + namespace pls { namespace internal { namespace scheduling { @@ -47,6 +49,7 @@ class task : public base_task { : base_task{cont}, function_{std::forward(function)} {} void execute_internal() override { + PROFILE_TASK("execute_internal") auto result = function_(); if (result.fast_path()) { cont_->store_right_result(std::move(result)); diff --git a/lib/pls/include/pls/internal/scheduling/task_manager.h b/lib/pls/include/pls/internal/scheduling/task_manager.h index 961cf1b..5eeb73d 100644 --- a/lib/pls/include/pls/internal/scheduling/task_manager.h +++ b/lib/pls/include/pls/internal/scheduling/task_manager.h @@ -30,20 +30,17 @@ class task_manager { public: // Publishes a task on the stack, i.e. makes it visible for other threads to steal. void publish_task(base_task *task) { -// std::lock_guard lock{lock_}; task_deque_.push_bot(task->get_cont()->get_memory_block()); } // Try to pop a local task from this task managers stack. data_structures::optional try_pop_local() { -// std::lock_guard lock{lock_}; return task_deque_.pop_bot().traded_; } // Try to steal a task from a remote task_manager instance. The stolen task must be stored locally. // Returns a pair containing the actual task and if the steal was successful. base_task *steal_remote_task(cont_manager &stealing_cont_manager) { -// std::lock_guard lock{lock_}; auto peek = task_deque_.peek_top(); if (std::get<0>(peek)) { @@ -73,7 +70,6 @@ class task_manager { private: data_structures::bounded_trading_deque &task_deque_; - base::spin_lock lock_{}; }; template diff --git a/lib/pls/include/pls/internal/scheduling/thread_state_static.h b/lib/pls/include/pls/internal/scheduling/thread_state_static.h index 39b1815..4ac295a 100644 --- a/lib/pls/include/pls/internal/scheduling/thread_state_static.h +++ b/lib/pls/include/pls/internal/scheduling/thread_state_static.h @@ -5,6 +5,8 @@ #include "pls/internal/scheduling/task_manager.h" #include "pls/internal/scheduling/cont_manager.h" +#include "pls/internal/base/system_details.h" + #include "thread_state.h" namespace pls { @@ -12,7 +14,7 @@ namespace internal { namespace scheduling { template -struct thread_state_static { +struct alignas(base::system_details::CACHE_LINE_SIZE) thread_state_static { public: thread_state_static() : static_task_manager_{}, @@ -21,9 +23,9 @@ struct thread_state_static { thread_state &get_thread_state() { return thread_state_; } private: - static_task_manager static_task_manager_; - static_cont_manager static_cont_manager_; - thread_state thread_state_; + alignas(base::system_details::CACHE_LINE_SIZE) static_task_manager static_task_manager_; + alignas(base::system_details::CACHE_LINE_SIZE) static_cont_manager static_cont_manager_; + alignas(base::system_details::CACHE_LINE_SIZE) thread_state thread_state_; }; } diff --git a/lib/pls/src/internal/scheduling/scheduler.cpp b/lib/pls/src/internal/scheduling/scheduler.cpp index 3a3c9c0..960a216 100644 --- a/lib/pls/src/internal/scheduling/scheduler.cpp +++ b/lib/pls/src/internal/scheduling/scheduler.cpp @@ -1,8 +1,9 @@ #include "pls/internal/scheduling/scheduler.h" #include "pls/internal/scheduling/thread_state.h" -#include "pls/internal/scheduling/task.h" +#include "pls/internal/base/thread.h" #include "pls/internal/base/error_handling.h" +#include "pls/internal/helpers/profiler.h" namespace pls { namespace internal { @@ -74,17 +75,15 @@ void scheduler::work_thread_work_section() { // Steal Routine (will be continuously executed when there are no more fall through's). // TODO: move into separate function const size_t offset = my_state.random_() % num_threads; - const size_t max_tries = num_threads - 1; + const size_t max_tries = num_threads; for (size_t i = 0; i < max_tries; i++) { size_t target = (offset + i) % num_threads; - - // Skip our self for stealing - target = ((target == my_id) + target) % num_threads; - auto &target_state = my_state.get_scheduler().thread_state_for(target); PLS_ASSERT(my_cont_manager.is_clean(), "Only steal with clean chain!"); + PROFILE_STEALING("steal") auto *stolen_task = target_state.get_task_manager().steal_remote_task(my_cont_manager); + PROFILE_END_BLOCK; if (stolen_task != nullptr) { my_state.parent_cont_ = stolen_task->get_cont(); my_state.right_spawn_ = true; @@ -97,6 +96,9 @@ void scheduler::work_thread_work_section() { } } } +// if (!my_cont_manager.falling_through()) { +// base::this_thread::sleep(5); +// } } while (!work_section_done_); PLS_ASSERT(my_cont_manager.is_clean(), "Only finish work section with clean chain!"); diff --git a/media/e34ea267_fft_execution_pattern.png b/media/e34ea267_fft_execution_pattern.png new file mode 100644 index 0000000..108ab8d Binary files /dev/null and b/media/e34ea267_fft_execution_pattern.png differ diff --git a/media/e34ea267_thread_state_for.png b/media/e34ea267_thread_state_for.png new file mode 100644 index 0000000..8431bfc Binary files /dev/null and b/media/e34ea267_thread_state_for.png differ