From e6add9647dfc12909ba8faf0c8a471e7b5f5c3f9 Mon Sep 17 00:00:00 2001 From: FritzFlorian Date: Thu, 6 Jun 2019 17:38:24 +0200 Subject: [PATCH] Document FFT performance problem and fine tune it's scheduling --- PERFORMANCE.md | 29 ++++++++++++++++++++++++++++- lib/pls/include/pls/algorithms/invoke_parallel_impl.h | 13 ++++--------- lib/pls/include/pls/algorithms/parallel_for_impl.h | 8 +++++--- lib/pls/include/pls/internal/scheduling/scheduler.h | 9 +++++++++ lib/pls/include/pls/internal/scheduling/scheduler_impl.h | 5 +++++ lib/pls/include/pls/internal/scheduling/task.h | 15 +++++++++++++++ lib/pls/src/internal/data_structures/locking_deque.cpp | 60 ------------------------------------------------------------ media/5044f0a1_fft_average.png | Bin 0 -> 192328 bytes 8 files changed, 66 insertions(+), 73 deletions(-) delete mode 100644 lib/pls/src/internal/data_structures/locking_deque.cpp create mode 100644 media/5044f0a1_fft_average.png diff --git a/PERFORMANCE.md b/PERFORMANCE.md index dbb786c..7112439 100644 --- a/PERFORMANCE.md +++ b/PERFORMANCE.md @@ -354,4 +354,31 @@ fix the source rather then 'circumventing' it with these extra tasks. performance, as contemption on the bus/cache is always bad) - +After some research we think that the issue is down to many threads +referencing the same atomic reference counter. We think so because +even cache aligning the shared refernce count does not fix the issue +when using the direct function call. Also, forcing a new method call +(going down in the call stack one function call) is not solving the +issue (thus making sure that it is not related with some caching issue +in the call itself). + +In conclusion there seems to be a hyperthreading issue with this +shared reference count. We keep this in mind if we eventually get +tasks with changing data memebers (as this problem could reappear there, +as then the ref_count actualy is in the same memory region as our +'user variables'). For now we leave the code like it is. + + +FFT Average with new call method: + + + +The performance of our new call method looks shockingly similar +to TBB with a slight, constant performance drop behind it. +This makes sense, as the basic principle (lock-free, classic work +stealing deque and the parallel call structure) are nearly the same. + +We will see if minor optimizations can even close this last gap. +Overall the performance at this point is good enough to move on +to implementing more functionality and to running tests on different +queues/stealing tactics etc. diff --git a/lib/pls/include/pls/algorithms/invoke_parallel_impl.h b/lib/pls/include/pls/algorithms/invoke_parallel_impl.h index c96fb3f..ef634bd 100644 --- a/lib/pls/include/pls/algorithms/invoke_parallel_impl.h +++ b/lib/pls/include/pls/algorithms/invoke_parallel_impl.h @@ -5,6 +5,7 @@ #include "pls/internal/scheduling/task.h" #include "pls/internal/scheduling/lambda_task.h" #include "pls/internal/scheduling/scheduler.h" +#include "pls/internal/scheduling/thread_state.h" namespace pls { namespace algorithm { @@ -17,10 +18,7 @@ void invoke_parallel(const Function1 &function1, const Function2 &function2) { auto sub_task_2 = lambda_task_by_reference(function2); scheduler::spawn_child(sub_task_2); - scheduler::spawn_child(sub_task_1); - // TODO: Research the exact cause of this being faster -// function1(); // Execute first function 'inline' without spawning a sub_task object - scheduler::wait_for_all(); + scheduler::spawn_child_and_wait(sub_task_1); } template @@ -31,12 +29,9 @@ void invoke_parallel(const Function1 &function1, const Function2 &function2, con auto sub_task_2 = lambda_task_by_reference(function2); auto sub_task_3 = lambda_task_by_reference(function3); - scheduler::spawn_child(sub_task_2); scheduler::spawn_child(sub_task_3); - scheduler::spawn_child(sub_task_1); - // TODO: Research the exact cause of this being faster -// function1(); // Execute first function 'inline' without spawning a sub_task object - scheduler::wait_for_all(); + scheduler::spawn_child(sub_task_2); + scheduler::spawn_child_and_wait(sub_task_1); } } diff --git a/lib/pls/include/pls/algorithms/parallel_for_impl.h b/lib/pls/include/pls/algorithms/parallel_for_impl.h index 786b875..b787b44 100644 --- a/lib/pls/include/pls/algorithms/parallel_for_impl.h +++ b/lib/pls/include/pls/algorithms/parallel_for_impl.h @@ -25,11 +25,13 @@ void parallel_for(RandomIt first, RandomIt last, const Function &function) { // Cut in half recursively long middle_index = num_elements / 2; - auto body = [=] { parallel_for(first + middle_index, last, function); }; - lambda_task_by_reference second_half_task(body); + auto body2 = [=] { parallel_for(first + middle_index, last, function); }; + lambda_task_by_reference second_half_task(body2); scheduler::spawn_child(second_half_task); - parallel_for(first, first + middle_index, function); + auto body1 = [=] { parallel_for(first, first + middle_index, function); }; + lambda_task_by_reference first_half_task(body1); + scheduler::spawn_child(first_half_task); scheduler::wait_for_all(); } } diff --git a/lib/pls/include/pls/internal/scheduling/scheduler.h b/lib/pls/include/pls/internal/scheduling/scheduler.h index c38ae64..54075fc 100644 --- a/lib/pls/include/pls/internal/scheduling/scheduler.h +++ b/lib/pls/include/pls/internal/scheduling/scheduler.h @@ -84,6 +84,15 @@ class scheduler { static void spawn_child(T &sub_task); /** + * Helper to spawn a child on the currently running task and waiting for it (skipping over the task-deque). + * + * @tparam T type of the new task + * @param sub_task the new task to be spawned + */ + template + static void spawn_child_and_wait(T &sub_task); + + /** * Helper to wait for all children of the currently executing task. */ static void wait_for_all(); diff --git a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h index 46265b7..98156dc 100644 --- a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h +++ b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h @@ -34,6 +34,11 @@ void scheduler::spawn_child(T &sub_task) { thread_state::get()->current_task_->spawn_child(sub_task); } +template +void scheduler::spawn_child_and_wait(T &sub_task) { + thread_state::get()->current_task_->spawn_child_and_wait(sub_task); +} + } } } diff --git a/lib/pls/include/pls/internal/scheduling/task.h b/lib/pls/include/pls/internal/scheduling/task.h index 30e04b0..cdeb9f2 100644 --- a/lib/pls/include/pls/internal/scheduling/task.h +++ b/lib/pls/include/pls/internal/scheduling/task.h @@ -35,6 +35,8 @@ class task { template void spawn_child(T &&sub_task); + template + void spawn_child_and_wait(T &&sub_task); void wait_for_all(); private: @@ -58,6 +60,19 @@ void task::spawn_child(T &&sub_task) { thread_state::get()->deque_.push_tail(const_task); } +template +void task::spawn_child_and_wait(T &&sub_task) { + PROFILE_FORK_JOIN_STEALING("spawn_child") + static_assert(std::is_base_of::type>::value, "Only pass task subclasses!"); + + // Assign forced values (for stack and parent management) + sub_task.parent_ = nullptr; + sub_task.deque_state_ = thread_state::get()->deque_.save_state(); + sub_task.execute(); + + wait_for_all(); +} + } } } diff --git a/lib/pls/src/internal/data_structures/locking_deque.cpp b/lib/pls/src/internal/data_structures/locking_deque.cpp deleted file mode 100644 index 90971ce..0000000 --- a/lib/pls/src/internal/data_structures/locking_deque.cpp +++ /dev/null @@ -1,60 +0,0 @@ -#include - -#include "pls/internal/data_structures/locking_deque.h" - -namespace pls { -namespace internal { -namespace data_structures { - -locking_deque_item *locking_deque_internal::pop_head_internal() { - std::lock_guard lock{lock_}; - - if (head_ == nullptr) { - return nullptr; - } - - locking_deque_item *result = head_; - head_ = head_->next_; - if (head_ == nullptr) { - tail_ = nullptr; - } else { - head_->prev_ = nullptr; - } - - return result; -} - -locking_deque_item *locking_deque_internal::pop_tail_internal() { - std::lock_guard lock{lock_}; - - if (tail_ == nullptr) { - return nullptr; - } - - locking_deque_item *result = tail_; - tail_ = tail_->prev_; - if (tail_ == nullptr) { - head_ = nullptr; - } else { - tail_->next_ = nullptr; - } - - return result; -} - -void locking_deque_internal::push_tail_internal(locking_deque_item *new_item) { - std::lock_guard lock{lock_}; - - if (tail_ != nullptr) { - tail_->next_ = new_item; - } else { - head_ = new_item; - } - new_item->prev_ = tail_; - new_item->next_ = nullptr; - tail_ = new_item; -} - -} -} -} diff --git a/media/5044f0a1_fft_average.png b/media/5044f0a1_fft_average.png new file mode 100644 index 0000000..deddf73 Binary files /dev/null and b/media/5044f0a1_fft_average.png differ -- libgit2 0.26.0