Document FFT performance problem and fine tune it's scheduling

e6add964 · FritzFlorian · 2263703b · e6add964 · e6add964 · e6add964
Commit e6add964 authored Jun 06, 2019 by FritzFlorian
8 changed files
--- a/PERFORMANCE.md
+++ b/PERFORMANCE.md
@@ -354,4 +354,31 @@ fix the source rather then 'circumventing' it with these extra tasks.
 performance, as contemption on the bus/cache is always bad)
+After some research we think that the issue is down to many threads
+referencing the same atomic reference counter. We think so because
+even cache aligning the shared refernce count does not fix the issue
+when using the direct function call. Also, forcing a new method call
+(going down in the call stack one function call) is not solving the
+issue (thus making sure that it is not related with some caching issue
+in the call itself).
+In conclusion there seems to be a hyperthreading issue with this
+shared reference count. We keep this in mind if we eventually get
+tasks with changing data memebers (as this problem could reappear there,
+as then the ref_count actualy is in the same memory region as our
+'user variables'). For now we leave the code like it is.
+FFT Average with new call method:
+<img src="media/5044f0a1_fft_average.png" width="400"/>
+The performance of our new call method looks shockingly similar
+to TBB with a slight, constant performance drop behind it.
+This makes sense, as the basic principle (lock-free, classic work
+stealing deque and the parallel call structure) are nearly the same.
+We will see if minor optimizations can even close this last gap.
+Overall the performance at this point is good enough to move on
+to implementing more functionality and to running tests on different
+queues/stealing tactics etc.
--- a/lib/pls/include/pls/algorithms/invoke_parallel_impl.h
+++ b/lib/pls/include/pls/algorithms/invoke_parallel_impl.h
@@ -5,6 +5,7 @@
 #include "pls/internal/scheduling/task.h"
 #include "pls/internal/scheduling/lambda_task.h"
 #include "pls/internal/scheduling/scheduler.h"
+#include "pls/internal/scheduling/thread_state.h"
 namespace pls {
 namespace algorithm {
@@ -17,10 +18,7 @@ void invoke_parallel(const Function1 &function1, const Function2 &function2) {
  auto sub_task_2 = lambda_task_by_reference<Function2>(function2);
  scheduler::spawn_child(sub_task_2);
-  scheduler::spawn_child(sub_task_1);
+  scheduler::spawn_child_and_wait(sub_task_1);
-  // TODO: Research the exact cause of this being faster
-//  function1(); // Execute first function 'inline' without spawning a sub_task object
-  scheduler::wait_for_all();
 }
 template<typename Function1, typename Function2, typename Function3>
@@ -31,12 +29,9 @@ void invoke_parallel(const Function1 &function1, const Function2 &function2, con
  auto sub_task_2 = lambda_task_by_reference<Function2>(function2);
  auto sub_task_3 = lambda_task_by_reference<Function3>(function3);
-  scheduler::spawn_child(sub_task_2);
  scheduler::spawn_child(sub_task_3);
-  scheduler::spawn_child(sub_task_1);
+  scheduler::spawn_child(sub_task_2);
-  // TODO: Research the exact cause of this being faster
+  scheduler::spawn_child_and_wait(sub_task_1);
-//  function1(); // Execute first function 'inline' without spawning a sub_task object
-  scheduler::wait_for_all();
 }
 }

--- a/lib/pls/include/pls/algorithms/parallel_for_impl.h
+++ b/lib/pls/include/pls/algorithms/parallel_for_impl.h
@@ -25,11 +25,13 @@ void parallel_for(RandomIt first, RandomIt last, const Function &function) {
    // Cut in half recursively
    long middle_index = num_elements / 2;
-    auto body = [=] { parallel_for(first + middle_index, last, function); };
+    auto body2 = [=] { parallel_for(first + middle_index, last, function); };
-    lambda_task_by_reference<decltype(body)> second_half_task(body);
+    lambda_task_by_reference<decltype(body2)> second_half_task(body2);
    scheduler::spawn_child(second_half_task);
-    parallel_for(first, first + middle_index, function);
+    auto body1 = [=] { parallel_for(first, first + middle_index, function); };
+    lambda_task_by_reference<decltype(body1)> first_half_task(body1);
+    scheduler::spawn_child(first_half_task);
    scheduler::wait_for_all();
  }
 }

--- a/lib/pls/include/pls/internal/scheduling/scheduler.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler.h
@@ -84,6 +84,15 @@ class scheduler {
  static void spawn_child(T &sub_task);
  /**
+   * Helper to spawn a child on the currently running task and waiting for it (skipping over the task-deque).
+   *
+   * @tparam T type of the new task
+   * @param sub_task the new task to be spawned
+   */
+  template<typename T>
+  static void spawn_child_and_wait(T &sub_task);
+  /**
   * Helper to wait for all children of the currently executing task.
   */
  static void wait_for_all();

--- a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
@@ -34,6 +34,11 @@ void scheduler::spawn_child(T &sub_task) {
  thread_state::get()->current_task_->spawn_child(sub_task);
 }
+template<typename T>
+void scheduler::spawn_child_and_wait(T &sub_task) {
+  thread_state::get()->current_task_->spawn_child_and_wait(sub_task);
+}
 }
 }
 }

--- a/lib/pls/include/pls/internal/scheduling/task.h
+++ b/lib/pls/include/pls/internal/scheduling/task.h
@@ -35,6 +35,8 @@ class task {
  template<typename T>
  void spawn_child(T &&sub_task);
+  template<typename T>
+  void spawn_child_and_wait(T &&sub_task);
  void wait_for_all();
 private:
@@ -58,6 +60,19 @@ void task::spawn_child(T &&sub_task) {
  thread_state::get()->deque_.push_tail(const_task);
 }
+template<typename T>
+void task::spawn_child_and_wait(T &&sub_task) {
+  PROFILE_FORK_JOIN_STEALING("spawn_child")
+  static_assert(std::is_base_of<task, typename std::remove_reference<T>::type>::value, "Only pass task subclasses!");
+  // Assign forced values (for stack and parent management)
+  sub_task.parent_ = nullptr;
+  sub_task.deque_state_ = thread_state::get()->deque_.save_state();
+  sub_task.execute();
+  wait_for_all();
+}
 }
 }
 }

--- a/lib/pls/src/internal/data_structures/locking_deque.cpp
+++ b/lib/pls/src/internal/data_structures/locking_deque.cpp
-#include <mutex>
-#include "pls/internal/data_structures/locking_deque.h"
-namespace pls {
-namespace internal {
-namespace data_structures {
-locking_deque_item *locking_deque_internal::pop_head_internal() {
-  std::lock_guard<base::spin_lock> lock{lock_};
-  if (head_ == nullptr) {
-    return nullptr;
-  }
-  locking_deque_item *result = head_;
-  head_ = head_->next_;
-  if (head_ == nullptr) {
-    tail_ = nullptr;
-  } else {
-    head_->prev_ = nullptr;
-  }
-  return result;
-}
-locking_deque_item *locking_deque_internal::pop_tail_internal() {
-  std::lock_guard<base::spin_lock> lock{lock_};
-  if (tail_ == nullptr) {
-    return nullptr;
-  }
-  locking_deque_item *result = tail_;
-  tail_ = tail_->prev_;
-  if (tail_ == nullptr) {
-    head_ = nullptr;
-  } else {
-    tail_->next_ = nullptr;
-  }
-  return result;
-}
-void locking_deque_internal::push_tail_internal(locking_deque_item *new_item) {
-  std::lock_guard<base::spin_lock> lock{lock_};
-  if (tail_ != nullptr) {
-    tail_->next_ = new_item;
-  } else {
-    head_ = new_item;
-  }
-  new_item->prev_ = tail_;
-  new_item->next_ = nullptr;
-  tail_ = new_item;
-}
-}
-}
-}
--- a/media/5044f0a1_fft_average.png
+++ b/media/5044f0a1_fft_average.png