First 'crash free' version.

This version runs through our initial fft and fib tests. However, it is not tested further in any way. Additionally, we added a locking deque, potentially hurting performance and moving away from our initial goal.

First 'crash free' version.
This version runs through our initial fft and fib tests. However, it is not tested further in any way. Additionally, we added a locking deque, potentially hurting performance and moving away from our initial goal.
1b576824 · FritzFlorian · c6dd2fc0 · 1b576824 · 1b576824 · 1b576824
Commit 1b576824 authored Nov 29, 2019 by FritzFlorian
11 changed files
--- a/app/benchmark_fft/main.cpp
+++ b/app/benchmark_fft/main.cpp
@@ -90,8 +90,8 @@ complex_vector prepare_input(int input_size) {
  return data;
 }

-static constexpr int NUM_ITERATIONS = 1000;
-constexpr size_t NUM_THREADS = 2;
+static constexpr int NUM_ITERATIONS = 500;
+constexpr size_t NUM_THREADS = 8;

 constexpr size_t NUM_TASKS = 128;
 constexpr size_t MAX_TASK_STACK_SIZE = 0;
@@ -127,14 +127,14 @@ int main() {
  std::cout << "Framework:  " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
            << std::endl;

-  start = std::chrono::steady_clock::now();
-  for (int i = 0; i < NUM_ITERATIONS; i++) {
-    complex_vector input_1(initial_input);
-    fft_normal(input_1.begin(), INPUT_SIZE);
-  }
-  end = std::chrono::steady_clock::now();
-  std::cout << "Normal:     " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
-            << std::endl;
+//  start = std::chrono::steady_clock::now();
+//  for (int i = 0; i < NUM_ITERATIONS; i++) {
+//    complex_vector input_1(initial_input);
+//    fft_normal(input_1.begin(), INPUT_SIZE);
+//  }
+//  end = std::chrono::steady_clock::now();
+//  std::cout << "Normal:     " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
+//            << std::endl;

  return 0;
 }
--- a/lib/pls/CMakeLists.txt
+++ b/lib/pls/CMakeLists.txt
@@ -58,7 +58,7 @@ add_library(pls STATIC
        include/pls/internal/scheduling/task.h src/internal/scheduling/task.cpp
        include/pls/internal/scheduling/cont_manager.h
        include/pls/internal/scheduling/cont.h
-        include/pls/internal/data_structures/bounded_ws_deque.h include/pls/internal/data_structures/optional.h include/pls/internal/scheduling/memory_block.h include/pls/internal/scheduling/thread_state_static.h)
+        include/pls/internal/data_structures/bounded_ws_deque.h include/pls/internal/data_structures/optional.h include/pls/internal/scheduling/memory_block.h include/pls/internal/scheduling/thread_state_static.h src/internal/base/error_handling.cpp)

 # Add everything in `./include` to be in the include path of this project
 target_include_directories(pls

--- a/lib/pls/include/pls/internal/base/error_handling.h
+++ b/lib/pls/include/pls/internal/base/error_handling.h
@@ -12,6 +12,9 @@
 * (or its inclusion adds too much overhead).
 */
 #define PLS_ERROR(msg) printf("%s\n", msg); exit(1);
-#define PLS_ASSERT(cond, msg) if (!(cond)) { PLS_ERROR(msg) }
+
+void pls_error(const char *msg);
+
+#define PLS_ASSERT(cond, msg) if (!(cond)) { pls_error(msg); }

 #endif //PLS_ERROR_HANDLING_H
--- a/lib/pls/include/pls/internal/data_structures/delayed_initialization.h
+++ b/lib/pls/include/pls/internal/data_structures/delayed_initialization.h
@@ -43,9 +43,7 @@ class delayed_initialization {

  template<typename ...ARGS>
  void initialize(ARGS &&...args) {
-    if (initialized_) {
-      PLS_ASSERT(!initialized_, "Can only initialize delayed wrapper object once!");
-    }
+    PLS_ASSERT(!initialized_, "Can only initialize delayed wrapper object once!");

    new((void *) memory_.data()) T(std::forward<ARGS>(args)...);
    initialized_ = true;
@@ -59,9 +57,8 @@ class delayed_initialization {
  }

  T &object() {
-    if (!initialized_) {
-      PLS_ASSERT(initialized_, "Can not use an uninitialized delayed wrapper object!")
-    }
+    PLS_ASSERT(initialized_, "Can not use an uninitialized delayed wrapper object!");
+
    return *reinterpret_cast<T *>(memory_.data());
  }


--- a/lib/pls/include/pls/internal/scheduling/cont.h
+++ b/lib/pls/include/pls/internal/scheduling/cont.h
@@ -9,6 +9,7 @@
 #include "pls/internal/data_structures/stamped_integer.h"
 #include "pls/internal/data_structures/delayed_initialization.h"
 #include "pls/internal/base/alignment.h"
+#include "pls/internal/base/error_handling.h"

 #include "parallel_result.h"
 #include "memory_block.h"
@@ -28,7 +29,10 @@ class base_cont {
  explicit base_cont(base_cont *parent, memory_block *memory_block, bool is_right_child)
      : parent_{parent},
        memory_block_{memory_block},
-        is_right_child_{is_right_child} {};
+        is_right_child_{is_right_child} {
+    PLS_ASSERT(parent_ == nullptr || parent_->memory_block_->get_depth() == memory_block_->get_depth() - 1,
+               "Must only build cont chains with matching depth!")
+  };

  /**
   * Execute the continuation itself.
@@ -77,7 +81,8 @@ class cont : public base_cont {
    using BASE_RES_TYPE = typename std::remove_cv<typename std::remove_reference<RES_TYPE>::type>::type;

    static void execute(cont &cont) {
-      parallel_result<BASE_RES_TYPE> result{cont.function_((*cont.left_result_).value(), (*cont.right_result_).value())};
+      parallel_result<BASE_RES_TYPE>
+          result{cont.function_((*cont.left_result_).value(), (*cont.right_result_).value())};
      if (result.fast_path() && cont.parent_ != nullptr) {
        if (cont.is_right_child()) {
          cont.parent_->store_right_result(std::move(result));

--- a/lib/pls/include/pls/internal/scheduling/cont_manager.h
+++ b/lib/pls/include/pls/internal/scheduling/cont_manager.h
@@ -47,11 +47,11 @@ class cont_manager {
  }
  void move_active_node(int depth) {
    if (depth < 0) {
-      for (int i = 0; i < (depth * -1); i++) {
+      for (long i = 0; i < (depth * -1); i++) {
        active_node_ = active_node_->get_prev();
      }
    } else {
-      for (int i = 0; i < depth; i++) {
+      for (long i = 0; i < depth; i++) {
        active_node_ = active_node_->get_next();
      }
    }
@@ -88,49 +88,10 @@ class cont_manager {

    // Copy fall through status and reset it (for potentially nested execution paths).
    auto *notified_cont = fall_through_cont_;
-    bool notifier_is_right_child = fall_through_child_right;
-
-//    std::cout << "Notifying Cont on core " << my_state.get_id() << " and depth "
-//              << notified_cont->get_memory_block()->get_depth() << std::endl;

    fall_through_cont_ = nullptr;
    fall_through_ = false;

-    // Special case for lock free implementation.
-    // When we finish a 'left strain' it can happen that the 'right strain'
-    // is currently being stolen. We need to be sure that this steal is finished
-    // as we need the thieves memory blocks in case we get blocked by it.
-    if (!notifier_is_right_child) {
-      // Check to execute right child directly...
-      auto &atomic_state = notified_cont->get_memory_block()->get_state();
-      auto old_state = atomic_state.load();
-      memory_block::stamped_state target_state{old_state.stamp + 1, memory_block::state::execute_local};
-      memory_block::stamped_state exchanged_state = atomic_state.exchange(target_state);
-      if (exchanged_state.value != memory_block::state::stolen) {
-        // We 'overruled' the stealing thread and execute the other task ourselfs.
-        // We can be sure that only we where involved in executing the child tasks of the parent_continuation...
-        notified_cont->get_memory_block()->get_results_missing().fetch_add(-1);
-
-        my_state.parent_cont_ = notified_cont;
-        my_state.right_spawn_ = true;
-        notified_cont->execute_task();
-        if (falling_through()) {
-          // ... if the second strain was interrupted we fall through without scheduling the parent_continuation
-          // (the currently pending/interrupted strain will do this itself).
-          return;
-        } else {
-          // ... we could finish the second strain.
-          // Register the parent continuation for being notified.
-          // (This is not the most efficient, as we could simply execute it. However,
-          // this way of doing it spares us from duplicating a lot of code).
-          fall_through_and_notify_cont(notified_cont, true);
-          return;
-        }
-      }
-
-      // Right side is 'fully' stolen. We can continue to inform the parent like we would do normally.
-    }
-
    // Keep the target chain before we execute, as this potentially frees the memory
    auto *target_chain = notified_cont->get_memory_block()->get_offered_chain().load();

@@ -145,12 +106,10 @@ class cont_manager {
        // We do not own the thing we will execute.
        // Own it by swapping the chain belonging to it in.
        aquire_memory_chain(notified_cont->get_memory_block());
-//        std::cout << "Now in charge of memory chain on core " << my_state.get_id() << std::endl;
      }
      my_state.parent_cont_ = notified_cont->get_parent();
      my_state.right_spawn_ = notified_cont->is_right_child();
      active_node_ = notified_cont->get_memory_block();
-//      std::cout << "Execute cont on core " << my_state.get_id() << std::endl;
      notified_cont->execute();
      if (!falling_through() && notified_cont->get_parent() != nullptr) {
        fall_through_and_notify_cont(notified_cont->get_parent(), notified_cont->is_right_child());
@@ -166,7 +125,6 @@ class cont_manager {
        // We own the thing we are not allowed to execute.
        // Get rid of the ownership by using the offered chain.
        aquire_memory_chain(target_chain);
-//        std::cout << "No longer in charge of chain above on core " << my_state.get_id() << std::endl;
      }

      move_active_node_to_start();

--- a/lib/pls/include/pls/internal/scheduling/memory_block.h
+++ b/lib/pls/include/pls/internal/scheduling/memory_block.h
@@ -19,7 +19,7 @@ class memory_block {
  memory_block(char *memory_buffer,
               size_t memory_buffer_size,
               memory_block *prev,
-               unsigned int depth)
+               int depth)
      : prev_{prev},
        next_{nullptr},
        offered_chain_{nullptr},
@@ -32,10 +32,7 @@ class memory_block {

  template<typename T, typename ...ARGS>
  T *place_in_buffer(ARGS &&...args) {
-    if (memory_buffer_used_) {
-      pls::internal::base::this_thread::sleep(100000);
-      PLS_ASSERT(!memory_buffer_used_, "Must only allocate one continuation at once per node.");
-    }
+    PLS_ASSERT(!memory_buffer_used_, "Must only allocate one continuation at once per node.");

    memory_buffer_used_ = true;
    return new(memory_buffer_) T(std::forward<ARGS>(args)...);
@@ -84,7 +81,7 @@ class memory_block {
    return results_missing_;
  }

-  unsigned int get_depth() const noexcept {
+  int get_depth() const noexcept {
    return depth_;
  }

@@ -130,7 +127,7 @@ class memory_block {
  // Each element stays at a fixed depth for the entire application run.
  // Swapping parts of a memory chain will not reorder it, as always parts of
  // the same size are exchanged.
-  const unsigned int depth_;
+  const int depth_;
 };

 }

--- a/lib/pls/include/pls/internal/scheduling/task_manager.h
+++ b/lib/pls/include/pls/internal/scheduling/task_manager.h
@@ -56,6 +56,9 @@ class task_manager {
  // Returns a pair containing the actual task and if the steal was successful.
  base_task *steal_remote_task(cont_manager &stealing_cont_manager) {
    std::lock_guard<base::spin_lock> lock{lock_};
+
+    // TODO: See if we can somehow make this trade lock free (and still be correct)
+
    auto stolen_task_handle = task_deque_.pop_top();
    if (stolen_task_handle) {
      base_task *stolen_task = (*stolen_task_handle).task_;
@@ -64,50 +67,14 @@ class task_manager {
      auto &atomic_state = stolen_task_memory->get_state();
      auto &atomic_offered_chain = stolen_task_memory->get_offered_chain();

-//      std::cout << "Nearly stole on core " << thread_state::get().get_id() << " task with depth "
-//                << stolen_task_depth << std::endl;
-
-      // Move our chain forward for stealing...
+      // TODO: We ignore all we tried with lock free implementations here, just store the state how it is supposed to be
      stealing_cont_manager.move_active_node(stolen_task_depth);
      auto offered_chain = stealing_cont_manager.get_active_node();
+      stealing_cont_manager.move_active_node(1);
+      atomic_offered_chain.store(offered_chain);
+      atomic_state.store(memory_block::stolen);

-      if (offered_chain == (*stolen_task_handle).task_memory_block_) {
-        PLS_ASSERT(false, "How would we offer our own chain? We only offer when stealing!");
-      }
-
-      auto last_state = atomic_state.load();
-      if (last_state.value != memory_block::initialized) {
-        stealing_cont_manager.move_active_node(-stolen_task_depth);
-        return nullptr;
-      }
-
-      auto last_offered_chain = atomic_offered_chain.load();
-      memory_block::stamped_state loop_state = {last_state.stamp + 1, memory_block::stealing};
-
-      if (atomic_state.compare_exchange_strong(last_state, loop_state)) {
-        while (true) {
-          if (atomic_offered_chain.compare_exchange_strong(last_offered_chain, offered_chain)) {
-            break;
-          }
-
-          last_offered_chain = atomic_offered_chain.load();
-          last_state = atomic_state.load();
-          if (last_state != loop_state) {
-            stealing_cont_manager.move_active_node(-stolen_task_depth);
-            return nullptr;
-          }
-        }
-
-        if (atomic_state.compare_exchange_strong(loop_state, {loop_state.stamp + 1, memory_block::stolen})) {
-//          std::cout << "Steal!" << std::endl;
-          stealing_cont_manager.move_active_node(1);
-          return stolen_task;
-        } else {
-          return nullptr;
-        }
-      } else {
-        return nullptr;
-      }
+      return stolen_task;
    } else {
      return nullptr;
    }

--- a/lib/pls/include/pls/internal/scheduling/thread_state.h
+++ b/lib/pls/include/pls/internal/scheduling/thread_state.h
@@ -44,7 +44,12 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) thread_state {
      task_manager_{task_manager},
      cont_manager_{cont_manager},
      current_task_{nullptr},
-      random_{static_cast<unsigned long>(std::chrono::steady_clock::now().time_since_epoch().count())} {}
+      random_{static_cast<unsigned long>(std::chrono::steady_clock::now().time_since_epoch().count())} {};
+
+  void reset() {
+    right_spawn_ = false;
+    parent_cont_ = nullptr;
+  }

  /**
   * Convenience helper to get the thread_state instance associated with this thread.

--- a/lib/pls/src/internal/base/error_handling.cpp
+++ b/lib/pls/src/internal/base/error_handling.cpp
+#include "pls/internal/base/error_handling.h"
+
+void pls_error(const char *msg) {
+  PLS_ERROR(msg);
+}
--- a/lib/pls/src/internal/scheduling/scheduler.cpp
+++ b/lib/pls/src/internal/scheduling/scheduler.cpp
@@ -54,6 +54,7 @@ void scheduler::work_thread_main_loop() {

 void scheduler::work_thread_work_section() {
  auto &my_state = thread_state::get();
+  my_state.reset();
  auto &my_cont_manager = my_state.get_cont_manager();

  auto const num_threads = my_state.get_scheduler().num_threads();