From 9fa9296a2e99c2eaf09831e8663c7f6d7c4996b1 Mon Sep 17 00:00:00 2001
From: FritzFlorian <flo.fritz@t-online.de>
Date: Sun, 28 Jun 2020 20:06:09 +0200
Subject: [PATCH] Final version ready for benchmarking.

---
 CMakeLists.txt                                                       |  3 +++
 app/benchmark_fft/CMakeLists.txt                                     |  2 ++
 app/benchmark_fib/CMakeLists.txt                                     |  2 ++
 app/benchmark_matrix/CMakeLists.txt                                  |  2 ++
 app/benchmark_matrix/main.cpp                                        | 11 +++++++++++
 app/benchmark_matrix_div_conquer/CMakeLists.txt                      |  2 ++
 app/benchmark_matrix_div_conquer/main.cpp                            | 10 +++-------
 app/benchmark_unbalanced/CMakeLists.txt                              |  2 ++
 app/benchmark_unbalanced/main.cpp                                    |  2 +-
 extern/benchmark_base/CMakeLists.txt                                 |  2 +-
 extern/benchmark_base/include/benchmark_base/matrix_div_conquer.h    | 71 +++++++++++++++++++++++------------------------------------------------
 extern/benchmark_base/src/matrix_div_conquer.cpp                     | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 extern/benchmark_runner/benchmark_runner.h                           |  2 +-
 lib/pls/CMakeLists.txt                                               |  2 +-
 lib/pls/include/pls/algorithms/divide_and_conquer_buffers.h          |  5 -----
 lib/pls/include/pls/algorithms/for_each.h                            |  8 ++++----
 lib/pls/include/pls/algorithms/for_each_impl.h                       | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
 lib/pls/include/pls/internal/base/error_handling.h                   |  5 +++++
 lib/pls/include/pls/internal/base/stack_allocator.h                  |  1 +
 lib/pls/include/pls/internal/scheduling/base_task.h                  |  9 +++++----
 lib/pls/include/pls/internal/scheduling/lock_free/task.h             |  4 +---
 lib/pls/include/pls/internal/scheduling/scheduler_impl.h             | 22 ++++++++++++++++++----
 lib/pls/include/pls/pls.h                                            |  3 +++
 lib/pls/src/internal/scheduling/lock_free/external_trading_deque.cpp | 38 ++++++++++++++++++++++----------------
 lib/pls/src/internal/scheduling/lock_free/task.cpp                   | 32 ++++++++++++--------------------
 lib/pls/src/internal/scheduling/lock_free/task_manager.cpp           |  9 +++++----
 lib/pls/src/internal/scheduling/scheduler.cpp                        | 15 ++++++++-------
 27 files changed, 255 insertions(+), 149 deletions(-)
 create mode 100644 extern/benchmark_base/src/matrix_div_conquer.cpp
 delete mode 100644 lib/pls/include/pls/algorithms/divide_and_conquer_buffers.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 215aa65..12cb45e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,9 @@ ADD_CUSTOM_TARGET(install.pls
         -P ${CMAKE_BINARY_DIR}/cmake_install.cmake)
 ADD_DEPENDENCIES(install.pls context_switcher pls)
 
+# ... second custom target to only build the benchmarks.
+ADD_CUSTOM_TARGET(benchmark.pls)
+
 # Include examples
 add_subdirectory(app/playground)
 add_subdirectory(app/benchmark_fft)
diff --git a/app/benchmark_fft/CMakeLists.txt b/app/benchmark_fft/CMakeLists.txt
index cfef00b..c351434 100644
--- a/app/benchmark_fft/CMakeLists.txt
+++ b/app/benchmark_fft/CMakeLists.txt
@@ -1,5 +1,7 @@
 add_executable(benchmark_fft_pls_v3 main.cpp)
 target_link_libraries(benchmark_fft_pls_v3 pls benchmark_runner benchmark_base)
+ADD_DEPENDENCIES(benchmark.pls benchmark_fft_pls_v3)
+
 if (EASY_PROFILER)
     target_link_libraries(benchmark_fft_pls_v3 easy_profiler)
 endif ()
diff --git a/app/benchmark_fib/CMakeLists.txt b/app/benchmark_fib/CMakeLists.txt
index 5233f4f..1369f79 100644
--- a/app/benchmark_fib/CMakeLists.txt
+++ b/app/benchmark_fib/CMakeLists.txt
@@ -1,5 +1,7 @@
 add_executable(benchmark_fib_pls_v3 main.cpp)
 target_link_libraries(benchmark_fib_pls_v3 pls benchmark_runner benchmark_base)
+ADD_DEPENDENCIES(benchmark.pls benchmark_fib_pls_v3)
+
 if (EASY_PROFILER)
     target_link_libraries(benchmark_fib_pls_v3 easy_profiler)
 endif ()
diff --git a/app/benchmark_matrix/CMakeLists.txt b/app/benchmark_matrix/CMakeLists.txt
index fe40f88..e2a6619 100644
--- a/app/benchmark_matrix/CMakeLists.txt
+++ b/app/benchmark_matrix/CMakeLists.txt
@@ -1,5 +1,7 @@
 add_executable(benchmark_matrix_pls_v3 main.cpp)
 target_link_libraries(benchmark_matrix_pls_v3 pls benchmark_runner benchmark_base)
+ADD_DEPENDENCIES(benchmark.pls benchmark_matrix_pls_v3)
+
 if (EASY_PROFILER)
     target_link_libraries(benchmark_matrix_pls_v3 easy_profiler)
 endif ()
diff --git a/app/benchmark_matrix/main.cpp b/app/benchmark_matrix/main.cpp
index 4ff4075..aa4348b 100644
--- a/app/benchmark_matrix/main.cpp
+++ b/app/benchmark_matrix/main.cpp
@@ -37,6 +37,12 @@ int main(int argc, char **argv) {
   pls::scheduler scheduler{(unsigned) settings.num_threads_, MAX_NUM_TASKS, MAX_STACK_SIZE};
 
   if (settings.type_ == benchmark_runner::benchmark_settings::ISOLATED) {
+#if PLS_PROFILING_ENABLED
+    scheduler.get_profiler().disable_memory_measure();
+    runner.add_custom_stats_field("T_1");
+    runner.add_custom_stats_field("T_inf");
+#endif
+
     printf("Running isolated measurement...\n");
     runner.enable_memory_stats();
     runner.pre_allocate_stats();
@@ -45,6 +51,11 @@ int main(int argc, char **argv) {
       scheduler.perform_work([&]() {
         result.multiply(a, b);
       });
+    }, [&]() {}, [&]() {
+#if PLS_PROFILING_ENABLED
+      runner.store_custom_stat("T_1", scheduler.get_profiler().current_run().t_1_);
+      runner.store_custom_stat("T_inf", scheduler.get_profiler().current_run().t_inf_);
+#endif
     });
     runner.commit_results(true);
   } else {
diff --git a/app/benchmark_matrix_div_conquer/CMakeLists.txt b/app/benchmark_matrix_div_conquer/CMakeLists.txt
index 4d510c0..a8ed2ce 100644
--- a/app/benchmark_matrix_div_conquer/CMakeLists.txt
+++ b/app/benchmark_matrix_div_conquer/CMakeLists.txt
@@ -1,5 +1,7 @@
 add_executable(benchmark_matrix_div_conquer_pls_v3 main.cpp)
 target_link_libraries(benchmark_matrix_div_conquer_pls_v3 pls benchmark_runner benchmark_base)
+ADD_DEPENDENCIES(benchmark.pls benchmark_matrix_div_conquer_pls_v3)
+
 if (EASY_PROFILER)
     target_link_libraries(benchmark_matrix_div_conquer_pls_v3 easy_profiler)
 endif ()
diff --git a/app/benchmark_matrix_div_conquer/main.cpp b/app/benchmark_matrix_div_conquer/main.cpp
index 050090f..0175d0f 100644
--- a/app/benchmark_matrix_div_conquer/main.cpp
+++ b/app/benchmark_matrix_div_conquer/main.cpp
@@ -92,12 +92,10 @@ void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_
   pls::spawn(
       [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 6, result_2_2_a, a_2_1, b_1_2); }
   );
-  pls::spawn(
+  pls::spawn_and_sync(
       [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 7, result_2_2_b, a_2_2, b_2_2); }
   );
 
-  pls::sync();
-
   // Combine results
   for (size_t i = 0; i < (size / 2) * (size / 2); i++) {
     // The layout is not important here, ass all have the same order, so just sum element wise
@@ -108,7 +106,6 @@ void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_
   }
 }
 
-constexpr int MAX_NUM_TASKS = 10;
 constexpr int MAX_STACK_SIZE = 4096 * 1;
 
 int main(int argc, char **argv) {
@@ -128,7 +125,6 @@ int main(int argc, char **argv) {
   // Fill data arrays as needed
   a.fill_default_data();
   b.fill_default_data();
-  matrix_div_conquer::fill_block_lookup(size);
 
   // Strain local data
   std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> div_conquer_temp_arrays;
@@ -138,7 +134,7 @@ int main(int argc, char **argv) {
   while (remaining_size > matrix_div_conquer::CUTOFF_SIZE) {
     auto &depth_buffers = div_conquer_temp_arrays.emplace_back();
     buffers_needed = std::min(buffers_needed, (size_t) settings.num_threads_);
-    for (int thread_id = 0; thread_id < buffers_needed; thread_id++) {
+    for (size_t thread_id = 0; thread_id < buffers_needed; thread_id++) {
       auto &depth_thread_buffers = depth_buffers.emplace_back();
       for (int i = 0; i < 8; i++) {
         size_t matrix_elements = (remaining_size / 2) * (remaining_size / 2);
@@ -159,7 +155,7 @@ int main(int argc, char **argv) {
   string full_directory = settings.output_directory_ + "/PLS_v3/";
   benchmark_runner runner{full_directory, test_name};
 
-  pls::scheduler scheduler{(unsigned) settings.num_threads_, MAX_NUM_TASKS, MAX_STACK_SIZE};
+  pls::scheduler scheduler{(unsigned) settings.num_threads_, max_depth + 2, MAX_STACK_SIZE};
 
   if (settings.type_ == benchmark_runner::benchmark_settings::ISOLATED) {
     printf("Running isolated measurement...\n");
diff --git a/app/benchmark_unbalanced/CMakeLists.txt b/app/benchmark_unbalanced/CMakeLists.txt
index 9c86805..26f4649 100644
--- a/app/benchmark_unbalanced/CMakeLists.txt
+++ b/app/benchmark_unbalanced/CMakeLists.txt
@@ -1,5 +1,7 @@
 add_executable(benchmark_unbalanced_pls_v3 main.cpp)
 target_link_libraries(benchmark_unbalanced_pls_v3 benchmark_runner benchmark_base pls)
+ADD_DEPENDENCIES(benchmark.pls benchmark_unbalanced_pls_v3)
+
 if (EASY_PROFILER)
     target_link_libraries(benchmark_unbalanced_pls_v3 easy_profiler)
 endif ()
diff --git a/app/benchmark_unbalanced/main.cpp b/app/benchmark_unbalanced/main.cpp
index d4936ac..c482090 100644
--- a/app/benchmark_unbalanced/main.cpp
+++ b/app/benchmark_unbalanced/main.cpp
@@ -31,7 +31,7 @@ int unbalanced_tree_search(int seed, int root_children, double q, int normal_chi
   return count_child_nodes(root);
 }
 
-constexpr int MAX_NUM_TASKS = 256;
+constexpr int MAX_NUM_TASKS = 180;
 constexpr int MAX_STACK_SIZE = 4096 * 1;
 
 int main(int argc, char **argv) {
diff --git a/extern/benchmark_base/CMakeLists.txt b/extern/benchmark_base/CMakeLists.txt
index da76867..03de16c 100644
--- a/extern/benchmark_base/CMakeLists.txt
+++ b/extern/benchmark_base/CMakeLists.txt
@@ -9,7 +9,7 @@ add_library(benchmark_base STATIC
         include/benchmark_base/unbalanced.h src/unbalanced.cpp
         include/benchmark_base/range.h
         include/benchmark_base/fib.h
-        include/benchmark_base/matrix_div_conquer.h)
+        include/benchmark_base/matrix_div_conquer.h src/matrix_div_conquer.cpp)
 
 target_include_directories(benchmark_base
         PUBLIC
diff --git a/extern/benchmark_base/include/benchmark_base/matrix_div_conquer.h b/extern/benchmark_base/include/benchmark_base/matrix_div_conquer.h
index 9002de1..f5e5221 100644
--- a/extern/benchmark_base/include/benchmark_base/matrix_div_conquer.h
+++ b/extern/benchmark_base/include/benchmark_base/matrix_div_conquer.h
@@ -1,51 +1,27 @@
 #ifndef COMPARISON_BENCHMARKS_BASE_MATRIX_DIV_CONQUER_H
 #define COMPARISON_BENCHMARKS_BASE_MATRIX_DIV_CONQUER_H
 
-#include <array>
+#include <vector>
+#include <cstdio>
+#include <cstdlib>
 
 namespace comparison_benchmarks {
 namespace base {
 namespace matrix_div_conquer {
 
-const int MATRIX_SIZE = 128;
 const int CUTOFF_SIZE = 8;
 
-const int NUM_ITERATIONS = 100;
-const int WARMUP_ITERATIONS = 10;
-
-// Helpers to directly index into blocked matrices
-const size_t MAX_SIZE = 128;
-std::array<std::array<size_t, MAX_SIZE>, MAX_SIZE> BLOCK_LOOKUP; // ROW, COLUMN
-void fill_block_lookup(size_t size = MAX_SIZE) {
-  if (size <= 1) {
-    BLOCK_LOOKUP[0][0] = 0;
-    return;
-  }
-
-  fill_block_lookup(size / 2);
-
-  size_t elements_per_quarter = (size / 2) * (size / 2);
-  for (size_t row = 0; row < size / 2; row++) {
-    for (size_t column = 0; column < size / 2; column++) {
-      BLOCK_LOOKUP[row][size / 2 + column] = BLOCK_LOOKUP[row][column] + elements_per_quarter;
-      BLOCK_LOOKUP[size / 2 + row][column] = BLOCK_LOOKUP[row][column] + 2 * elements_per_quarter;
-      BLOCK_LOOKUP[size / 2 + row][size / 2 + column] = BLOCK_LOOKUP[row][column] + 3 * elements_per_quarter;
-    }
-  }
-}
-
 class blocked_matrix_view {
  public:
-  blocked_matrix_view(double *data, size_t size) : data_{data}, size_{size} {}
-
-  void fill_default_data() {
-    for (size_t row = 0; row < size_; row++) {
-      for (size_t column = 0; column < size_; column++) {
-        at(row, column) = row;
-      }
+  blocked_matrix_view(double *data, size_t size) : data_{data},
+                                                   size_{size} {
+    if (size > BLOCK_LOOKUPS_SIZE) {
+      init_block_lookup(size);
     }
   }
 
+  void fill_default_data();
+
   blocked_matrix_view quadrant_1_1() {
     size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
     return blocked_matrix_view(data_ + 0 * elements_per_quarter, size_ / 2);
@@ -64,7 +40,7 @@ class blocked_matrix_view {
   }
 
   double &at(size_t row, size_t column) {
-    return data_[BLOCK_LOOKUP[row][column]];
+    return data_[BLOCK_LOOKUPS[block_lookup_at(row, column)]];
   }
 
   double *get_data() {
@@ -72,22 +48,21 @@ class blocked_matrix_view {
   }
 
  private:
-  double *data_;
-  size_t size_;
-};
+  double *const data_;
+  const size_t size_;
 
-void multiply_naive(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) {
-  for (size_t i = 0; i < size; i++) {
-    for (size_t j = 0; j < size; j++) {
-      result.at(i, j) = 0;
-    }
-    for (size_t j = 0; j < size; j++) {
-      for (size_t k = 0; k < size; k++) {
-        result.at(i, j) += a.at(i, k) * b.at(k, j);
-      }
-    }
+  // Lookup indices for non divide-conquer block lookups
+  static std::vector<size_t> BLOCK_LOOKUPS;
+  static size_t BLOCK_LOOKUPS_SIZE;
+
+  static void fill_block_lookup(size_t size, std::vector<size_t> &BLOCK_LOOKUP);
+  static void init_block_lookup(size_t max_size);
+  static size_t block_lookup_at(size_t row, size_t column) {
+    return row * BLOCK_LOOKUPS_SIZE + column;
   }
-}
+};
+
+void multiply_naive(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b);
 
 }
 }
diff --git a/extern/benchmark_base/src/matrix_div_conquer.cpp b/extern/benchmark_base/src/matrix_div_conquer.cpp
new file mode 100644
index 0000000..8155588
--- /dev/null
+++ b/extern/benchmark_base/src/matrix_div_conquer.cpp
@@ -0,0 +1,64 @@
+#include <cstdio>
+#include <cstdlib>
+#include "benchmark_base/matrix_div_conquer.h"
+
+namespace comparison_benchmarks {
+namespace base {
+namespace matrix_div_conquer {
+
+void multiply_naive(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) {
+  for (size_t i = 0; i < size; i++) {
+    for (size_t j = 0; j < size; j++) {
+      result.at(i, j) = 0;
+    }
+    for (size_t j = 0; j < size; j++) {
+      for (size_t k = 0; k < size; k++) {
+        result.at(i, j) += a.at(i, k) * b.at(k, j);
+      }
+    }
+  }
+}
+
+void blocked_matrix_view::fill_default_data() {
+  for (size_t row = 0; row < size_; row++) {
+    for (size_t column = 0; column < size_; column++) {
+      at(row, column) = row;
+    }
+  }
+}
+
+std::vector<size_t> blocked_matrix_view::BLOCK_LOOKUPS;
+size_t blocked_matrix_view::BLOCK_LOOKUPS_SIZE;
+
+void blocked_matrix_view::fill_block_lookup(size_t size, std::vector<size_t> &BLOCK_LOOKUP) {
+  if (size <= 1) {
+    BLOCK_LOOKUP[block_lookup_at(0, 0)] = 0;
+    return;
+  }
+
+  fill_block_lookup(size / 2, BLOCK_LOOKUP);
+
+  size_t elements_per_quarter = (size / 2) * (size / 2);
+  for (size_t row = 0; row < size / 2; row++) {
+    for (size_t column = 0; column < size / 2; column++) {
+      BLOCK_LOOKUP[block_lookup_at(row, size / 2 + column)] =
+          BLOCK_LOOKUP[block_lookup_at(row, column)] + elements_per_quarter;
+      BLOCK_LOOKUP[block_lookup_at(size / 2 + row, column)] =
+          BLOCK_LOOKUP[block_lookup_at(row, column)] + 2 * elements_per_quarter;
+      BLOCK_LOOKUP[block_lookup_at(size / 2 + row, size / 2 + column)] =
+          BLOCK_LOOKUP[block_lookup_at(row, column)] + 3 * elements_per_quarter;
+    }
+  }
+}
+
+void blocked_matrix_view::init_block_lookup(size_t max_size) {
+  if (BLOCK_LOOKUPS.size() < max_size) {
+    BLOCK_LOOKUPS = std::vector<size_t>(max_size * max_size);
+    BLOCK_LOOKUPS_SIZE = max_size;
+    fill_block_lookup(max_size, BLOCK_LOOKUPS);
+  }
+}
+
+}
+}
+}
diff --git a/extern/benchmark_runner/benchmark_runner.h b/extern/benchmark_runner/benchmark_runner.h
index 1a89990..36a8f55 100644
--- a/extern/benchmark_runner/benchmark_runner.h
+++ b/extern/benchmark_runner/benchmark_runner.h
@@ -299,7 +299,7 @@ class benchmark_runner {
       long wall_time_us = 0;
       wall_time_us += (finish_time.tv_sec - iteration_start.tv_sec) * 1000l * 1000l;
       wall_time_us += ((long) finish_time.tv_nsec - (long) iteration_start.tv_nsec) / 1000l;
-      printf("Difference: %d\n", wall_time_us - times_[current_iteration]);
+      printf("Difference: %ld\n", wall_time_us - times_[current_iteration]);
       times_[current_iteration] = wall_time_us;
 
       if (finish_time.tv_sec >= deadline_end.tv_sec && finish_time.tv_nsec > deadline_end.tv_nsec) {
diff --git a/lib/pls/CMakeLists.txt b/lib/pls/CMakeLists.txt
index 8065e84..e127f98 100644
--- a/lib/pls/CMakeLists.txt
+++ b/lib/pls/CMakeLists.txt
@@ -49,7 +49,7 @@ add_library(pls STATIC
 
         include/pls/internal/profiling/dag_node.h src/internal/profiling/dag_node.cpp
         include/pls/internal/profiling/profiler.h src/internal/profiling/profiler.cpp
-        include/pls/internal/profiling/thread_stats.h src/internal/profiling/thread_stats.cpp include/pls/algorithms/divide_and_conquer_buffers.h)
+        include/pls/internal/profiling/thread_stats.h src/internal/profiling/thread_stats.cpp)
 
 # Dependencies for pls
 target_link_libraries(pls Threads::Threads)
diff --git a/lib/pls/include/pls/algorithms/divide_and_conquer_buffers.h b/lib/pls/include/pls/algorithms/divide_and_conquer_buffers.h
deleted file mode 100644
index 8786c30..0000000
--- a/lib/pls/include/pls/algorithms/divide_and_conquer_buffers.h
+++ /dev/null
@@ -1,5 +0,0 @@
-
-#ifndef PLS_ALGORITHMS_DIVIDE_AND_CONQUER_BUFFERS_H_
-#define PLS_ALGORITHMS_DIVIDE_AND_CONQUER_BUFFERS_H_
-
-#endif //PLS_ALGORITHMS_DIVIDE_AND_CONQUER_BUFFERS_H_
diff --git a/lib/pls/include/pls/algorithms/for_each.h b/lib/pls/include/pls/algorithms/for_each.h
index 7430ea5..9a1e300 100644
--- a/lib/pls/include/pls/algorithms/for_each.h
+++ b/lib/pls/include/pls/algorithms/for_each.h
@@ -7,14 +7,14 @@
 namespace pls::algorithm {
 
 template<typename Function, typename ExecutionStrategy>
-static void for_each_range(unsigned long first,
-                           unsigned long last,
+static void for_each_range(long first,
+                           long last,
                            const Function &function,
                            ExecutionStrategy &execution_strategy);
 
 template<typename Function>
-static void for_each_range(unsigned long first,
-                           unsigned long last,
+static void for_each_range(long first,
+                           long last,
                            const Function &function);
 
 template<typename RandomIt, typename Function, typename ExecutionStrategy>
diff --git a/lib/pls/include/pls/algorithms/for_each_impl.h b/lib/pls/include/pls/algorithms/for_each_impl.h
index 722cb69..9ba4479 100644
--- a/lib/pls/include/pls/algorithms/for_each_impl.h
+++ b/lib/pls/include/pls/algorithms/for_each_impl.h
@@ -10,10 +10,10 @@ namespace pls::algorithm {
 namespace internal {
 
 template<typename RandomIt, typename Function>
-static void for_each(const RandomIt first,
-                     const RandomIt last,
-                     const Function &function,
-                     const long min_elements) {
+static void for_each_iterator(const RandomIt first,
+                              const RandomIt last,
+                              const Function &function,
+                              const size_t min_elements) {
   using namespace ::pls::internal::scheduling;
 
   const long num_elements = std::distance(first, last);
@@ -27,16 +27,48 @@ static void for_each(const RandomIt first,
     const long middle_index = num_elements / 2;
 
     scheduler::spawn([first, middle_index, last, &function, min_elements] {
-      internal::for_each(first,
-                         first + middle_index,
-                         function,
-                         min_elements);
+      internal::for_each_iterator(first,
+                                  first + middle_index,
+                                  function,
+                                  min_elements);
+    });
+    scheduler::spawn_and_sync([first, middle_index, last, &function, min_elements] {
+      internal::for_each_iterator(first + middle_index,
+                                  last,
+                                  function,
+                                  min_elements);
+    });
+  }
+}
+
+template<typename Function>
+static void for_each_range(const long first,
+                           const long last,
+                           const Function &function,
+                           const size_t min_elements) {
+  using namespace ::pls::internal::scheduling;
+
+  const long num_elements = last - first;
+  if (num_elements <= min_elements) {
+    // calculate last elements in loop to avoid overhead
+    for (auto current = first; current != last; current++) {
+      function(current);
+    }
+  } else {
+    // Cut in half recursively
+    const long middle_index = num_elements / 2;
+
+    scheduler::spawn([first, middle_index, last, &function, min_elements] {
+      internal::for_each_range(first,
+                               first + middle_index,
+                               function,
+                               min_elements);
     });
     scheduler::spawn_and_sync([first, middle_index, last, &function, min_elements] {
-      internal::for_each(first + middle_index,
-                         last,
-                         function,
-                         min_elements);
+      internal::for_each_range(first + middle_index,
+                               last,
+                               function,
+                               min_elements);
     });
   }
 }
@@ -44,15 +76,14 @@ static void for_each(const RandomIt first,
 }
 
 template<typename RandomIt, typename Function, typename ExecutionStrategy>
-static void for_each(RandomIt
-                     first,
+static void for_each(RandomIt first,
                      RandomIt last,
                      const Function &function,
                      ExecutionStrategy
                      execution_strategy) {
   long num_elements = std::distance(first, last);
   return
-      internal::for_each(first, last, function, execution_strategy.calculate_min_elements(num_elements));
+      internal::for_each_iterator(first, last, function, execution_strategy.calculate_min_elements(num_elements));
 }
 
 template<typename RandomIt, typename Function>
@@ -61,20 +92,19 @@ static void for_each(RandomIt first, RandomIt last, const Function &function) {
 }
 
 template<typename Function, typename ExecutionStrategy>
-static void for_each_range(unsigned long first,
-                           unsigned long last,
+static void for_each_range(long first,
+                           long last,
                            const Function &function,
                            ExecutionStrategy execution_strategy) {
-  auto range = pls::internal::helpers::range(first, last);
-  return for_each(range.begin(), range.end(), function, execution_strategy);
+  long num_elements = last - first;
+  return internal::for_each_range(first, last, function, execution_strategy.calculate_min_elements(num_elements));
 }
 
 template<typename Function>
-static void for_each_range(unsigned long first,
-                           unsigned long last,
+static void for_each_range(long first,
+                           long last,
                            const Function &function) {
-  auto range = pls::internal::helpers::range(first, last);
-  return for_each(range.begin(), range.end(), function);
+  return for_each_range(first, last, function, dynamic_strategy{4});
 }
 
 }
diff --git a/lib/pls/include/pls/internal/base/error_handling.h b/lib/pls/include/pls/internal/base/error_handling.h
index 8704cc8..2a3e156 100644
--- a/lib/pls/include/pls/internal/base/error_handling.h
+++ b/lib/pls/include/pls/internal/base/error_handling.h
@@ -18,4 +18,9 @@ void pls_error(const char *msg);
 // TODO: Distinguish between debug/internal asserts and production asserts.
 #define PLS_ASSERT(cond, msg) if (!(cond)) { pls_error(msg); }
 
+// Enable/Disable more expensive asserts.
+// On very small workloads also the 'normal' asserts can be disabled for more performance.
+//#define PLS_ASSERT_EXPENSIVE(cond, msg) if (!(cond)) { pls_error(msg); }
+#define PLS_ASSERT_EXPENSIVE(cond, msg)
+
 #endif //PLS_ERROR_HANDLING_H
diff --git a/lib/pls/include/pls/internal/base/stack_allocator.h b/lib/pls/include/pls/internal/base/stack_allocator.h
index ec99bf5..6296131 100644
--- a/lib/pls/include/pls/internal/base/stack_allocator.h
+++ b/lib/pls/include/pls/internal/base/stack_allocator.h
@@ -5,6 +5,7 @@
 #include <cstddef>
 
 namespace pls::internal::base {
+
 class stack_allocator {
  public:
   virtual char *allocate_stack(size_t size) = 0;
diff --git a/lib/pls/include/pls/internal/scheduling/base_task.h b/lib/pls/include/pls/internal/scheduling/base_task.h
index 231c402..3e744f0 100644
--- a/lib/pls/include/pls/internal/scheduling/base_task.h
+++ b/lib/pls/include/pls/internal/scheduling/base_task.h
@@ -56,12 +56,13 @@ struct base_task {
   }
 
   // General task information
-  unsigned depth_;
-  unsigned thread_id_;
+  const unsigned depth_;
+  const unsigned thread_id_;
 
   // Stack/continuation management
-  char *stack_memory_;
-  size_t stack_size_;
+  char * const stack_memory_;
+  const size_t stack_size_;
+
   context_switcher::continuation continuation_;
   bool is_synchronized_;
   bool is_serial_section_;
diff --git a/lib/pls/include/pls/internal/scheduling/lock_free/task.h b/lib/pls/include/pls/internal/scheduling/lock_free/task.h
index 723a067..967688d 100644
--- a/lib/pls/include/pls/internal/scheduling/lock_free/task.h
+++ b/lib/pls/include/pls/internal/scheduling/lock_free/task.h
@@ -30,11 +30,9 @@ struct task : public base_task {
   static task *find_task(unsigned id, unsigned depth);
 
  private:
-  std::atomic<int> num_resources_{};
-
   // STAMP = thread id of 'owning' thread before task was inserted into stack.
   // VALUE = next item in stack, indicated by thread ID.
-  std::atomic<data_structures::stamped_integer> resource_stack_next_{{0, 0}};
+  PLS_CACHE_ALIGN std::atomic<data_structures::stamped_integer> resource_stack_next_{{0, 0}};
 
   // STAMP = CAS stamp, half CAS length (16 or 32 Bit)
   // VALUE = Root of the actual stack, indicated by thread ID (16 or 32 Bit)
diff --git a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
index 3e1077b..8101c05 100644
--- a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
@@ -63,6 +63,20 @@ scheduler::scheduler(unsigned int num_threads,
       work_thread_main_loop();
     });
   }
+
+  // Make sure all threads are created and touched their stacks.
+  // Executing a work section ensures one wakeup/sleep cycle of all workers
+  // and explicitly forcing one task per worker forces them to initialize their stacks.
+  std::atomic<unsigned> num_spawned;
+  this->perform_work([&]() {
+    for (unsigned i = 0; i < num_threads; i++) {
+      spawn([&]() {
+        num_spawned++;
+        while (num_spawned < num_threads) std::this_thread::yield();
+      });
+    }
+    sync();
+  });
 }
 
 class scheduler::init_function {
@@ -195,7 +209,7 @@ void scheduler::spawn_internal(Function &&lambda) {
 #if PLS_SLEEP_WORKERS_ON_EMPTY
       // TODO: relax atomic operations on empty flag
       data_structures::stamped_integer queue_empty_flag = spawning_state.get_queue_empty_flag().load();
-      switch (queue_empty_flag.value) {
+      switch (queue_empty_flag.value_) {
         case EMPTY_QUEUE_STATE::QUEUE_NON_EMPTY: {
           // The queue was not found empty, ignore it.
           break;
@@ -203,9 +217,9 @@ void scheduler::spawn_internal(Function &&lambda) {
         case EMPTY_QUEUE_STATE::QUEUE_MAYBE_EMPTY: {
           // Someone tries to mark us empty and might be re-stealing right now.
           data_structures::stamped_integer
-              queue_non_empty_flag{queue_empty_flag.stamp++, EMPTY_QUEUE_STATE::QUEUE_NON_EMPTY};
+              queue_non_empty_flag{queue_empty_flag.stamp_++, EMPTY_QUEUE_STATE::QUEUE_NON_EMPTY};
           auto actual_empty_flag = spawning_state.get_queue_empty_flag().exchange(queue_non_empty_flag);
-          if (actual_empty_flag.value == EMPTY_QUEUE_STATE::QUEUE_EMPTY) {
+          if (actual_empty_flag.value_ == EMPTY_QUEUE_STATE::QUEUE_EMPTY) {
             spawning_state.get_scheduler().empty_queue_decrease_counter_and_wake();
           }
           break;
@@ -213,7 +227,7 @@ void scheduler::spawn_internal(Function &&lambda) {
         case EMPTY_QUEUE_STATE::QUEUE_EMPTY: {
           // Someone already marked the queue empty, we must revert its action on the central queue.
           data_structures::stamped_integer
-              queue_non_empty_flag{queue_empty_flag.stamp++, EMPTY_QUEUE_STATE::QUEUE_NON_EMPTY};
+              queue_non_empty_flag{queue_empty_flag.stamp_++, EMPTY_QUEUE_STATE::QUEUE_NON_EMPTY};
           spawning_state.get_queue_empty_flag().store(queue_non_empty_flag);
           spawning_state.get_scheduler().empty_queue_decrease_counter_and_wake();
           break;
diff --git a/lib/pls/include/pls/pls.h b/lib/pls/include/pls/pls.h
index 9755229..7f5895c 100644
--- a/lib/pls/include/pls/pls.h
+++ b/lib/pls/include/pls/pls.h
@@ -10,6 +10,7 @@
 
 #include "pls/internal/scheduling/scheduler.h"
 #include "pls/internal/scheduling/strain_local_resource.h"
+#include "pls/internal/base/stack_allocator.h"
 
 #include "pls/internal/helpers/range.h"
 #include "pls/internal/helpers/member_function.h"
@@ -18,6 +19,8 @@ namespace pls {
 
 // 'basic' for-join APIs
 using internal::scheduling::scheduler;
+using internal::base::heap_stack_allocator;
+using internal::base::mmap_stack_allocator;
 template<typename Function>
 static void spawn(Function &&function) {
   scheduler::spawn(std::forward<Function>(function));
diff --git a/lib/pls/src/internal/scheduling/lock_free/external_trading_deque.cpp b/lib/pls/src/internal/scheduling/lock_free/external_trading_deque.cpp
index f57c6b6..9bf2e4e 100644
--- a/lib/pls/src/internal/scheduling/lock_free/external_trading_deque.cpp
+++ b/lib/pls/src/internal/scheduling/lock_free/external_trading_deque.cpp
@@ -5,7 +5,7 @@
 namespace pls::internal::scheduling::lock_free {
 
 traded_cas_field external_trading_deque::peek_traded_object(task *target_task) {
-  traded_cas_field current_cas = target_task->external_trading_deque_cas_.load();
+  traded_cas_field current_cas = target_task->external_trading_deque_cas_.load(std::memory_order_relaxed);
   return current_cas;
 }
 
@@ -17,7 +17,9 @@ task *external_trading_deque::get_trade_object(task *target_task,
 
     traded_cas_field empty_cas = peeked_cas;
     empty_cas.make_empty();
-    if (target_task->external_trading_deque_cas_.compare_exchange_strong(current_cas, empty_cas)) {
+    if (target_task->external_trading_deque_cas_.compare_exchange_strong(current_cas,
+                                                                         empty_cas,
+                                                                         std::memory_order_acq_rel)) {
       task *result = task::find_task(result_id, target_task->depth_);
       return result;
     }
@@ -50,8 +52,8 @@ void external_trading_deque::reset_bot_and_top() {
   bot_internal_.value_ = 0;
   bot_internal_.stamp_++;
 
-  bot_.store(0);
-  top_.store({bot_internal_.stamp_, 0});
+  bot_.store(0, std::memory_order_release);
+  top_.store({bot_internal_.stamp_, 0}, std::memory_order_release);
 }
 
 task *external_trading_deque::pop_bot() {
@@ -83,11 +85,11 @@ task *external_trading_deque::pop_bot() {
 }
 
 external_trading_deque::peek_result external_trading_deque::peek_top() {
-  auto local_top = top_.load();
-  auto local_bot = bot_.load();
+  auto local_top = top_.load(std::memory_order_acquire);
+  auto local_bot = bot_.load(std::memory_order_acquire);
 
   if (local_top.value_ < local_bot) {
-    return peek_result{entries_[local_top.value_].traded_task_, local_top};
+    return peek_result{entries_[local_top.value_].traded_task_.load(std::memory_order_relaxed), local_top};
   } else {
     return peek_result{nullptr, local_top};
   }
@@ -95,7 +97,7 @@ external_trading_deque::peek_result external_trading_deque::peek_top() {
 
 task *external_trading_deque::pop_top(task *offered_task, peek_result peek_result) {
   stamped_integer expected_top = peek_result.top_pointer_;
-  auto local_bot = bot_.load();
+  auto local_bot = bot_.load(std::memory_order_acquire);
   if (expected_top.value_ >= local_bot) {
     return nullptr;
   }
@@ -103,8 +105,8 @@ task *external_trading_deque::pop_top(task *offered_task, peek_result peek_resul
   auto &target_entry = entries_[expected_top.value_];
 
   // Read our potential result
-  task *result = target_entry.traded_task_.load();
-  unsigned long forwarding_stamp = target_entry.forwarding_stamp_.load();
+  task *result = target_entry.traded_task_.load(std::memory_order_relaxed);
+  unsigned long forwarding_stamp = target_entry.forwarding_stamp_.load(std::memory_order_relaxed);
 
   if (result == nullptr) {
     return nullptr;
@@ -112,7 +114,7 @@ task *external_trading_deque::pop_top(task *offered_task, peek_result peek_resul
   if (forwarding_stamp != expected_top.stamp_) {
     // ...we failed because the top tag lags behind...try to fix it.
     // This means only updating the tag, as this location can still hold data we need.
-    top_.compare_exchange_strong(expected_top, {forwarding_stamp, expected_top.value_});
+    top_.compare_exchange_strong(expected_top, {forwarding_stamp, expected_top.value_}, std::memory_order_relaxed);
     return nullptr;
   }
 
@@ -123,16 +125,20 @@ task *external_trading_deque::pop_top(task *offered_task, peek_result peek_resul
   traded_cas_field offered_field = expected_sync_cas_field;
   offered_field.fill_with_task(offered_task->thread_id_);
 
-  if (result->external_trading_deque_cas_.compare_exchange_strong(expected_sync_cas_field, offered_field)) {
+  if (result->external_trading_deque_cas_.compare_exchange_strong(expected_sync_cas_field,
+                                                                  offered_field,
+                                                                  std::memory_order_acq_rel)) {
     // We got it, for sure move the top pointer forward.
-    top_.compare_exchange_strong(expected_top, {expected_top.stamp_ + 1, expected_top.value_ + 1});
+    top_.compare_exchange_strong(expected_top,
+                                 {expected_top.stamp_ + 1, expected_top.value_ + 1},
+                                 std::memory_order_acq_rel);
     return result;
   } else {
-    // TODO: Re-Check this condition for forwarding the stamp!  Should only happen if another top-stealer took the
-    //       slot that we where interested in!
     if (expected_sync_cas_field.is_filled_with_object() && expected_sync_cas_field.get_stamp() == expected_top.stamp_
         && expected_sync_cas_field.get_trade_request_thread_id() == thread_id_) {
-      top_.compare_exchange_strong(expected_top, {expected_top.stamp_ + 1, expected_top.value_ + 1});
+      top_.compare_exchange_strong(expected_top,
+                                   {expected_top.stamp_ + 1, expected_top.value_ + 1},
+                                   std::memory_order_relaxed);
     }
     return nullptr;
   }
diff --git a/lib/pls/src/internal/scheduling/lock_free/task.cpp b/lib/pls/src/internal/scheduling/lock_free/task.cpp
index 7100b46..d077693 100644
--- a/lib/pls/src/internal/scheduling/lock_free/task.cpp
+++ b/lib/pls/src/internal/scheduling/lock_free/task.cpp
@@ -15,14 +15,12 @@ void task::prepare_for_push(unsigned int pushing_thread_id) {
 }
 
 bool task::push_task_chain(task *spare_task_chain, unsigned pushing_thread_id) {
-  num_resources_++;
-
   PLS_ASSERT(this->thread_id_ != spare_task_chain->thread_id_,
              "Makes no sense to push task onto itself, as it is not clean by definition.");
   PLS_ASSERT(this->depth_ == spare_task_chain->depth_,
              "Must only push tasks with correct depth.");
 
-  data_structures::stamped_integer current_root;
+  data_structures::stamped_integer current_root = this->resource_stack_root_.load(std::memory_order_relaxed);
   data_structures::stamped_integer target_root;
 
   data_structures::stamped_integer expected_next_field;
@@ -30,10 +28,8 @@ bool task::push_task_chain(task *spare_task_chain, unsigned pushing_thread_id) {
   expected_next_field.stamp_ = pushing_thread_id + 1;
   expected_next_field.value_ = 0;
 
-  int iteration = 0;
   do {
-    iteration++;
-    current_root = this->resource_stack_root_.load();
+    // current_root implicitly re-loaded by CAS in loop
     target_root.stamp_ = current_root.stamp_ + 1;
     target_root.value_ = spare_task_chain->thread_id_ + 1;
 
@@ -50,53 +46,49 @@ bool task::push_task_chain(task *spare_task_chain, unsigned pushing_thread_id) {
       target_next_field.value_ = current_root_task->thread_id_ + 1;
     }
 
-    if (!spare_task_chain->resource_stack_next_.compare_exchange_strong(expected_next_field, target_next_field)) {
-      num_resources_--;
+    if (!spare_task_chain->resource_stack_next_.compare_exchange_strong(expected_next_field,
+                                                                        target_next_field,
+                                                                        std::memory_order_relaxed)) {
       return false;
     } else {
       expected_next_field = target_next_field;
     }
 
-  } while (!this->resource_stack_root_.compare_exchange_strong(current_root, target_root));
+  } while (!this->resource_stack_root_.compare_exchange_strong(current_root, target_root, std::memory_order_acq_rel));
 
   return true;
 }
 
 void task::reset_task_chain(task *expected_content) {
-  num_resources_--;
-
-  data_structures::stamped_integer current_root = this->resource_stack_root_.load();
+  data_structures::stamped_integer current_root = this->resource_stack_root_.load(std::memory_order_relaxed);
   PLS_ASSERT(current_root.value_ == expected_content->thread_id_ + 1,
              "Must only reset the task chain if we exactly know its state! (current_root.value_)");
 
   data_structures::stamped_integer target_root;
   target_root.stamp_ = current_root.stamp_ + 1;
-  bool success = this->resource_stack_root_.compare_exchange_strong(current_root, target_root);
-  PLS_ASSERT(success, "Must always succeed in resetting the chain, as we must be the sole one operating on it!");
+  this->resource_stack_root_.store(target_root, std::memory_order_relaxed);
 }
 
 task *task::pop_task_chain() {
-  data_structures::stamped_integer current_root;
+  data_structures::stamped_integer current_root = this->resource_stack_root_.load(std::memory_order_relaxed);
   data_structures::stamped_integer target_root;
   task *output_task;
   do {
-    current_root = this->resource_stack_root_.load();
+    // current_root implicitly re-loaded by CAS in loop
     if (current_root.value_ == 0) {
       // Empty...
       return nullptr;
     } else {
       // Found something, try to pop it
       auto *current_root_task = find_task(current_root.value_ - 1, this->depth_);
-      auto next_stack_cas = current_root_task->resource_stack_next_.load();
+      auto next_stack_cas = current_root_task->resource_stack_next_.load(std::memory_order_relaxed);
 
       target_root.stamp_ = current_root.stamp_ + 1;
       target_root.value_ = next_stack_cas.value_;
 
       output_task = current_root_task;
     }
-  } while (!this->resource_stack_root_.compare_exchange_strong(current_root, target_root));
-
-  PLS_ASSERT(num_resources_.fetch_add(-1) > 0, "Must only return an task from the chain if there are items!");
+  } while (!this->resource_stack_root_.compare_exchange_strong(current_root, target_root, std::memory_order_acq_rel));
 
   output_task->resource_stack_next_.store({0, 0});
   return output_task;
diff --git a/lib/pls/src/internal/scheduling/lock_free/task_manager.cpp b/lib/pls/src/internal/scheduling/lock_free/task_manager.cpp
index e1e2da9..251f705 100644
--- a/lib/pls/src/internal/scheduling/lock_free/task_manager.cpp
+++ b/lib/pls/src/internal/scheduling/lock_free/task_manager.cpp
@@ -44,7 +44,7 @@ base_task *task_manager::pop_local_task() {
 
 std::tuple<base_task *, base_task *, bool> task_manager::steal_task(thread_state &stealing_state) {
   PLS_ASSERT(stealing_state.get_active_task()->depth_ == 0, "Must only steal with clean task chain.");
-  PLS_ASSERT(scheduler::check_task_chain(*stealing_state.get_active_task()), "Must only steal with clean task chain.");
+  PLS_ASSERT_EXPENSIVE(scheduler::check_task_chain(*stealing_state.get_active_task()), "Must only steal with clean task chain.");
 
   auto peek = deque_.peek_top();
   if (peek.top_task_) {
@@ -83,7 +83,6 @@ std::tuple<base_task *, base_task *, bool> task_manager::steal_task(thread_state
 
       return std::tuple{stolen_task, chain_after_stolen_task, true};
     } else {
-      // TODO: traded task resource_stack_next_ field is de-marked from being mine
       return std::tuple{nullptr, nullptr, false};
     }
   } else {
@@ -94,15 +93,17 @@ std::tuple<base_task *, base_task *, bool> task_manager::steal_task(thread_state
 base_task *task_manager::pop_clean_task_chain(base_task *base_task) {
   task *target_task = static_cast<task *>(base_task);
 
+  traded_cas_field peeked_task_cas_before, peeked_task_cas_after;
+  peeked_task_cas_after = external_trading_deque::peek_traded_object(target_task);
   while (true) {
     // Try to get a clean resource chain to go back to the main stealing loop
-    auto peeked_task_cas_before = external_trading_deque::peek_traded_object(target_task);
+    peeked_task_cas_before = peeked_task_cas_after;
     task *pop_result = target_task->pop_task_chain();
     if (pop_result) {
       PLS_ASSERT(scheduler::check_task_chain_backward(*pop_result), "Must only pop proper task chains.");
       return pop_result; // Got something, so we are simply done here
     }
-    auto peeked_task_cas_after = external_trading_deque::peek_traded_object(target_task);
+    peeked_task_cas_after = external_trading_deque::peek_traded_object(target_task);
 
     if (peeked_task_cas_before != peeked_task_cas_after) {
       continue;
diff --git a/lib/pls/src/internal/scheduling/scheduler.cpp b/lib/pls/src/internal/scheduling/scheduler.cpp
index f2f5986..6596c6f 100644
--- a/lib/pls/src/internal/scheduling/scheduler.cpp
+++ b/lib/pls/src/internal/scheduling/scheduler.cpp
@@ -5,6 +5,7 @@
 #include "pls/internal/scheduling/strain_local_resource.h"
 #include "pls/internal/build_flavour.h"
 #include "pls/internal/base/error_handling.h"
+#include "pls/internal/base/futex_wrapper.h"
 
 #include <thread>
 
@@ -57,7 +58,7 @@ void scheduler::work_thread_work_section() {
 #if PLS_PROFILING_ENABLED
     my_state.get_scheduler().profiler_.stealing_start(my_state.get_thread_id());
 #endif
-    PLS_ASSERT(check_task_chain(*my_state.get_active_task()), "Must start stealing with a clean task chain.");
+    PLS_ASSERT_EXPENSIVE(check_task_chain(*my_state.get_active_task()), "Must start stealing with a clean task chain.");
 
     size_t target;
     do {
@@ -91,7 +92,7 @@ void scheduler::work_thread_work_section() {
       auto *stolen_resources = stolen_task->attached_resources_.load(std::memory_order_relaxed);
       strain_local_resource::acquire_locally(stolen_resources, my_state.get_thread_id());
 
-      PLS_ASSERT(check_task_chain_forward(*my_state.get_active_task()),
+      PLS_ASSERT_EXPENSIVE(check_task_chain_forward(*my_state.get_active_task()),
                  "We are sole owner of this chain, it has to be valid!");
 
       // Execute the stolen task by jumping to it's continuation.
@@ -117,12 +118,12 @@ void scheduler::work_thread_work_section() {
         my_state.get_scheduler().profiler_.stealing_end(my_state.get_thread_id(), false);
 #endif
 #if PLS_SLEEP_WORKERS_ON_EMPTY
-        switch (target_queue_empty_flag.value) {
+        switch (target_queue_empty_flag.value_) {
           case EMPTY_QUEUE_STATE::QUEUE_NON_EMPTY: {
             // We found the queue empty, but the flag says it should still be full.
             // We want to declare it empty, bet we need to re-check the queue in a sub-step to avoid races.
             data_structures::stamped_integer
-                maybe_empty_flag{target_queue_empty_flag.stamp + 1, EMPTY_QUEUE_STATE::QUEUE_MAYBE_EMPTY};
+                maybe_empty_flag{target_queue_empty_flag.stamp_ + 1, EMPTY_QUEUE_STATE::QUEUE_MAYBE_EMPTY};
             if (target_state.get_queue_empty_flag().compare_exchange_strong(target_queue_empty_flag,
                                                                             maybe_empty_flag)) {
               goto queue_empty_flag_retry_steal;
@@ -133,7 +134,7 @@ void scheduler::work_thread_work_section() {
             // We found the queue empty and it was already marked as maybe empty.
             // We can safely mark it empty and increment the central counter.
             data_structures::stamped_integer
-                empty_flag{target_queue_empty_flag.stamp + 1, EMPTY_QUEUE_STATE::QUEUE_EMPTY};
+                empty_flag{target_queue_empty_flag.stamp_ + 1, EMPTY_QUEUE_STATE::QUEUE_EMPTY};
             if (target_state.get_queue_empty_flag().compare_exchange_strong(target_queue_empty_flag, empty_flag)) {
               // We marked it empty, now its our duty to modify the central counter
               my_state.get_scheduler().empty_queue_increase_counter();
@@ -216,8 +217,8 @@ context_switcher::continuation scheduler::slow_return(thread_state &calling_stat
                "Resources must only reside in the correct depth!");
     PLS_ASSERT(last_task != clean_chain,
                "We want to swap out the last task and its chain to use a clean one, thus they must differ.");
-    PLS_ASSERT(check_task_chain_backward(*clean_chain),
-               "Can only acquire clean chains for clean returns!");
+    PLS_ASSERT_EXPENSIVE(check_task_chain_backward(*clean_chain),
+                         "Can only acquire clean chains for clean returns!");
 
     // Acquire it/merge it with our task chain.
     this_task->prev_ = clean_chain;
--
libgit2 0.26.0