diff --git a/app/benchmark_fft/main.cpp b/app/benchmark_fft/main.cpp
index 80de92c..e890d9e 100644
--- a/app/benchmark_fft/main.cpp
+++ b/app/benchmark_fft/main.cpp
@@ -1,13 +1,16 @@
-#include <pls/pls.h>
-#include <pls/internal/helpers/profiler.h>
-#include <pls/internal/helpers/mini_benchmark.h>
+#include "pls/internal/scheduling/scheduler.h"
+#include "pls/internal/scheduling/parallel_result.h"
+#include "pls/internal/scheduling/scheduler_memory.h"
+using namespace pls::internal::scheduling;
 
 #include <iostream>
 #include <complex>
 #include <vector>
+#include <atomic>
+
+std::atomic<unsigned long> count;
 
 static constexpr int CUTOFF = 16;
-static constexpr int NUM_ITERATIONS = 1000;
 static constexpr int INPUT_SIZE = 8192;
 typedef std::vector<std::complex<double>> complex_vector;
 
@@ -39,22 +42,38 @@ void combine(complex_vector::iterator data, int n) {
   }
 }
 
-void fft(complex_vector::iterator data, int n) {
+void fft_normal(complex_vector::iterator data, int n) {
   if (n < 2) {
+//    count++;
     return;
   }
 
   divide(data, n);
+  fft_normal(data, n / 2);
+  fft_normal(data + n / 2, n / 2);
+  combine(data, n);
+}
+
+parallel_result<short> fft(complex_vector::iterator data, int n) {
+  if (n < 2) {
+    return 0;
+  }
+
+  divide(data, n);
   if (n <= CUTOFF) {
-    fft(data, n / 2);
-    fft(data + n / 2, n / 2);
+    fft_normal(data, n / 2);
+    fft_normal(data + n / 2, n / 2);
+    return 0;
   } else {
-    pls::invoke(
-        [&] { fft(data, n / 2); },
-        [&] { fft(data + n / 2, n / 2); }
-    );
+    return scheduler::par([=]() {
+      return fft(data, n / 2);
+    }, [=]() {
+      return fft(data + n / 2, n / 2);
+    }).then([=](int, int) {
+      combine(data, n);
+      return 0;
+    });
   }
-  combine(data, n);
 }
 
 complex_vector prepare_input(int input_size) {
@@ -73,14 +92,55 @@ complex_vector prepare_input(int input_size) {
   return data;
 }
 
+static constexpr int NUM_ITERATIONS = 1000;
+constexpr size_t NUM_THREADS = 1;
+
+constexpr size_t NUM_TASKS = 64;
+constexpr size_t MAX_TASK_STACK_SIZE = 0;
+
+constexpr size_t NUM_CONTS = 64;
+constexpr size_t MAX_CONT_SIZE = 192;
+
 int main() {
-  PROFILE_ENABLE
   complex_vector initial_input = prepare_input(INPUT_SIZE);
 
-  pls::internal::helpers::run_mini_benchmark([&] {
-    complex_vector input = initial_input;
-    fft(input.begin(), input.size());
-  }, 7, 1000);
+  static_scheduler_memory<NUM_THREADS,
+                          NUM_TASKS,
+                          MAX_TASK_STACK_SIZE,
+                          NUM_CONTS,
+                          MAX_CONT_SIZE> static_scheduler_memory;
+
+  scheduler scheduler{static_scheduler_memory, NUM_THREADS};
+
+  count.store(0);
+  auto start = std::chrono::steady_clock::now();
+  for (int i = 0; i < NUM_ITERATIONS; i++) {
+    complex_vector input_2(initial_input);
+    scheduler.perform_work([&]() {
+      return scheduler::par([&]() {
+        return fft(input_2.begin(), INPUT_SIZE);
+      }, []() {
+        return parallel_result<int>{0};
+      }).then([](int, int) {
+        return 0;
+      });
+    });
+  }
+  auto end = std::chrono::steady_clock::now();
+  std::cout << "Count: " << count.load() << std::endl;
+  std::cout << "Framework:  " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
+            << std::endl;
+
+  count.store(0);
+  start = std::chrono::steady_clock::now();
+  for (int i = 0; i < NUM_ITERATIONS; i++) {
+    complex_vector input_1(initial_input);
+    fft_normal(input_1.begin(), INPUT_SIZE);
+  }
+  end = std::chrono::steady_clock::now();
+  std::cout << "Count: " << count.load() << std::endl;
+  std::cout << "Normal:     " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
+            << std::endl;
 
-  PROFILE_SAVE("test_profile.prof")
+  return 0;
 }
diff --git a/cmake/SetupOptimizationLevel.cmake b/cmake/SetupOptimizationLevel.cmake
index 05c18cc..f2f66ec 100644
--- a/cmake/SetupOptimizationLevel.cmake
+++ b/cmake/SetupOptimizationLevel.cmake
@@ -18,7 +18,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release")
     # but inlining functions and SIMD/Vectorization is
     # only enabled by -O3, thus it's way faster in some
     # array calculations.
-    set(CMAKE_CXX_FLAGS_RELEASE "-O2 -march=native")
+    set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native")
     set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 else ()
     set(CMAKE_CXX_FLAGS_DEBUG "-g -O0")
diff --git a/lib/pls/CMakeLists.txt b/lib/pls/CMakeLists.txt
index 12e7e97..9e9dd85 100644
--- a/lib/pls/CMakeLists.txt
+++ b/lib/pls/CMakeLists.txt
@@ -34,7 +34,7 @@ add_library(pls STATIC
         include/pls/internal/base/barrier.h src/internal/base/barrier.cpp
         include/pls/internal/base/system_details.h
         include/pls/internal/base/error_handling.h
-        include/pls/internal/base/alignment.h
+        include/pls/internal/base/alignment.h src/internal/base/alignment.cpp
 
         include/pls/internal/data_structures/aligned_stack.h src/internal/data_structures/aligned_stack.cpp
         include/pls/internal/data_structures/aligned_stack_impl.h
@@ -58,7 +58,7 @@ add_library(pls STATIC
         include/pls/internal/scheduling/task.h src/internal/scheduling/task.cpp
         include/pls/internal/scheduling/cont_manager.h
         include/pls/internal/scheduling/continuation.h
-        include/pls/internal/data_structures/bounded_ws_deque.h include/pls/internal/data_structures/optional.h src/internal/base/alignment.cpp src/internal/base/alignment.h)
+        include/pls/internal/data_structures/bounded_ws_deque.h include/pls/internal/data_structures/optional.h)
 
 # Add everything in `./include` to be in the include path of this project
 target_include_directories(pls
diff --git a/lib/pls/include/pls/internal/data_structures/bounded_ws_deque.h b/lib/pls/include/pls/internal/data_structures/bounded_ws_deque.h
index ccc6bfa..7a37f7b 100644
--- a/lib/pls/include/pls/internal/data_structures/bounded_ws_deque.h
+++ b/lib/pls/include/pls/internal/data_structures/bounded_ws_deque.h
@@ -36,11 +36,11 @@ class bounded_ws_deque {
   void push_bottom(T item) {
     item_array_[bottom_] = item;
     local_bottom_++;
-    bottom_.store(local_bottom_);
+    bottom_.store(local_bottom_, std::memory_order_release);
   }
 
   bool is_empty() {
-    return top_.load().value < bottom_;
+    return top_.load().value < bottom_.load();
   }
 
   optional<T> pop_top() {
@@ -66,11 +66,11 @@ class bounded_ws_deque {
     }
 
     local_bottom_--;
-    bottom_.store(local_bottom_);
+    bottom_.store(local_bottom_, std::memory_order_seq_cst);
 
     optional<T> result(item_array_[local_bottom_]);
 
-    stamped_integer old_top = top_.load();
+    stamped_integer old_top = top_.load(std::memory_order_acquire);
     if (local_bottom_ > old_top.value) {
       // Enough distance to just return the value
       return result;