diff --git a/app/benchmark_fft/CMakeLists.txt b/app/benchmark_fft/CMakeLists.txt
index 4c7f3fe..cfef00b 100644
--- a/app/benchmark_fft/CMakeLists.txt
+++ b/app/benchmark_fft/CMakeLists.txt
@@ -1,5 +1,5 @@
-add_executable(benchmark_fft_pls_v2 main.cpp)
-target_link_libraries(benchmark_fft_pls_v2 pls benchmark_runner benchmark_base)
+add_executable(benchmark_fft_pls_v3 main.cpp)
+target_link_libraries(benchmark_fft_pls_v3 pls benchmark_runner benchmark_base)
 if (EASY_PROFILER)
-    target_link_libraries(benchmark_fft_pls_v2 easy_profiler)
+    target_link_libraries(benchmark_fft_pls_v3 easy_profiler)
 endif ()
diff --git a/app/benchmark_fft/main.cpp b/app/benchmark_fft/main.cpp
index 0a34b30..64f2915 100644
--- a/app/benchmark_fft/main.cpp
+++ b/app/benchmark_fft/main.cpp
@@ -37,7 +37,7 @@ void conquer(fft::complex_vector::iterator data, int n) {
 
 constexpr int MAX_NUM_THREADS = 8;
 constexpr int MAX_NUM_TASKS = 32;
-constexpr int MAX_STACK_SIZE = 1024 * 4;
+constexpr int MAX_STACK_SIZE = 1024 * 64;
 
 static_scheduler_memory<MAX_NUM_THREADS,
                         MAX_NUM_TASKS,
@@ -49,7 +49,7 @@ int main(int argc, char **argv) {
   benchmark_runner::read_args(argc, argv, num_threads, directory);
 
   string test_name = to_string(num_threads) + ".csv";
-  string full_directory = directory + "/PLS_v2/";
+  string full_directory = directory + "/PLS_v3/";
   benchmark_runner runner{full_directory, test_name};
 
   fft::complex_vector data = fft::generate_input();
diff --git a/app/benchmark_fib/CMakeLists.txt b/app/benchmark_fib/CMakeLists.txt
index af2f63c..5233f4f 100644
--- a/app/benchmark_fib/CMakeLists.txt
+++ b/app/benchmark_fib/CMakeLists.txt
@@ -1,5 +1,5 @@
-add_executable(benchmark_fib_pls_v2 main.cpp)
-target_link_libraries(benchmark_fib_pls_v2 pls benchmark_runner benchmark_base)
+add_executable(benchmark_fib_pls_v3 main.cpp)
+target_link_libraries(benchmark_fib_pls_v3 pls benchmark_runner benchmark_base)
 if (EASY_PROFILER)
-    target_link_libraries(benchmark_fib_pls_v2 easy_profiler)
+    target_link_libraries(benchmark_fib_pls_v3 easy_profiler)
 endif ()
diff --git a/app/benchmark_fib/main.cpp b/app/benchmark_fib/main.cpp
index e9bf411..44a97cb 100644
--- a/app/benchmark_fib/main.cpp
+++ b/app/benchmark_fib/main.cpp
@@ -33,7 +33,7 @@ int pls_fib(int n) {
 
 constexpr int MAX_NUM_THREADS = 8;
 constexpr int MAX_NUM_TASKS = 32;
-constexpr int MAX_STACK_SIZE = 1024 * 1;
+constexpr int MAX_STACK_SIZE = 1024 * 4;
 
 static_scheduler_memory<MAX_NUM_THREADS,
                         MAX_NUM_TASKS,
diff --git a/app/benchmark_matrix/CMakeLists.txt b/app/benchmark_matrix/CMakeLists.txt
index 67c9b09..fe40f88 100644
--- a/app/benchmark_matrix/CMakeLists.txt
+++ b/app/benchmark_matrix/CMakeLists.txt
@@ -1,5 +1,5 @@
-add_executable(benchmark_matrix_pls_v2 main.cpp)
-target_link_libraries(benchmark_matrix_pls_v2 pls benchmark_runner benchmark_base)
+add_executable(benchmark_matrix_pls_v3 main.cpp)
+target_link_libraries(benchmark_matrix_pls_v3 pls benchmark_runner benchmark_base)
 if (EASY_PROFILER)
-    target_link_libraries(benchmark_matrix_pls_v2 easy_profiler)
+    target_link_libraries(benchmark_matrix_pls_v3 easy_profiler)
 endif ()
diff --git a/app/benchmark_matrix/main.cpp b/app/benchmark_matrix/main.cpp
index 57301e4..ffca2d3 100644
--- a/app/benchmark_matrix/main.cpp
+++ b/app/benchmark_matrix/main.cpp
@@ -23,7 +23,7 @@ class pls_matrix : public matrix::matrix<T, SIZE> {
 
 constexpr int MAX_NUM_THREADS = 8;
 constexpr int MAX_NUM_TASKS = 32;
-constexpr int MAX_STACK_SIZE = 1024 * 1;
+constexpr int MAX_STACK_SIZE = 1024 * 4;
 
 static_scheduler_memory<MAX_NUM_THREADS,
                         MAX_NUM_TASKS,
@@ -35,7 +35,7 @@ int main(int argc, char **argv) {
   benchmark_runner::read_args(argc, argv, num_threads, directory);
 
   string test_name = to_string(num_threads) + ".csv";
-  string full_directory = directory + "/PLS_v2/";
+  string full_directory = directory + "/PLS_v3/";
   benchmark_runner runner{full_directory, test_name};
 
   pls_matrix<double, matrix::MATRIX_SIZE> a;
diff --git a/cmake/SetupOptimizationLevel.cmake b/cmake/SetupOptimizationLevel.cmake
index 5d22958..57fcd59 100644
--- a/cmake/SetupOptimizationLevel.cmake
+++ b/cmake/SetupOptimizationLevel.cmake
@@ -18,7 +18,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release")
     # but inlining functions and SIMD/Vectorization is
     # only enabled by -O3, thus it's way faster in some
     # array calculations.
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2 -march=native")
     set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 else ()
     set(CMAKE_CXX_FLAGS_DEBUG "-g -O0")
diff --git a/lib/context_switcher/include/context_switcher/context_switcher.h b/lib/context_switcher/include/context_switcher/context_switcher.h
index 6bb4886..3bcc795 100644
--- a/lib/context_switcher/include/context_switcher/context_switcher.h
+++ b/lib/context_switcher/include/context_switcher/context_switcher.h
@@ -111,6 +111,9 @@ void lambda_capture_callback(fcontext::transfer_t transfer) {
   lambda_capture->~T();
 
   continuation_t cont_pointer = cont.consume();
+  if (cont_pointer == nullptr) {
+    printf("Error!!!\n");
+  }
   fcontext::jump_fcontext(cont_pointer, (void *) 0);
 }
 
diff --git a/lib/context_switcher/include/context_switcher/continuation.h b/lib/context_switcher/include/context_switcher/continuation.h
index ff6c31c..3c196c0 100644
--- a/lib/context_switcher/include/context_switcher/continuation.h
+++ b/lib/context_switcher/include/context_switcher/continuation.h
@@ -50,10 +50,6 @@ struct continuation {
   }
 
   continuation_t consume() {
-    if (cont_pointer_ == nullptr) {
-      printf("Error!\n");
-    }
-
     auto tmp = cont_pointer_;
     cont_pointer_ = nullptr;
     return tmp;
diff --git a/lib/pls/CMakeLists.txt b/lib/pls/CMakeLists.txt
index e120f2e..f7f8232 100644
--- a/lib/pls/CMakeLists.txt
+++ b/lib/pls/CMakeLists.txt
@@ -40,7 +40,7 @@ add_library(pls STATIC
         include/pls/internal/scheduling/task_manager_impl.h
         include/pls/internal/scheduling/static_scheduler_memory.h
         include/pls/internal/scheduling/heap_scheduler_memory.h
-        src/internal/scheduling/task_manager.cpp)
+        src/internal/scheduling/task_manager.cpp src/internal/scheduling/thread_state.cpp)
 
 # Dependencies for pls
 target_link_libraries(pls Threads::Threads)
diff --git a/lib/pls/include/pls/internal/base/error_handling.h b/lib/pls/include/pls/internal/base/error_handling.h
index bdb6466..8704cc8 100644
--- a/lib/pls/include/pls/internal/base/error_handling.h
+++ b/lib/pls/include/pls/internal/base/error_handling.h
@@ -16,6 +16,6 @@
 void pls_error(const char *msg);
 
 // TODO: Distinguish between debug/internal asserts and production asserts.
-#define PLS_ASSERT(cond, msg) // if (!(cond)) { pls_error(msg); }
+#define PLS_ASSERT(cond, msg) if (!(cond)) { pls_error(msg); }
 
 #endif //PLS_ERROR_HANDLING_H
diff --git a/lib/pls/include/pls/internal/base/system_details.h b/lib/pls/include/pls/internal/base/system_details.h
index 165ed3e..222d127 100644
--- a/lib/pls/include/pls/internal/base/system_details.h
+++ b/lib/pls/include/pls/internal/base/system_details.h
@@ -67,6 +67,24 @@ inline void relax_cpu() {
 #endif
 }
 
+/**
+ * Prevent inlining of functions. This is a compiler specific setting and
+ * it is seen as an error to not properly declare this.
+ * (Some functions in the codebase MUST be re-evaluated after fiber switches,
+ * by preventing inlining them we prevent the compiler caching their results)
+ */
+#if defined(_MSC_VER)
+#define PLS_NOINLINE __declspec(noinline)
+#elif defined(__GNUC__) && __GNUC__ > 3
+#if defined(__CUDACC__)
+#define PLS_NOINLINE __attribute__ ((noinline))
+#else
+#define PLS_NOINLINE __attribute__ ((__noinline__))
+#endif
+#else
+#error "PLS requires inline prevention for certain functions."
+#endif
+
 }
 }
 }
diff --git a/lib/pls/include/pls/internal/scheduling/external_trading_deque.h b/lib/pls/include/pls/internal/scheduling/external_trading_deque.h
index 2f589c6..4fc6b1e 100644
--- a/lib/pls/include/pls/internal/scheduling/external_trading_deque.h
+++ b/lib/pls/include/pls/internal/scheduling/external_trading_deque.h
@@ -78,20 +78,20 @@ class external_trading_deque {
     auto expected_stamp = bot_internal_.stamp;
     auto &current_entry = entries_[bot_internal_.value];
 
+    // Publish the prepared task in the deque.
+    current_entry.forwarding_stamp_.store(expected_stamp, std::memory_order_relaxed);
+    current_entry.traded_task_.store(published_task, std::memory_order_relaxed);
+
     // Field that all threads synchronize on.
     // This happens not in the deque itself, but in the published task.
     traded_cas_field sync_cas_field;
     sync_cas_field.fill_with_stamp(expected_stamp, thread_id_);
-    published_task->external_trading_deque_cas_.store(sync_cas_field);
-
-    // Publish the prepared task in the deque.
-    current_entry.forwarding_stamp_.store(expected_stamp);
-    current_entry.traded_task_.store(published_task);
+    published_task->external_trading_deque_cas_.store(sync_cas_field, std::memory_order_release);
 
     // Advance the bot pointer. Linearization point for making the task public.
     bot_internal_.stamp++;
     bot_internal_.value++;
-    bot_.store(bot_internal_.value);
+    bot_.store(bot_internal_.value, std::memory_order_release);
   }
 
   void reset_bot_and_top() {
@@ -104,7 +104,7 @@ class external_trading_deque {
 
   void decrease_bot() {
     bot_internal_.value--;
-    bot_.store(bot_internal_.value);
+    bot_.store(bot_internal_.value, std::memory_order_relaxed);
   }
 
   /**
@@ -120,15 +120,17 @@ class external_trading_deque {
     decrease_bot();
 
     auto &current_entry = entries_[bot_internal_.value];
-    auto *popped_task = current_entry.traded_task_.load();
-    auto expected_stamp = current_entry.forwarding_stamp_.load();
+    auto *popped_task = current_entry.traded_task_.load(std::memory_order_relaxed);
+    auto expected_stamp = current_entry.forwarding_stamp_.load(std::memory_order_relaxed);
 
     // We know what value must be in the cas field if no other thread stole it.
     traded_cas_field expected_sync_cas_field;
     expected_sync_cas_field.fill_with_stamp(expected_stamp, thread_id_);
     traded_cas_field empty_cas_field;
 
-    if (popped_task->external_trading_deque_cas_.compare_exchange_strong(expected_sync_cas_field, empty_cas_field)) {
+    if (popped_task->external_trading_deque_cas_.compare_exchange_strong(expected_sync_cas_field,
+                                                                         empty_cas_field,
+                                                                         std::memory_order_acq_rel)) {
       return optional<task *>{popped_task};
     } else {
       reset_bot_and_top();
diff --git a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
index 45abc65..9510505 100644
--- a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
@@ -27,10 +27,11 @@ class scheduler::init_function_impl : public init_function {
   void run() override {
     auto &root_task = thread_state::get().get_task_manager().get_active_task();
     root_task.run_as_task([&](context_switcher::continuation cont) {
-      thread_state::get().set_main_continuation(std::move(cont));
+      thread_state::get().main_continuation() = std::move(cont);
       function_();
       thread_state::get().get_scheduler().work_section_done_.store(true);
-      return std::move(thread_state::get().get_main_continuation());
+      PLS_ASSERT(thread_state::get().main_continuation().valid(), "Must return valid continuation from main task.");
+      return std::move(thread_state::get().main_continuation());
     });
 
   }
diff --git a/lib/pls/include/pls/internal/scheduling/task_manager.h b/lib/pls/include/pls/internal/scheduling/task_manager.h
index 9670d82..8b473a6 100644
--- a/lib/pls/include/pls/internal/scheduling/task_manager.h
+++ b/lib/pls/include/pls/internal/scheduling/task_manager.h
@@ -52,7 +52,7 @@ class task_manager {
   void spawn_child(F &&lambda);
   void sync();
 
-  task* steal_task(task_manager &stealing_task_manager);
+  task *steal_task(task_manager &stealing_task_manager);
 
   bool try_clean_return(context_switcher::continuation &result_cont);
 
diff --git a/lib/pls/include/pls/internal/scheduling/task_manager_impl.h b/lib/pls/include/pls/internal/scheduling/task_manager_impl.h
index 31d06d2..4c4d740 100644
--- a/lib/pls/include/pls/internal/scheduling/task_manager_impl.h
+++ b/lib/pls/include/pls/internal/scheduling/task_manager_impl.h
@@ -71,7 +71,7 @@ void task_manager::spawn_child(F &&lambda) {
 
   if (continuation.valid()) {
     // We jumped in here from the main loop, keep track!
-    thread_state::get().set_main_continuation(std::move(continuation));
+    thread_state::get().main_continuation() = std::move(continuation);
   }
 }
 
diff --git a/lib/pls/include/pls/internal/scheduling/thread_state.h b/lib/pls/include/pls/internal/scheduling/thread_state.h
index 74fe5d9..329ff91 100644
--- a/lib/pls/include/pls/internal/scheduling/thread_state.h
+++ b/lib/pls/include/pls/internal/scheduling/thread_state.h
@@ -6,6 +6,8 @@
 #include <chrono>
 #include <utility>
 
+#include "pls/internal/base/system_details.h"
+
 #include "context_switcher/continuation.h"
 
 namespace pls {
@@ -37,9 +39,11 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) thread_state {
    * Must only be called on threads that are associated with a thread_state,
    * this will most likely be threads created by the scheduler.
    *
+   * Each call is guaranteed to be a new lockup, i.e. it is not cached after fiber context switches.
+   *
    * @return The thread_state of this thread.
    */
-  static thread_state &get() { return *base::this_thread::state<thread_state>(); }
+  static thread_state &PLS_NOINLINE get();
 
   unsigned get_id() { return id_; }
   void set_id(unsigned id) {
@@ -54,11 +58,8 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) thread_state {
     return random_();
   }
 
-  void set_main_continuation(context_switcher::continuation &&continuation) {
-    main_loop_continuation_ = std::move(continuation);
-  }
-  context_switcher::continuation get_main_continuation() {
-    return std::move(main_loop_continuation_);
+  context_switcher::continuation &main_continuation() {
+    return main_loop_continuation_;
   }
 
   // Do not allow move/copy operations.
diff --git a/lib/pls/src/internal/scheduling/task_manager.cpp b/lib/pls/src/internal/scheduling/task_manager.cpp
index 2c73fe1..eeff837 100644
--- a/lib/pls/src/internal/scheduling/task_manager.cpp
+++ b/lib/pls/src/internal/scheduling/task_manager.cpp
@@ -1,5 +1,3 @@
-#include <tuple>
-
 #include "pls/internal/scheduling/task_manager.h"
 
 #include "pls/internal/scheduling/task.h"
@@ -76,7 +74,6 @@ task *task_manager::steal_task(task_manager &stealing_task_manager) {
 }
 
 void task_manager::push_resource_on_task(task *target_task, task *spare_task_chain) {
-  PLS_ASSERT(check_task_chain_backward(spare_task_chain), "Must only push proper task chains.");
   PLS_ASSERT(target_task->thread_id_ != spare_task_chain->thread_id_,
              "Makes no sense to push task onto itself, as it is not clean by definition.");
   PLS_ASSERT(target_task->depth_ == spare_task_chain->depth_, "Must only push tasks with correct depth.");
@@ -90,11 +87,11 @@ void task_manager::push_resource_on_task(task *target_task, task *spare_task_cha
 
     if (current_root.value == 0) {
       // Empty, simply push in with no successor
-      spare_task_chain->resource_stack_next_.store(nullptr, std::memory_order_relaxed);
+      spare_task_chain->resource_stack_next_.store(nullptr);
     } else {
       // Already an entry. Find it's corresponding task and set it as our successor.
       auto *current_root_task = find_task(current_root.value - 1, target_task->depth_);
-      spare_task_chain->resource_stack_next_.store(current_root_task, std::memory_order_relaxed);
+      spare_task_chain->resource_stack_next_.store(current_root_task);
     }
 
   } while (!target_task->resource_stack_root_.compare_exchange_strong(current_root, target_root));
@@ -112,7 +109,7 @@ task *task_manager::pop_resource_from_task(task *target_task) {
     } else {
       // Found something, try to pop it
       auto *current_root_task = find_task(current_root.value - 1, target_task->depth_);
-      auto *next_stack_task = current_root_task->resource_stack_next_.load(std::memory_order_relaxed);
+      auto *next_stack_task = current_root_task->resource_stack_next_.load();
 
       target_root.stamp = current_root.stamp + 1;
       target_root.value = next_stack_task != nullptr ? next_stack_task->thread_id_ + 1 : 0;
@@ -122,7 +119,7 @@ task *task_manager::pop_resource_from_task(task *target_task) {
   } while (!target_task->resource_stack_root_.compare_exchange_strong(current_root, target_root));
 
   PLS_ASSERT(check_task_chain_backward(output_task), "Must only pop proper task chains.");
-  output_task->resource_stack_next_.store(nullptr, std::memory_order_relaxed);
+  output_task->resource_stack_next_.store(nullptr);
   return output_task;
 }
 
@@ -187,7 +184,8 @@ bool task_manager::try_clean_return(context_switcher::continuation &result_cont)
     }
 
     // jump back to the continuation in main scheduling loop, time to steal some work
-    result_cont = thread_state::get().get_main_continuation();
+    result_cont = std::move(thread_state::get().main_continuation());
+    PLS_ASSERT(result_cont.valid(), "Must return a valid continuation.");
     return true;
   } else {
     // Make sure that we are owner fo this full continuation/task chain.
@@ -198,13 +196,16 @@ bool task_manager::try_clean_return(context_switcher::continuation &result_cont)
     active_task_ = last_task;
 
     result_cont = std::move(last_task->continuation_);
+    PLS_ASSERT(result_cont.valid(), "Must return a valid continuation.");
     return false;
   }
 }
 
 bool task_manager::check_task_chain_forward(task *start_task) {
   while (start_task->next_ != nullptr) {
-    PLS_ASSERT(start_task->next_->prev_ == start_task, "Chain must have correct prev/next fields for linked list!");
+    if (start_task->next_->prev_ != start_task) {
+      return false;
+    }
     start_task = start_task->next_;
   }
   return true;
@@ -212,17 +213,16 @@ bool task_manager::check_task_chain_forward(task *start_task) {
 
 bool task_manager::check_task_chain_backward(task *start_task) {
   while (start_task->prev_ != nullptr) {
-    PLS_ASSERT(start_task->prev_->next_ == start_task, "Chain must have correct prev/next fields for linked list!");
+    if (start_task->prev_->next_ != start_task) {
+      return false;
+    }
     start_task = start_task->prev_;
   }
   return true;
 }
 
 bool task_manager::check_task_chain() {
-  check_task_chain_backward(active_task_);
-  check_task_chain_forward(active_task_);
-
-  return true;
+  return check_task_chain_backward(active_task_) && check_task_chain_forward(active_task_);
 }
 
 }
diff --git a/lib/pls/src/internal/scheduling/thread_state.cpp b/lib/pls/src/internal/scheduling/thread_state.cpp
new file mode 100644
index 0000000..fe70562
--- /dev/null
+++ b/lib/pls/src/internal/scheduling/thread_state.cpp
@@ -0,0 +1,12 @@
+#include "pls/internal/scheduling/thread_state.h"
+#include "pls/internal/base/thread.h"
+
+namespace pls {
+namespace internal {
+namespace scheduling {
+
+thread_state &thread_state::get() { return *base::this_thread::state<thread_state>(); }
+
+}
+}
+}