diff --git a/lib/pls/include/pls/internal/base/spin_lock.h b/lib/pls/include/pls/internal/base/spin_lock.h
index cfd3143..c2b98c8 100644
--- a/lib/pls/include/pls/internal/base/spin_lock.h
+++ b/lib/pls/include/pls/internal/base/spin_lock.h
@@ -17,9 +17,7 @@ namespace pls {
 
             public:
                 spin_lock(): flag_{ATOMIC_FLAG_INIT}, yield_at_tries_{1024} {};
-                spin_lock(const spin_lock& other): flag_{ATOMIC_FLAG_INIT}, yield_at_tries_{other.yield_at_tries_} {
-                    std::cout << "Spinlock Moved!" << std::endl;
-                }
+                spin_lock(const spin_lock& other): flag_{ATOMIC_FLAG_INIT}, yield_at_tries_{other.yield_at_tries_} {}
 
                 void lock();
                 void unlock();
diff --git a/lib/pls/include/pls/internal/scheduling/scheduler.h b/lib/pls/include/pls/internal/scheduling/scheduler.h
index 0726c06..164eaff 100644
--- a/lib/pls/include/pls/internal/scheduling/scheduler.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler.h
@@ -65,19 +65,28 @@ namespace pls {
                 template<typename Function>
                 void perform_work(Function work_section) {
                     root_task<Function> master{work_section};
-                    root_worker_task<Function> worker{&master};
 
                     // Push root task on stacks
-                    memory_->thread_state_for(0)->root_task_ = &master;
-                    memory_->thread_state_for(0)->current_task_ = &master;
+                    auto new_master = memory_->task_stack_for(0)->push(master);
+                    memory_->thread_state_for(0)->root_task_ = new_master;
+                    memory_->thread_state_for(0)->current_task_ = new_master;
                     for (unsigned int i = 1; i < num_threads_; i++) {
-                        memory_->thread_state_for(i)->root_task_ = &worker;
-                        memory_->thread_state_for(i)->current_task_ = &worker;
+                        root_worker_task<Function> worker{new_master};
+                        auto new_worker = memory_->task_stack_for(0)->push(worker);
+                        memory_->thread_state_for(i)->root_task_ = new_worker;
+                        memory_->thread_state_for(i)->current_task_ = new_worker;
                     }
 
                     // Perform and wait for work
                     sync_barrier_.wait(); // Trigger threads to wake up
                     sync_barrier_.wait(); // Wait for threads to finish
+
+                    // Clean up stack
+                    memory_->task_stack_for(0)->pop<typeof(master)>();
+                    for (unsigned int i = 1; i < num_threads_; i++) {
+                        root_worker_task<Function> worker{new_master};
+                        memory_->task_stack_for(0)->pop<typeof(worker)>();
+                    }
                 }
 
                 // TODO: See if we should place this differently (only for performance reasons)
@@ -86,24 +95,31 @@ namespace pls {
                     static_assert(std::is_base_of<abstract_task, Task>::value, "Only pass abstract_task subclasses!");
 
                     auto my_state = base::this_thread::state<thread_state>();
-                    auto current_task = my_state->current_task_;
+                    abstract_task* old_task;
+                    abstract_task* new_task;
 
                     // Init Task
                     {
                         std::lock_guard<base::spin_lock> lock{my_state->lock_};
-                        task.set_depth(depth >= 0 ? depth : current_task->depth() + 1);
-                        my_state->current_task_ = &task;
-                        current_task->set_child(&task);
+                        old_task = my_state->current_task_;
+                        new_task = my_state->task_stack_->push(task);
+
+                        new_task->set_depth(depth >= 0 ? depth : old_task->depth() + 1);
+                        my_state->current_task_ = new_task;
+                        old_task->set_child(new_task);
                     }
 
                     // Run Task
-                    task.execute();
+                    new_task->execute();
 
                     // Teardown state back to before the task was executed
                     {
                         std::lock_guard<base::spin_lock> lock{my_state->lock_};
-                        current_task->set_child(nullptr);
-                        my_state->current_task_ = current_task;
+
+                        old_task->set_child(nullptr);
+                        my_state->current_task_ = old_task;
+
+                        my_state->task_stack_->pop<Task>();
                     }
                 }
 
diff --git a/lib/pls/include/pls/internal/scheduling/tbb_task.h b/lib/pls/include/pls/internal/scheduling/tbb_task.h
index 70d34da..fd70430 100644
--- a/lib/pls/include/pls/internal/scheduling/tbb_task.h
+++ b/lib/pls/include/pls/internal/scheduling/tbb_task.h
@@ -74,13 +74,15 @@ namespace pls {
                         abstract_task{0, 0},
                         root_task_{root_task},
                         deque_{},
-                        last_stolen_{nullptr} {
+                        last_stolen_{nullptr} {};
+
+                void execute() override {
+                    // Bind this instance to our OS thread
                     my_stack_ = base::this_thread::state<thread_state>()->task_stack_;
                     root_task_->tbb_task_ = this;
                     root_task_->stack_state_ = my_stack_->save_state();
-                };
 
-                void execute() override {
+                    // Execute it on our OS thread until its finished
                     root_task_->execute();
                 }
             };
diff --git a/lib/pls/src/internal/base/spin_lock.cpp b/lib/pls/src/internal/base/spin_lock.cpp
index 4c61216..c99019b 100644
--- a/lib/pls/src/internal/base/spin_lock.cpp
+++ b/lib/pls/src/internal/base/spin_lock.cpp
@@ -8,18 +8,16 @@ namespace pls {
             //       also act as a strict memory fence.
             void spin_lock::lock() {
                 int tries = 0;
-                while (flag_.test_and_set(std::memory_order_acquire)) {
+                while (flag_.test_and_set(std::memory_order_seq_cst)) {
                     tries++;
                     if (tries % yield_at_tries_ == 0) {
                         this_thread::yield();
                     }
                 }
-                std::atomic_thread_fence(std::memory_order_seq_cst);
             }
 
             void spin_lock::unlock() {
-                flag_.clear(std::memory_order_release);
-                std::atomic_thread_fence(std::memory_order_seq_cst);
+                flag_.clear(std::memory_order_seq_cst);
             }
         }
     }
diff --git a/lib/pls/src/internal/scheduling/abstract_task.cpp b/lib/pls/src/internal/scheduling/abstract_task.cpp
index 3cd709d..0694f6f 100644
--- a/lib/pls/src/internal/scheduling/abstract_task.cpp
+++ b/lib/pls/src/internal/scheduling/abstract_task.cpp
@@ -42,7 +42,8 @@ namespace pls {
                     // Execute 'top level task steal' if possible
                     // (only try deeper tasks to keep depth restricted stealing)
                     while (current_task != nullptr) {
-                        if (current_task->split_task(&target_state->lock_)) {
+                        auto lock = &target_state->lock_;
+                        if (current_task->split_task(lock)) {
                             // internal steal was no success (we did a top level task steal)
                             return false;
                         }
diff --git a/lib/pls/src/internal/scheduling/tbb_task.cpp b/lib/pls/src/internal/scheduling/tbb_task.cpp
index 87c3cab..4708a62 100644
--- a/lib/pls/src/internal/scheduling/tbb_task.cpp
+++ b/lib/pls/src/internal/scheduling/tbb_task.cpp
@@ -1,4 +1,4 @@
-#include <pls/internal/scheduling/scheduler.h>
+#include "pls/internal/scheduling/scheduler.h"
 #include "pls/internal/scheduling/tbb_task.h"
 
 namespace pls {
@@ -82,11 +82,12 @@ namespace pls {
                 if (stolen_sub_task == nullptr) {
                     return false;
                 }
+                tbb_task task{stolen_sub_task};
+
                 // In success case, unlock.
                 // TODO: this locking is complicated and error prone.
                 lock->unlock();
 
-                tbb_task task{stolen_sub_task};
                 scheduler::execute_task(task, depth());
                 return true;
             }
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 4c5fa7c..dbe5d58 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,4 +1,4 @@
 add_executable(tests
         main.cpp
-        base_tests.cpp)
+        base_tests.cpp scheduling_tests.cpp)
 target_link_libraries(tests catch2 pls)
diff --git a/test/scheduling_tests.cpp b/test/scheduling_tests.cpp
new file mode 100644
index 0000000..47ff5e2
--- /dev/null
+++ b/test/scheduling_tests.cpp
@@ -0,0 +1,79 @@
+#include <catch.hpp>
+
+#include <pls/pls.h>
+
+using namespace pls;
+
+class once_sub_task: public tbb_sub_task {
+    std::atomic<int>* counter_;
+    int children_;
+
+protected:
+    void execute_internal() override {
+        (*counter_)++;
+        for (int i = 0; i < children_; i++) {
+            spawn_child(once_sub_task(counter_, children_ - 1));
+        }
+    }
+
+public:
+    explicit once_sub_task(std::atomic<int>* counter, int children):
+        tbb_sub_task(),
+        counter_{counter},
+        children_{children} {}
+};
+
+class force_steal_sub_task: public tbb_sub_task {
+    std::atomic<int>* parent_counter_;
+    std::atomic<int>* overall_counter_;
+
+protected:
+    void execute_internal() override {
+        (*overall_counter_)--;
+        if (overall_counter_->load() > 0) {
+            std::atomic<int> counter{1};
+            spawn_child(force_steal_sub_task(&counter, overall_counter_));
+            while (counter.load() > 0)
+                ; // Spin...
+        }
+
+        (*parent_counter_)--;
+    }
+
+public:
+    explicit force_steal_sub_task(std::atomic<int>* parent_counter, std::atomic<int>* overall_counter):
+            tbb_sub_task(),
+            parent_counter_{parent_counter},
+            overall_counter_{overall_counter} {}
+};
+
+TEST_CASE( "tbb task are scheduled correctly", "[internal/scheduling/tbb_task.h]") {
+    static static_scheduler_memory<8, 2 << 12> my_scheduler_memory;
+
+    SECTION("tasks are executed exactly once") {
+        scheduler my_scheduler{&my_scheduler_memory, 2};
+        int start_counter = 4;
+        int total_tasks = 1 + 4 + 4 * 3 + 4 * 3 * 2 + 4 * 3 * 2 * 1;
+        std::atomic<int> counter{0};
+
+        my_scheduler.perform_work([&] (){
+            once_sub_task sub_task{&counter, start_counter};
+            tbb_task task{&sub_task};
+            scheduler::execute_task(task);
+        });
+
+        REQUIRE(counter.load() == total_tasks);
+        my_scheduler.terminate(true);
+    }
+
+    SECTION("tasks can be stolen") {
+        scheduler my_scheduler{&my_scheduler_memory, 8};
+        my_scheduler.perform_work([&] (){
+            std::atomic<int> dummy_parent{1}, overall_counter{8};
+            force_steal_sub_task sub_task{&dummy_parent, &overall_counter};
+            tbb_task task{&sub_task};
+            scheduler::execute_task(task);
+        });
+        my_scheduler.terminate(true);
+    }
+}