diff --git a/lib/pls/include/pls/internal/base/alignment.h b/lib/pls/include/pls/internal/base/alignment.h
index 777c72f..71aa7a0 100644
--- a/lib/pls/include/pls/internal/base/alignment.h
+++ b/lib/pls/include/pls/internal/base/alignment.h
@@ -13,9 +13,9 @@ namespace internal {
 namespace base {
 namespace alignment {
 
-system_details::pointer_t next_alignment(system_details::pointer_t size);
+constexpr system_details::pointer_t next_alignment(system_details::pointer_t size);
+constexpr system_details::pointer_t previous_alignment(system_details::pointer_t size);
 char *next_alignment(char *pointer);
-system_details::pointer_t previous_alignment(system_details::pointer_t size);
 
 /**
  * Forces alignment requirements on a type equal to a cache line size.
diff --git a/lib/pls/include/pls/internal/base/alignment_impl.h b/lib/pls/include/pls/internal/base/alignment_impl.h
index 2ec6a7b..a734a40 100644
--- a/lib/pls/include/pls/internal/base/alignment_impl.h
+++ b/lib/pls/include/pls/internal/base/alignment_impl.h
@@ -23,7 +23,7 @@ constexpr system_details::pointer_t previous_alignment(system_details::pointer_t
          size - (size % system_details::CACHE_LINE_SIZE);
 }
 
-constexpr char *next_alignment(char *pointer) {
+char *next_alignment(char *pointer) {
   return reinterpret_cast<char *>(next_alignment(reinterpret_cast<system_details::pointer_t >(pointer)));
 }
 
diff --git a/lib/pls/include/pls/internal/scheduling/cont_manager.h b/lib/pls/include/pls/internal/scheduling/cont_manager.h
index 1290921..b7f33f2 100644
--- a/lib/pls/include/pls/internal/scheduling/cont_manager.h
+++ b/lib/pls/include/pls/internal/scheduling/cont_manager.h
@@ -3,7 +3,7 @@
 #define PLS_CONT_MANAGER_H_
 
 #include <memory>
-#include <tuple>
+#include <utility>
 #include <array>
 
 #include "pls/internal/data_structures/aligned_stack.h"
@@ -42,11 +42,11 @@ class cont_manager {
                                            continuation_node *cont_chain_start,
                                            continuation_node *prev) {
     // Represents one cont node and its corresponding memory buffer (as one continuous block of memory).
-    using cont_node_memory_pair = std::tuple<continuation_node,
+    using cont_node_memory_pair = std::pair<continuation_node,
                                              std::array<char, MAX_CONT_SIZE - sizeof(continuation_node)>>;
-    char *tuple_memory = cont_storage.push_bytes<cont_node_memory_pair>();
-    char *cont_node_address = tuple_memory;
-    char *cont_node_memory_address = tuple_memory + sizeof(continuation_node);
+    char *pair_memory = cont_storage.push_bytes<cont_node_memory_pair>();
+    char *cont_node_address = pair_memory;
+    char *cont_node_memory_address = pair_memory + sizeof(continuation_node);
 
     return new(cont_node_address) continuation_node(cont_node_memory_address, cont_chain_start, prev);
   }
diff --git a/lib/pls/include/pls/internal/scheduling/task.h b/lib/pls/include/pls/internal/scheduling/task.h
index 46abf95..f5c351c 100644
--- a/lib/pls/include/pls/internal/scheduling/task.h
+++ b/lib/pls/include/pls/internal/scheduling/task.h
@@ -2,8 +2,8 @@
 #ifndef PLS_TASK_H
 #define PLS_TASK_H
 
-#include "pls/internal/scheduling/task_manager.h"
 #include "pls/internal/scheduling/thread_state.h"
+#include "pls/internal/scheduling/continuation.h"
 
 namespace pls {
 namespace internal {
@@ -15,26 +15,11 @@ namespace scheduling {
  *
  * Override the execute_internal() method for your custom code.
  */
-class task {
+class base_task {
   friend class scheduler;
 
-  // TODO: Add ref to continuation
-  task_manager::task_manager_state task_manager_state_;
-
  protected:
-  explicit task();
-
-  /**
-   * Allow to allocate extra memory during run-time for this task.
-   * Memory will be pushed onto the stack (in aligned memory, thus avoid many small chunks).
-   *
-   * Memory is fully self managed. Calling e.g. de-constructors when not needing objects
-   * anymore is the users responsibility (memory is simply re-used after the life time of the task ends).
-   *
-   * @param size Number of bytes to be allocated
-   * @return The allocated memory region
-   */
-  void *allocate_memory(long size);
+  base_task() = default;
 
   /**
    * Overwrite this with the actual behaviour of concrete tasks.
@@ -42,7 +27,27 @@ class task {
   virtual void execute_internal() = 0;
 
  private:
-  void execute();
+  void execute() {
+    // TODO: Figure out slow path execution
+    execute_internal();
+  }
+};
+
+template<typename F, typename R1, typename R2, typename CF>
+class task : public base_task {
+ public:
+  template<typename FARG>
+  explicit task(FARG &&function, continuation<R1, R2, CF> *continuation)
+      : base_task{}, function_{std::forward<FARG>(function)}, continuation_{continuation} {}
+
+  void execute_internal() override {
+    continuation_->store_result_2(function_());
+    // TODO: Properly notify continuation on slow path
+  }
+
+ private:
+  F function_;
+  continuation<R1, R2, CF> *continuation_;
 };
 
 }
diff --git a/lib/pls/include/pls/internal/scheduling/task_manager.h b/lib/pls/include/pls/internal/scheduling/task_manager.h
index 7a0aeb6..d31a2a8 100644
--- a/lib/pls/include/pls/internal/scheduling/task_manager.h
+++ b/lib/pls/include/pls/internal/scheduling/task_manager.h
@@ -3,15 +3,26 @@
 #define PLS_TASK_MANAGER_H_
 
 #include <memory>
+#include <utility>
+#include <array>
+#include <atomic>
 
-#include "pls/internal/data_structures/aligned_stack.h"
+#include "pls/internal/scheduling/task.h"
+#include "pls/internal/data_structures/stamped_integer.h"
+#include "task.h"
 
 namespace pls {
 namespace internal {
 namespace scheduling {
 
-// TODO: Remove forward references
-class task;
+struct task_handle {
+ public:
+  enum state { uninitialized, initialized, execute_local, stealing, execute_remote, finished };
+  using stamped_state = data_structures::stamped_integer;
+
+  std::atomic<stamped_state> stamped_state_{uninitialized};
+  base_task *task_;
+};
 
 /**
  * Handles management of tasks in the system. Each thread has a local task manager,
@@ -22,42 +33,65 @@ class task;
  * integrate the memory management into the stealing procedure.
  */
 class task_manager {
-  using task_manager_offset = data_structures::aligned_stack::stack_offset;
-
  public:
-  // Data each task needs to store to enable the 'return_task' functionality.
-  using task_manager_state = task_manager_offset;
-
-  // Construct a task onto the stack. Stores the previous offset in the newly constructed task.
-  template<class T, typename ...ARGS>
-  T *push_task(ARGS ...args);
   // Publishes a task on the stack, i.e. makes it visible for other threads to steal.
-  void publish_task(task *task);
-  // Return a no longer needed task to the stack. Must be the current most top task (will reset the stack pointer).
-  void return_task(task *task);
+  // The task itself is located on the stack of the worker, as the stealer will copy it away before it is freed.
+  void publish_task(base_task &task) {
+    task_handle_stack_[tail_internal_].task_ = &task;
+    task_handle_stack_[tail_internal_].stamped_state_.store({stamp_internal_++, task_handle::initialized},
+                                                            std::memory_order_relaxed);
+    tail_internal_++;
+    tail_.store(tail_internal_, std::memory_order_release); // Linearization point, handle is published here
+  }
 
   // Try to pop a local task from this task managers stack.
-  task *pop_local_task();
-  // Try to steal a task from a remote task_manager instance.
-  // The returned task pointer is valid during the lifetyme of the task.
-  // The returned task pointer must be returned to this task_manager instance.
-  // (This is because we can either decide to just steal a remote task pointer or to copy the whole task)
-  task *pop_remote_task(task_manager &other);
+  // This should only be required on the fast path of the implementation,
+  // thus only returning if the operation was a success.
+  // Essentially this is an 'un-publish' of a task with a notion if it was successful.
+  bool steal_local_task() {
+    tail_internal_--;
+    tail_.store(tail_internal_, std::memory_order_relaxed);
+
+    task_handle::stamped_state swapped_state{task_handle::execute_local, stamp_internal_++};
+    task_handle_stack_[tail_internal_].stamped_state_.exchange(swapped_state, std::memory_order_acq_rel);
+
+    if (swapped_state.value == task_handle::execute_remote ||
+        swapped_state.value == task_handle::finished) {
+      // Someone got the other task, return to 'non linear' execution path
+      // TODO: Properly handle slow path
+      return false;
+    } else {
+      // No one got the task so far, we are happy and continue our fast path
+      return true;
+    }
+  }
+
+  // Try to steal a task from a remote task_manager instance. The stolen task must be stored locally.
+  // Returns a pair containing the actual task and if the steal was successful.
+  // TODO: Re-implement after fast path is done
+//  std::pair<task, bool> steal_remote_task(task_manager &other);
 
-  explicit task_manager(data_structures::aligned_stack &task_stack) : task_stack_{task_stack} {}
+  explicit task_manager(task_handle *task_handle_stack) : task_handle_stack_{task_handle_stack},
+                                                          head_{{0}},
+                                                          tail_{0},
+                                                          tail_internal_{0},
+                                                          stamp_internal_{0} {}
 
  private:
-  data_structures::aligned_stack &task_stack_;
+  task_handle *task_handle_stack_;
+  alignas(base::system_details::CACHE_LINE_SIZE) std::atomic<std::atomic<data_structures::stamped_integer>> head_;
+  alignas(base::system_details::CACHE_LINE_SIZE) std::atomic<unsigned int> tail_;
+  alignas(base::system_details::CACHE_LINE_SIZE) unsigned int tail_internal_, stamp_internal_;
 };
 
 template<size_t NUM_TASKS, size_t MAX_STACK_SIZE>
 class static_task_manager {
  public:
-  static_task_manager() : static_task_stack_{}, task_manager_{static_task_stack_} {};
+  static_task_manager() : static_task_handle_stack_{}, task_manager_{static_task_handle_stack_.data()} {};
   task_manager &get_task_manager() { return task_manager_; }
 
  private:
-  data_structures::static_aligned_stack<MAX_STACK_SIZE> static_task_stack_;
+  std::array<task_handle, NUM_TASKS> static_task_handle_stack_;
   task_manager task_manager_;
 };