diff --git a/CMakeLists.txt b/CMakeLists.txt
index bc1107e..d5bcb81 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,8 @@
 cmake_minimum_required(VERSION 3.10)
 project(predictable_parallel_patterns
         VERSION 0.0.1
-        DESCRIPTION "predictable parallel patterns for scalable smart systems using work stealing")
+        DESCRIPTION "predictable parallel patterns for scalable smart systems using work stealing"
+        LANGUAGES CXX ASM)
 
 set(CMAKE_CXX_STANDARD 11)
 
diff --git a/app/playground/CMakeLists.txt b/app/playground/CMakeLists.txt
index 3ea2ce1..9482fb6 100644
--- a/app/playground/CMakeLists.txt
+++ b/app/playground/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_executable(playground main.cpp)
+add_executable(playground main.cpp custom_stack_callback.s)
 
 # Example for adding the library to your app (as a cmake project dependency)
-target_link_libraries(playground pls)
+target_link_libraries(playground)
diff --git a/app/playground/custom_stack_callback.s b/app/playground/custom_stack_callback.s
new file mode 100644
index 0000000..085350c
--- /dev/null
+++ b/app/playground/custom_stack_callback.s
@@ -0,0 +1,16 @@
+	.file	"custom_stack_callback.s"
+	.text
+	.global	custom_stack_callback
+	.type	custom_stack_callback, @function
+
+.align 16
+custom_stack_callback:
+        # rdi = new stack adress (passed as parameter)
+        # r12 temporary for restoring old stack (callee saved, so we get the correct value in case of a return)
+        push %r12           # store the callee saved register as required
+        movq %rsp, %r12     # store current stack pointer
+        movq %rdi, %rsp     # update stack pointer to new user level stack
+        call callback       # enter next tasks (will not return if continuation is stolen)
+        movq %r12, %rsp     # restore to the old stack pointer
+        pop %r12            # restore the callee saved register as required
+    	ret
diff --git a/app/playground/main.cpp b/app/playground/main.cpp
index d7d0c07..8022bfd 100644
--- a/app/playground/main.cpp
+++ b/app/playground/main.cpp
@@ -1,91 +1,75 @@
-#include <iostream>
+#include <cstdio>
+#include <csetjmp>
+#include <cstring>
 #include <chrono>
 
-#include "pls/internal/scheduling/scheduler.h"
-#include "pls/internal/scheduling/parallel_result.h"
-#include "pls/internal/scheduling/scheduler_memory.h"
-#include "pls/internal/data_structures/bounded_trading_deque.h"
+using namespace std;
 
-using namespace pls::internal;
+// Settings for stack and benchmark
+const unsigned int NUM_RUNS = 100000;
+const unsigned int STACK_SIZE = 512 * 1;
+const unsigned char MAGIC_NUMBER = (unsigned char) 0xAB;
 
-constexpr size_t MAX_NUM_THREADS = 1;
+// Memory for custom stack and continuation semantics
+unsigned char custom_stack[STACK_SIZE] = {0};
+jmp_buf buffer;
 
-constexpr size_t MAX_NUM_TASKS = 128;
-static constexpr int NUM_ITERATIONS = 10;
+// Example callback function and declaration of our assembly stack switching routine
+extern "C" {
+void custom_stack_callback(void *);
 
-constexpr size_t MAX_NUM_CONTS = 128;
-constexpr size_t MAX_CONT_SIZE = 256;
+void __attribute__ ((noinline)) callback() {
+  static volatile int tmp;
+  tmp = 0; // Force at least a single memory write
+}
+}
 
-int fib_normal(int n) {
-  if (n == 0) {
-    return 0;
-  }
-  if (n == 1) {
-    return 1;
+long __attribute__ ((noinline)) measure_function_call() {
+  auto start_time = chrono::steady_clock::now();
+  for (unsigned int i = 0; i < NUM_RUNS; i++) {
+    callback();
   }
-
-  int result = fib_normal(n - 1) + fib_normal(n - 2);
-  return result;
+  auto end_time = chrono::steady_clock::now();
+  return chrono::duration_cast<chrono::nanoseconds>(end_time - start_time).count();
 }
 
-scheduling::parallel_result<int> fib(int n) {
-  pls::variable<int> i;
-  pls::array<int> a{10};
-  if (n == 0) {
-    return 0;
-  }
-  if (n == 1) {
-    return 1;
+long __attribute__ ((noinline)) measure_stack_switch() {
+  auto start_time = chrono::steady_clock::now();
+  for (unsigned int i = 0; i < NUM_RUNS; i++) {
+    custom_stack_callback(&custom_stack[STACK_SIZE - 16]);
   }
+  auto end_time = chrono::steady_clock::now();
+  return chrono::duration_cast<chrono::nanoseconds>(end_time - start_time).count();
+}
 
-  return scheduling::scheduler::par([=]() {
-    return fib(n - 1);
-  }, [=]() {
-    return fib(n - 2);
-  }).then([=](int a, int b) {
-    return scheduling::parallel_result<int>{a + b};
-  });
+long __attribute__ ((noinline)) measure_continuation() {
+  auto start_time = chrono::steady_clock::now();
+  for (unsigned int i = 0; i < NUM_RUNS; i++) {
+    if (setjmp(buffer) == 0) {
+      custom_stack_callback(&custom_stack[STACK_SIZE - 16]);
+    }
+  }
+  auto end_time = chrono::steady_clock::now();
+  return chrono::duration_cast<chrono::nanoseconds>(end_time - start_time).count();
 }
 
-static volatile int result;
 int main() {
-  PROFILE_ENABLE;
-  scheduling::static_scheduler_memory<MAX_NUM_THREADS,
-                                      MAX_NUM_TASKS,
-                                      MAX_NUM_CONTS,
-                                      MAX_CONT_SIZE> static_scheduler_memory;
-
-  scheduling::scheduler scheduler{static_scheduler_memory, MAX_NUM_THREADS};
-
-  auto start = std::chrono::steady_clock::now();
-  for (int i = 0; i < NUM_ITERATIONS; i++) {
-    result = fib_normal(35);
-  }
-  auto end = std::chrono::steady_clock::now();
-  std::cout << "Normal:     " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
-            << std::endl;
+  memset(custom_stack, MAGIC_NUMBER, STACK_SIZE);
 
-  start = std::chrono::steady_clock::now();
+  auto time_cont = measure_continuation();
+  auto time_stack = measure_stack_switch();
+  auto time_func = measure_function_call();
 
-  for (int i = 0; i < NUM_ITERATIONS; i++) {
-    scheduler.perform_work([]() {
-      PROFILE_MAIN_THREAD;
-      return scheduling::scheduler::par([]() {
-        return scheduling::parallel_result<int>(0);
-      }, []() {
-        return fib(35);
-      }).then([](int, int b) {
-        result = b;
-        PROFILE_LOCK("DONE");
-        return scheduling::parallel_result<int>{0};
-      });
-    });
-    PROFILE_LOCK("DONE");
+  for (unsigned int i = 0; i < STACK_SIZE; i++) {
+    if (custom_stack[i] != MAGIC_NUMBER) {
+      printf("Used stack size about %u bytes.\n\n\n", (STACK_SIZE - i));
+      break;
+    }
   }
-  PROFILE_SAVE("test_profile.prof");
 
-  end = std::chrono::steady_clock::now();
-  std::cout << "Framework: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << std::endl;
+  printf("Function Call    : %10ld, %5.5f\n", time_func, ((float) time_func / NUM_RUNS));
+  printf("Stack Switching  : %10ld, %5.5f\n", time_stack, ((float) time_stack / NUM_RUNS));
+  printf("Full Continuation: %10ld, %5.5f\n", time_cont, ((float) time_cont / NUM_RUNS));
 
   return 0;
 }
diff --git a/cmake/SetupOptimizationLevel.cmake b/cmake/SetupOptimizationLevel.cmake
index f2f66ec..5d22958 100644
--- a/cmake/SetupOptimizationLevel.cmake
+++ b/cmake/SetupOptimizationLevel.cmake
@@ -18,7 +18,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release")
     # but inlining functions and SIMD/Vectorization is
     # only enabled by -O3, thus it's way faster in some
     # array calculations.
-    set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native")
     set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 else ()
     set(CMAKE_CXX_FLAGS_DEBUG "-g -O0")