diff --git a/app/benchmark_fft/main.cpp b/app/benchmark_fft/main.cpp
index 64f2915..5a35e16 100644
--- a/app/benchmark_fft/main.cpp
+++ b/app/benchmark_fft/main.cpp
@@ -37,7 +37,7 @@ void conquer(fft::complex_vector::iterator data, int n) {
 
 constexpr int MAX_NUM_THREADS = 8;
 constexpr int MAX_NUM_TASKS = 32;
-constexpr int MAX_STACK_SIZE = 1024 * 64;
+constexpr int MAX_STACK_SIZE = 1024 * 32;
 
 static_scheduler_memory<MAX_NUM_THREADS,
                         MAX_NUM_TASKS,
@@ -56,19 +56,11 @@ int main(int argc, char **argv) {
 
   scheduler scheduler{global_scheduler_memory, (unsigned) num_threads};
 
-  scheduler.perform_work([&]() {
-    for (int i = 0; i < fft::NUM_WARMUP_ITERATIONS; i++) {
-      conquer(data.begin(), fft::SIZE);
-    }
-  });
-
-  scheduler.perform_work([&]() {
-    for (int i = 0; i < fft::NUM_ITERATIONS; i++) {
-      runner.start_iteration();
-      conquer(data.begin(), fft::SIZE);
-      runner.end_iteration();
-    }
-  });
+  runner.run_iterations(fft::NUM_ITERATIONS, [&]() {
+    scheduler.perform_work([&]() {
+      conquer(data.begin(), fft::SIZE);;
+    });
+  }, fft::NUM_WARMUP_ITERATIONS);
   runner.commit_results(true);
 
   return 0;
diff --git a/app/benchmark_fib/main.cpp b/app/benchmark_fib/main.cpp
index 44a97cb..83336d4 100644
--- a/app/benchmark_fib/main.cpp
+++ b/app/benchmark_fib/main.cpp
@@ -51,19 +51,12 @@ int main(int argc, char **argv) {
   scheduler scheduler{global_scheduler_memory, (unsigned) num_threads};
 
   volatile int res;
-  scheduler.perform_work([&]() {
-    for (int i = 0; i < fib::NUM_WARMUP_ITERATIONS; i++) {
-      res = pls_fib(fib::INPUT_N);
-    }
-  });
 
-  scheduler.perform_work([&]() {
-    for (int i = 0; i < fib::NUM_ITERATIONS; i++) {
-      runner.start_iteration();
+  runner.run_iterations(fib::NUM_ITERATIONS, [&]() {
+    scheduler.perform_work([&]() {
       res = pls_fib(fib::INPUT_N);
-      runner.end_iteration();
-    }
-  });
+    });
+  }, fib::NUM_WARMUP_ITERATIONS);
   runner.commit_results(true);
 
   return 0;
diff --git a/app/benchmark_matrix/main.cpp b/app/benchmark_matrix/main.cpp
index ffca2d3..ef1ae10 100644
--- a/app/benchmark_matrix/main.cpp
+++ b/app/benchmark_matrix/main.cpp
@@ -14,8 +14,8 @@ class pls_matrix : public matrix::matrix<T, SIZE> {
  public:
   pls_matrix() : matrix::matrix<T, SIZE>() {}
 
-  void pls_multiply(const matrix::matrix<T, SIZE> &a, const matrix::matrix<T, SIZE> &b) {
-    pls::algorithm::for_each_range(0, SIZE, [this, &a, &b](int i) {
+  void multiply(const matrix::matrix<T, SIZE> &a, const matrix::matrix<T, SIZE> &b) override {
+    pls::algorithm::for_each_range(0, SIZE, [&](int i) {
       this->multiply_column(i, a, b);
     });
   }
@@ -23,7 +23,7 @@ class pls_matrix : public matrix::matrix<T, SIZE> {
 
 constexpr int MAX_NUM_THREADS = 8;
 constexpr int MAX_NUM_TASKS = 32;
-constexpr int MAX_STACK_SIZE = 1024 * 4;
+constexpr int MAX_STACK_SIZE = 1024 * 1;
 
 static_scheduler_memory<MAX_NUM_THREADS,
                         MAX_NUM_TASKS,
@@ -44,18 +44,10 @@ int main(int argc, char **argv) {
 
   scheduler scheduler{global_scheduler_memory, (unsigned) num_threads};
 
-  scheduler.perform_work([&]() {
-    for (int i = 0; i < matrix::WARMUP_ITERATIONS; i++) {
-      result.pls_multiply(a, b);
-    }
-  });
-
-  scheduler.perform_work([&]() {
-    for (int i = 0; i < matrix::NUM_ITERATIONS; i++) {
-      runner.start_iteration();
-      result.pls_multiply(a, b);
-      runner.end_iteration();
-    }
-  });
+  runner.run_iterations(matrix::NUM_ITERATIONS, [&]() {
+    scheduler.perform_work([&]() {
+      result.multiply(a, b);
+    });
+  }, matrix::WARMUP_ITERATIONS);
   runner.commit_results(true);
 }
diff --git a/cmake/SetupOptimizationLevel.cmake b/cmake/SetupOptimizationLevel.cmake
index 57fcd59..5d22958 100644
--- a/cmake/SetupOptimizationLevel.cmake
+++ b/cmake/SetupOptimizationLevel.cmake
@@ -18,7 +18,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release")
     # but inlining functions and SIMD/Vectorization is
     # only enabled by -O3, thus it's way faster in some
     # array calculations.
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2 -march=native")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native")
     set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 else ()
     set(CMAKE_CXX_FLAGS_DEBUG "-g -O0")
diff --git a/lib/context_switcher/asm/cscontext/SelectAssemblyFiles.cmake b/lib/context_switcher/asm/cscontext/SelectAssemblyFiles.cmake
index 9777b97..e957080 100644
--- a/lib/context_switcher/asm/cscontext/SelectAssemblyFiles.cmake
+++ b/lib/context_switcher/asm/cscontext/SelectAssemblyFiles.cmake
@@ -9,6 +9,11 @@ if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND CMAKE_SYSTEM_NAME STREQUAL "Lin
     SET(CS_CSCONTEXT_ASSEMBLY
             asm/cscontext/enter_context_x86_64_sysv_elf.s
             asm/cscontext/switch_context_x86_64_sysv_elf.s)
+elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l" AND CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    # Typical Linux running on ARMv7
+    SET(CS_CSCONTEXT_ASSEMBLY
+            asm/cscontext/enter_context_arm32_sysv_elf.s
+            asm/cscontext/switch_context_arm32_sysv_elf.s)
 else ()
     SET(CS_CSCONTEXT_FOUND FALSE)
 endif ()
diff --git a/lib/context_switcher/asm/cscontext/enter_context_arm32_sysv_elf.s b/lib/context_switcher/asm/cscontext/enter_context_arm32_sysv_elf.s
new file mode 100644
index 0000000..9f6b917
--- /dev/null
+++ b/lib/context_switcher/asm/cscontext/enter_context_arm32_sysv_elf.s
@@ -0,0 +1,63 @@
+	.arm
+	.text
+	.global	__cs_enter_context
+	.type	__cs_enter_context, %function
+
+__cs_enter_context:
+        /* Parameter List (in order)
+        * r0 = new stack pointer
+        * r1 = first parameter to callback
+        * r2 = callback function pointer
+        * r3 = new stack limit (not used on most platforms)
+        *
+        * Return
+        * r0 = continuation that returned control back to the caller (null if fallthrough)
+        *
+        * Variables
+        * r4 = temporary for the old stack pointer */
+
+        /* ========== Save State ========== */
+        /* store programm counter for later return */
+        push {lr}
+        /* store callee saved registers */
+        push {r4-r12,lr}
+        /* store floating point extension registers */
+        #if (defined(__VFP_FP__) && !defined(__SOFTFP__))
+        sub sp, sp, #64
+        vstmia sp, {d8-d15}
+        #endif
+        /* ========== Save State ========== */
+
+        /* Perform change to new stack */
+        /* Keep old stack as second parameter to our callback function. */
+        mov r4, sp
+        /* Make sure that stack start is properly aligned. */
+        and r0, r0, #-16
+        /* Switch to new stack pointer. */
+        mov sp, r0
+
+        /* Perform actual function call, this will now be on the new stack */
+        /* r0 = first parametor to callback (continuation) */
+        /* r1 = second parameter to callback (arbetary pointer) */
+        mov r0, r4
+        blx r2
+
+        /* Restore state of returned continuation. */
+        /* To do so we first reset the stack pointer (which we get returned in r0). */
+        /* After that we execute our standard restore procedere to pop the state from the stack. */
+        mov sp, r0
+
+        /* ========== Restore State ========== */
+        /* restore floating point extension registers */
+        #if (defined(__VFP_FP__) && !defined(__SOFTFP__))
+        vldmia sp, {d8-d15}
+        add sp, sp, #64
+        #endif
+        /* restore callee saved registers */
+        pop {r4-r12,lr}
+        /* ========== Restore State ========== */
+
+        /* Just return back from the call. */
+        /* This is the end of a fiber, so we have no continuation. */
+        eor r0, r0, r0
+        pop {pc}
diff --git a/lib/context_switcher/asm/cscontext/switch_context_arm32_sysv_elf.s b/lib/context_switcher/asm/cscontext/switch_context_arm32_sysv_elf.s
new file mode 100644
index 0000000..3f143a1
--- /dev/null
+++ b/lib/context_switcher/asm/cscontext/switch_context_arm32_sysv_elf.s
@@ -0,0 +1,48 @@
+	.arm
+	.text
+	.global	__cs_switch_context
+	.type	__cs_switch_context, %function
+
+__cs_switch_context:
+        /* Parameter List (in order)
+        * r0 = pointer to continuation (should hold value of target stack will be filled with this continuation)
+        *
+        * Return
+        * r0 = continuation that returned control back to the caller (null if fallthrough)
+        *
+        * Variables
+        * r1 = temporary for the old stack pointer */
+
+        /* ========== Save State ========== */
+        /* store programm counter for later return */
+        push {lr}
+        /* store callee saved registers */
+        push {r4-r12,lr}
+        /* store floating point extension registers */
+        #if (defined(__VFP_FP__) && !defined(__SOFTFP__))
+        sub sp, sp, #64
+        vstmia sp, {d8-d15}
+        #endif
+        /* ========== Save State ========== */
+
+        /* Perform change to new stack */
+        /* Keep old stack as result from this function. */
+        mov r1, sp
+        /* Switch to new stack pointer. */
+        mov sp, r0
+
+
+        /* ========== Restore State ========== */
+        /* restore floating point extension registers */
+        #if (defined(__VFP_FP__) && !defined(__SOFTFP__))
+        vldmia sp, {d8-d15}
+        add sp, sp, #64
+        #endif
+        /* restore callee saved registers */
+        pop {r4-r12,lr}
+        /* ========== Restore State ========== */
+
+        /* Just return back from the call. */
+        /* This is the end of a fiber, so we have no continuation. */
+        mov r0, r1
+        pop {pc}
diff --git a/lib/pls/include/pls/algorithms/for_each_impl.h b/lib/pls/include/pls/algorithms/for_each_impl.h
index 8f47e62..73e42bf 100644
--- a/lib/pls/include/pls/algorithms/for_each_impl.h
+++ b/lib/pls/include/pls/algorithms/for_each_impl.h
@@ -77,10 +77,7 @@ void for_each(RandomIt
               execution_strategy) {
   long num_elements = std::distance(first, last);
   return
-      internal::for_each(first, last, function, execution_strategy
-          .
-              calculate_min_elements(num_elements)
-      );
+      internal::for_each(first, last, function, execution_strategy.calculate_min_elements(num_elements));
 }
 
 template<typename RandomIt, typename Function>
diff --git a/lib/pls/include/pls/internal/scheduling/task.h b/lib/pls/include/pls/internal/scheduling/task.h
index fa2977c..c8696a8 100644
--- a/lib/pls/include/pls/internal/scheduling/task.h
+++ b/lib/pls/include/pls/internal/scheduling/task.h
@@ -34,6 +34,8 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) task {
 
     depth_ = depth;
     thread_id_ = thread_id;
+
+    is_synchronized_ = false;
   }
 
   template<typename F>
@@ -44,9 +46,11 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) task {
   // TODO: Proper access control and split it up into responsibilities
   // Stack/Continuation Management
   char *stack_memory_;
-  size_t stack_size_;
+  size_t stack_size_; // TODO: maybe remove it, not needed in here
   context_switcher::continuation continuation_;
+  bool is_synchronized_;
 
+  // TODO: Clean up responsibilities
   // Work-Stealing
   std::atomic<traded_cas_field> external_trading_deque_cas_{};
   std::atomic<task *> resource_stack_next_{};
diff --git a/lib/pls/include/pls/internal/scheduling/task_manager_impl.h b/lib/pls/include/pls/internal/scheduling/task_manager_impl.h
index 4c4d740..ba11377 100644
--- a/lib/pls/include/pls/internal/scheduling/task_manager_impl.h
+++ b/lib/pls/include/pls/internal/scheduling/task_manager_impl.h
@@ -28,6 +28,7 @@ void task_manager::spawn_child(F &&lambda) {
         last_task->continuation_ = std::move(cont);
 
         // we are now executing the new task, allow others to steal the last task continuation.
+        spawned_task->is_synchronized_ = true;
         spawning_task_manager->active_task_ = spawned_task;
         spawning_task_manager->deque_.push_bot(last_task);
 
diff --git a/lib/pls/src/internal/scheduling/scheduler.cpp b/lib/pls/src/internal/scheduling/scheduler.cpp
index c414208..484337a 100644
--- a/lib/pls/src/internal/scheduling/scheduler.cpp
+++ b/lib/pls/src/internal/scheduling/scheduler.cpp
@@ -108,6 +108,7 @@ void scheduler::work_thread_work_section() {
       // Execute the stolen task by jumping to it's continuation.
       PLS_ASSERT(stolen_task->continuation_.valid(),
                  "A task that we can steal must have a valid continuation for us to start working.");
+      stolen_task->is_synchronized_ = false;
       context_switcher::switch_context(std::move(stolen_task->continuation_));
       // We will continue execution in this line when we finished the stolen work.
     }
diff --git a/lib/pls/src/internal/scheduling/task_manager.cpp b/lib/pls/src/internal/scheduling/task_manager.cpp
index eeff837..c0c7e12 100644
--- a/lib/pls/src/internal/scheduling/task_manager.cpp
+++ b/lib/pls/src/internal/scheduling/task_manager.cpp
@@ -128,23 +128,29 @@ void task_manager::sync() {
   auto *last_task = spawning_task_manager->active_task_;
   auto *spawned_task = spawning_task_manager->active_task_->next_;
 
-  auto continuation = spawned_task->run_as_task([=](context_switcher::continuation cont) {
-    last_task->continuation_ = std::move(cont);
-    spawning_task_manager->active_task_ = spawned_task;
-
-    context_switcher::continuation result_cont;
-    if (spawning_task_manager->try_clean_return(result_cont)) {
-      // We return back to the main scheduling loop
-      return result_cont;
-    } else {
-      // We finish up the last task
-      return result_cont;
-    }
-  });
-
-  PLS_ASSERT(!continuation.valid(),
-             "We only return to a sync point, never jump to it directly."
-             "This must therefore never return an unfinished fiber/continuation.");
+  if (last_task->is_synchronized_) {
+    return; // We are already the sole owner of last_task
+  } else {
+    auto continuation = spawned_task->run_as_task([=](context_switcher::continuation cont) {
+      last_task->continuation_ = std::move(cont);
+      spawning_task_manager->active_task_ = spawned_task;
+
+      context_switcher::continuation result_cont;
+      if (spawning_task_manager->try_clean_return(result_cont)) {
+        // We return back to the main scheduling loop
+        return result_cont;
+      } else {
+        // We finish up the last task
+        return result_cont;
+      }
+    });
+
+    PLS_ASSERT(!continuation.valid(),
+               "We only return to a sync point, never jump to it directly."
+               "This must therefore never return an unfinished fiber/continuation.");
+
+    return; // We cleanly synced to the last one finishing work on last_task
+  }
 }
 
 bool task_manager::try_clean_return(context_switcher::continuation &result_cont) {
@@ -195,6 +201,7 @@ bool task_manager::try_clean_return(context_switcher::continuation &result_cont)
     // We are the last one working on this task. Thus the sync must be finished, continue working.
     active_task_ = last_task;
 
+    last_task->is_synchronized_ = true;
     result_cont = std::move(last_task->continuation_);
     PLS_ASSERT(result_cont.valid(), "Must return a valid continuation.");
     return false;