diff --git a/PERFORMANCE.md b/PERFORMANCE.md
index dbb786c..7112439 100644
--- a/PERFORMANCE.md
+++ b/PERFORMANCE.md
@@ -354,4 +354,31 @@ fix the source rather then 'circumventing' it with these extra tasks.
 performance, as contemption on the bus/cache is always bad)
 
 
-
+After some research we think that the issue is down to many threads
+referencing the same atomic reference counter. We think so because
+even cache aligning the shared refernce count does not fix the issue
+when using the direct function call. Also, forcing a new method call
+(going down in the call stack one function call) is not solving the
+issue (thus making sure that it is not related with some caching issue
+in the call itself).
+
+In conclusion there seems to be a hyperthreading issue with this
+shared reference count. We keep this in mind if we eventually get
+tasks with changing data memebers (as this problem could reappear there,
+as then the ref_count actualy is in the same memory region as our
+'user variables'). For now we leave the code like it is.
+
+
+FFT Average with new call method:
+
+<img src="media/5044f0a1_fft_average.png" width="400"/>
+
+The performance of our new call method looks shockingly similar
+to TBB with a slight, constant performance drop behind it.
+This makes sense, as the basic principle (lock-free, classic work
+stealing deque and the parallel call structure) are nearly the same.
+
+We will see if minor optimizations can even close this last gap.
+Overall the performance at this point is good enough to move on
+to implementing more functionality and to running tests on different
+queues/stealing tactics etc.
diff --git a/lib/pls/include/pls/algorithms/invoke_parallel_impl.h b/lib/pls/include/pls/algorithms/invoke_parallel_impl.h
index c96fb3f..ef634bd 100644
--- a/lib/pls/include/pls/algorithms/invoke_parallel_impl.h
+++ b/lib/pls/include/pls/algorithms/invoke_parallel_impl.h
@@ -5,6 +5,7 @@
 #include "pls/internal/scheduling/task.h"
 #include "pls/internal/scheduling/lambda_task.h"
 #include "pls/internal/scheduling/scheduler.h"
+#include "pls/internal/scheduling/thread_state.h"
 
 namespace pls {
 namespace algorithm {
@@ -17,10 +18,7 @@ void invoke_parallel(const Function1 &function1, const Function2 &function2) {
   auto sub_task_2 = lambda_task_by_reference<Function2>(function2);
 
   scheduler::spawn_child(sub_task_2);
-  scheduler::spawn_child(sub_task_1);
-  // TODO: Research the exact cause of this being faster
-//  function1(); // Execute first function 'inline' without spawning a sub_task object
-  scheduler::wait_for_all();
+  scheduler::spawn_child_and_wait(sub_task_1);
 }
 
 template<typename Function1, typename Function2, typename Function3>
@@ -31,12 +29,9 @@ void invoke_parallel(const Function1 &function1, const Function2 &function2, con
   auto sub_task_2 = lambda_task_by_reference<Function2>(function2);
   auto sub_task_3 = lambda_task_by_reference<Function3>(function3);
 
-  scheduler::spawn_child(sub_task_2);
   scheduler::spawn_child(sub_task_3);
-  scheduler::spawn_child(sub_task_1);
-  // TODO: Research the exact cause of this being faster
-//  function1(); // Execute first function 'inline' without spawning a sub_task object
-  scheduler::wait_for_all();
+  scheduler::spawn_child(sub_task_2);
+  scheduler::spawn_child_and_wait(sub_task_1);
 }
 
 }
diff --git a/lib/pls/include/pls/algorithms/parallel_for_impl.h b/lib/pls/include/pls/algorithms/parallel_for_impl.h
index 786b875..b787b44 100644
--- a/lib/pls/include/pls/algorithms/parallel_for_impl.h
+++ b/lib/pls/include/pls/algorithms/parallel_for_impl.h
@@ -25,11 +25,13 @@ void parallel_for(RandomIt first, RandomIt last, const Function &function) {
     // Cut in half recursively
     long middle_index = num_elements / 2;
 
-    auto body = [=] { parallel_for(first + middle_index, last, function); };
-    lambda_task_by_reference<decltype(body)> second_half_task(body);
+    auto body2 = [=] { parallel_for(first + middle_index, last, function); };
+    lambda_task_by_reference<decltype(body2)> second_half_task(body2);
     scheduler::spawn_child(second_half_task);
 
-    parallel_for(first, first + middle_index, function);
+    auto body1 = [=] { parallel_for(first, first + middle_index, function); };
+    lambda_task_by_reference<decltype(body1)> first_half_task(body1);
+    scheduler::spawn_child(first_half_task);
     scheduler::wait_for_all();
   }
 }
diff --git a/lib/pls/include/pls/internal/scheduling/scheduler.h b/lib/pls/include/pls/internal/scheduling/scheduler.h
index c38ae64..54075fc 100644
--- a/lib/pls/include/pls/internal/scheduling/scheduler.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler.h
@@ -84,6 +84,15 @@ class scheduler {
   static void spawn_child(T &sub_task);
 
   /**
+   * Helper to spawn a child on the currently running task and waiting for it (skipping over the task-deque).
+   *
+   * @tparam T type of the new task
+   * @param sub_task the new task to be spawned
+   */
+  template<typename T>
+  static void spawn_child_and_wait(T &sub_task);
+
+  /**
    * Helper to wait for all children of the currently executing task.
    */
   static void wait_for_all();
diff --git a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
index 46265b7..98156dc 100644
--- a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
@@ -34,6 +34,11 @@ void scheduler::spawn_child(T &sub_task) {
   thread_state::get()->current_task_->spawn_child(sub_task);
 }
 
+template<typename T>
+void scheduler::spawn_child_and_wait(T &sub_task) {
+  thread_state::get()->current_task_->spawn_child_and_wait(sub_task);
+}
+
 }
 }
 }
diff --git a/lib/pls/include/pls/internal/scheduling/task.h b/lib/pls/include/pls/internal/scheduling/task.h
index 30e04b0..cdeb9f2 100644
--- a/lib/pls/include/pls/internal/scheduling/task.h
+++ b/lib/pls/include/pls/internal/scheduling/task.h
@@ -35,6 +35,8 @@ class task {
 
   template<typename T>
   void spawn_child(T &&sub_task);
+  template<typename T>
+  void spawn_child_and_wait(T &&sub_task);
   void wait_for_all();
 
  private:
@@ -58,6 +60,19 @@ void task::spawn_child(T &&sub_task) {
   thread_state::get()->deque_.push_tail(const_task);
 }
 
+template<typename T>
+void task::spawn_child_and_wait(T &&sub_task) {
+  PROFILE_FORK_JOIN_STEALING("spawn_child")
+  static_assert(std::is_base_of<task, typename std::remove_reference<T>::type>::value, "Only pass task subclasses!");
+
+  // Assign forced values (for stack and parent management)
+  sub_task.parent_ = nullptr;
+  sub_task.deque_state_ = thread_state::get()->deque_.save_state();
+  sub_task.execute();
+
+  wait_for_all();
+}
+
 }
 }
 }
diff --git a/lib/pls/src/internal/data_structures/locking_deque.cpp b/lib/pls/src/internal/data_structures/locking_deque.cpp
deleted file mode 100644
index 90971ce..0000000
--- a/lib/pls/src/internal/data_structures/locking_deque.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-#include <mutex>
-
-#include "pls/internal/data_structures/locking_deque.h"
-
-namespace pls {
-namespace internal {
-namespace data_structures {
-
-locking_deque_item *locking_deque_internal::pop_head_internal() {
-  std::lock_guard<base::spin_lock> lock{lock_};
-
-  if (head_ == nullptr) {
-    return nullptr;
-  }
-
-  locking_deque_item *result = head_;
-  head_ = head_->next_;
-  if (head_ == nullptr) {
-    tail_ = nullptr;
-  } else {
-    head_->prev_ = nullptr;
-  }
-
-  return result;
-}
-
-locking_deque_item *locking_deque_internal::pop_tail_internal() {
-  std::lock_guard<base::spin_lock> lock{lock_};
-
-  if (tail_ == nullptr) {
-    return nullptr;
-  }
-
-  locking_deque_item *result = tail_;
-  tail_ = tail_->prev_;
-  if (tail_ == nullptr) {
-    head_ = nullptr;
-  } else {
-    tail_->next_ = nullptr;
-  }
-
-  return result;
-}
-
-void locking_deque_internal::push_tail_internal(locking_deque_item *new_item) {
-  std::lock_guard<base::spin_lock> lock{lock_};
-
-  if (tail_ != nullptr) {
-    tail_->next_ = new_item;
-  } else {
-    head_ = new_item;
-  }
-  new_item->prev_ = tail_;
-  new_item->next_ = nullptr;
-  tail_ = new_item;
-}
-
-}
-}
-}
diff --git a/media/5044f0a1_fft_average.png b/media/5044f0a1_fft_average.png
new file mode 100644
index 0000000..deddf73
Binary files /dev/null and b/media/5044f0a1_fft_average.png differ