From a4b03ffe6fd4f716170afbb8c5bc8e96229c4533 Mon Sep 17 00:00:00 2001
From: FritzFlorian <flo.fritz@t-online.de>
Date: Thu, 6 Jun 2019 15:27:24 +0200
Subject: [PATCH] Found general problem for FFT performance.

---
 PERFORMANCE.md                                        | 37 +++++++++++++++++++++++++++++++++++++
 lib/pls/include/pls/algorithms/invoke_parallel_impl.h | 10 ++++++++--
 2 files changed, 45 insertions(+), 2 deletions(-)
diff --git a/PERFORMANCE.md b/PERFORMANCE.md
index 45ee6e9..dbb786c 100644
--- a/PERFORMANCE.md
+++ b/PERFORMANCE.md
@@ -318,3 +318,40 @@ threads (threads without any actual work) and the threads actually
 performing work. Most likely there is a resource on the same cache
 line used that hinders the working threads, but we can not really
 figure out which one it is.
+
+### Commit be2cdbfe - Locking Deque
+
+Switching to a locking deque has not improved (or even slightly hurt)
+performance, we therefore think that the deque itself is not the
+portion slowing down our execution.
+
+### Commit 5044f0a1 - Performance Bottelneck in FFT FIXED
+
+By moving from directly calling one of the parallel invocations
+
+```c++
+scheduler::spawn_child(sub_task_2);
+function1(); // Execute first function 'inline' without spawning a sub_task object
+```
+
+to spawning two tasks
+```c++
+scheduler::spawn_child(sub_task_2);
+scheduler::spawn_child(sub_task_1);
+```
+
+we where able to fix the bad performance of our framework in the
+FFT benchmark (where there is a lot spinning/idling of some
+worker threads).
+
+We think this is due to some sort of cache misses/bus contemption
+on the finishing counters. This would make sense, as the drop
+at the hyperthreading mark indicates problems with this part of the
+CPU pipeline (althought it did not show clearly in our profiling runs).
+We will now try to find the exact spot where the problem originates and
+fix the source rather then 'circumventing' it with these extra tasks.
+(This then aigain, should hopefully even boost all other workloads
+performance, as contemption on the bus/cache is always bad)
+
+
+
diff --git a/lib/pls/include/pls/algorithms/invoke_parallel_impl.h b/lib/pls/include/pls/algorithms/invoke_parallel_impl.h
index 4b337bd..c96fb3f 100644
--- a/lib/pls/include/pls/algorithms/invoke_parallel_impl.h
+++ b/lib/pls/include/pls/algorithms/invoke_parallel_impl.h
@@ -13,10 +13,13 @@ template<typename Function1, typename Function2>
 void invoke_parallel(const Function1 &function1, const Function2 &function2) {
   using namespace ::pls::internal::scheduling;
 
+  auto sub_task_1 = lambda_task_by_reference<Function1>(function1);
   auto sub_task_2 = lambda_task_by_reference<Function2>(function2);
 
   scheduler::spawn_child(sub_task_2);
-  function1(); // Execute first function 'inline' without spawning a sub_task object
+  scheduler::spawn_child(sub_task_1);
+  // TODO: Research the exact cause of this being faster
+//  function1(); // Execute first function 'inline' without spawning a sub_task object
   scheduler::wait_for_all();
 }
 
@@ -24,12 +27,15 @@ template<typename Function1, typename Function2, typename Function3>
 void invoke_parallel(const Function1 &function1, const Function2 &function2, const Function3 &function3) {
   using namespace ::pls::internal::scheduling;
 
+  auto sub_task_1 = lambda_task_by_reference<Function1>(function1);
   auto sub_task_2 = lambda_task_by_reference<Function2>(function2);
   auto sub_task_3 = lambda_task_by_reference<Function3>(function3);
 
   scheduler::spawn_child(sub_task_2);
   scheduler::spawn_child(sub_task_3);
-  function1(); // Execute first function 'inline' without spawning a sub_task object
+  scheduler::spawn_child(sub_task_1);
+  // TODO: Research the exact cause of this being faster
+//  function1(); // Execute first function 'inline' without spawning a sub_task object
   scheduler::wait_for_all();
 }
 
--
libgit2 0.26.0