From a4b03ffe6fd4f716170afbb8c5bc8e96229c4533 Mon Sep 17 00:00:00 2001 From: FritzFlorian Date: Thu, 6 Jun 2019 15:27:24 +0200 Subject: [PATCH] Found general problem for FFT performance. --- PERFORMANCE.md | 37 +++++++++++++++++++++++++++++++++++++ lib/pls/include/pls/algorithms/invoke_parallel_impl.h | 10 ++++++++-- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/PERFORMANCE.md b/PERFORMANCE.md index 45ee6e9..dbb786c 100644 --- a/PERFORMANCE.md +++ b/PERFORMANCE.md @@ -318,3 +318,40 @@ threads (threads without any actual work) and the threads actually performing work. Most likely there is a resource on the same cache line used that hinders the working threads, but we can not really figure out which one it is. + +### Commit be2cdbfe - Locking Deque + +Switching to a locking deque has not improved (or even slightly hurt) +performance, we therefore think that the deque itself is not the +portion slowing down our execution. + +### Commit 5044f0a1 - Performance Bottelneck in FFT FIXED + +By moving from directly calling one of the parallel invocations + +```c++ +scheduler::spawn_child(sub_task_2); +function1(); // Execute first function 'inline' without spawning a sub_task object +``` + +to spawning two tasks +```c++ +scheduler::spawn_child(sub_task_2); +scheduler::spawn_child(sub_task_1); +``` + +we where able to fix the bad performance of our framework in the +FFT benchmark (where there is a lot spinning/idling of some +worker threads). + +We think this is due to some sort of cache misses/bus contemption +on the finishing counters. This would make sense, as the drop +at the hyperthreading mark indicates problems with this part of the +CPU pipeline (althought it did not show clearly in our profiling runs). +We will now try to find the exact spot where the problem originates and +fix the source rather then 'circumventing' it with these extra tasks. +(This then aigain, should hopefully even boost all other workloads +performance, as contemption on the bus/cache is always bad) + + + diff --git a/lib/pls/include/pls/algorithms/invoke_parallel_impl.h b/lib/pls/include/pls/algorithms/invoke_parallel_impl.h index 4b337bd..c96fb3f 100644 --- a/lib/pls/include/pls/algorithms/invoke_parallel_impl.h +++ b/lib/pls/include/pls/algorithms/invoke_parallel_impl.h @@ -13,10 +13,13 @@ template void invoke_parallel(const Function1 &function1, const Function2 &function2) { using namespace ::pls::internal::scheduling; + auto sub_task_1 = lambda_task_by_reference(function1); auto sub_task_2 = lambda_task_by_reference(function2); scheduler::spawn_child(sub_task_2); - function1(); // Execute first function 'inline' without spawning a sub_task object + scheduler::spawn_child(sub_task_1); + // TODO: Research the exact cause of this being faster +// function1(); // Execute first function 'inline' without spawning a sub_task object scheduler::wait_for_all(); } @@ -24,12 +27,15 @@ template void invoke_parallel(const Function1 &function1, const Function2 &function2, const Function3 &function3) { using namespace ::pls::internal::scheduling; + auto sub_task_1 = lambda_task_by_reference(function1); auto sub_task_2 = lambda_task_by_reference(function2); auto sub_task_3 = lambda_task_by_reference(function3); scheduler::spawn_child(sub_task_2); scheduler::spawn_child(sub_task_3); - function1(); // Execute first function 'inline' without spawning a sub_task object + scheduler::spawn_child(sub_task_1); + // TODO: Research the exact cause of this being faster +// function1(); // Execute first function 'inline' without spawning a sub_task object scheduler::wait_for_all(); } -- libgit2 0.26.0