From 64e2238c5d53cb00c4ff952fa0e72455840970bf Mon Sep 17 00:00:00 2001 From: FritzFlorian Date: Mon, 3 Jun 2019 13:42:27 +0200 Subject: [PATCH] Add further performance notes from vtune and 2D heat diffusion. --- PERFORMANCE.md | 39 +++++++++++++++++++++++++++++++++++++++ app/benchmark_matrix/main.cpp | 4 ++-- app/benchmark_unbalanced/main.cpp | 40 ++++++++++++++++++++-------------------- app/invoke_parallel/main.cpp | 2 +- lib/pls/src/internal/scheduling/abstract_task.cpp | 2 +- media/116cf4af_fft_vtune.png | Bin 0 -> 146759 bytes media/116cf4af_heat_average.png | Bin 0 -> 202428 bytes media/116cf4af_matrix_vtune.png | Bin 0 -> 153920 bytes 8 files changed, 63 insertions(+), 24 deletions(-) create mode 100644 media/116cf4af_fft_vtune.png create mode 100644 media/116cf4af_heat_average.png create mode 100644 media/116cf4af_matrix_vtune.png diff --git a/PERFORMANCE.md b/PERFORMANCE.md index 7cc7c1e..ea9e983 100644 --- a/PERFORMANCE.md +++ b/PERFORMANCE.md @@ -242,3 +242,42 @@ case of FFT). We also want to note that all these measurements are not very controlled/scientific, but simply ran ot our notebook for fast iterations over different, potential issues with our scheduler. + + +### Commit 116cf4af - VTune Amplifier and MRSW top level lock + +When looking at why our code works quite well on problems with +mostly busy workers and not so well on code with spinning/waiting +workers (like in the FFT), we take a closer look at the FFT and +matrix multiplication in VTune. + +FFT: + + + +Matrix: + + + +The sections highlighted in red represent parts of the code spent +on spinning in the work-stealing loop. +We can see that as long as our workers are mainly busy/find work +in the stealing loop the overhead spent on spinning is minimal. +We can also see that in the FFT considerable amounts of time are +spent spining. + +A general observation are the high CPI rates for our spinning code. +This makes sense, as we are currently working on locks that share +atomic variables in order to work, thus leading to cache misses. + +### Commit 116cf4af - 2D Heat Diffusion + +As a last test for our current state on performance we implemented the +2D heat diffusion benchmark using our framework (using fork-join based +parallel_for, 512 heat array size): + + + +We observe solid performance from our implementation. +(Again, not very scientific test environment, but good enough for +our general direction) diff --git a/app/benchmark_matrix/main.cpp b/app/benchmark_matrix/main.cpp index c633820..bb21de6 100644 --- a/app/benchmark_matrix/main.cpp +++ b/app/benchmark_matrix/main.cpp @@ -71,7 +71,7 @@ int main() { // // scheduler.perform_work([&] { // PROFILE_MAIN_THREAD -// for (int i = 0; i < 10; i++) { +// for (int i = 0; i < 5000; i++) { // PROFILE_WORK_BLOCK("Top Level") // result.multiply(a, b); // } @@ -79,4 +79,4 @@ int main() { // // PROFILE_SAVE("test_profile.prof") //} - +// diff --git a/app/benchmark_unbalanced/main.cpp b/app/benchmark_unbalanced/main.cpp index 27cdaf3..1860877 100644 --- a/app/benchmark_unbalanced/main.cpp +++ b/app/benchmark_unbalanced/main.cpp @@ -50,29 +50,29 @@ int unbalanced_tree_search(int seed, int root_children, double q, int normal_chi return result; } +// +//int main() { +// PROFILE_ENABLE +// pls::internal::helpers::run_mini_benchmark([&] { +// unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN); +// }, 8, 4000); +// +// PROFILE_SAVE("test_profile.prof") +//} int main() { PROFILE_ENABLE - pls::internal::helpers::run_mini_benchmark([&] { - unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN); - }, 8, 4000); + pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18}; + pls::scheduler scheduler{&my_scheduler_memory, 8}; + + scheduler.perform_work([&] { + PROFILE_MAIN_THREAD + for (int i = 0; i < 50; i++) { + PROFILE_WORK_BLOCK("Top Level") + int result = unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN); + std::cout << result << std::endl; + } + }); PROFILE_SAVE("test_profile.prof") } - -//int main() { -// PROFILE_ENABLE -// pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18}; -// pls::scheduler scheduler{&my_scheduler_memory, 8}; -// -// scheduler.perform_work([&] { -// PROFILE_MAIN_THREAD -// for (int i = 0; i < 10; i++) { -// PROFILE_WORK_BLOCK("Top Level") -// int result = unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN); -// std::cout << result << std::endl; -// } -// }); -// -// PROFILE_SAVE("test_profile.prof") -//} diff --git a/app/invoke_parallel/main.cpp b/app/invoke_parallel/main.cpp index 4382168..e469bad 100644 --- a/app/invoke_parallel/main.cpp +++ b/app/invoke_parallel/main.cpp @@ -91,7 +91,7 @@ int main() { PROFILE_MAIN_THREAD // Call looks just the same, only requirement is // the enclosure in the perform_work lambda. - for (int i = 0; i < 10; i++) { + for (int i = 0; i < 1000; i++) { PROFILE_WORK_BLOCK("Top Level FFT") complex_vector input = initial_input; fft(input.begin(), input.size()); diff --git a/lib/pls/src/internal/scheduling/abstract_task.cpp b/lib/pls/src/internal/scheduling/abstract_task.cpp index 85e2c36..0fb15d2 100644 --- a/lib/pls/src/internal/scheduling/abstract_task.cpp +++ b/lib/pls/src/internal/scheduling/abstract_task.cpp @@ -18,7 +18,7 @@ bool abstract_task::steal_work() { const size_t my_id = my_state->id_; const size_t offset = my_state->random_() % my_scheduler->num_threads(); - const size_t max_tries = my_scheduler->num_threads() - 1; // TODO: Tune this value + const size_t max_tries = my_scheduler->num_threads(); // TODO: Tune this value for (size_t i = 0; i < max_tries; i++) { size_t target = (offset + i) % my_scheduler->num_threads(); if (target == my_id) { diff --git a/media/116cf4af_fft_vtune.png b/media/116cf4af_fft_vtune.png new file mode 100644 index 0000000..01008cb Binary files /dev/null and b/media/116cf4af_fft_vtune.png differ diff --git a/media/116cf4af_heat_average.png b/media/116cf4af_heat_average.png new file mode 100644 index 0000000..24f159b Binary files /dev/null and b/media/116cf4af_heat_average.png differ diff --git a/media/116cf4af_matrix_vtune.png b/media/116cf4af_matrix_vtune.png new file mode 100644 index 0000000..8182121 Binary files /dev/null and b/media/116cf4af_matrix_vtune.png differ -- libgit2 0.26.0