From 64e2238c5d53cb00c4ff952fa0e72455840970bf Mon Sep 17 00:00:00 2001
From: FritzFlorian <flo.fritz@t-online.de>
Date: Mon, 3 Jun 2019 13:42:27 +0200
Subject: [PATCH] Add further performance notes from vtune and 2D heat diffusion.

---
 PERFORMANCE.md                                    | 39 +++++++++++++++++++++++++++++++++++++++
 app/benchmark_matrix/main.cpp                     |  4 ++--
 app/benchmark_unbalanced/main.cpp                 | 40 ++++++++++++++++++++--------------------
 app/invoke_parallel/main.cpp                      |  2 +-
 lib/pls/src/internal/scheduling/abstract_task.cpp |  2 +-
 media/116cf4af_fft_vtune.png                      | Bin 0 -> 146759 bytes
 media/116cf4af_heat_average.png                   | Bin 0 -> 202428 bytes
 media/116cf4af_matrix_vtune.png                   | Bin 0 -> 153920 bytes
 8 files changed, 63 insertions(+), 24 deletions(-)
 create mode 100644 media/116cf4af_fft_vtune.png
 create mode 100644 media/116cf4af_heat_average.png
 create mode 100644 media/116cf4af_matrix_vtune.png
diff --git a/PERFORMANCE.md b/PERFORMANCE.md
index 7cc7c1e..ea9e983 100644
--- a/PERFORMANCE.md
+++ b/PERFORMANCE.md
@@ -242,3 +242,42 @@ case of FFT).
 We also want to note that all these measurements are not very
 controlled/scientific, but simply ran ot our notebook for
 fast iterations over different, potential issues with our scheduler.
+
+
+### Commit 116cf4af - VTune Amplifier and MRSW top level lock
+
+When looking at why our code works quite well on problems with
+mostly busy workers and not so well on code with spinning/waiting
+workers (like in the FFT), we take a closer look at the FFT and
+matrix multiplication in VTune.
+
+FFT:
+
+<img src="media/116cf4af_fft_vtune.png" width="400"/>
+
+Matrix:
+
+<img src="media/116cf4af_matrix_vtune.png" width="400"/>
+
+The sections highlighted in red represent parts of the code spent
+on spinning in the work-stealing loop.
+We can see that as long as our workers are mainly busy/find work
+in the stealing loop the overhead spent on spinning is minimal.
+We can also see that in the FFT considerable amounts of time are
+spent spining.
+
+A general observation are the high CPI rates for our spinning code.
+This makes sense, as we are currently working on locks that share
+atomic variables in order to work, thus leading to cache misses.
+
+### Commit 116cf4af - 2D Heat Diffusion
+
+As a last test for our current state on performance we implemented the
+2D heat diffusion benchmark using our framework (using fork-join based
+parallel_for, 512 heat array size):
+
+<img src="media/116cf4af_heat_average.png" width="400"/>
+
+We observe solid performance from our implementation.
+(Again, not very scientific test environment, but good enough for
+our general direction)
diff --git a/app/benchmark_matrix/main.cpp b/app/benchmark_matrix/main.cpp
index c633820..bb21de6 100644
--- a/app/benchmark_matrix/main.cpp
+++ b/app/benchmark_matrix/main.cpp
@@ -71,7 +71,7 @@ int main() {
 //
 //  scheduler.perform_work([&] {
 //    PROFILE_MAIN_THREAD
-//    for (int i = 0; i < 10; i++) {
+//    for (int i = 0; i < 5000; i++) {
 //      PROFILE_WORK_BLOCK("Top Level")
 //      result.multiply(a, b);
 //    }
@@ -79,4 +79,4 @@ int main() {
 //
 //  PROFILE_SAVE("test_profile.prof")
 //}
-
+//
diff --git a/app/benchmark_unbalanced/main.cpp b/app/benchmark_unbalanced/main.cpp
index 27cdaf3..1860877 100644
--- a/app/benchmark_unbalanced/main.cpp
+++ b/app/benchmark_unbalanced/main.cpp
@@ -50,29 +50,29 @@ int unbalanced_tree_search(int seed, int root_children, double q, int normal_chi
 
   return result;
 }
+//
+//int main() {
+//  PROFILE_ENABLE
+//  pls::internal::helpers::run_mini_benchmark([&] {
+//    unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN);
+//  }, 8, 4000);
+//
+//  PROFILE_SAVE("test_profile.prof")
+//}
 
 int main() {
   PROFILE_ENABLE
-  pls::internal::helpers::run_mini_benchmark([&] {
-    unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN);
-  }, 8, 4000);
+  pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18};
+  pls::scheduler scheduler{&my_scheduler_memory, 8};
+
+  scheduler.perform_work([&] {
+    PROFILE_MAIN_THREAD
+    for (int i = 0; i < 50; i++) {
+      PROFILE_WORK_BLOCK("Top Level")
+      int result = unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN);
+      std::cout << result << std::endl;
+    }
+  });
 
   PROFILE_SAVE("test_profile.prof")
 }
-
-//int main() {
-//  PROFILE_ENABLE
-//  pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18};
-//  pls::scheduler scheduler{&my_scheduler_memory, 8};
-//
-//  scheduler.perform_work([&] {
-//    PROFILE_MAIN_THREAD
-//    for (int i = 0; i < 10; i++) {
-//      PROFILE_WORK_BLOCK("Top Level")
-//      int result = unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN);
-//      std::cout << result << std::endl;
-//    }
-//  });
-//
-//  PROFILE_SAVE("test_profile.prof")
-//}
diff --git a/app/invoke_parallel/main.cpp b/app/invoke_parallel/main.cpp
index 4382168..e469bad 100644
--- a/app/invoke_parallel/main.cpp
+++ b/app/invoke_parallel/main.cpp
@@ -91,7 +91,7 @@ int main() {
     PROFILE_MAIN_THREAD
     // Call looks just the same, only requirement is
     // the enclosure in the perform_work lambda.
-    for (int i = 0; i < 10; i++) {
+    for (int i = 0; i < 1000; i++) {
       PROFILE_WORK_BLOCK("Top Level FFT")
       complex_vector input = initial_input;
       fft(input.begin(), input.size());
diff --git a/lib/pls/src/internal/scheduling/abstract_task.cpp b/lib/pls/src/internal/scheduling/abstract_task.cpp
index 85e2c36..0fb15d2 100644
--- a/lib/pls/src/internal/scheduling/abstract_task.cpp
+++ b/lib/pls/src/internal/scheduling/abstract_task.cpp
@@ -18,7 +18,7 @@ bool abstract_task::steal_work() {
 
   const size_t my_id = my_state->id_;
   const size_t offset = my_state->random_() % my_scheduler->num_threads();
-  const size_t max_tries = my_scheduler->num_threads() - 1; // TODO: Tune this value
+  const size_t max_tries = my_scheduler->num_threads(); // TODO: Tune this value
   for (size_t i = 0; i < max_tries; i++) {
     size_t target = (offset + i) % my_scheduler->num_threads();
     if (target == my_id) {
diff --git a/media/116cf4af_fft_vtune.png b/media/116cf4af_fft_vtune.png
new file mode 100644
index 0000000..01008cb
Binary files /dev/null and b/media/116cf4af_fft_vtune.png differ
diff --git a/media/116cf4af_heat_average.png b/media/116cf4af_heat_average.png
new file mode 100644
index 0000000..24f159b
Binary files /dev/null and b/media/116cf4af_heat_average.png differ
diff --git a/media/116cf4af_matrix_vtune.png b/media/116cf4af_matrix_vtune.png
new file mode 100644
index 0000000..8182121
Binary files /dev/null and b/media/116cf4af_matrix_vtune.png differ
--
libgit2 0.26.0