Add further performance notes from vtune and 2D heat diffusion.

64e2238c · FritzFlorian · 9ab06d6f · 64e2238c · 64e2238c · 64e2238c
Commit 64e2238c authored Jun 03, 2019 by FritzFlorian
8 changed files
--- a/PERFORMANCE.md
+++ b/PERFORMANCE.md
@@ -242,3 +242,42 @@ case of FFT).
 We also want to note that all these measurements are not very
 controlled/scientific, but simply ran ot our notebook for
 fast iterations over different, potential issues with our scheduler.
+### Commit 116cf4af - VTune Amplifier and MRSW top level lock
+When looking at why our code works quite well on problems with
+mostly busy workers and not so well on code with spinning/waiting
+workers (like in the FFT), we take a closer look at the FFT and
+matrix multiplication in VTune.
+FFT:
+<img src="media/116cf4af_fft_vtune.png" width="400"/>
+Matrix:
+<img src="media/116cf4af_matrix_vtune.png" width="400"/>
+The sections highlighted in red represent parts of the code spent
+on spinning in the work-stealing loop.
+We can see that as long as our workers are mainly busy/find work
+in the stealing loop the overhead spent on spinning is minimal.
+We can also see that in the FFT considerable amounts of time are
+spent spining.
+A general observation are the high CPI rates for our spinning code.
+This makes sense, as we are currently working on locks that share
+atomic variables in order to work, thus leading to cache misses.
+### Commit 116cf4af - 2D Heat Diffusion
+As a last test for our current state on performance we implemented the
+2D heat diffusion benchmark using our framework (using fork-join based
+parallel_for, 512 heat array size):
+<img src="media/116cf4af_heat_average.png" width="400"/>
+We observe solid performance from our implementation.
+(Again, not very scientific test environment, but good enough for
+our general direction)
--- a/app/benchmark_matrix/main.cpp
+++ b/app/benchmark_matrix/main.cpp
@@ -71,7 +71,7 @@ int main() {
 //
 //  scheduler.perform_work([&] {
 //    PROFILE_MAIN_THREAD
-//    for (int i = 0; i < 10; i++) {
+//    for (int i = 0; i < 5000; i++) {
 //      PROFILE_WORK_BLOCK("Top Level")
 //      result.multiply(a, b);
 //    }
@@ -79,4 +79,4 @@ int main() {
 //
 //  PROFILE_SAVE("test_profile.prof")
 //}
+//
--- a/app/benchmark_unbalanced/main.cpp
+++ b/app/benchmark_unbalanced/main.cpp
@@ -50,29 +50,29 @@ int unbalanced_tree_search(int seed, int root_children, double q, int normal_chi
  return result;
 }
+//
+//int main() {
+//  PROFILE_ENABLE
+//  pls::internal::helpers::run_mini_benchmark([&] {
+//    unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN);
+//  }, 8, 4000);
+//
+//  PROFILE_SAVE("test_profile.prof")
+//}
 int main() {
  PROFILE_ENABLE
-  pls::internal::helpers::run_mini_benchmark([&] {
+  pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18};
-    unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN);
+  pls::scheduler scheduler{&my_scheduler_memory, 8};
-  }, 8, 4000);
+  scheduler.perform_work([&] {
+    PROFILE_MAIN_THREAD
+    for (int i = 0; i < 50; i++) {
+      PROFILE_WORK_BLOCK("Top Level")
+      int result = unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN);
+      std::cout << result << std::endl;
+    }
+  });
  PROFILE_SAVE("test_profile.prof")
 }
-//int main() {
-//  PROFILE_ENABLE
-//  pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18};
-//  pls::scheduler scheduler{&my_scheduler_memory, 8};
-//
-//  scheduler.perform_work([&] {
-//    PROFILE_MAIN_THREAD
-//    for (int i = 0; i < 10; i++) {
-//      PROFILE_WORK_BLOCK("Top Level")
-//      int result = unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN);
-//      std::cout << result << std::endl;
-//    }
-//  });
-//
-//  PROFILE_SAVE("test_profile.prof")
-//}
--- a/app/invoke_parallel/main.cpp
+++ b/app/invoke_parallel/main.cpp
@@ -91,7 +91,7 @@ int main() {
    PROFILE_MAIN_THREAD
    // Call looks just the same, only requirement is
    // the enclosure in the perform_work lambda.
-    for (int i = 0; i < 10; i++) {
+    for (int i = 0; i < 1000; i++) {
      PROFILE_WORK_BLOCK("Top Level FFT")
      complex_vector input = initial_input;
      fft(input.begin(), input.size());

--- a/lib/pls/src/internal/scheduling/abstract_task.cpp
+++ b/lib/pls/src/internal/scheduling/abstract_task.cpp
@@ -18,7 +18,7 @@ bool abstract_task::steal_work() {
  const size_t my_id = my_state->id_;
  const size_t offset = my_state->random_() % my_scheduler->num_threads();
-  const size_t max_tries = my_scheduler->num_threads() - 1; // TODO: Tune this value
+  const size_t max_tries = my_scheduler->num_threads(); // TODO: Tune this value
  for (size_t i = 0; i < max_tries; i++) {
    size_t target = (offset + i) % my_scheduler->num_threads();
    if (target == my_id) {

--- a/media/116cf4af_fft_vtune.png
+++ b/media/116cf4af_fft_vtune.png
--- a/media/116cf4af_heat_average.png
+++ b/media/116cf4af_heat_average.png
--- a/media/116cf4af_matrix_vtune.png
+++ b/media/116cf4af_matrix_vtune.png