From 8668cad280bc6908ff7b0d998ef444bbd41ab88c Mon Sep 17 00:00:00 2001
From: FritzFlorian <flo.fritz@t-online.de>
Date: Mon, 25 Nov 2019 10:03:54 +0100
Subject: [PATCH] WIP: Add first performance tests of single threaded execution.

We changed up some of the memory constraints in the lock free deque and will need to see if this is ok. If so, the single threaded performance looks very good.
---
 app/benchmark_fft/main.cpp                                      | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
 cmake/SetupOptimizationLevel.cmake                              |  2 +-
 lib/pls/CMakeLists.txt                                          |  4 ++--
 lib/pls/include/pls/internal/data_structures/bounded_ws_deque.h |  8 ++++----
 4 files changed, 85 insertions(+), 25 deletions(-)
diff --git a/app/benchmark_fft/main.cpp b/app/benchmark_fft/main.cpp
index 80de92c..e890d9e 100644
--- a/app/benchmark_fft/main.cpp
+++ b/app/benchmark_fft/main.cpp
@@ -1,13 +1,16 @@
-#include <pls/pls.h>
-#include <pls/internal/helpers/profiler.h>
-#include <pls/internal/helpers/mini_benchmark.h>
+#include "pls/internal/scheduling/scheduler.h"
+#include "pls/internal/scheduling/parallel_result.h"
+#include "pls/internal/scheduling/scheduler_memory.h"
+using namespace pls::internal::scheduling;
 
 #include <iostream>
 #include <complex>
 #include <vector>
+#include <atomic>
+
+std::atomic<unsigned long> count;
 
 static constexpr int CUTOFF = 16;
-static constexpr int NUM_ITERATIONS = 1000;
 static constexpr int INPUT_SIZE = 8192;
 typedef std::vector<std::complex<double>> complex_vector;
 
@@ -39,22 +42,38 @@ void combine(complex_vector::iterator data, int n) {
   }
 }
 
-void fft(complex_vector::iterator data, int n) {
+void fft_normal(complex_vector::iterator data, int n) {
   if (n < 2) {
+//    count++;
     return;
   }
 
   divide(data, n);
+  fft_normal(data, n / 2);
+  fft_normal(data + n / 2, n / 2);
+  combine(data, n);
+}
+
+parallel_result<short> fft(complex_vector::iterator data, int n) {
+  if (n < 2) {
+    return 0;
+  }
+
+  divide(data, n);
   if (n <= CUTOFF) {
-    fft(data, n / 2);
-    fft(data + n / 2, n / 2);
+    fft_normal(data, n / 2);
+    fft_normal(data + n / 2, n / 2);
+    return 0;
   } else {
-    pls::invoke(
-        [&] { fft(data, n / 2); },
-        [&] { fft(data + n / 2, n / 2); }
-    );
+    return scheduler::par([=]() {
+      return fft(data, n / 2);
+    }, [=]() {
+      return fft(data + n / 2, n / 2);
+    }).then([=](int, int) {
+      combine(data, n);
+      return 0;
+    });
   }
-  combine(data, n);
 }
 
 complex_vector prepare_input(int input_size) {
@@ -73,14 +92,55 @@ complex_vector prepare_input(int input_size) {
   return data;
 }
 
+static constexpr int NUM_ITERATIONS = 1000;
+constexpr size_t NUM_THREADS = 1;
+
+constexpr size_t NUM_TASKS = 64;
+constexpr size_t MAX_TASK_STACK_SIZE = 0;
+
+constexpr size_t NUM_CONTS = 64;
+constexpr size_t MAX_CONT_SIZE = 192;
+
 int main() {
-  PROFILE_ENABLE
   complex_vector initial_input = prepare_input(INPUT_SIZE);
 
-  pls::internal::helpers::run_mini_benchmark([&] {
-    complex_vector input = initial_input;
-    fft(input.begin(), input.size());
-  }, 7, 1000);
+  static_scheduler_memory<NUM_THREADS,
+                          NUM_TASKS,
+                          MAX_TASK_STACK_SIZE,
+                          NUM_CONTS,
+                          MAX_CONT_SIZE> static_scheduler_memory;
+
+  scheduler scheduler{static_scheduler_memory, NUM_THREADS};
+
+  count.store(0);
+  auto start = std::chrono::steady_clock::now();
+  for (int i = 0; i < NUM_ITERATIONS; i++) {
+    complex_vector input_2(initial_input);
+    scheduler.perform_work([&]() {
+      return scheduler::par([&]() {
+        return fft(input_2.begin(), INPUT_SIZE);
+      }, []() {
+        return parallel_result<int>{0};
+      }).then([](int, int) {
+        return 0;
+      });
+    });
+  }
+  auto end = std::chrono::steady_clock::now();
+  std::cout << "Count: " << count.load() << std::endl;
+  std::cout << "Framework:  " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
+            << std::endl;
+
+  count.store(0);
+  start = std::chrono::steady_clock::now();
+  for (int i = 0; i < NUM_ITERATIONS; i++) {
+    complex_vector input_1(initial_input);
+    fft_normal(input_1.begin(), INPUT_SIZE);
+  }
+  end = std::chrono::steady_clock::now();
+  std::cout << "Count: " << count.load() << std::endl;
+  std::cout << "Normal:     " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
+            << std::endl;
 
-  PROFILE_SAVE("test_profile.prof")
+  return 0;
 }
diff --git a/cmake/SetupOptimizationLevel.cmake b/cmake/SetupOptimizationLevel.cmake
index 05c18cc..f2f66ec 100644
--- a/cmake/SetupOptimizationLevel.cmake
+++ b/cmake/SetupOptimizationLevel.cmake
@@ -18,7 +18,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release")
     # but inlining functions and SIMD/Vectorization is
     # only enabled by -O3, thus it's way faster in some
     # array calculations.
-    set(CMAKE_CXX_FLAGS_RELEASE "-O2 -march=native")
+    set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native")
     set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 else ()
     set(CMAKE_CXX_FLAGS_DEBUG "-g -O0")
diff --git a/lib/pls/CMakeLists.txt b/lib/pls/CMakeLists.txt
index 12e7e97..9e9dd85 100644
--- a/lib/pls/CMakeLists.txt
+++ b/lib/pls/CMakeLists.txt
@@ -34,7 +34,7 @@ add_library(pls STATIC
         include/pls/internal/base/barrier.h src/internal/base/barrier.cpp
         include/pls/internal/base/system_details.h
         include/pls/internal/base/error_handling.h
-        include/pls/internal/base/alignment.h
+        include/pls/internal/base/alignment.h src/internal/base/alignment.cpp
 
         include/pls/internal/data_structures/aligned_stack.h src/internal/data_structures/aligned_stack.cpp
         include/pls/internal/data_structures/aligned_stack_impl.h
@@ -58,7 +58,7 @@ add_library(pls STATIC
         include/pls/internal/scheduling/task.h src/internal/scheduling/task.cpp
         include/pls/internal/scheduling/cont_manager.h
         include/pls/internal/scheduling/continuation.h
-        include/pls/internal/data_structures/bounded_ws_deque.h include/pls/internal/data_structures/optional.h src/internal/base/alignment.cpp src/internal/base/alignment.h)
+        include/pls/internal/data_structures/bounded_ws_deque.h include/pls/internal/data_structures/optional.h)
 
 # Add everything in `./include` to be in the include path of this project
 target_include_directories(pls
diff --git a/lib/pls/include/pls/internal/data_structures/bounded_ws_deque.h b/lib/pls/include/pls/internal/data_structures/bounded_ws_deque.h
index ccc6bfa..7a37f7b 100644
--- a/lib/pls/include/pls/internal/data_structures/bounded_ws_deque.h
+++ b/lib/pls/include/pls/internal/data_structures/bounded_ws_deque.h
@@ -36,11 +36,11 @@ class bounded_ws_deque {
   void push_bottom(T item) {
     item_array_[bottom_] = item;
     local_bottom_++;
-    bottom_.store(local_bottom_);
+    bottom_.store(local_bottom_, std::memory_order_release);
   }
 
   bool is_empty() {
-    return top_.load().value < bottom_;
+    return top_.load().value < bottom_.load();
   }
 
   optional<T> pop_top() {
@@ -66,11 +66,11 @@ class bounded_ws_deque {
     }
 
     local_bottom_--;
-    bottom_.store(local_bottom_);
+    bottom_.store(local_bottom_, std::memory_order_seq_cst);
 
     optional<T> result(item_array_[local_bottom_]);
 
-    stamped_integer old_top = top_.load();
+    stamped_integer old_top = top_.load(std::memory_order_acquire);
     if (local_bottom_ > old_top.value) {
       // Enough distance to just return the value
       return result;
--
libgit2 0.26.0