Add profiling information to benchmarks and matrix_then_fft benchmark.

7a8f320b · FritzFlorian · 08bc7855 · 7a8f320b · 7a8f320b · 7a8f320b
Commit 7a8f320b authored Jul 22, 2020 by FritzFlorian
8 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,6 +54,7 @@ add_subdirectory(app/benchmark_matrix)
 add_subdirectory(app/benchmark_matrix_div_conquer)
 add_subdirectory(app/benchmark_fib)
 add_subdirectory(app/context_switch)
+add_subdirectory(app/benchmark_matrix_then_fft)

 # Add optional tests
 option(PACKAGE_TESTS "Build the tests" ON)

--- a/app/benchmark_fft/main.cpp
+++ b/app/benchmark_fft/main.cpp
@@ -43,6 +43,11 @@ int main(int argc, char **argv) {
  pls::scheduler scheduler{(unsigned) settings.num_threads_, MAX_NUM_TASKS, MAX_STACK_SIZE};

  if (settings.type_ == benchmark_runner::benchmark_settings::ISOLATED) {
+#if PLS_PROFILING_ENABLED
+    scheduler.get_profiler().disable_memory_measure();
+    runner.add_custom_stats_field("T_1");
+    runner.add_custom_stats_field("T_inf");
+#endif
    printf("Running isolated measurement...\n");
    runner.enable_memory_stats();
    runner.pre_allocate_stats();
@@ -53,6 +58,11 @@ int main(int argc, char **argv) {
      });
    }, [&]() {
      fft::fill_input(data); // Reset data before each run
+    }, [&]() {
+#if PLS_PROFILING_ENABLED
+      runner.store_custom_stat("T_1", scheduler.get_profiler().current_run().t_1_);
+      runner.store_custom_stat("T_inf", scheduler.get_profiler().current_run().t_inf_);
+#endif
    });
    runner.commit_results(true);
  } else {

--- a/app/benchmark_fib/main.cpp
+++ b/app/benchmark_fib/main.cpp
@@ -34,6 +34,11 @@ int main(int argc, char **argv) {

  volatile int res;
  if (settings.type_ == benchmark_runner::benchmark_settings::ISOLATED) {
+#if PLS_PROFILING_ENABLED
+    scheduler.get_profiler().disable_memory_measure();
+    runner.add_custom_stats_field("T_1");
+    runner.add_custom_stats_field("T_inf");
+#endif
    printf("Running isolated measurement...\n");
    runner.enable_memory_stats();
    runner.pre_allocate_stats();
@@ -42,6 +47,11 @@ int main(int argc, char **argv) {
      scheduler.perform_work([&]() {
        res = pls_fib(settings.size_);
      });
+    }, [&]() {}, [&]() {
+#if PLS_PROFILING_ENABLED
+      runner.store_custom_stat("T_1", scheduler.get_profiler().current_run().t_1_);
+      runner.store_custom_stat("T_inf", scheduler.get_profiler().current_run().t_inf_);
+#endif
    });
    runner.commit_results(true);
  } else {

--- a/app/benchmark_matrix_div_conquer/main.cpp
+++ b/app/benchmark_matrix_div_conquer/main.cpp
@@ -125,6 +125,7 @@ int main(int argc, char **argv) {
  // Fill data arrays as needed
  a.fill_default_data();
  b.fill_default_data();
+  result.fill_default_data();

  // Strain local data
  std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> div_conquer_temp_arrays;
@@ -158,6 +159,11 @@ int main(int argc, char **argv) {
  pls::scheduler scheduler{(unsigned) settings.num_threads_, max_depth + 2, MAX_STACK_SIZE};

  if (settings.type_ == benchmark_runner::benchmark_settings::ISOLATED) {
+#if PLS_PROFILING_ENABLED
+    scheduler.get_profiler().disable_memory_measure();
+    runner.add_custom_stats_field("T_1");
+    runner.add_custom_stats_field("T_inf");
+#endif
    printf("Running isolated measurement...\n");
    runner.enable_memory_stats();
    runner.pre_allocate_stats();
@@ -166,6 +172,11 @@ int main(int argc, char **argv) {
      scheduler.perform_work([&]() {
        multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, 0, result, a, b);
      });
+    }, [&]() {}, [&]() {
+#if PLS_PROFILING_ENABLED
+      runner.store_custom_stat("T_1", scheduler.get_profiler().current_run().t_1_);
+      runner.store_custom_stat("T_inf", scheduler.get_profiler().current_run().t_inf_);
+#endif
    });
    runner.commit_results(true);
  } else {

--- a/app/benchmark_matrix_then_fft/CMakeLists.txt
+++ b/app/benchmark_matrix_then_fft/CMakeLists.txt
+add_executable(benchmark_matrix_then_fft_pls_v3 main.cpp)
+target_link_libraries(benchmark_matrix_then_fft_pls_v3 pls benchmark_runner benchmark_base)
+ADD_DEPENDENCIES(benchmark.pls benchmark_matrix_then_fft_pls_v3)
+
+if (EASY_PROFILER)
+    target_link_libraries(benchmark_matrix_then_fft_pls_v3 easy_profiler)
+endif ()
--- a/app/benchmark_matrix_then_fft/main.cpp
+++ b/app/benchmark_matrix_then_fft/main.cpp
+#include "pls/pls.h"
+
+#include "benchmark_runner.h"
+#include "benchmark_base/matrix.h"
+#include "benchmark_base/fft.h"
+
+using namespace comparison_benchmarks::base;
+
+void pls_conquer(fft::complex_vector::iterator data, fft::complex_vector::iterator swap_array, int n) {
+  if (n < 2) {
+    return;
+  }
+
+  fft::divide(data, swap_array, n);
+  if (n <= fft::RECURSIVE_CUTOFF) {
+    fft::conquer(data, swap_array, n / 2);
+    fft::conquer(data + n / 2, swap_array + n / 2, n / 2);
+  } else {
+    pls::spawn([data, n, swap_array]() {
+      pls_conquer(data, swap_array, n / 2);
+    });
+    pls::spawn_and_sync([data, n, swap_array]() {
+      pls_conquer(data + n / 2, swap_array + n / 2, n / 2);
+    });
+  }
+
+  fft::combine(data, n);
+}
+
+constexpr int MAX_NUM_TASKS = 16;
+constexpr int MAX_STACK_SIZE = 4096 * 1;
+
+int main(int argc, char **argv) {
+  auto settings = benchmark_runner::parse_parameters(argc, argv);
+  size_t matrix_size = settings.size_;
+  size_t fft_size = 8192;
+
+  string test_name = to_string(settings.num_threads_) + ".csv";
+  string full_directory = settings.output_directory_ + "/PLS_v3/";
+  benchmark_runner runner{full_directory, test_name};
+
+  pls::scheduler scheduler{(unsigned) settings.num_threads_, MAX_NUM_TASKS, MAX_STACK_SIZE};
+
+  // Data Containers
+  fft::complex_vector fft_data(fft_size);
+  fft::complex_vector fft_swap_array(fft_size);
+  fft::fill_input(fft_data);
+  matrix::matrix<double> matrix_a{settings.size_};
+  matrix::matrix<double> matrix_b{settings.size_};
+  matrix::matrix<double> matrix_result{settings.size_};
+
+  if (settings.type_ == benchmark_runner::benchmark_settings::ISOLATED) {
+#if PLS_PROFILING_ENABLED
+    scheduler.get_profiler().disable_memory_measure();
+    runner.add_custom_stats_field("T_1");
+    runner.add_custom_stats_field("T_inf");
+#endif
+    printf("Running isolated measurement...\n");
+    runner.enable_memory_stats();
+    runner.pre_allocate_stats();
+
+    runner.run_iterations(settings.iterations_, [&]() {
+      // Serial Matrix Multiplication
+      matrix_result.multiply(matrix_a, matrix_b);
+      // Parallel FFT
+      scheduler.perform_work([&]() {
+        pls_conquer(fft_data.begin(), fft_swap_array.begin(), fft_size);
+      });
+    }, [&]() {
+      fft::fill_input(fft_data); // Reset data before each run
+    }, [&]() {
+#if PLS_PROFILING_ENABLED
+      runner.store_custom_stat("T_1", scheduler.get_profiler().current_run().t_1_);
+      runner.store_custom_stat("T_inf", scheduler.get_profiler().current_run().t_inf_);
+#endif
+    });
+    runner.commit_results(true);
+  } else {
+    printf("Running periodic measurement...\n");
+    runner.enable_wall_time_stats();
+    runner.pre_allocate_stats();
+
+    runner.run_periodic(settings.iterations_, settings.interval_period_, settings.interval_deadline_, [&]() {
+      // Serial Matrix Multiplication
+      matrix_result.multiply(matrix_a, matrix_b);
+      // Parallel FFT
+      scheduler.perform_work([&]() {
+        pls_conquer(fft_data.begin(), fft_swap_array.begin(), fft_size);
+      });
+      // Reset data before each run
+      fft::fill_input(fft_data);
+    });
+    runner.commit_results(true);
+  }
+
+  return 0;
+}
--- a/app/benchmark_unbalanced/main.cpp
+++ b/app/benchmark_unbalanced/main.cpp
@@ -44,6 +44,11 @@ int main(int argc, char **argv) {
  pls::scheduler scheduler{(unsigned) settings.num_threads_, MAX_NUM_TASKS, MAX_STACK_SIZE};

  if (settings.type_ == benchmark_runner::benchmark_settings::ISOLATED) {
+#if PLS_PROFILING_ENABLED
+    scheduler.get_profiler().disable_memory_measure();
+    runner.add_custom_stats_field("T_1");
+    runner.add_custom_stats_field("T_inf");
+#endif
    printf("Running isolated measurement...\n");
    runner.enable_memory_stats();
    runner.pre_allocate_stats();
@@ -55,6 +60,11 @@ int main(int argc, char **argv) {
                               unbalanced::Q,
                               unbalanced::NORMAL_CHILDREN);
      });
+    }, [&]() {}, [&]() {
+#if PLS_PROFILING_ENABLED
+      runner.store_custom_stat("T_1", scheduler.get_profiler().current_run().t_1_);
+      runner.store_custom_stat("T_inf", scheduler.get_profiler().current_run().t_inf_);
+#endif
    });
    runner.commit_results(true);
  } else {

--- a/extern/benchmark_runner/benchmark_runner.h
+++ b/extern/benchmark_runner/benchmark_runner.h
@@ -42,7 +42,10 @@ class benchmark_runner {
  unsigned long wall_time_pre_run_;
  unsigned long wall_time_post_run_;

-  map<string, vector<long>> custom_stats_;
+  const string WALL_TIME_ITERATION_START = "wall_time_iteration_start_us";
+  const string WALL_TIME_ITERATION_END = "wall_time_iteration_end_us";
+
+  map<string, vector<unsigned long>> custom_stats_;

  void print_statistics() {
    long time_sum = std::accumulate(times_.begin(), times_.end(), 0l);
@@ -119,9 +122,11 @@ class benchmark_runner {
    wall_time_enabled_ = true;
    add_custom_stats_field(WALL_TIME_PRE_RUN);
    add_custom_stats_field(WALL_TIME_POST_RUN);
+    add_custom_stats_field(WALL_TIME_ITERATION_START);
+    add_custom_stats_field(WALL_TIME_ITERATION_END);
  }

-  void pre_allocate_stats(size_t num = 100000) {
+  void pre_allocate_stats(size_t num = 500000) {
    times_.reserve(num);
    memset(times_.data(), 'a', num * sizeof(long));
    for (auto &iter : custom_stats_) {
@@ -187,8 +192,7 @@ class benchmark_runner {
    }
    if (wall_time_enabled_) {
      wall_time_pre_run_ =
-          std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::time_point_cast<std::chrono::microseconds>(
-              std::chrono::system_clock::now()).time_since_epoch()).count();
+          chrono::duration_cast<chrono::microseconds>(chrono::system_clock::now().time_since_epoch()).count();
    }

    last_start_time_ = chrono::steady_clock::now();
@@ -213,8 +217,8 @@ class benchmark_runner {
    }
    if (wall_time_enabled_) {
      wall_time_post_run_ =
-          std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::time_point_cast<std::chrono::microseconds>(
-              std::chrono::system_clock::now()).time_since_epoch()).count();
+          chrono::duration_cast<chrono::microseconds>(chrono::system_clock::now().time_since_epoch()).count();
+
      custom_stats_[WALL_TIME_PRE_RUN][iteration_index] = wall_time_pre_run_;
      custom_stats_[WALL_TIME_POST_RUN][iteration_index] = wall_time_post_run_;
    }
@@ -271,6 +275,7 @@ class benchmark_runner {
    add_to_timespec(iteration_start, period_seconds, period_nanoseconds);

    size_t current_iteration = 0;
+    size_t deadline_misses = 0;
    while (current_iteration < count) {
      // Sleep until the next iteration
      long sleep_error = clock_nanosleep(CLOCK_MONOTONIC, TIMER_ABSTIME, &iteration_start, nullptr);
@@ -295,19 +300,28 @@ class benchmark_runner {
      deadline_end = iteration_start;
      add_to_timespec(deadline_end, deadline_seconds, deadline_nanoseconds);

+      // Keep stats...
+      if (wall_time_enabled_) {
+        custom_stats_[WALL_TIME_ITERATION_START][current_iteration] =
+            iteration_start.tv_sec * 1000 * 1000 + iteration_start.tv_nsec / 1000;
+        custom_stats_[WALL_TIME_ITERATION_END][current_iteration] =
+            iteration_end.tv_sec * 1000 * 1000 + iteration_end.tv_nsec / 1000;
+      }
+
      // Store 'actual' wall time instead of iteration time (we want to include sleeping here!)
-      long wall_time_us = 0;
+      unsigned long wall_time_us = 0;
      wall_time_us += (finish_time.tv_sec - iteration_start.tv_sec) * 1000l * 1000l;
-      wall_time_us += ((long) finish_time.tv_nsec - (long) iteration_start.tv_nsec) / 1000l;
-      printf("Difference: %ld\n", wall_time_us - times_[current_iteration]);
+      long nano_second_difference = ((long) finish_time.tv_nsec - (long) iteration_start.tv_nsec) / 1000l;
+      wall_time_us += nano_second_difference;
      times_[current_iteration] = wall_time_us;

      if (finish_time.tv_sec >= deadline_end.tv_sec && finish_time.tv_nsec > deadline_end.tv_nsec) {
-        printf("Deadline Miss!\n"); // TODO: Remove
+        deadline_misses++;
      }

      // Skip iterations if their start time is later than the current time (skipping)
-      while (finish_time.tv_sec >= iteration_end.tv_sec && finish_time.tv_nsec > iteration_end.tv_nsec) {
+      while (finish_time.tv_sec > iteration_end.tv_sec
+          || (finish_time.tv_sec == iteration_end.tv_sec && finish_time.tv_nsec > iteration_end.tv_nsec)) {
        iteration_start = iteration_end;

        iteration_end = iteration_start;
@@ -322,6 +336,8 @@ class benchmark_runner {
      current_iteration++;
      iteration_start = iteration_end;
    }
+
+    printf("%ld deadline misses!\n", deadline_misses);
  }

  void commit_results(bool print_stats) {