Add two 'standardized' benchmarks.

79ac0243 · FritzFlorian · 2f539691 · 79ac0243 · 79ac0243 · 79ac0243
Commit 79ac0243 authored Dec 20, 2019 by FritzFlorian
26 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,6 +26,9 @@ list(APPEND CMAKE_PREFIX_PATH "${PROJECT_SOURCE_DIR}/cmake")
 # Each library has an own CMakeLists.txt that should make it avaliabale as a library target,
 # thus allowing one to include it as any cmake dependency later on.
 add_subdirectory(extern/catch2)
+add_subdirectory(extern/picosha2)
+add_subdirectory(extern/benchmark_base)
+add_subdirectory(extern/benchmark_runner)

 # Include all internal subprojects (library, examples, testing).
 add_subdirectory(lib/pls)

--- a/app/benchmark_fft/CMakeLists.txt
+++ b/app/benchmark_fft/CMakeLists.txt
-add_executable(benchmark_fft main.cpp)
-target_link_libraries(benchmark_fft pls)
-if(EASY_PROFILER)
-    target_link_libraries(benchmark_fft easy_profiler)
-endif()
+add_executable(benchmark_fft_pls_v2 main.cpp)
+target_link_libraries(benchmark_fft_pls_v2 pls benchmark_runner benchmark_base)
+if (EASY_PROFILER)
+    target_link_libraries(benchmark_fft_pls_v2 easy_profiler)
+endif ()
--- a/app/benchmark_fft/main.cpp
+++ b/app/benchmark_fft/main.cpp
@@ -2,142 +2,91 @@
 #include "pls/internal/scheduling/parallel_result.h"
 #include "pls/internal/scheduling/scheduler_memory.h"
 #include "pls/internal/helpers/profiler.h"
+
 using namespace pls::internal::scheduling;

 #include <iostream>
 #include <complex>
 #include <vector>
-#include <atomic>
-
-static constexpr int CUTOFF = 16;
-static constexpr int INPUT_SIZE = 16384;
-typedef std::vector<std::complex<double>> complex_vector;
-
-void divide(complex_vector::iterator data, int n) {
-  complex_vector tmp_odd_elements(n / 2);
-  for (int i = 0; i < n / 2; i++) {
-    tmp_odd_elements[i] = data[i * 2 + 1];
-  }
-  for (int i = 0; i < n / 2; i++) {
-    data[i] = data[i * 2];
-  }
-  for (int i = 0; i < n / 2; i++) {
-    data[i + n / 2] = tmp_odd_elements[i];
-  }
-}

-void combine(complex_vector::iterator data, int n) {
-  for (int i = 0; i < n / 2; i++) {
-    std::complex<double> even = data[i];
-    std::complex<double> odd = data[i + n / 2];
-
-    // w is the "twiddle-factor".
-    // this could be cached, but we run the same 'data_structures' algorithm parallel/serial,
-    // so it won't impact the performance comparison.
-    std::complex<double> w = exp(std::complex<double>(0, -2. * M_PI * i / n));
-
-    data[i] = even + w * odd;
-    data[i + n / 2] = even - w * odd;
-  }
-}
-
-void fft_normal(complex_vector::iterator data, int n) {
-  if (n < 2) {
-    return;
-  }
+#include "benchmark_runner.h"
+#include "benchmark_base/fft.h"

-  divide(data, n);
-  fft_normal(data, n / 2);
-  fft_normal(data + n / 2, n / 2);
-  combine(data, n);
-}
+using namespace comparison_benchmarks::base;

-parallel_result<short> fft(complex_vector::iterator data, int n) {
+parallel_result<short> conquer(fft::complex_vector::iterator data, int n) {
  if (n < 2) {
    return parallel_result<short>{0};
  }

-  divide(data, n);
-  if (n <= CUTOFF) {
-    fft_normal(data, n / 2);
-    fft_normal(data + n / 2, n / 2);
-    combine(data, n);
+  fft::divide(data, n);
+  if (n <= fft::RECURSIVE_CUTOFF) {
+    fft::conquer(data, n / 2);
+    fft::conquer(data + n / 2, n / 2);
+    fft::combine(data, n);
    return parallel_result<short>{0};
  } else {
    return scheduler::par([=]() {
-      return fft(data, n / 2);
+      return conquer(data, n / 2);
    }, [=]() {
-      return fft(data + n / 2, n / 2);
+      return conquer(data + n / 2, n / 2);
    }).then([=](int, int) {
-      combine(data, n);
+      fft::combine(data, n);
      return parallel_result<short>{0};
    });
  }
 }

-complex_vector prepare_input(int input_size) {
-  std::vector<double> known_frequencies{2, 11, 52, 88, 256};
-  complex_vector data(input_size);
-
-  // Set our input data to match a time series of the known_frequencies.
-  // When applying fft to this time-series we should find these frequencies.
-  for (int i = 0; i < input_size; i++) {
-    data[i] = std::complex<double>(0.0, 0.0);
-    for (auto frequencie : known_frequencies) {
-      data[i] += sin(2 * M_PI * frequencie * i / input_size);
-    }
-  }
-
-  return data;
-}
+constexpr int MAX_NUM_THREADS = 8;
+constexpr int MAX_NUM_TASKS = 64;
+constexpr int MAX_NUM_CONTS = 64;
+constexpr int MAX_CONT_SIZE = 256;

-static constexpr int NUM_ITERATIONS = 500;
-constexpr size_t NUM_THREADS = 2;
+int main(int argc, char **argv) {
+  int num_threads;
+  string directory;
+  benchmark_runner::read_args(argc, argv, num_threads, directory);

-constexpr size_t NUM_TASKS = 128;
+  string test_name = to_string(num_threads) + ".csv";
+  string full_directory = directory + "/PLS_v2/";
+  benchmark_runner runner{full_directory, test_name};

-constexpr size_t NUM_CONTS = 128;
-constexpr size_t MAX_CONT_SIZE = 512;
+  fft::complex_vector data = fft::generate_input();

-int main() {
-  PROFILE_ENABLE;
-  complex_vector initial_input = prepare_input(INPUT_SIZE);
-
-  static_scheduler_memory<NUM_THREADS,
-                          NUM_TASKS,
-                          NUM_CONTS,
+  static_scheduler_memory<MAX_NUM_THREADS,
+                          MAX_NUM_TASKS,
+                          MAX_NUM_CONTS,
                          MAX_CONT_SIZE> static_scheduler_memory;

-  scheduler scheduler{static_scheduler_memory, NUM_THREADS};
+  scheduler scheduler{static_scheduler_memory, (unsigned int) num_threads};

-  auto start = std::chrono::steady_clock::now();
-  for (int i = 0; i < NUM_ITERATIONS; i++) {
-    complex_vector input_2(initial_input);
+  for (int i = 0; i < fft::NUM_WARMUP_ITERATIONS; i++) {
    scheduler.perform_work([&]() {
-      PROFILE_MAIN_THREAD;
      return scheduler::par([&]() {
-        return fft(input_2.begin(), INPUT_SIZE);
+        return conquer(data.begin(), fft::SIZE);
      }, []() {
-        return parallel_result<int>{0};
-      }).then([](int, int) {
+        return parallel_result<short>{0};
+      }).then([&](short, short) {
        return parallel_result<int>{0};
      });
    });
-    PROFILE_LOCK("DONE");
  }
-  auto end = std::chrono::steady_clock::now();
-  std::cout << "Framework:  " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
-            << std::endl;
-  PROFILE_SAVE("test_profile.prof");
-
-  start = std::chrono::steady_clock::now();
-  for (int i = 0; i < NUM_ITERATIONS; i++) {
-    complex_vector input_1(initial_input);
-    fft_normal(input_1.begin(), INPUT_SIZE);
+
+  for (int i = 0; i < fft::NUM_ITERATIONS; i++) {
+    scheduler.perform_work([&]() {
+      runner.start_iteration();
+
+      return scheduler::par([&]() {
+        return conquer(data.begin(), fft::SIZE);
+      }, []() {
+        return parallel_result<short>{0};
+      }).then([&](short, short) {
+        runner.end_iteration();
+        return parallel_result<int>{0};
+      });
+    });
  }
-  end = std::chrono::steady_clock::now();
-  std::cout << "Normal:     " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
-            << std::endl;
+  runner.commit_results(true);

  return 0;
 }
--- a/app/benchmark_matrix/CMakeLists.txt
+++ b/app/benchmark_matrix/CMakeLists.txt
-add_executable(benchmark_matrix main.cpp)
-target_link_libraries(benchmark_matrix pls)
+add_executable(benchmark_matrix_pls_v2 main.cpp)
+target_link_libraries(benchmark_matrix_pls_v2 pls benchmark_runner benchmark_base)
 if (EASY_PROFILER)
-    target_link_libraries(benchmark_matrix easy_profiler)
+    target_link_libraries(benchmark_matrix_pls_v2 easy_profiler)
 endif ()
--- a/app/benchmark_matrix/main.cpp
+++ b/app/benchmark_matrix/main.cpp
@@ -2,112 +2,78 @@
 #include "pls/internal/scheduling/parallel_result.h"
 #include "pls/internal/scheduling/scheduler_memory.h"
 #include "pls/algorithms/for_each.h"
+
 using namespace pls::internal::scheduling;

-#include <chrono>
+#include "benchmark_runner.h"
+#include "benchmark_base/matrix.h"

-const int MATRIX_SIZE = 128;
+using namespace comparison_benchmarks::base;

 template<typename T, int SIZE>
-class matrix {
+class pls_matrix : public matrix::matrix<T, SIZE> {
 public:
-  T data[SIZE][SIZE];
-
-  explicit matrix(T i = 1) {
-    std::fill(&data[0][0], &data[0][0] + SIZE * SIZE, i);
-  }
+  pls_matrix() : matrix::matrix<T, SIZE>() {}

-  parallel_result<int> multiply(const matrix<T, SIZE> &a, const matrix<T, SIZE> &b) {
-    return pls::algorithm::for_each_range(0, SIZE, [&](int i) {
+  parallel_result<int> pls_multiply(const matrix::matrix<T, SIZE> &a, const matrix::matrix<T, SIZE> &b) {
+    return pls::algorithm::for_each_range(0, SIZE, [this, &a, &b](int i) {
      this->multiply_column(i, a, b);
    });
  }
-
- private:
-  void multiply_column(int i, const matrix<T, SIZE> &a, const matrix<T, SIZE> &b) {
-    for (int j = 0; j < SIZE; ++j) {
-      data[i][j] = 0;
-    }
-    for (int k = 0; k < SIZE; ++k) {
-      for (int j = 0; j < SIZE; ++j) {
-        data[i][j] += a.data[i][k] * b.data[k][j];
-      }
-    }
-  }
 };

-void fill_with_data(matrix<double, MATRIX_SIZE> &a, matrix<double, MATRIX_SIZE> &b) {
-  // Fill in some data...
-  for (int i = 0; i < MATRIX_SIZE; i++) {
-    for (int j = 0; j < MATRIX_SIZE; j++) {
-      a.data[i][j] = i;
-      b.data[i][j] = j;
-    }
-  }
-}
+constexpr size_t MAX_NUM_THREADS = 8;
+constexpr size_t MAX_NUM_TASKS = 32;
+constexpr size_t MAX_NUM_CONTS = 32;
+constexpr size_t MAX_CONT_SIZE = 512;

-static constexpr int NUM_ITERATIONS = 1000;
-constexpr size_t NUM_THREADS = 3;
+int main(int argc, char **argv) {
+  int num_threads;
+  string directory;
+  benchmark_runner::read_args(argc, argv, num_threads, directory);

-constexpr size_t NUM_TASKS = 128;
+  string test_name = to_string(num_threads) + ".csv";
+  string full_directory = directory + "/PLS_v2/";
+  benchmark_runner runner{full_directory, test_name};

-constexpr size_t NUM_CONTS = 128;
-constexpr size_t MAX_CONT_SIZE = 512;
+  pls_matrix<double, matrix::MATRIX_SIZE> a;
+  pls_matrix<double, matrix::MATRIX_SIZE> b;
+  pls_matrix<double, matrix::MATRIX_SIZE> result;

-int main() {
-  PROFILE_ENABLE
-  matrix<double, MATRIX_SIZE> a;
-  matrix<double, MATRIX_SIZE> b;
-  matrix<double, MATRIX_SIZE> result;
-  fill_with_data(a, b);
-
-  static_scheduler_memory<NUM_THREADS,
-                          NUM_TASKS,
-                          NUM_CONTS,
+  static_scheduler_memory<MAX_NUM_THREADS,
+                          MAX_NUM_TASKS,
+                          MAX_NUM_CONTS,
                          MAX_CONT_SIZE> static_scheduler_memory;

-  scheduler scheduler{static_scheduler_memory, NUM_THREADS};
+  scheduler scheduler{static_scheduler_memory, (unsigned int) num_threads};
+
+  for (int i = 0; i < matrix::WARMUP_ITERATIONS; i++) {

-  auto start = std::chrono::steady_clock::now();
-  for (int i = 0; i < NUM_ITERATIONS; i++) {
    scheduler.perform_work([&]() {
-      PROFILE_MAIN_THREAD;
      return scheduler::par([&]() {
-        return result.multiply(a, b);
+        return result.pls_multiply(a, b);
      }, []() {
        return parallel_result<int>{0};
-      }).then([](int, int) {
+      }).then([&](int, int) {
        return parallel_result<int>{0};
      });
    });
  }
-  auto end = std::chrono::steady_clock::now();
-  std::cout << "Framework:  " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
-            << std::endl;
-}

-//int main() {
-//  PROFILE_ENABLE
-//  pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18u};
-//  pls::scheduler scheduler{&my_scheduler_memory, 4};
-//
-//  matrix<double, MATRIX_SIZE> a;
-//  matrix<double, MATRIX_SIZE> b;
-//  matrix<double, MATRIX_SIZE> result;
-//  fill_with_data(a, b);
-//
-//  scheduler.perform_work([&] {
-//    auto start_time = std::chrono::high_resolution_clock::now();
-//    PROFILE_MAIN_THREAD
-//    for (int i = 0; i < 10000; i++) {
-//      PROFILE_WORK_BLOCK("Top Level")
-//      result.multiply(a, b);
-//    }
-//    auto end_time = std::chrono::high_resolution_clock::now();
-//    long time = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
-//    std::cout << "Runtime: " << time << "us" << std::endl;
-//  });
-//
-//  PROFILE_SAVE("test_profile.prof")
-//}
+  for (int i = 0; i < matrix::NUM_ITERATIONS; i++) {
+    scheduler.perform_work([&]() {
+      runner.start_iteration();

+      return scheduler::par([&]() {
+        return result.pls_multiply(a, b);
+      }, []() {
+        return parallel_result<int>{0};
+      }).then([&](int, int) {
+        runner.end_iteration();
+        return parallel_result<int>{0};
+      });
+    });
+  }
+  runner.commit_results(true);
+
+}
--- a/app/benchmark_unbalanced/main.cpp
+++ b/app/benchmark_unbalanced/main.cpp
@@ -51,22 +51,22 @@ parallel_result<int> unbalanced_tree_search(int seed, int root_children, double 
  return result;
 }

-constexpr size_t NUM_THREADS = 5;
+constexpr size_t MAX_NUM_THREADS = 5;

-constexpr size_t NUM_TASKS = 128;
+constexpr size_t MAX_NUM_TASKS = 128;

-constexpr size_t NUM_CONTS = 128;
+constexpr size_t MAX_NUM_CONTS = 128;
 constexpr size_t MAX_CONT_SIZE = 512;

 volatile int result;
 int main() {
  PROFILE_ENABLE
-  static_scheduler_memory<NUM_THREADS,
-                          NUM_TASKS,
-                          NUM_CONTS,
+  static_scheduler_memory<MAX_NUM_THREADS,
+                          MAX_NUM_TASKS,
+                          MAX_NUM_CONTS,
                          MAX_CONT_SIZE> static_scheduler_memory;

-  scheduler scheduler{static_scheduler_memory, NUM_THREADS};
+  scheduler scheduler{static_scheduler_memory, MAX_NUM_THREADS};

  scheduler.perform_work([&]() {
    return scheduler::par([&]() {

--- a/app/playground/main.cpp
+++ b/app/playground/main.cpp
@@ -8,12 +8,12 @@

 using namespace pls::internal;

-constexpr size_t NUM_THREADS = 4;
+constexpr size_t MAX_NUM_THREADS = 1;

-constexpr size_t NUM_TASKS = 128;
-static constexpr int NUM_ITERATIONS = 100;
+constexpr size_t MAX_NUM_TASKS = 128;
+static constexpr int NUM_ITERATIONS = 10;

-constexpr size_t NUM_CONTS = 128;
+constexpr size_t MAX_NUM_CONTS = 128;
 constexpr size_t MAX_CONT_SIZE = 256;

 int fib_normal(int n) {
@@ -29,8 +29,13 @@ int fib_normal(int n) {
 }

 scheduling::parallel_result<int> fib(int n) {
-  if (n <= 10) {
-    return fib_normal(n);
+  pls::variable<int> i;
+  pls::array<int> a{10};
+  if (n == 0) {
+    return 0;
+  }
+  if (n == 1) {
+    return 1;
  }

  return scheduling::scheduler::par([=]() {
@@ -45,12 +50,12 @@ scheduling::parallel_result<int> fib(int n) {
 static volatile int result;
 int main() {
  PROFILE_ENABLE;
-  scheduling::static_scheduler_memory<NUM_THREADS,
-                                      NUM_TASKS,
-                                      NUM_CONTS,
+  scheduling::static_scheduler_memory<MAX_NUM_THREADS,
+                                      MAX_NUM_TASKS,
+                                      MAX_NUM_CONTS,
                                      MAX_CONT_SIZE> static_scheduler_memory;

-  scheduling::scheduler scheduler{static_scheduler_memory, NUM_THREADS};
+  scheduling::scheduler scheduler{static_scheduler_memory, MAX_NUM_THREADS};

  auto start = std::chrono::steady_clock::now();
  for (int i = 0; i < NUM_ITERATIONS; i++) {

--- a/extern/benchmark_base/CMakeLists.txt
+++ b/extern/benchmark_base/CMakeLists.txt
+# Configuration and common algorithm pieces for benchmarks
+configure_file(src/sample_images.cpp.in sample_images.cpp)
+
+add_library(benchmark_base STATIC
+        ${CMAKE_CURRENT_BINARY_DIR}/sample_images.cpp
+        src/fft.cpp include/benchmark_base/fft.h
+        include/benchmark_base/heat.h
+        include/benchmark_base/matrix.h
+        include/benchmark_base/unbalanced.h src/unbalanced.cpp
+        include/benchmark_base/range.h)
+
+target_include_directories(benchmark_base
+        PUBLIC
+        $<INSTALL_INTERFACE:include>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+        PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/src
+        )
+
+target_link_libraries(benchmark_base picosha2)
--- a/extern/benchmark_base/include/benchmark_base/.gitkeep
+++ b/extern/benchmark_base/include/benchmark_base/.gitkeep
--- a/extern/benchmark_base/include/benchmark_base/RANGE_LICENSE.txt
+++ b/extern/benchmark_base/include/benchmark_base/RANGE_LICENSE.txt
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
--- a/extern/benchmark_base/include/benchmark_base/fft.h
+++ b/extern/benchmark_base/include/benchmark_base/fft.h
+#ifndef COMPARISON_BENCHMARKS_BASE_FFT_H
+#define COMPARISON_BENCHMARKS_BASE_FFT_H
+
+#include <complex>
+#include <string>
+#include <vector>
+
+namespace comparison_benchmarks {
+namespace base {
+namespace fft {
+
+const int SIZE = 8192;
+const int NUM_ITERATIONS = 1000;
+const int NUM_WARMUP_ITERATIONS = 100;
+
+const int RECURSIVE_CUTOFF = 32;
+typedef std::vector<std::complex<double>> complex_vector;
+
+complex_vector generate_input();
+
+void divide(complex_vector::iterator data, int n);
+void conquer(complex_vector::iterator data, int n);
+void combine(complex_vector::iterator data, int n);
+
+}
+}
+}
+
+#endif //COMPARISON_BENCHMARKS_BASE_FFT_H
--- a/extern/benchmark_base/include/benchmark_base/heat.h
+++ b/extern/benchmark_base/include/benchmark_base/heat.h
+
+#ifndef COMPARISON_BENCHMARKS_BASE_HEAT_H
+#define COMPARISON_BENCHMARKS_BASE_HEAT_H
+
+#include <array>
+#include <iostream>
+#include <memory>
+
+namespace comparison_benchmarks {
+namespace base {
+namespace heat {
+
+const int DIFFUSION_SIZE = 256;
+const int DIFFUSION_STEPS = 256;
+
+const int NUM_ITERATIONS = 100;
+const int WARMUP_ITERATIONS = 20;
+
+template<typename T, int SIZE>
+class heat_diffusion {
+  // Center portion is SIZExSIZE, borders are fixed temperature values
+  using matrix = std::array<std::array<T, SIZE + 2>, SIZE + 2>;
+
+ protected:
+  // Sane default values for the simulation (form paper).
+  // This is not about perfect simulation results but the speedup of the workload.
+  double c = 0.1;
+  double d_s = 1.0 / (SIZE + 1);
+  double d_t = (d_s * d_s) / (4 * c);
+
+ public:
+  matrix *current_data;
+  matrix *next_data;
+
+  explicit heat_diffusion() {
+    current_data = new matrix;
+    next_data = new matrix;
+    reset_data();
+  }
+
+  ~heat_diffusion() {
+    delete current_data;
+    delete next_data;
+  }
+
+  virtual void run_simulation(int n) {
+    for (int i = 0; i < n; i++) {
+      for (int row = 1; row <= SIZE; row++) {
+        for (int column = 1; column <= SIZE; column++) {
+          update_element(row, column);
+        }
+      }
+
+      // Synchronization point needed to coordinate the calculation!
+      swap_data_arrays();
+    }
+  }
+
+ protected:
+  void update_element(int row, int column) {
+    (*next_data)[row][column] = (*current_data)[row][column] + ((c * d_t) / (d_s * d_s)) *
+        ((*current_data)[row + 1][column] + (*current_data)[row - 1][column]
+            - 4 * (*current_data)[row][column]
+            + (*current_data)[row][column + 1] + (*current_data)[row][column - 1]);
+  }
+
+  void swap_data_arrays() {
+    matrix *tmp = current_data;
+    current_data = next_data;
+    next_data = tmp;
+  }
+
+  void reset_data() {
+    for (int row = 0; row < SIZE + 2; row++) {
+      for (int column = 0; column < SIZE + 2; column++) {
+        (*current_data)[row][column] = 0.0;
+        (*next_data)[row][column] = 0.0;
+
+        // Edges are a fixed, hot temperature
+        if (row == 0 || row == SIZE + 1) {
+          (*current_data)[row][column] = 1.0;
+          (*next_data)[row][column] = 1.0;
+        }
+      }
+    }
+  }
+};
+
+template<typename T, int SIZE>
+std::ostream &operator<<(std::ostream &strm, const heat_diffusion<T, SIZE> &simulation) {
+  for (int i = 0; i < SIZE + 2; i++) {
+    for (int j = 0; j < SIZE + 2; j++) {
+      // 'color' our output according to temperature
+      char out;
+      if (simulation.current_data[i][j] < 0.1) {
+        out = ' ';
+      } else if (simulation.current_data[i][j] < 0.2) {
+        out = '-';
+      } else if (simulation.current_data[i][j] < 0.5) {
+        out = '=';
+      } else {
+        out = '#';
+      }
+
+      strm << out << "\t";
+    }
+    strm << std::endl;
+  }
+
+  return strm;
+}
+
+}
+}
+}
+
+#endif //COMPARISON_BENCHMARKS_BASE_HEAT_H
--- a/extern/benchmark_base/include/benchmark_base/matrix.h
+++ b/extern/benchmark_base/include/benchmark_base/matrix.h
+
+#ifndef COMPARISON_BENCHMARKS_BASE_MATRIX_H
+#define COMPARISON_BENCHMARKS_BASE_MATRIX_H
+
+#include <algorithm>
+#include <iostream>
+
+namespace comparison_benchmarks {
+namespace base {
+namespace matrix {
+
+const int MATRIX_SIZE = 128;
+
+const int NUM_ITERATIONS = 5000;
+const int WARMUP_ITERATIONS = 1000;
+
+template<typename T, int SIZE>
+class matrix {
+ public:
+  T data[SIZE][SIZE];
+
+  explicit matrix() {
+    for (int i = 0; i < MATRIX_SIZE; i++) {
+      for (int j = 0; j < MATRIX_SIZE; j++) {
+        data[i][j] = i;
+      }
+    }
+  }
+
+  virtual void multiply(const matrix<T, SIZE> &a, const matrix<T, SIZE> &b) {
+    for (int i = 0; i < SIZE; i++) {
+      multiply_column(i, a, b);
+    }
+  }
+
+ protected:
+  void multiply_column(int i, const matrix<T, SIZE> &a, const matrix<T, SIZE> &b) {
+    for (int j = 0; j < SIZE; ++j) {
+      data[i][j] = 0;
+    }
+    for (int k = 0; k < SIZE; ++k) {
+      for (int j = 0; j < SIZE; ++j) {
+        T a_data = a.data[i][k];
+        T b_data = b.data[k][j];
+        data[i][j] += a_data * b_data;
+      }
+    }
+  }
+};
+
+template<typename T, int SIZE>
+std::ostream &operator<<(std::ostream &strm, const matrix<T, SIZE> &matrix) {
+  for (int i = 0; i < SIZE; i++) {
+    for (int j = 0; j < SIZE; j++) {
+      strm << matrix.data[i][j] << "\t";
+    }
+    strm << std::endl;
+  }
+
+  return strm;
+}
+
+}
+}
+}
+
+#endif //COMPARISON_BENCHMARKS_BASE_MATRIX_H
--- a/extern/benchmark_base/include/benchmark_base/range.h
+++ b/extern/benchmark_base/include/benchmark_base/range.h
--- a/extern/benchmark_base/include/benchmark_base/unbalanced.h
+++ b/extern/benchmark_base/include/benchmark_base/unbalanced.h
+
+#ifndef COMPARISON_BENCHMARKS_BASE_UNBALANCED_H_
+#define COMPARISON_BENCHMARKS_BASE_UNBALANCED_H_
+
+#include <cstdint>
+#include <array>
+#include <vector>
+
+#include "picosha2.h"
+
+namespace comparison_benchmarks {
+namespace base {
+namespace unbalanced {
+
+const int SEED = 42;
+const int ROOT_CHILDREN = 140;
+const double Q = 0.124875;
+const int NORMAL_CHILDREN = 8;
+
+const int NUM_NODES = 71069;
+
+const int NUM_ITERATIONS = 50;
+const int WARMUP_ITERATIONS = 5;
+
+using node_state = std::array<uint8_t, 20>;
+
+/**
+ * Node of an unballanced binomial tree (https://www.cs.unc.edu/~olivier/LCPC06.pdf).
+ * To build up the tree recursivly call spawn_child_nodes on each node until leaves are reached.
+ * The tree is not built up directly in memory, but rather by the recursive calls.
+ */
+class node {
+  // The state is used to allow a deterministic tree construction using sha256 hashes.
+  node_state state_;
+
+  // Number of children for the current node
+  int num_children_;
+
+  // Set this to a positive number for the root node to start the tree with a specific size
+  int root_children_;
+
+  // general branching factors
+  double q_;
+  int b_;
+
+  // Private constructor for children
+  node(node_state state, double q, int b) : state_{state},
+                                            num_children_{0},
+                                            root_children_{-1},
+                                            q_{q},
+                                            b_{b} { init_num_children(); }
+
+  std::array<uint8_t, 20> generate_child_state(uint32_t index);
+  double get_state_random();
+  void init_num_children() {
+    double state_random = get_state_random();
+    if (root_children_ > 0) {
+      num_children_ = root_children_; // Root always spawns children
+    } else if (state_random < q_) {
+      num_children_ = b_;
+    } else {
+      num_children_ = 0;
+    }
+  }
+
+ public:
+  node(uint32_t seed, int root_children, double q, int b)
+      : state_({{}}), num_children_{0}, root_children_{root_children}, q_{q}, b_{b} {
+
+    for (int i = 0; i < 16; i++) {
+      state_[i] = 0;
+    }
+    state_[16] = static_cast<uint8_t>(0xFFu & (seed >> 24u));
+    state_[17] = static_cast<uint8_t>(0xFFu & (seed >> 16u));
+    state_[18] = static_cast<uint8_t>(0xFFu & (seed >> 8u));
+    state_[19] = static_cast<uint8_t>(0xFFu & (seed >> 0u));
+
+    picosha2::hash256_one_by_one hasher;
+    hasher.process(state_.begin(), state_.end());
+    hasher.finish();
+    hasher.get_hash_bytes(state_.begin(), state_.end());
+
+    init_num_children();
+  }
+
+  int get_num_children() const { return num_children_; }
+
+  node spawn_child_node(int index) {
+    return {generate_child_state(index), q_, b_};
+  }
+};
+
+}
+}
+}
+
+#endif //COMPARISON_BENCHMARKS_BASE_UNBALANCED_H_
--- a/extern/benchmark_base/src/.gitkeep
+++ b/extern/benchmark_base/src/.gitkeep
--- a/extern/benchmark_base/src/fft.cpp
+++ b/extern/benchmark_base/src/fft.cpp
+#include "benchmark_base/fft.h"
+
+namespace comparison_benchmarks {
+namespace base {
+namespace fft {
+
+complex_vector generate_input() {
+  std::vector<double> known_frequencies{2, 11, 52, 88, 256};
+  fft::complex_vector data(SIZE);
+  // Set our input data to match a time series of the known_frequencies.
+  // When applying fft to this time-series we should find these frequencies.
+  for (int i = 0; i < SIZE; i++) {
+    data[i] = std::complex<double>(0.0, 0.0);
+    for (auto frequencie : known_frequencies) {
+      data[i] += sin(2 * M_PI * frequencie * i / SIZE);
+    }
+  }
+
+  return data;
+}
+
+void divide(complex_vector::iterator data, int n) {
+  complex_vector tmp_odd_elements(n / 2);
+  for (int i = 0; i < n / 2; i++) {
+    tmp_odd_elements[i] = data[i * 2 + 1];
+  }
+  for (int i = 0; i < n / 2; i++) {
+    data[i] = data[i * 2];
+  }
+  for (int i = 0; i < n / 2; i++) {
+    data[i + n / 2] = tmp_odd_elements[i];
+  }
+}
+
+void combine(complex_vector::iterator data, int n) {
+  for (int i = 0; i < n / 2; i++) {
+    std::complex<double> even = data[i];
+    std::complex<double> odd = data[i + n / 2];
+
+    // w is the "twiddle-factor".
+    // this could be cached, but we run the same 'base' algorithm parallel/serial,
+    // so it won't impact the performance comparison.
+    std::complex<double> w = exp(std::complex<double>(0, -2. * M_PI * i / n));
+
+    data[i] = even + w * odd;
+    data[i + n / 2] = even - w * odd;
+  }
+}
+
+void conquer(complex_vector::iterator data, int n) {
+  if (n < 2) {
+    return;
+  }
+
+  divide(data, n);
+  conquer(data, n / 2);
+  conquer(data + n / 2, n / 2);
+  combine(data, n);
+}
+
+}
+}
+}
--- a/extern/benchmark_base/src/sample_images.cpp.in
+++ b/extern/benchmark_base/src/sample_images.cpp.in
+#include <vector>
+#include <string>
+#include <iostream>
+#include <sstream>
+
+using namespace std;
+
+namespace comparison_benchmarks {
+    namespace base {
+        vector<string> get_sample_image_paths() {
+            const int num_images = 19;
+
+            vector<string> result(num_images);
+            for (int i = 0; i < num_images; i++) {
+                ostringstream string_stream;
+                string_stream << "@CMAKE_CURRENT_SOURCE_DIR@/sample_images/" << i << ".jpg";
+                result[i] = string_stream.str();
+            }
+
+            return result;
+        }
+    }
+}
--- a/extern/benchmark_base/src/unbalanced.cpp
+++ b/extern/benchmark_base/src/unbalanced.cpp
+#include "benchmark_base/unbalanced.h"
+
+namespace comparison_benchmarks {
+namespace base {
+namespace unbalanced {
+
+node_state node::generate_child_state(uint32_t index) {
+  node_state result;
+
+  picosha2::hash256_one_by_one hasher;
+  hasher.process(state_.begin(), state_.end());
+  auto index_begin = reinterpret_cast<uint8_t *>(&index);
+  hasher.process(index_begin, index_begin + 4);
+  hasher.finish();
+  hasher.get_hash_bytes(result.begin(), result.end());
+
+  return result;
+}
+
+double node::get_state_random() {
+  int32_t state_random_integer;
+  uint32_t b = ((uint32_t) state_[16] << 24u) |
+      ((uint32_t) state_[17] << 16u) |
+      ((uint32_t) state_[18] << 8u) |
+      ((uint32_t) state_[19] << 0u);
+  b = b & 0x7fffffff; // Mask out negative values
+  state_random_integer = static_cast<int32_t>(b);
+
+  return (double) state_random_integer / (double) INT32_MAX;
+}
+
+}
+}
+}
--- a/extern/benchmark_runner/CMakeLists.txt
+++ b/extern/benchmark_runner/CMakeLists.txt
+add_library(benchmark_runner INTERFACE)
+target_include_directories(benchmark_runner INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
--- a/extern/benchmark_runner/benchmark_runner.h
+++ b/extern/benchmark_runner/benchmark_runner.h
+
+#ifndef BENCHMARK_RUNNER_H
+#define BENCHMARK_RUNNER_H
+
+#include <string>
+#include <cstdlib>
+#include <vector>
+#include <chrono>
+#include <numeric>
+#include <iostream>
+#include <fstream>
+#include <bits/stdc++.h>
+
+using namespace std;
+
+class benchmark_runner {
+ private:
+  string csv_path_;
+  string csv_name_;
+
+  chrono::steady_clock::time_point last_start_time_;
+  vector<long> times_;
+
+  void print_statistics() {
+    long time_sum = std::accumulate(times_.begin(), times_.end(), 0l);
+    cout << "Average Runtime (us): " << (time_sum / times_.size()) << endl;
+  }
+
+  inline bool file_exists(const std::string &name) {
+    ifstream f(name);
+    return f.good();
+  }
+
+ public:
+  benchmark_runner(string csv_path, string csv_name) : csv_path_{std::move(csv_path)},
+                                                       csv_name_{std::move(csv_name)},
+                                                       times_{} {
+    string command = "mkdir -p " + csv_path_;
+    int res = system(command.c_str());
+    if (res) {
+      cout << "Error while creating directory!" << endl;
+      exit(1);
+    }
+  }
+
+  static void read_args(int argc, char **argv, int &num_threads, string &path) {
+    if (argc < 3) {
+      cout << "Must Specifiy concurrency and output directory! (usage: `benchmark <output_directory> <num_threads>`)"
+           << endl;
+      exit(1);
+    }
+
+    string tmp = argv[1];
+    path = tmp;
+    num_threads = atoi(argv[2]);
+  }
+
+  void start_iteration() {
+    last_start_time_ = chrono::steady_clock::now();
+  }
+
+  void end_iteration() {
+    auto end_time = chrono::steady_clock::now();
+    long time = chrono::duration_cast<chrono::microseconds>(end_time - last_start_time_).count();
+    times_.emplace_back(time);
+  }
+
+  void run_iterations(int count, function<void(void)> f, int warmup_count) {
+    for (int i = 0; i < warmup_count; i++) {
+      f();
+    }
+
+    for (int i = 0; i < count; i++) {
+      start_iteration();
+      f();
+      end_iteration();
+    }
+  }
+
+  void commit_results(bool print_stats) {
+    if (print_stats) {
+      print_statistics();
+    }
+
+    string full_filename = csv_path_ + csv_name_;
+    bool write_header = !file_exists(full_filename);
+
+    { // Scope for output file
+      ofstream o(full_filename, std::fstream::out | std::fstream::app);
+      if (write_header) {
+        o << "runtime_us" << endl;
+      }
+      for (auto time : times_) {
+        o << time << endl;
+      }
+    } // End Scope for output file
+
+    times_.clear();
+  }
+};
+
+#endif //BENCHMARK_RUNNER_H
--- a/extern/picosha2/CMakeLists.txt
+++ b/extern/picosha2/CMakeLists.txt
+add_library(picosha2 INTERFACE)
+target_include_directories(picosha2 INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
\ No newline at end of file
--- a/extern/picosha2/LICENSE
+++ b/extern/picosha2/LICENSE
+MIT License
+
+Copyright (c) 2017 okdshin
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
--- a/extern/picosha2/picosha2.h
+++ b/extern/picosha2/picosha2.h
--- a/lib/pls/include/pls/algorithms/for_each_impl.h
+++ b/lib/pls/include/pls/algorithms/for_each_impl.h
@@ -29,12 +29,12 @@ pls::internal::scheduling::parallel_result<int> for_each(const RandomIt first,
    // Cut in half recursively
    const long middle_index = num_elements / 2;

-    return scheduler::par([first, middle_index, last, &function, min_elements] {
+    return scheduler::par([first, middle_index, last, function, min_elements] {
      return internal::for_each(first,
                                first + middle_index,
                                function,
                                min_elements);
-    }, [first, middle_index, last, &function, min_elements] {
+    }, [first, middle_index, last, function, min_elements] {
      return internal::for_each(first + middle_index,
                                last,
                                function,

--- a/lib/pls/include/pls/internal/helpers/range.h
+++ b/lib/pls/include/pls/internal/helpers/range.h
@@ -112,7 +112,7 @@ struct basic_range {
        : r(rhs.r), index(rhs.index) {}

    const_iterator_impl(basic_range<IntegerType> const *p_range, size_type p_index)
-        : r(p_range), index(p_index) {}
+        : r(*p_range), index(p_index) {}

    const_iterator_impl &operator=(const const_iterator_impl &rhs) {
      r = rhs.r;
@@ -121,7 +121,7 @@ struct basic_range {
    }

    bool operator==(const const_iterator_impl &rhs) const {
-      return *r == *(rhs.r) && index == rhs.index;
+      return r == rhs.r && index == rhs.index;
    }

    bool operator!=(const const_iterator_impl &rhs) const {
@@ -145,7 +145,7 @@ struct basic_range {
    }

    value_type operator*() const {
-      return r->m_first_element + r->m_step * index;
+      return r.m_first_element + r.m_step * index;
    }

    // operator->
@@ -212,11 +212,11 @@ struct basic_range {

    value_type operator[](difference_type offset) const {
      size_type new_index = index + offset;
-      return r->m_first_element + r->m_step * new_index;
+      return r.m_first_element + r.m_step * new_index;
    }

   private:
-    basic_range<IntegerType> const *r;
+    basic_range<IntegerType> r;
    size_type index;
  };

@@ -236,7 +236,7 @@ struct basic_range {
        : r(rhs.r), index(rhs.index) {}

    const_reverse_iterator_impl(basic_range<IntegerType> const *p_range, size_type p_index)
-        : r(p_range), index(p_index) {}
+        : r(*p_range), index(p_index) {}

    const_reverse_iterator_impl &operator=(const const_reverse_iterator_impl &rhs) {
      r = rhs.r;
@@ -245,7 +245,7 @@ struct basic_range {
    }

    bool operator==(const const_reverse_iterator_impl &rhs) const {
-      return *r == *(rhs.r) && index == rhs.index;
+      return r == rhs.r && index == rhs.index;
    }

    bool operator!=(const const_reverse_iterator_impl &rhs) const {
@@ -270,8 +270,8 @@ struct basic_range {

    value_type operator*() const {
      size_type reverse_index
-          = (r->m_element_count - 1) - index;
-      return r->m_first_element + r->m_step * reverse_index;
+          = (r.m_element_count - 1) - index;
+      return r.m_first_element + r.m_step * reverse_index;
    }

    // operator->
@@ -338,12 +338,12 @@ struct basic_range {

    value_type operator[](difference_type offset) const {
      size_type new_reverse_index
-          = (r->m_element_count - 1) - (index + offset);
-      return r->m_first_element + r->m_step * new_reverse_index;
+          = (r.m_element_count - 1) - (index + offset);
+      return r.m_first_element + r.m_step * new_reverse_index;
    }

   private:
-    basic_range<IntegerType> const *r;
+    basic_range<IntegerType> r;
    size_type index;
  };