Add two 'standardized' benchmarks.

79ac0243 · FritzFlorian · 2f539691 · 79ac0243 · 79ac0243 · 79ac0243
Commit 79ac0243 authored Dec 20, 2019 by FritzFlorian
26 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,6 +26,9 @@ list(APPEND CMAKE_PREFIX_PATH "${PROJECT_SOURCE_DIR}/cmake")
 # Each library has an own CMakeLists.txt that should make it avaliabale as a library target,
 # thus allowing one to include it as any cmake dependency later on.
 add_subdirectory(extern/catch2)
+add_subdirectory(extern/picosha2)
+add_subdirectory(extern/benchmark_base)
+add_subdirectory(extern/benchmark_runner)
 # Include all internal subprojects (library, examples, testing).
 add_subdirectory(lib/pls)

--- a/app/benchmark_fft/CMakeLists.txt
+++ b/app/benchmark_fft/CMakeLists.txt
-add_executable(benchmark_fft main.cpp)
+add_executable(benchmark_fft_pls_v2 main.cpp)
-target_link_libraries(benchmark_fft pls)
+target_link_libraries(benchmark_fft_pls_v2 pls benchmark_runner benchmark_base)
-if(EASY_PROFILER)
+if (EASY_PROFILER)
-    target_link_libraries(benchmark_fft easy_profiler)
+    target_link_libraries(benchmark_fft_pls_v2 easy_profiler)
-endif()
+endif ()
--- a/app/benchmark_fft/main.cpp
+++ b/app/benchmark_fft/main.cpp
@@ -2,142 +2,91 @@
 #include "pls/internal/scheduling/parallel_result.h"
 #include "pls/internal/scheduling/scheduler_memory.h"
 #include "pls/internal/helpers/profiler.h"
 using namespace pls::internal::scheduling;
 #include <iostream>
 #include <complex>
 #include <vector>
-#include <atomic>
-static constexpr int CUTOFF = 16;
-static constexpr int INPUT_SIZE = 16384;
-typedef std::vector<std::complex<double>> complex_vector;
-void divide(complex_vector::iterator data, int n) {
-  complex_vector tmp_odd_elements(n / 2);
-  for (int i = 0; i < n / 2; i++) {
-    tmp_odd_elements[i] = data[i * 2 + 1];
-  }
-  for (int i = 0; i < n / 2; i++) {
-    data[i] = data[i * 2];
-  }
-  for (int i = 0; i < n / 2; i++) {
-    data[i + n / 2] = tmp_odd_elements[i];
-  }
-}
-void combine(complex_vector::iterator data, int n) {
+#include "benchmark_runner.h"
-  for (int i = 0; i < n / 2; i++) {
+#include "benchmark_base/fft.h"
-    std::complex<double> even = data[i];
-    std::complex<double> odd = data[i + n / 2];
-    // w is the "twiddle-factor".
-    // this could be cached, but we run the same 'data_structures' algorithm parallel/serial,
-    // so it won't impact the performance comparison.
-    std::complex<double> w = exp(std::complex<double>(0, -2. * M_PI * i / n));
-    data[i] = even + w * odd;
-    data[i + n / 2] = even - w * odd;
-  }
-}
-void fft_normal(complex_vector::iterator data, int n) {
-  if (n < 2) {
-    return;
-  }
-  divide(data, n);
+using namespace comparison_benchmarks::base;
-  fft_normal(data, n / 2);
-  fft_normal(data + n / 2, n / 2);
-  combine(data, n);
-}
-parallel_result<short> fft(complex_vector::iterator data, int n) {
+parallel_result<short> conquer(fft::complex_vector::iterator data, int n) {
  if (n < 2) {
    return parallel_result<short>{0};
  }
-  divide(data, n);
+  fft::divide(data, n);
-  if (n <= CUTOFF) {
+  if (n <= fft::RECURSIVE_CUTOFF) {
-    fft_normal(data, n / 2);
+    fft::conquer(data, n / 2);
-    fft_normal(data + n / 2, n / 2);
+    fft::conquer(data + n / 2, n / 2);
-    combine(data, n);
+    fft::combine(data, n);
    return parallel_result<short>{0};
  } else {
    return scheduler::par([=]() {
-      return fft(data, n / 2);
+      return conquer(data, n / 2);
    }, [=]() {
-      return fft(data + n / 2, n / 2);
+      return conquer(data + n / 2, n / 2);
    }).then([=](int, int) {
-      combine(data, n);
+      fft::combine(data, n);
      return parallel_result<short>{0};
    });
  }
 }
-complex_vector prepare_input(int input_size) {
+constexpr int MAX_NUM_THREADS = 8;
-  std::vector<double> known_frequencies{2, 11, 52, 88, 256};
+constexpr int MAX_NUM_TASKS = 64;
-  complex_vector data(input_size);
+constexpr int MAX_NUM_CONTS = 64;
+constexpr int MAX_CONT_SIZE = 256;
-  // Set our input data to match a time series of the known_frequencies.
-  // When applying fft to this time-series we should find these frequencies.
-  for (int i = 0; i < input_size; i++) {
-    data[i] = std::complex<double>(0.0, 0.0);
-    for (auto frequencie : known_frequencies) {
-      data[i] += sin(2 * M_PI * frequencie * i / input_size);
-    }
-  }
-  return data;
-}
-static constexpr int NUM_ITERATIONS = 500;
+int main(int argc, char **argv) {
-constexpr size_t NUM_THREADS = 2;
+  int num_threads;
+  string directory;
+  benchmark_runner::read_args(argc, argv, num_threads, directory);
-constexpr size_t NUM_TASKS = 128;
+  string test_name = to_string(num_threads) + ".csv";
+  string full_directory = directory + "/PLS_v2/";
+  benchmark_runner runner{full_directory, test_name};
-constexpr size_t NUM_CONTS = 128;
+  fft::complex_vector data = fft::generate_input();
-constexpr size_t MAX_CONT_SIZE = 512;
-int main() {
+  static_scheduler_memory<MAX_NUM_THREADS,
-  PROFILE_ENABLE;
+                          MAX_NUM_TASKS,
-  complex_vector initial_input = prepare_input(INPUT_SIZE);
+                          MAX_NUM_CONTS,
-  static_scheduler_memory<NUM_THREADS,
-                          NUM_TASKS,
-                          NUM_CONTS,
                          MAX_CONT_SIZE> static_scheduler_memory;
-  scheduler scheduler{static_scheduler_memory, NUM_THREADS};
+  scheduler scheduler{static_scheduler_memory, (unsigned int) num_threads};
-  auto start = std::chrono::steady_clock::now();
+  for (int i = 0; i < fft::NUM_WARMUP_ITERATIONS; i++) {
-  for (int i = 0; i < NUM_ITERATIONS; i++) {
-    complex_vector input_2(initial_input);
    scheduler.perform_work([&]() {
-      PROFILE_MAIN_THREAD;
      return scheduler::par([&]() {
-        return fft(input_2.begin(), INPUT_SIZE);
+        return conquer(data.begin(), fft::SIZE);
      }, []() {
-        return parallel_result<int>{0};
+        return parallel_result<short>{0};
-      }).then([](int, int) {
+      }).then([&](short, short) {
        return parallel_result<int>{0};
      });
    });
-    PROFILE_LOCK("DONE");
  }
-  auto end = std::chrono::steady_clock::now();
-  std::cout << "Framework:  " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
+  for (int i = 0; i < fft::NUM_ITERATIONS; i++) {
-            << std::endl;
+    scheduler.perform_work([&]() {
-  PROFILE_SAVE("test_profile.prof");
+      runner.start_iteration();
-  start = std::chrono::steady_clock::now();
+      return scheduler::par([&]() {
-  for (int i = 0; i < NUM_ITERATIONS; i++) {
+        return conquer(data.begin(), fft::SIZE);
-    complex_vector input_1(initial_input);
+      }, []() {
-    fft_normal(input_1.begin(), INPUT_SIZE);
+        return parallel_result<short>{0};
+      }).then([&](short, short) {
+        runner.end_iteration();
+        return parallel_result<int>{0};
+      });
+    });
  }
-  end = std::chrono::steady_clock::now();
+  runner.commit_results(true);
-  std::cout << "Normal:     " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
-            << std::endl;
  return 0;
 }
--- a/app/benchmark_matrix/CMakeLists.txt
+++ b/app/benchmark_matrix/CMakeLists.txt
-add_executable(benchmark_matrix main.cpp)
+add_executable(benchmark_matrix_pls_v2 main.cpp)
-target_link_libraries(benchmark_matrix pls)
+target_link_libraries(benchmark_matrix_pls_v2 pls benchmark_runner benchmark_base)
 if (EASY_PROFILER)
-    target_link_libraries(benchmark_matrix easy_profiler)
+    target_link_libraries(benchmark_matrix_pls_v2 easy_profiler)
 endif ()
--- a/app/benchmark_matrix/main.cpp
+++ b/app/benchmark_matrix/main.cpp
@@ -2,112 +2,78 @@
 #include "pls/internal/scheduling/parallel_result.h"
 #include "pls/internal/scheduling/scheduler_memory.h"
 #include "pls/algorithms/for_each.h"
 using namespace pls::internal::scheduling;
-#include <chrono>
+#include "benchmark_runner.h"
+#include "benchmark_base/matrix.h"
-const int MATRIX_SIZE = 128;
+using namespace comparison_benchmarks::base;
 template<typename T, int SIZE>
-class matrix {
+class pls_matrix : public matrix::matrix<T, SIZE> {
 public:
-  T data[SIZE][SIZE];
+  pls_matrix() : matrix::matrix<T, SIZE>() {}
-  explicit matrix(T i = 1) {
-    std::fill(&data[0][0], &data[0][0] + SIZE * SIZE, i);
-  }
-  parallel_result<int> multiply(const matrix<T, SIZE> &a, const matrix<T, SIZE> &b) {
+  parallel_result<int> pls_multiply(const matrix::matrix<T, SIZE> &a, const matrix::matrix<T, SIZE> &b) {
-    return pls::algorithm::for_each_range(0, SIZE, [&](int i) {
+    return pls::algorithm::for_each_range(0, SIZE, [this, &a, &b](int i) {
      this->multiply_column(i, a, b);
    });
  }
- private:
-  void multiply_column(int i, const matrix<T, SIZE> &a, const matrix<T, SIZE> &b) {
-    for (int j = 0; j < SIZE; ++j) {
-      data[i][j] = 0;
-    }
-    for (int k = 0; k < SIZE; ++k) {
-      for (int j = 0; j < SIZE; ++j) {
-        data[i][j] += a.data[i][k] * b.data[k][j];
-      }
-    }
-  }
 };
-void fill_with_data(matrix<double, MATRIX_SIZE> &a, matrix<double, MATRIX_SIZE> &b) {
+constexpr size_t MAX_NUM_THREADS = 8;
-  // Fill in some data...
+constexpr size_t MAX_NUM_TASKS = 32;
-  for (int i = 0; i < MATRIX_SIZE; i++) {
+constexpr size_t MAX_NUM_CONTS = 32;
-    for (int j = 0; j < MATRIX_SIZE; j++) {
+constexpr size_t MAX_CONT_SIZE = 512;
-      a.data[i][j] = i;
-      b.data[i][j] = j;
-    }
-  }
-}
-static constexpr int NUM_ITERATIONS = 1000;
+int main(int argc, char **argv) {
-constexpr size_t NUM_THREADS = 3;
+  int num_threads;
+  string directory;
+  benchmark_runner::read_args(argc, argv, num_threads, directory);
-constexpr size_t NUM_TASKS = 128;
+  string test_name = to_string(num_threads) + ".csv";
+  string full_directory = directory + "/PLS_v2/";
+  benchmark_runner runner{full_directory, test_name};
-constexpr size_t NUM_CONTS = 128;
+  pls_matrix<double, matrix::MATRIX_SIZE> a;
-constexpr size_t MAX_CONT_SIZE = 512;
+  pls_matrix<double, matrix::MATRIX_SIZE> b;
+  pls_matrix<double, matrix::MATRIX_SIZE> result;
-int main() {
+  static_scheduler_memory<MAX_NUM_THREADS,
-  PROFILE_ENABLE
+                          MAX_NUM_TASKS,
-  matrix<double, MATRIX_SIZE> a;
+                          MAX_NUM_CONTS,
-  matrix<double, MATRIX_SIZE> b;
-  matrix<double, MATRIX_SIZE> result;
-  fill_with_data(a, b);
-  static_scheduler_memory<NUM_THREADS,
-                          NUM_TASKS,
-                          NUM_CONTS,
                          MAX_CONT_SIZE> static_scheduler_memory;
-  scheduler scheduler{static_scheduler_memory, NUM_THREADS};
+  scheduler scheduler{static_scheduler_memory, (unsigned int) num_threads};
+  for (int i = 0; i < matrix::WARMUP_ITERATIONS; i++) {
-  auto start = std::chrono::steady_clock::now();
-  for (int i = 0; i < NUM_ITERATIONS; i++) {
    scheduler.perform_work([&]() {
-      PROFILE_MAIN_THREAD;
      return scheduler::par([&]() {
-        return result.multiply(a, b);
+        return result.pls_multiply(a, b);
      }, []() {
        return parallel_result<int>{0};
-      }).then([](int, int) {
+      }).then([&](int, int) {
        return parallel_result<int>{0};
      });
    });
  }
-  auto end = std::chrono::steady_clock::now();
-  std::cout << "Framework:  " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
-            << std::endl;
-}
-//int main() {
+  for (int i = 0; i < matrix::NUM_ITERATIONS; i++) {
-//  PROFILE_ENABLE
+    scheduler.perform_work([&]() {
-//  pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18u};
+      runner.start_iteration();
-//  pls::scheduler scheduler{&my_scheduler_memory, 4};
-//
-//  matrix<double, MATRIX_SIZE> a;
-//  matrix<double, MATRIX_SIZE> b;
-//  matrix<double, MATRIX_SIZE> result;
-//  fill_with_data(a, b);
-//
-//  scheduler.perform_work([&] {
-//    auto start_time = std::chrono::high_resolution_clock::now();
-//    PROFILE_MAIN_THREAD
-//    for (int i = 0; i < 10000; i++) {
-//      PROFILE_WORK_BLOCK("Top Level")
-//      result.multiply(a, b);
-//    }
-//    auto end_time = std::chrono::high_resolution_clock::now();
-//    long time = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
-//    std::cout << "Runtime: " << time << "us" << std::endl;
-//  });
-//
-//  PROFILE_SAVE("test_profile.prof")
-//}
+      return scheduler::par([&]() {
+        return result.pls_multiply(a, b);
+      }, []() {
+        return parallel_result<int>{0};
+      }).then([&](int, int) {
+        runner.end_iteration();
+        return parallel_result<int>{0};
+      });
+    });
+  }
+  runner.commit_results(true);
+}
--- a/app/benchmark_unbalanced/main.cpp
+++ b/app/benchmark_unbalanced/main.cpp
@@ -51,22 +51,22 @@ parallel_result<int> unbalanced_tree_search(int seed, int root_children, double 
  return result;
 }
-constexpr size_t NUM_THREADS = 5;
+constexpr size_t MAX_NUM_THREADS = 5;
-constexpr size_t NUM_TASKS = 128;
+constexpr size_t MAX_NUM_TASKS = 128;
-constexpr size_t NUM_CONTS = 128;
+constexpr size_t MAX_NUM_CONTS = 128;
 constexpr size_t MAX_CONT_SIZE = 512;
 volatile int result;
 int main() {
  PROFILE_ENABLE
-  static_scheduler_memory<NUM_THREADS,
+  static_scheduler_memory<MAX_NUM_THREADS,
-                          NUM_TASKS,
+                          MAX_NUM_TASKS,
-                          NUM_CONTS,
+                          MAX_NUM_CONTS,
                          MAX_CONT_SIZE> static_scheduler_memory;
-  scheduler scheduler{static_scheduler_memory, NUM_THREADS};
+  scheduler scheduler{static_scheduler_memory, MAX_NUM_THREADS};
  scheduler.perform_work([&]() {
    return scheduler::par([&]() {

--- a/app/playground/main.cpp
+++ b/app/playground/main.cpp
@@ -8,12 +8,12 @@
 using namespace pls::internal;
-constexpr size_t NUM_THREADS = 4;
+constexpr size_t MAX_NUM_THREADS = 1;
-constexpr size_t NUM_TASKS = 128;
+constexpr size_t MAX_NUM_TASKS = 128;
-static constexpr int NUM_ITERATIONS = 100;
+static constexpr int NUM_ITERATIONS = 10;
-constexpr size_t NUM_CONTS = 128;
+constexpr size_t MAX_NUM_CONTS = 128;
 constexpr size_t MAX_CONT_SIZE = 256;
 int fib_normal(int n) {
@@ -29,8 +29,13 @@ int fib_normal(int n) {
 }
 scheduling::parallel_result<int> fib(int n) {
-  if (n <= 10) {
+  pls::variable<int> i;
-    return fib_normal(n);
+  pls::array<int> a{10};
+  if (n == 0) {
+    return 0;
+  }
+  if (n == 1) {
+    return 1;
  }
  return scheduling::scheduler::par([=]() {
@@ -45,12 +50,12 @@ scheduling::parallel_result<int> fib(int n) {
 static volatile int result;
 int main() {
  PROFILE_ENABLE;
-  scheduling::static_scheduler_memory<NUM_THREADS,
+  scheduling::static_scheduler_memory<MAX_NUM_THREADS,
-                                      NUM_TASKS,
+                                      MAX_NUM_TASKS,
-                                      NUM_CONTS,
+                                      MAX_NUM_CONTS,
                                      MAX_CONT_SIZE> static_scheduler_memory;
-  scheduling::scheduler scheduler{static_scheduler_memory, NUM_THREADS};
+  scheduling::scheduler scheduler{static_scheduler_memory, MAX_NUM_THREADS};
  auto start = std::chrono::steady_clock::now();
  for (int i = 0; i < NUM_ITERATIONS; i++) {

--- a/extern/benchmark_base/CMakeLists.txt
+++ b/extern/benchmark_base/CMakeLists.txt
+# Configuration and common algorithm pieces for benchmarks
+configure_file(src/sample_images.cpp.in sample_images.cpp)
+add_library(benchmark_base STATIC
+        ${CMAKE_CURRENT_BINARY_DIR}/sample_images.cpp
+        src/fft.cpp include/benchmark_base/fft.h
+        include/benchmark_base/heat.h
+        include/benchmark_base/matrix.h
+        include/benchmark_base/unbalanced.h src/unbalanced.cpp
+        include/benchmark_base/range.h)
+target_include_directories(benchmark_base
+        PUBLIC
+        $<INSTALL_INTERFACE:include>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+        PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/src
+        )
+target_link_libraries(benchmark_base picosha2)
--- a/extern/benchmark_base/include/benchmark_base/.gitkeep
+++ b/extern/benchmark_base/include/benchmark_base/.gitkeep
--- a/extern/benchmark_base/include/benchmark_base/RANGE_LICENSE.txt
+++ b/extern/benchmark_base/include/benchmark_base/RANGE_LICENSE.txt
+Boost Software License - Version 1.0 - August 17th, 2003
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
--- a/extern/benchmark_base/include/benchmark_base/fft.h
+++ b/extern/benchmark_base/include/benchmark_base/fft.h
+#ifndef COMPARISON_BENCHMARKS_BASE_FFT_H
+#define COMPARISON_BENCHMARKS_BASE_FFT_H
+#include <complex>
+#include <string>
+#include <vector>
+namespace comparison_benchmarks {
+namespace base {
+namespace fft {
+const int SIZE = 8192;
+const int NUM_ITERATIONS = 1000;
+const int NUM_WARMUP_ITERATIONS = 100;
+const int RECURSIVE_CUTOFF = 32;
+typedef std::vector<std::complex<double>> complex_vector;
+complex_vector generate_input();
+void divide(complex_vector::iterator data, int n);
+void conquer(complex_vector::iterator data, int n);
+void combine(complex_vector::iterator data, int n);
+}
+}
+}
+#endif //COMPARISON_BENCHMARKS_BASE_FFT_H
--- a/extern/benchmark_base/include/benchmark_base/heat.h
+++ b/extern/benchmark_base/include/benchmark_base/heat.h
+#ifndef COMPARISON_BENCHMARKS_BASE_HEAT_H
+#define COMPARISON_BENCHMARKS_BASE_HEAT_H
+#include <array>
+#include <iostream>
+#include <memory>
+namespace comparison_benchmarks {
+namespace base {
+namespace heat {
+const int DIFFUSION_SIZE = 256;
+const int DIFFUSION_STEPS = 256;
+const int NUM_ITERATIONS = 100;
+const int WARMUP_ITERATIONS = 20;
+template<typename T, int SIZE>
+class heat_diffusion {
+  // Center portion is SIZExSIZE, borders are fixed temperature values
+  using matrix = std::array<std::array<T, SIZE + 2>, SIZE + 2>;
+ protected:
+  // Sane default values for the simulation (form paper).
+  // This is not about perfect simulation results but the speedup of the workload.
+  double c = 0.1;
+  double d_s = 1.0 / (SIZE + 1);
+  double d_t = (d_s * d_s) / (4 * c);
+ public:
+  matrix *current_data;
+  matrix *next_data;
+  explicit heat_diffusion() {
+    current_data = new matrix;
+    next_data = new matrix;
+    reset_data();
+  }
+  ~heat_diffusion() {
+    delete current_data;
+    delete next_data;
+  }
+  virtual void run_simulation(int n) {
+    for (int i = 0; i < n; i++) {
+      for (int row = 1; row <= SIZE; row++) {
+        for (int column = 1; column <= SIZE; column++) {
+          update_element(row, column);
+        }
+      }
+      // Synchronization point needed to coordinate the calculation!
+      swap_data_arrays();
+    }
+  }
+ protected:
+  void update_element(int row, int column) {
+    (*next_data)[row][column] = (*current_data)[row][column] + ((c * d_t) / (d_s * d_s)) *
+        ((*current_data)[row + 1][column] + (*current_data)[row - 1][column]
+            - 4 * (*current_data)[row][column]
+            + (*current_data)[row][column + 1] + (*current_data)[row][column - 1]);
+  }
+  void swap_data_arrays() {
+    matrix *tmp = current_data;
+    current_data = next_data;
+    next_data = tmp;
+  }
+  void reset_data() {
+    for (int row = 0; row < SIZE + 2; row++) {
+      for (int column = 0; column < SIZE + 2; column++) {
+        (*current_data)[row][column] = 0.0;
+        (*next_data)[row][column] = 0.0;
+        // Edges are a fixed, hot temperature
+        if (row == 0 || row == SIZE + 1) {
+          (*current_data)[row][column] = 1.0;
+          (*next_data)[row][column] = 1.0;
+        }
+      }
+    }
+  }
+};
+template<typename T, int SIZE>
+std::ostream &operator<<(std::ostream &strm, const heat_diffusion<T, SIZE> &simulation) {
+  for (int i = 0; i < SIZE + 2; i++) {
+    for (int j = 0; j < SIZE + 2; j++) {
+      // 'color' our output according to temperature
+      char out;
+      if (simulation.current_data[i][j] < 0.1) {
+        out = ' ';
+      } else if (simulation.current_data[i][j] < 0.2) {
+        out = '-';
+      } else if (simulation.current_data[i][j] < 0.5) {
+        out = '=';
+      } else {
+        out = '#';
+      }
+      strm << out << "\t";
+    }
+    strm << std::endl;
+  }
+  return strm;
+}
+}
+}
+}
+#endif //COMPARISON_BENCHMARKS_BASE_HEAT_H
--- a/extern/benchmark_base/include/benchmark_base/matrix.h
+++ b/extern/benchmark_base/include/benchmark_base/matrix.h
+#ifndef COMPARISON_BENCHMARKS_BASE_MATRIX_H
+#define COMPARISON_BENCHMARKS_BASE_MATRIX_H
+#include <algorithm>
+#include <iostream>
+namespace comparison_benchmarks {
+namespace base {
+namespace matrix {
+const int MATRIX_SIZE = 128;
+const int NUM_ITERATIONS = 5000;
+const int WARMUP_ITERATIONS = 1000;
+template<typename T, int SIZE>
+class matrix {
+ public:
+  T data[SIZE][SIZE];
+  explicit matrix() {
+    for (int i = 0; i < MATRIX_SIZE; i++) {
+      for (int j = 0; j < MATRIX_SIZE; j++) {
+        data[i][j] = i;
+      }
+    }
+  }
+  virtual void multiply(const matrix<T, SIZE> &a, const matrix<T, SIZE> &b) {
+    for (int i = 0; i < SIZE; i++) {
+      multiply_column(i, a, b);
+    }
+  }
+ protected:
+  void multiply_column(int i, const matrix<T, SIZE> &a, const matrix<T, SIZE> &b) {
+    for (int j = 0; j < SIZE; ++j) {
+      data[i][j] = 0;
+    }
+    for (int k = 0; k < SIZE; ++k) {
+      for (int j = 0; j < SIZE; ++j) {
+        T a_data = a.data[i][k];
+        T b_data = b.data[k][j];
+        data[i][j] += a_data * b_data;
+      }
+    }
+  }
+};
+template<typename T, int SIZE>
+std::ostream &operator<<(std::ostream &strm, const matrix<T, SIZE> &matrix) {
+  for (int i = 0; i < SIZE; i++) {
+    for (int j = 0; j < SIZE; j++) {
+      strm << matrix.data[i][j] << "\t";
+    }
+    strm << std::endl;
+  }
+  return strm;
+}
+}
+}
+}
+#endif //COMPARISON_BENCHMARKS_BASE_MATRIX_H
--- a/extern/benchmark_base/include/benchmark_base/range.h
+++ b/extern/benchmark_base/include/benchmark_base/range.h
+/*
+                Range
+                =====
+    Copyright (c) 2009-2011 Khaled Alshaya
+    Distributed under the Boost Software License, version 1.0
+    (See the license at: http://www.boost.org/license_1_0.txt).
+*/
+/*
+                Rationale
+                =========
+    In Python, there is a beautiful function called "range".
+    "range" allows the programmer to iterate over a range elegantly.
+    This concept is not as general as "for-loops" in C++,
+    but non the less, it expresses the intent of the programmer
+    clearer than the general "for-loops" in many cases.
+                Design
+                ======
+    Range is made to be STL-like library. In fact, it is 
+    built on top of the concepts of STL. The library is designed to
+    work with STL algorithms as well. Range is more flexible
+    than the Python "range", because:
+    Range is an "immutable ordered random access container"
+                Specifications
+                ==============
+    Range satisfies the following requirements:
+        * Immutable.
+        * Random Access Container.
+        * Random Access Iterator Interface.
+        * Constant Time Complexity Operations.
+    Range models an ordered sequence of elements,
+    where a range is defined by:
+        [begin, end)
+        * begin: the first element in the range. (Inclusive)
+        * end  : the last element in the range.  (Exclusive)
+        * step : the distance between two consecutive elements in a range.
+        where each element in the range is defined by:
+        element = begin + step * i
+        * i: is the index of the element in range.
+        The following precondition must be met for the sequence
+        to be a valid range:
+            step != 0
+            &&
+            (    
+                begin <= end && step > 0
+                ||
+                begin >= end && step < 0
+            )
+                Portability
+                ===========
+    Range Generator is written in standard C++ (C++98). It depends
+    -only- on the standard C++ library.
+*/
+// TODO: See if we should swap this out for our own implementation, for now this is fine, as it is self contained.
+/**
+ * Notes on Modification:
+ * The code was adpated to fit into our namespacing/naming scheme for simpler use.
+ * This includes ifdef's, namespace and code formatting style.
+ */
+#ifndef Range_h__
+#define Range_h__
+#include <iterator>
+#include <stdexcept>
+#include <cstddef>
+#include <cmath>
+namespace comparison_benchmarks {
+namespace base {
+namespace range {
+template<class IntegerType>
+struct basic_range {
+  struct const_iterator_impl {
+    typedef IntegerType value_type;
+    typedef std::size_t size_type;
+    typedef IntegerType difference_type;
+    typedef value_type *pointer;
+    typedef value_type &reference;
+    typedef
+    std::random_access_iterator_tag
+        iterator_category;
+    const_iterator_impl() : r(0), index(0) {}
+    const_iterator_impl(const const_iterator_impl &rhs)
+        : r(rhs.r), index(rhs.index) {}
+    const_iterator_impl(basic_range<IntegerType> const *p_range, size_type p_index)
+        : r(p_range), index(p_index) {}
+    const_iterator_impl &operator=(const const_iterator_impl &rhs) {
+      r = rhs.r;
+      index = rhs.index;
+      return *this;
+    }
+    bool operator==(const const_iterator_impl &rhs) const {
+      return *r == *(rhs.r) && index == rhs.index;
+    }
+    bool operator!=(const const_iterator_impl &rhs) const {
+      return !(*this == rhs);
+    }
+    bool operator<(const const_iterator_impl &rhs) const {
+      return index < rhs.index;
+    }
+    bool operator>(const const_iterator_impl &rhs) const {
+      return index > rhs.index;
+    }
+    bool operator<=(const const_iterator_impl &rhs) const {
+      return index <= rhs.index;
+    }
+    bool operator>=(const const_iterator_impl &rhs) const {
+      return index >= rhs.index;
+    }
+    value_type operator*() const {
+      return r->m_first_element + r->m_step * index;
+    }
+    // operator->
+    // is not implemented because the value_type is an integer type
+    // and primitive types in C++ don't define member functions.
+    const_iterator_impl &operator++() {
+      ++index;
+      return *this;
+    }
+    const_iterator_impl operator++(int) {
+      const_iterator_impl temp = *this;
+      ++index;
+      return temp;
+    }
+    const_iterator_impl &operator--() {
+      --index;
+      return *this;
+    }
+    const_iterator_impl operator--(int) {
+      const_iterator_impl temp = *this;
+      --index;
+      return temp;
+    }
+    const_iterator_impl &operator+=(difference_type increment) {
+      index += increment;
+      return *this;
+    }
+    // operator+
+    // is friend operator but operator-
+    // is not, because we want to allow the following for "+":
+    // iterator+5
+    // 5+iterator
+    // For the "-" it is not correct to do so, because
+    // iterator-5 != 5-iterator
+    friend const_iterator_impl operator+
+        (const const_iterator_impl &lhs, difference_type increment) {
+      const_iterator_impl sum;
+      sum.r = lhs.r;
+      sum.index = lhs.index + increment;
+      return sum;
+    }
+    const_iterator_impl &operator-=(difference_type decrement) {
+      index -= decrement;
+      return *this;
+    }
+    const_iterator_impl operator-(difference_type decrement) const {
+      const_iterator_impl shifted_iterator;
+      shifted_iterator.r = r;
+      shifted_iterator.index = index - decrement;
+      return shifted_iterator;
+    }
+    difference_type operator-(const const_iterator_impl &rhs) const {
+      return index - rhs.index;
+    }
+    value_type operator[](difference_type offset) const {
+      size_type new_index = index + offset;
+      return r->m_first_element + r->m_step * new_index;
+    }
+   private:
+    basic_range<IntegerType> const *r;
+    size_type index;
+  };
+  struct const_reverse_iterator_impl {
+    typedef IntegerType value_type;
+    typedef std::size_t size_type;
+    typedef IntegerType difference_type;
+    typedef value_type *pointer;
+    typedef value_type &reference;
+    typedef
+    std::random_access_iterator_tag
+        iterator_category;
+    const_reverse_iterator_impl() : r(0), index(0) {}
+    const_reverse_iterator_impl(const const_reverse_iterator_impl &rhs)
+        : r(rhs.r), index(rhs.index) {}
+    const_reverse_iterator_impl(basic_range<IntegerType> const *p_range, size_type p_index)
+        : r(p_range), index(p_index) {}
+    const_reverse_iterator_impl &operator=(const const_reverse_iterator_impl &rhs) {
+      r = rhs.r;
+      index = rhs.index;
+      return *this;
+    }
+    bool operator==(const const_reverse_iterator_impl &rhs) const {
+      return *r == *(rhs.r) && index == rhs.index;
+    }
+    bool operator!=(const const_reverse_iterator_impl &rhs) const {
+      return !(*this == rhs);
+    }
+    bool operator<(const const_reverse_iterator_impl &rhs) const {
+      return index < rhs.index;
+    }
+    bool operator>(const const_reverse_iterator_impl &rhs) const {
+      return index > rhs.index;
+    }
+    bool operator<=(const const_reverse_iterator_impl &rhs) const {
+      return index <= rhs.index;
+    }
+    bool operator>=(const const_reverse_iterator_impl &rhs) const {
+      return index >= rhs.index;
+    }
+    value_type operator*() const {
+      size_type reverse_index
+          = (r->m_element_count - 1) - index;
+      return r->m_first_element + r->m_step * reverse_index;
+    }
+    // operator->
+    // is not implemented because the value_type is integer type
+    // and primitive types in C++ don't define member functions.
+    const_reverse_iterator_impl &operator++() {
+      ++index;
+      return *this;
+    }
+    const_reverse_iterator_impl operator++(int) {
+      const_reverse_iterator_impl temp = *this;
+      ++index;
+      return temp;
+    }
+    const_reverse_iterator_impl &operator--() {
+      --index;
+      return *this;
+    }
+    const_reverse_iterator_impl operator--(int) {
+      const_reverse_iterator_impl temp = *this;
+      --index;
+      return temp;
+    }
+    const_reverse_iterator_impl &operator+=(difference_type increment) {
+      index += increment;
+      return *this;
+    }
+    // operator+
+    // is friend operator but operator-
+    // is not, because we want to allow the following for "+":
+    // iterator+5
+    // 5+iterator
+    // For the "-" it is not correct to do so, because
+    // iterator-5 != 5-iterator
+    friend const_reverse_iterator_impl operator+
+        (const const_reverse_iterator_impl &lhs, difference_type increment) {
+      const_reverse_iterator_impl sum;
+      sum.r = lhs.r;
+      sum.index = lhs.index + increment;
+      return sum;
+    }
+    const_reverse_iterator_impl &operator-=(difference_type decrement) {
+      index -= decrement;
+      return *this;
+    }
+    const_reverse_iterator_impl operator-(difference_type decrement) const {
+      const_reverse_iterator_impl shifted_iterator;
+      shifted_iterator.r = r;
+      shifted_iterator.index = index - decrement;
+      return shifted_iterator;
+    }
+    difference_type operator-(const const_reverse_iterator_impl &rhs) const {
+      return index - rhs.index;
+    }
+    value_type operator[](difference_type offset) const {
+      size_type new_reverse_index
+          = (r->m_element_count - 1) - (index + offset);
+      return r->m_first_element + r->m_step * new_reverse_index;
+    }
+   private:
+    basic_range<IntegerType> const *r;
+    size_type index;
+  };
+  typedef IntegerType value_type;
+  typedef const_iterator_impl iterator;
+  typedef const_iterator_impl const_iterator;
+  typedef const_reverse_iterator_impl reverse_iterator;
+  typedef const_reverse_iterator_impl const_reverse_iterator;
+  typedef value_type &reference;
+  typedef const value_type &const_reference;
+  typedef value_type *pointer;
+  typedef IntegerType difference_type;
+  typedef std::size_t size_type;
+  // In the case of default construction,
+  // the range is considered as an empty range with no elements.
+  // step can be anything other than 0. 1 is
+  // an implementation convention, and it doesn't have
+  // a significance in this case because the range is empty.
+  basic_range() : m_first_element(0), m_element_count(0), m_step(1) {}
+  // first_element: is begin in specifications.
+  // last_element: is end in specifications.
+  basic_range(value_type first_element, value_type last_element, value_type step)
+      : m_first_element(first_element),
+        m_step(step) {
+    // We need to count the number of elements.
+    // The only case where a range is invalid,
+    // when the step=0. It means that the range
+    // is infinite, because the number of elements
+    // in a range, is the length of that range
+    // divided by the difference between
+    // every two successive elements.
+    if (step == 0)
+      throw std::out_of_range("Invalid Range: step can't be equal to zero!");
+    if (first_element < last_element && step < 0)
+      throw std::out_of_range("Invalid Range: step can't be backward, while the range is forward!");
+    if (first_element > last_element && step > 0)
+      throw std::out_of_range("Invalid Range: step can't be forward, while the range is backward!");
+    m_element_count = (last_element - first_element) / step;
+    if ((last_element - first_element) % step != 0)
+      ++m_element_count;
+  }
+  // The following constructor, determines the step
+  // automatically. If the range is forward, then
+  // step will be one. If the range is backward,
+  // step will be minus one. If the begin is equal
+  // to end, then the step must not equal to zero
+  // and it is set to one as a convention.
+  basic_range(value_type first_element, value_type last_element)
+      : m_first_element(first_element) {
+    if (last_element >= first_element) *this = basic_range<IntegerType>(first_element, last_element, 1);
+    else *this = basic_range<IntegerType>(first_element, last_element, -1);
+  }
+  // The following constructor is a shortcut
+  // if you want the first element as zero.
+  // the step is determined automatically, based
+  // on the last element. If the last element is
+  // positive, then step is one, but if it is negative
+  // then step is minus one.
+  basic_range<IntegerType>(value_type last_element)
+      : m_first_element(0) {
+    if (last_element >= m_first_element) *this = basic_range<IntegerType>(m_first_element, last_element, 1);
+    else *this = basic_range<IntegerType>(m_first_element, last_element, -1);
+  }
+  basic_range<IntegerType>(const basic_range<IntegerType> &r)
+      : m_first_element(r.m_first_element),
+        m_element_count(r.m_element_count),
+        m_step(r.m_step) {}
+  basic_range<IntegerType> &operator=(const basic_range<IntegerType> &r) {
+    m_first_element = r.m_first_element;
+    m_element_count = r.m_element_count;
+    m_step = r.m_step;
+    return *this;
+  }
+  bool operator==(const basic_range<IntegerType> &r) const {
+    return m_first_element == r.m_first_element
+        &&
+            m_element_count == r.m_element_count
+        &&
+            m_step == r.m_step;
+  }
+  bool operator!=(const basic_range<IntegerType> &r) const {
+    return !(*this == r);
+  }
+  // The following four functions enable the user to compare
+  // ranges using ( <, >, <=, >=).
+  // The comparison between two ranges is a simple lexicographical
+  // comparison(element by element). By convention, if two ranges
+  // R1, R2 where R1 has a smaller number of elements. Then if
+  // R1 contains more elements but all R1 elements are found in R2
+  // R1 is considered less than R2.
+  bool operator<(const basic_range<IntegerType> &r) const {
+    // ********** This function needs refactoring.
+    if (m_element_count == 0 && r.m_element_count == 0)
+      return false;
+    if (m_element_count == 0 && r.m_element_count > 0)
+      return true;
+    if (m_element_count > 0 && r.m_element_count == 0)
+      return false;
+    // At this point, both has at least one element.
+    if (m_first_element < r.m_first_element)
+      return true;
+    if (m_first_element > r.m_first_element)
+      return false;
+    // At this point, the first element of both are equal.
+    if (m_element_count == 1 && r.m_element_count == 1)
+      return false;
+    if (m_element_count == 1 && r.m_element_count > 1)
+      return true;
+    if (m_element_count > 1 && r.m_element_count == 1)
+      return false;
+    // At this point, both have at least two elements with
+    // a similar first element. Note than the final answer
+    // in this case depends on the second element only, because
+    // we don't need to compare the elements further.
+    // Note that the second element is at (index == 1), because
+    // the first element is at (index == 0).
+    if (m_first_element + m_step * 1 < r.m_first_element + r.m_step * 1)
+      return true;
+    if (m_first_element + m_step * 1 > r.m_first_element + r.m_step * 1)
+      return false;
+    // if the first two elements of both ranges are equal, then
+    // they are co-linear ranges(because the step is constant).
+    // In that case, they comparison depends only on
+    // the size of the ranges by convention.
+    return m_element_count < r.m_element_count;
+  }
+  bool operator>(const basic_range<IntegerType> &r) const {
+    // ********** This function needs refactoring.
+    if (m_element_count == 0 && r.m_element_count == 0)
+      return false;
+    if (m_element_count == 0 && r.m_element_count > 0)
+      return false;
+    if (m_element_count > 0 && r.m_element_count == 0)
+      return true;
+    // At this point, both has at least one element.
+    if (m_first_element < r.m_first_element)
+      return false;
+    if (m_first_element > r.m_first_element)
+      return true;
+    // At this point, the first element of both are equal.
+    if (m_element_count == 1 && r.m_element_count == 1)
+      return false;
+    if (m_element_count == 1 && r.m_element_count > 1)
+      return false;
+    if (m_element_count > 1 && r.m_element_count == 1)
+      return true;
+    // At this point, both have at least two elements with
+    // a similar first element. Note than the final answer
+    // in this case depends on the second element only, because
+    // we don't need to compare the elements further.
+    // Note that the second element is at (index == 1), because
+    // the first element is at (index == 0).
+    if (m_first_element + m_step * 1 < r.m_first_element + r.m_step * 1)
+      return false;
+    if (m_first_element + m_step * 1 > r.m_first_element + r.m_step * 1)
+      return true;
+    // if the first two elements of both ranges are equal, then
+    // they are co-linear ranges(because the step is constant).
+    // In that case, they comparison depends only on
+    // the size of the ranges by convention.
+    return m_element_count > r.m_element_count;
+  }
+  bool operator<=(const basic_range<IntegerType> &r) const {
+    return !(*this > r);
+  }
+  bool operator>=(const basic_range<IntegerType> &r) const {
+    return !(*this < r);
+  }
+  const_iterator begin() const {
+    return const_iterator(this, 0);
+  }
+  const_iterator end() const {
+    return const_iterator(this, m_element_count);
+  }
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(this, 0);
+  }
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(this, m_element_count);
+  }
+  size_type size() const {
+    return m_element_count;
+  }
+  size_type max_size() const {
+    // Because this is an immutable container,
+    // max_size() == size()
+    return m_element_count;
+  }
+  bool empty() const {
+    return m_element_count == 0;
+  }
+  // exist() and find() are similar except that
+  // find() returns the index of the element.
+  iterator find(value_type element) const {
+    value_type element_index = (element - m_first_element) / m_step;
+    bool in_range = element_index >= 0 && element_index < m_element_count &&
+        (element - m_first_element) % m_step == 0;
+    if (in_range)
+      return begin() + element_index;
+    return end();
+  }
+  bool exist(value_type element) const {
+    return find(element) != end();
+  }
+  // In the standard, the operator[]
+  // should return a const reference.
+  // Because Range Generator doesn't store its elements
+  // internally, we return a copy of the value.
+  // In any case, this doesn't affect the semantics of the operator.
+  value_type operator[](size_type index) const {
+    return m_first_element + m_step * index;
+  }
+ private:
+  // m_first_element: begin (see specifications).
+  // m_element_count: (end - begin) / step
+  value_type m_first_element, m_element_count, m_step;
+};
+// This is the default type of range!
+typedef basic_range<int> range;
+}
+}
+}
+#endif // range_h__
--- a/extern/benchmark_base/include/benchmark_base/unbalanced.h
+++ b/extern/benchmark_base/include/benchmark_base/unbalanced.h
+#ifndef COMPARISON_BENCHMARKS_BASE_UNBALANCED_H_
+#define COMPARISON_BENCHMARKS_BASE_UNBALANCED_H_
+#include <cstdint>
+#include <array>
+#include <vector>
+#include "picosha2.h"
+namespace comparison_benchmarks {
+namespace base {
+namespace unbalanced {
+const int SEED = 42;
+const int ROOT_CHILDREN = 140;
+const double Q = 0.124875;
+const int NORMAL_CHILDREN = 8;
+const int NUM_NODES = 71069;
+const int NUM_ITERATIONS = 50;
+const int WARMUP_ITERATIONS = 5;
+using node_state = std::array<uint8_t, 20>;
+/**
+ * Node of an unballanced binomial tree (https://www.cs.unc.edu/~olivier/LCPC06.pdf).
+ * To build up the tree recursivly call spawn_child_nodes on each node until leaves are reached.
+ * The tree is not built up directly in memory, but rather by the recursive calls.
+ */
+class node {
+  // The state is used to allow a deterministic tree construction using sha256 hashes.
+  node_state state_;
+  // Number of children for the current node
+  int num_children_;
+  // Set this to a positive number for the root node to start the tree with a specific size
+  int root_children_;
+  // general branching factors
+  double q_;
+  int b_;
+  // Private constructor for children
+  node(node_state state, double q, int b) : state_{state},
+                                            num_children_{0},
+                                            root_children_{-1},
+                                            q_{q},
+                                            b_{b} { init_num_children(); }
+  std::array<uint8_t, 20> generate_child_state(uint32_t index);
+  double get_state_random();
+  void init_num_children() {
+    double state_random = get_state_random();
+    if (root_children_ > 0) {
+      num_children_ = root_children_; // Root always spawns children
+    } else if (state_random < q_) {
+      num_children_ = b_;
+    } else {
+      num_children_ = 0;
+    }
+  }
+ public:
+  node(uint32_t seed, int root_children, double q, int b)
+      : state_({{}}), num_children_{0}, root_children_{root_children}, q_{q}, b_{b} {
+    for (int i = 0; i < 16; i++) {
+      state_[i] = 0;
+    }
+    state_[16] = static_cast<uint8_t>(0xFFu & (seed >> 24u));
+    state_[17] = static_cast<uint8_t>(0xFFu & (seed >> 16u));
+    state_[18] = static_cast<uint8_t>(0xFFu & (seed >> 8u));
+    state_[19] = static_cast<uint8_t>(0xFFu & (seed >> 0u));
+    picosha2::hash256_one_by_one hasher;
+    hasher.process(state_.begin(), state_.end());
+    hasher.finish();
+    hasher.get_hash_bytes(state_.begin(), state_.end());
+    init_num_children();
+  }
+  int get_num_children() const { return num_children_; }
+  node spawn_child_node(int index) {
+    return {generate_child_state(index), q_, b_};
+  }
+};
+}
+}
+}
+#endif //COMPARISON_BENCHMARKS_BASE_UNBALANCED_H_
--- a/extern/benchmark_base/src/.gitkeep
+++ b/extern/benchmark_base/src/.gitkeep
--- a/extern/benchmark_base/src/fft.cpp
+++ b/extern/benchmark_base/src/fft.cpp
+#include "benchmark_base/fft.h"
+namespace comparison_benchmarks {
+namespace base {
+namespace fft {
+complex_vector generate_input() {
+  std::vector<double> known_frequencies{2, 11, 52, 88, 256};
+  fft::complex_vector data(SIZE);
+  // Set our input data to match a time series of the known_frequencies.
+  // When applying fft to this time-series we should find these frequencies.
+  for (int i = 0; i < SIZE; i++) {
+    data[i] = std::complex<double>(0.0, 0.0);
+    for (auto frequencie : known_frequencies) {
+      data[i] += sin(2 * M_PI * frequencie * i / SIZE);
+    }
+  }
+  return data;
+}
+void divide(complex_vector::iterator data, int n) {
+  complex_vector tmp_odd_elements(n / 2);
+  for (int i = 0; i < n / 2; i++) {
+    tmp_odd_elements[i] = data[i * 2 + 1];
+  }
+  for (int i = 0; i < n / 2; i++) {
+    data[i] = data[i * 2];
+  }
+  for (int i = 0; i < n / 2; i++) {
+    data[i + n / 2] = tmp_odd_elements[i];
+  }
+}
+void combine(complex_vector::iterator data, int n) {
+  for (int i = 0; i < n / 2; i++) {
+    std::complex<double> even = data[i];
+    std::complex<double> odd = data[i + n / 2];
+    // w is the "twiddle-factor".
+    // this could be cached, but we run the same 'base' algorithm parallel/serial,
+    // so it won't impact the performance comparison.
+    std::complex<double> w = exp(std::complex<double>(0, -2. * M_PI * i / n));
+    data[i] = even + w * odd;
+    data[i + n / 2] = even - w * odd;
+  }
+}
+void conquer(complex_vector::iterator data, int n) {
+  if (n < 2) {
+    return;
+  }
+  divide(data, n);
+  conquer(data, n / 2);
+  conquer(data + n / 2, n / 2);
+  combine(data, n);
+}
+}
+}
+}
--- a/extern/benchmark_base/src/sample_images.cpp.in
+++ b/extern/benchmark_base/src/sample_images.cpp.in
+#include <vector>
+#include <string>
+#include <iostream>
+#include <sstream>
+using namespace std;
+namespace comparison_benchmarks {
+    namespace base {
+        vector<string> get_sample_image_paths() {
+            const int num_images = 19;
+            vector<string> result(num_images);
+            for (int i = 0; i < num_images; i++) {
+                ostringstream string_stream;
+                string_stream << "@CMAKE_CURRENT_SOURCE_DIR@/sample_images/" << i << ".jpg";
+                result[i] = string_stream.str();
+            }
+            return result;
+        }
+    }
+}
--- a/extern/benchmark_base/src/unbalanced.cpp
+++ b/extern/benchmark_base/src/unbalanced.cpp
+#include "benchmark_base/unbalanced.h"
+namespace comparison_benchmarks {
+namespace base {
+namespace unbalanced {
+node_state node::generate_child_state(uint32_t index) {
+  node_state result;
+  picosha2::hash256_one_by_one hasher;
+  hasher.process(state_.begin(), state_.end());
+  auto index_begin = reinterpret_cast<uint8_t *>(&index);
+  hasher.process(index_begin, index_begin + 4);
+  hasher.finish();
+  hasher.get_hash_bytes(result.begin(), result.end());
+  return result;
+}
+double node::get_state_random() {
+  int32_t state_random_integer;
+  uint32_t b = ((uint32_t) state_[16] << 24u) |
+      ((uint32_t) state_[17] << 16u) |
+      ((uint32_t) state_[18] << 8u) |
+      ((uint32_t) state_[19] << 0u);
+  b = b & 0x7fffffff; // Mask out negative values
+  state_random_integer = static_cast<int32_t>(b);
+  return (double) state_random_integer / (double) INT32_MAX;
+}
+}
+}
+}
--- a/extern/benchmark_runner/CMakeLists.txt
+++ b/extern/benchmark_runner/CMakeLists.txt
+add_library(benchmark_runner INTERFACE)
+target_include_directories(benchmark_runner INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
--- a/extern/benchmark_runner/benchmark_runner.h
+++ b/extern/benchmark_runner/benchmark_runner.h
+#ifndef BENCHMARK_RUNNER_H
+#define BENCHMARK_RUNNER_H
+#include <string>
+#include <cstdlib>
+#include <vector>
+#include <chrono>
+#include <numeric>
+#include <iostream>
+#include <fstream>
+#include <bits/stdc++.h>
+using namespace std;
+class benchmark_runner {
+ private:
+  string csv_path_;
+  string csv_name_;
+  chrono::steady_clock::time_point last_start_time_;
+  vector<long> times_;
+  void print_statistics() {
+    long time_sum = std::accumulate(times_.begin(), times_.end(), 0l);
+    cout << "Average Runtime (us): " << (time_sum / times_.size()) << endl;
+  }
+  inline bool file_exists(const std::string &name) {
+    ifstream f(name);
+    return f.good();
+  }
+ public:
+  benchmark_runner(string csv_path, string csv_name) : csv_path_{std::move(csv_path)},
+                                                       csv_name_{std::move(csv_name)},
+                                                       times_{} {
+    string command = "mkdir -p " + csv_path_;
+    int res = system(command.c_str());
+    if (res) {
+      cout << "Error while creating directory!" << endl;
+      exit(1);
+    }
+  }
+  static void read_args(int argc, char **argv, int &num_threads, string &path) {
+    if (argc < 3) {
+      cout << "Must Specifiy concurrency and output directory! (usage: `benchmark <output_directory> <num_threads>`)"
+           << endl;
+      exit(1);
+    }
+    string tmp = argv[1];
+    path = tmp;
+    num_threads = atoi(argv[2]);
+  }
+  void start_iteration() {
+    last_start_time_ = chrono::steady_clock::now();
+  }
+  void end_iteration() {
+    auto end_time = chrono::steady_clock::now();
+    long time = chrono::duration_cast<chrono::microseconds>(end_time - last_start_time_).count();
+    times_.emplace_back(time);
+  }
+  void run_iterations(int count, function<void(void)> f, int warmup_count) {
+    for (int i = 0; i < warmup_count; i++) {
+      f();
+    }
+    for (int i = 0; i < count; i++) {
+      start_iteration();
+      f();
+      end_iteration();
+    }
+  }
+  void commit_results(bool print_stats) {
+    if (print_stats) {
+      print_statistics();
+    }
+    string full_filename = csv_path_ + csv_name_;
+    bool write_header = !file_exists(full_filename);
+    { // Scope for output file
+      ofstream o(full_filename, std::fstream::out | std::fstream::app);
+      if (write_header) {
+        o << "runtime_us" << endl;
+      }
+      for (auto time : times_) {
+        o << time << endl;
+      }
+    } // End Scope for output file
+    times_.clear();
+  }
+};
+#endif //BENCHMARK_RUNNER_H
--- a/extern/picosha2/CMakeLists.txt
+++ b/extern/picosha2/CMakeLists.txt
+add_library(picosha2 INTERFACE)
+target_include_directories(picosha2 INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
\ No newline at end of file
--- a/extern/picosha2/LICENSE
+++ b/extern/picosha2/LICENSE
+MIT License
+Copyright (c) 2017 okdshin
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
--- a/extern/picosha2/picosha2.h
+++ b/extern/picosha2/picosha2.h
+/*
+The MIT License (MIT)
+Copyright (C) 2017 okdshin
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef PICOSHA2_H
+#define PICOSHA2_H
+// picosha2:20140213
+#ifndef PICOSHA2_BUFFER_SIZE_FOR_INPUT_ITERATOR
+#define PICOSHA2_BUFFER_SIZE_FOR_INPUT_ITERATOR \
+    1048576  //=1024*1024: default is 1MB memory
+#endif
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <sstream>
+#include <vector>
+#include <fstream>
+namespace picosha2 {
+typedef unsigned long word_t;
+typedef unsigned char byte_t;
+static const size_t k_digest_size = 32;
+namespace detail {
+inline byte_t mask_8bit(byte_t x) { return x & 0xff; }
+inline word_t mask_32bit(word_t x) { return x & 0xffffffff; }
+const word_t add_constant[64] = {
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
+    0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
+    0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
+    0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
+    0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
+    0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2};
+const word_t initial_message_digest[8] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372,
+                                          0xa54ff53a, 0x510e527f, 0x9b05688c,
+                                          0x1f83d9ab, 0x5be0cd19};
+inline word_t ch(word_t x, word_t y, word_t z) { return (x & y) ^ ((~x) & z); }
+inline word_t maj(word_t x, word_t y, word_t z) {
+    return (x & y) ^ (x & z) ^ (y & z);
+}
+inline word_t rotr(word_t x, std::size_t n) {
+    assert(n < 32);
+    return mask_32bit((x >> n) | (x << (32 - n)));
+}
+inline word_t bsig0(word_t x) { return rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22); }
+inline word_t bsig1(word_t x) { return rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25); }
+inline word_t shr(word_t x, std::size_t n) {
+    assert(n < 32);
+    return x >> n;
+}
+inline word_t ssig0(word_t x) { return rotr(x, 7) ^ rotr(x, 18) ^ shr(x, 3); }
+inline word_t ssig1(word_t x) { return rotr(x, 17) ^ rotr(x, 19) ^ shr(x, 10); }
+template <typename RaIter1, typename RaIter2>
+void hash256_block(RaIter1 message_digest, RaIter2 first, RaIter2 last) {
+    assert(first + 64 == last);
+    static_cast<void>(last);  // for avoiding unused-variable warning
+    word_t w[64];
+    std::fill(w, w + 64, 0);
+    for (std::size_t i = 0; i < 16; ++i) {
+        w[i] = (static_cast<word_t>(mask_8bit(*(first + i * 4))) << 24) |
+               (static_cast<word_t>(mask_8bit(*(first + i * 4 + 1))) << 16) |
+               (static_cast<word_t>(mask_8bit(*(first + i * 4 + 2))) << 8) |
+               (static_cast<word_t>(mask_8bit(*(first + i * 4 + 3))));
+    }
+    for (std::size_t i = 16; i < 64; ++i) {
+        w[i] = mask_32bit(ssig1(w[i - 2]) + w[i - 7] + ssig0(w[i - 15]) +
+                          w[i - 16]);
+    }
+    word_t a = *message_digest;
+    word_t b = *(message_digest + 1);
+    word_t c = *(message_digest + 2);
+    word_t d = *(message_digest + 3);
+    word_t e = *(message_digest + 4);
+    word_t f = *(message_digest + 5);
+    word_t g = *(message_digest + 6);
+    word_t h = *(message_digest + 7);
+    for (std::size_t i = 0; i < 64; ++i) {
+        word_t temp1 = h + bsig1(e) + ch(e, f, g) + add_constant[i] + w[i];
+        word_t temp2 = bsig0(a) + maj(a, b, c);
+        h = g;
+        g = f;
+        f = e;
+        e = mask_32bit(d + temp1);
+        d = c;
+        c = b;
+        b = a;
+        a = mask_32bit(temp1 + temp2);
+    }
+    *message_digest += a;
+    *(message_digest + 1) += b;
+    *(message_digest + 2) += c;
+    *(message_digest + 3) += d;
+    *(message_digest + 4) += e;
+    *(message_digest + 5) += f;
+    *(message_digest + 6) += g;
+    *(message_digest + 7) += h;
+    for (std::size_t i = 0; i < 8; ++i) {
+        *(message_digest + i) = mask_32bit(*(message_digest + i));
+    }
+}
+}  // namespace detail
+template <typename InIter>
+void output_hex(InIter first, InIter last, std::ostream& os) {
+    os.setf(std::ios::hex, std::ios::basefield);
+    while (first != last) {
+        os.width(2);
+        os.fill('0');
+        os << static_cast<unsigned int>(*first);
+        ++first;
+    }
+    os.setf(std::ios::dec, std::ios::basefield);
+}
+template <typename InIter>
+void bytes_to_hex_string(InIter first, InIter last, std::string& hex_str) {
+    std::ostringstream oss;
+    output_hex(first, last, oss);
+    hex_str.assign(oss.str());
+}
+template <typename InContainer>
+void bytes_to_hex_string(const InContainer& bytes, std::string& hex_str) {
+    bytes_to_hex_string(bytes.begin(), bytes.end(), hex_str);
+}
+template <typename InIter>
+std::string bytes_to_hex_string(InIter first, InIter last) {
+    std::string hex_str;
+    bytes_to_hex_string(first, last, hex_str);
+    return hex_str;
+}
+template <typename InContainer>
+std::string bytes_to_hex_string(const InContainer& bytes) {
+    std::string hex_str;
+    bytes_to_hex_string(bytes, hex_str);
+    return hex_str;
+}
+class hash256_one_by_one {
+   public:
+    hash256_one_by_one() { init(); }
+    void init() {
+        buffer_.clear();
+        std::fill(data_length_digits_, data_length_digits_ + 4, 0);
+        std::copy(detail::initial_message_digest,
+                  detail::initial_message_digest + 8, h_);
+    }
+    template <typename RaIter>
+    void process(RaIter first, RaIter last) {
+        add_to_data_length(static_cast<word_t>(std::distance(first, last)));
+        std::copy(first, last, std::back_inserter(buffer_));
+        std::size_t i = 0;
+        for (; i + 64 <= buffer_.size(); i += 64) {
+            detail::hash256_block(h_, buffer_.begin() + i,
+                                  buffer_.begin() + i + 64);
+        }
+        buffer_.erase(buffer_.begin(), buffer_.begin() + i);
+    }
+    void finish() {
+        byte_t temp[64];
+        std::fill(temp, temp + 64, 0);
+        std::size_t remains = buffer_.size();
+        std::copy(buffer_.begin(), buffer_.end(), temp);
+        temp[remains] = 0x80;
+        if (remains > 55) {
+            std::fill(temp + remains + 1, temp + 64, 0);
+            detail::hash256_block(h_, temp, temp + 64);
+            std::fill(temp, temp + 64 - 4, 0);
+        } else {
+            std::fill(temp + remains + 1, temp + 64 - 4, 0);
+        }
+        write_data_bit_length(&(temp[56]));
+        detail::hash256_block(h_, temp, temp + 64);
+    }
+    template <typename OutIter>
+    void get_hash_bytes(OutIter first, OutIter last) const {
+        for (const word_t* iter = h_; iter != h_ + 8; ++iter) {
+            for (std::size_t i = 0; i < 4 && first != last; ++i) {
+                *(first++) = detail::mask_8bit(
+                    static_cast<byte_t>((*iter >> (24 - 8 * i))));
+            }
+        }
+    }
+   private:
+    void add_to_data_length(word_t n) {
+        word_t carry = 0;
+        data_length_digits_[0] += n;
+        for (std::size_t i = 0; i < 4; ++i) {
+            data_length_digits_[i] += carry;
+            if (data_length_digits_[i] >= 65536u) {
+                carry = data_length_digits_[i] >> 16;
+                data_length_digits_[i] &= 65535u;
+            } else {
+                break;
+            }
+        }
+    }
+    void write_data_bit_length(byte_t* begin) {
+        word_t data_bit_length_digits[4];
+        std::copy(data_length_digits_, data_length_digits_ + 4,
+                  data_bit_length_digits);
+        // convert byte length to bit length (multiply 8 or shift 3 times left)
+        word_t carry = 0;
+        for (std::size_t i = 0; i < 4; ++i) {
+            word_t before_val = data_bit_length_digits[i];
+            data_bit_length_digits[i] <<= 3;
+            data_bit_length_digits[i] |= carry;
+            data_bit_length_digits[i] &= 65535u;
+            carry = (before_val >> (16 - 3)) & 65535u;
+        }
+        // write data_bit_length
+        for (int i = 3; i >= 0; --i) {
+            (*begin++) = static_cast<byte_t>(data_bit_length_digits[i] >> 8);
+            (*begin++) = static_cast<byte_t>(data_bit_length_digits[i]);
+        }
+    }
+    std::vector<byte_t> buffer_;
+    word_t data_length_digits_[4];  // as 64bit integer (16bit x 4 integer)
+    word_t h_[8];
+};
+inline void get_hash_hex_string(const hash256_one_by_one& hasher,
+                                std::string& hex_str) {
+    byte_t hash[k_digest_size];
+    hasher.get_hash_bytes(hash, hash + k_digest_size);
+    return bytes_to_hex_string(hash, hash + k_digest_size, hex_str);
+}
+inline std::string get_hash_hex_string(const hash256_one_by_one& hasher) {
+    std::string hex_str;
+    get_hash_hex_string(hasher, hex_str);
+    return hex_str;
+}
+namespace impl {
+template <typename RaIter, typename OutIter>
+void hash256_impl(RaIter first, RaIter last, OutIter first2, OutIter last2, int,
+                  std::random_access_iterator_tag) {
+    hash256_one_by_one hasher;
+    // hasher.init();
+    hasher.process(first, last);
+    hasher.finish();
+    hasher.get_hash_bytes(first2, last2);
+}
+template <typename InputIter, typename OutIter>
+void hash256_impl(InputIter first, InputIter last, OutIter first2,
+                  OutIter last2, int buffer_size, std::input_iterator_tag) {
+    std::vector<byte_t> buffer(buffer_size);
+    hash256_one_by_one hasher;
+    // hasher.init();
+    while (first != last) {
+        int size = buffer_size;
+        for (int i = 0; i != buffer_size; ++i, ++first) {
+            if (first == last) {
+                size = i;
+                break;
+            }
+            buffer[i] = *first;
+        }
+        hasher.process(buffer.begin(), buffer.begin() + size);
+    }
+    hasher.finish();
+    hasher.get_hash_bytes(first2, last2);
+}
+}
+template <typename InIter, typename OutIter>
+void hash256(InIter first, InIter last, OutIter first2, OutIter last2,
+             int buffer_size = PICOSHA2_BUFFER_SIZE_FOR_INPUT_ITERATOR) {
+    picosha2::impl::hash256_impl(
+        first, last, first2, last2, buffer_size,
+        typename std::iterator_traits<InIter>::iterator_category());
+}
+template <typename InIter, typename OutContainer>
+void hash256(InIter first, InIter last, OutContainer& dst) {
+    hash256(first, last, dst.begin(), dst.end());
+}
+template <typename InContainer, typename OutIter>
+void hash256(const InContainer& src, OutIter first, OutIter last) {
+    hash256(src.begin(), src.end(), first, last);
+}
+template <typename InContainer, typename OutContainer>
+void hash256(const InContainer& src, OutContainer& dst) {
+    hash256(src.begin(), src.end(), dst.begin(), dst.end());
+}
+template <typename InIter>
+void hash256_hex_string(InIter first, InIter last, std::string& hex_str) {
+    byte_t hashed[k_digest_size];
+    hash256(first, last, hashed, hashed + k_digest_size);
+    std::ostringstream oss;
+    output_hex(hashed, hashed + k_digest_size, oss);
+    hex_str.assign(oss.str());
+}
+template <typename InIter>
+std::string hash256_hex_string(InIter first, InIter last) {
+    std::string hex_str;
+    hash256_hex_string(first, last, hex_str);
+    return hex_str;
+}
+inline void hash256_hex_string(const std::string& src, std::string& hex_str) {
+    hash256_hex_string(src.begin(), src.end(), hex_str);
+}
+template <typename InContainer>
+void hash256_hex_string(const InContainer& src, std::string& hex_str) {
+    hash256_hex_string(src.begin(), src.end(), hex_str);
+}
+template <typename InContainer>
+std::string hash256_hex_string(const InContainer& src) {
+    return hash256_hex_string(src.begin(), src.end());
+}
+template<typename OutIter>void hash256(std::ifstream& f, OutIter first, OutIter last){
+    hash256(std::istreambuf_iterator<char>(f), std::istreambuf_iterator<char>(), first,last);
+}
+}// namespace picosha2
+#endif  // PICOSHA2_H
--- a/lib/pls/include/pls/algorithms/for_each_impl.h
+++ b/lib/pls/include/pls/algorithms/for_each_impl.h
@@ -29,12 +29,12 @@ pls::internal::scheduling::parallel_result<int> for_each(const RandomIt first,
    // Cut in half recursively
    const long middle_index = num_elements / 2;
-    return scheduler::par([first, middle_index, last, &function, min_elements] {
+    return scheduler::par([first, middle_index, last, function, min_elements] {
      return internal::for_each(first,
                                first + middle_index,
                                function,
                                min_elements);
-    }, [first, middle_index, last, &function, min_elements] {
+    }, [first, middle_index, last, function, min_elements] {
      return internal::for_each(first + middle_index,
                                last,
                                function,

--- a/lib/pls/include/pls/internal/helpers/range.h
+++ b/lib/pls/include/pls/internal/helpers/range.h
@@ -112,7 +112,7 @@ struct basic_range {
        : r(rhs.r), index(rhs.index) {}
    const_iterator_impl(basic_range<IntegerType> const *p_range, size_type p_index)
-        : r(p_range), index(p_index) {}
+        : r(*p_range), index(p_index) {}
    const_iterator_impl &operator=(const const_iterator_impl &rhs) {
      r = rhs.r;
@@ -121,7 +121,7 @@ struct basic_range {
    }
    bool operator==(const const_iterator_impl &rhs) const {
-      return *r == *(rhs.r) && index == rhs.index;
+      return r == rhs.r && index == rhs.index;
    }
    bool operator!=(const const_iterator_impl &rhs) const {
@@ -145,7 +145,7 @@ struct basic_range {
    }
    value_type operator*() const {
-      return r->m_first_element + r->m_step * index;
+      return r.m_first_element + r.m_step * index;
    }
    // operator->
@@ -212,11 +212,11 @@ struct basic_range {
    value_type operator[](difference_type offset) const {
      size_type new_index = index + offset;
-      return r->m_first_element + r->m_step * new_index;
+      return r.m_first_element + r.m_step * new_index;
    }
   private:
-    basic_range<IntegerType> const *r;
+    basic_range<IntegerType> r;
    size_type index;
  };
@@ -236,7 +236,7 @@ struct basic_range {
        : r(rhs.r), index(rhs.index) {}
    const_reverse_iterator_impl(basic_range<IntegerType> const *p_range, size_type p_index)
-        : r(p_range), index(p_index) {}
+        : r(*p_range), index(p_index) {}
    const_reverse_iterator_impl &operator=(const const_reverse_iterator_impl &rhs) {
      r = rhs.r;
@@ -245,7 +245,7 @@ struct basic_range {
    }
    bool operator==(const const_reverse_iterator_impl &rhs) const {
-      return *r == *(rhs.r) && index == rhs.index;
+      return r == rhs.r && index == rhs.index;
    }
    bool operator!=(const const_reverse_iterator_impl &rhs) const {
@@ -270,8 +270,8 @@ struct basic_range {
    value_type operator*() const {
      size_type reverse_index
-          = (r->m_element_count - 1) - index;
+          = (r.m_element_count - 1) - index;
-      return r->m_first_element + r->m_step * reverse_index;
+      return r.m_first_element + r.m_step * reverse_index;
    }
    // operator->
@@ -338,12 +338,12 @@ struct basic_range {
    value_type operator[](difference_type offset) const {
      size_type new_reverse_index
-          = (r->m_element_count - 1) - (index + offset);
+          = (r.m_element_count - 1) - (index + offset);
-      return r->m_first_element + r->m_step * new_reverse_index;
+      return r.m_first_element + r.m_step * new_reverse_index;
    }
   private:
-    basic_range<IntegerType> const *r;
+    basic_range<IntegerType> r;
    size_type index;
  };