From 79ac0243346c8c488048f7597b93182d3793fd4d Mon Sep 17 00:00:00 2001
From: FritzFlorian <flo.fritz@t-online.de>
Date: Fri, 20 Dec 2019 22:04:26 +0100
Subject: [PATCH] Add two 'standardized' benchmarks.

---
 CMakeLists.txt                                                 |   3 +++
 app/benchmark_fft/CMakeLists.txt                               |  10 +++++-----
 app/benchmark_fft/main.cpp                                     | 147 ++++++++++++++++++++++++++++++++++++++++++++++++---------------------------------------------------------------------------------------------------
 app/benchmark_matrix/CMakeLists.txt                            |   6 +++---
 app/benchmark_matrix/main.cpp                                  | 126 ++++++++++++++++++++++++++++++++++++++++++++++--------------------------------------------------------------------------------
 app/benchmark_unbalanced/main.cpp                              |  14 +++++++-------
 app/playground/main.cpp                                        |  25 +++++++++++++++----------
 extern/benchmark_base/CMakeLists.txt                           |  20 ++++++++++++++++++++
 extern/benchmark_base/include/benchmark_base/.gitkeep          |   0
 extern/benchmark_base/include/benchmark_base/RANGE_LICENSE.txt |  23 +++++++++++++++++++++++
 extern/benchmark_base/include/benchmark_base/fft.h             |  29 +++++++++++++++++++++++++++++
 extern/benchmark_base/include/benchmark_base/heat.h            | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 extern/benchmark_base/include/benchmark_base/matrix.h          |  67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 extern/benchmark_base/include/benchmark_base/range.h           | 608 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 extern/benchmark_base/include/benchmark_base/unbalanced.h      |  97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 extern/benchmark_base/src/.gitkeep                             |   0
 extern/benchmark_base/src/fft.cpp                              |  63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 extern/benchmark_base/src/sample_images.cpp.in                 |  23 +++++++++++++++++++++++
 extern/benchmark_base/src/unbalanced.cpp                       |  34 ++++++++++++++++++++++++++++++++++
 extern/benchmark_runner/CMakeLists.txt                         |   2 ++
 extern/benchmark_runner/benchmark_runner.h                     | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 extern/picosha2/CMakeLists.txt                                 |   2 ++
 extern/picosha2/LICENSE                                        |  21 +++++++++++++++++++++
 extern/picosha2/picosha2.h                                     | 377 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/pls/include/pls/algorithms/for_each_impl.h                 |   4 ++--
 lib/pls/include/pls/internal/helpers/range.h                   |  24 ++++++++++++------------
 26 files changed, 1726 insertions(+), 218 deletions(-)
 create mode 100644 extern/benchmark_base/CMakeLists.txt
 create mode 100644 extern/benchmark_base/include/benchmark_base/.gitkeep
 create mode 100644 extern/benchmark_base/include/benchmark_base/RANGE_LICENSE.txt
 create mode 100644 extern/benchmark_base/include/benchmark_base/fft.h
 create mode 100644 extern/benchmark_base/include/benchmark_base/heat.h
 create mode 100644 extern/benchmark_base/include/benchmark_base/matrix.h
 create mode 100644 extern/benchmark_base/include/benchmark_base/range.h
 create mode 100644 extern/benchmark_base/include/benchmark_base/unbalanced.h
 create mode 100644 extern/benchmark_base/src/.gitkeep
 create mode 100644 extern/benchmark_base/src/fft.cpp
 create mode 100644 extern/benchmark_base/src/sample_images.cpp.in
 create mode 100644 extern/benchmark_base/src/unbalanced.cpp
 create mode 100644 extern/benchmark_runner/CMakeLists.txt
 create mode 100644 extern/benchmark_runner/benchmark_runner.h
 create mode 100644 extern/picosha2/CMakeLists.txt
 create mode 100644 extern/picosha2/LICENSE
 create mode 100644 extern/picosha2/picosha2.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c866791..ff7f6c4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,6 +26,9 @@ list(APPEND CMAKE_PREFIX_PATH "${PROJECT_SOURCE_DIR}/cmake")
 # Each library has an own CMakeLists.txt that should make it avaliabale as a library target,
 # thus allowing one to include it as any cmake dependency later on.
 add_subdirectory(extern/catch2)
+add_subdirectory(extern/picosha2)
+add_subdirectory(extern/benchmark_base)
+add_subdirectory(extern/benchmark_runner)
 
 # Include all internal subprojects (library, examples, testing).
 add_subdirectory(lib/pls)
diff --git a/app/benchmark_fft/CMakeLists.txt b/app/benchmark_fft/CMakeLists.txt
index 41591e5..4c7f3fe 100644
--- a/app/benchmark_fft/CMakeLists.txt
+++ b/app/benchmark_fft/CMakeLists.txt
@@ -1,5 +1,5 @@
-add_executable(benchmark_fft main.cpp)
-target_link_libraries(benchmark_fft pls)
-if(EASY_PROFILER)
-    target_link_libraries(benchmark_fft easy_profiler)
-endif()
+add_executable(benchmark_fft_pls_v2 main.cpp)
+target_link_libraries(benchmark_fft_pls_v2 pls benchmark_runner benchmark_base)
+if (EASY_PROFILER)
+    target_link_libraries(benchmark_fft_pls_v2 easy_profiler)
+endif ()
diff --git a/app/benchmark_fft/main.cpp b/app/benchmark_fft/main.cpp
index 68a587c..953c669 100644
--- a/app/benchmark_fft/main.cpp
+++ b/app/benchmark_fft/main.cpp
@@ -2,142 +2,91 @@
 #include "pls/internal/scheduling/parallel_result.h"
 #include "pls/internal/scheduling/scheduler_memory.h"
 #include "pls/internal/helpers/profiler.h"
+
 using namespace pls::internal::scheduling;
 
 #include <iostream>
 #include <complex>
 #include <vector>
-#include <atomic>
-
-static constexpr int CUTOFF = 16;
-static constexpr int INPUT_SIZE = 16384;
-typedef std::vector<std::complex<double>> complex_vector;
-
-void divide(complex_vector::iterator data, int n) {
-  complex_vector tmp_odd_elements(n / 2);
-  for (int i = 0; i < n / 2; i++) {
-    tmp_odd_elements[i] = data[i * 2 + 1];
-  }
-  for (int i = 0; i < n / 2; i++) {
-    data[i] = data[i * 2];
-  }
-  for (int i = 0; i < n / 2; i++) {
-    data[i + n / 2] = tmp_odd_elements[i];
-  }
-}
 
-void combine(complex_vector::iterator data, int n) {
-  for (int i = 0; i < n / 2; i++) {
-    std::complex<double> even = data[i];
-    std::complex<double> odd = data[i + n / 2];
-
-    // w is the "twiddle-factor".
-    // this could be cached, but we run the same 'data_structures' algorithm parallel/serial,
-    // so it won't impact the performance comparison.
-    std::complex<double> w = exp(std::complex<double>(0, -2. * M_PI * i / n));
-
-    data[i] = even + w * odd;
-    data[i + n / 2] = even - w * odd;
-  }
-}
-
-void fft_normal(complex_vector::iterator data, int n) {
-  if (n < 2) {
-    return;
-  }
+#include "benchmark_runner.h"
+#include "benchmark_base/fft.h"
 
-  divide(data, n);
-  fft_normal(data, n / 2);
-  fft_normal(data + n / 2, n / 2);
-  combine(data, n);
-}
+using namespace comparison_benchmarks::base;
 
-parallel_result<short> fft(complex_vector::iterator data, int n) {
+parallel_result<short> conquer(fft::complex_vector::iterator data, int n) {
   if (n < 2) {
     return parallel_result<short>{0};
   }
 
-  divide(data, n);
-  if (n <= CUTOFF) {
-    fft_normal(data, n / 2);
-    fft_normal(data + n / 2, n / 2);
-    combine(data, n);
+  fft::divide(data, n);
+  if (n <= fft::RECURSIVE_CUTOFF) {
+    fft::conquer(data, n / 2);
+    fft::conquer(data + n / 2, n / 2);
+    fft::combine(data, n);
     return parallel_result<short>{0};
   } else {
     return scheduler::par([=]() {
-      return fft(data, n / 2);
+      return conquer(data, n / 2);
     }, [=]() {
-      return fft(data + n / 2, n / 2);
+      return conquer(data + n / 2, n / 2);
     }).then([=](int, int) {
-      combine(data, n);
+      fft::combine(data, n);
       return parallel_result<short>{0};
     });
   }
 }
 
-complex_vector prepare_input(int input_size) {
-  std::vector<double> known_frequencies{2, 11, 52, 88, 256};
-  complex_vector data(input_size);
-
-  // Set our input data to match a time series of the known_frequencies.
-  // When applying fft to this time-series we should find these frequencies.
-  for (int i = 0; i < input_size; i++) {
-    data[i] = std::complex<double>(0.0, 0.0);
-    for (auto frequencie : known_frequencies) {
-      data[i] += sin(2 * M_PI * frequencie * i / input_size);
-    }
-  }
-
-  return data;
-}
+constexpr int MAX_NUM_THREADS = 8;
+constexpr int MAX_NUM_TASKS = 64;
+constexpr int MAX_NUM_CONTS = 64;
+constexpr int MAX_CONT_SIZE = 256;
 
-static constexpr int NUM_ITERATIONS = 500;
-constexpr size_t NUM_THREADS = 2;
+int main(int argc, char **argv) {
+  int num_threads;
+  string directory;
+  benchmark_runner::read_args(argc, argv, num_threads, directory);
 
-constexpr size_t NUM_TASKS = 128;
+  string test_name = to_string(num_threads) + ".csv";
+  string full_directory = directory + "/PLS_v2/";
+  benchmark_runner runner{full_directory, test_name};
 
-constexpr size_t NUM_CONTS = 128;
-constexpr size_t MAX_CONT_SIZE = 512;
+  fft::complex_vector data = fft::generate_input();
 
-int main() {
-  PROFILE_ENABLE;
-  complex_vector initial_input = prepare_input(INPUT_SIZE);
-
-  static_scheduler_memory<NUM_THREADS,
-                          NUM_TASKS,
-                          NUM_CONTS,
+  static_scheduler_memory<MAX_NUM_THREADS,
+                          MAX_NUM_TASKS,
+                          MAX_NUM_CONTS,
                           MAX_CONT_SIZE> static_scheduler_memory;
 
-  scheduler scheduler{static_scheduler_memory, NUM_THREADS};
+  scheduler scheduler{static_scheduler_memory, (unsigned int) num_threads};
 
-  auto start = std::chrono::steady_clock::now();
-  for (int i = 0; i < NUM_ITERATIONS; i++) {
-    complex_vector input_2(initial_input);
+  for (int i = 0; i < fft::NUM_WARMUP_ITERATIONS; i++) {
     scheduler.perform_work([&]() {
-      PROFILE_MAIN_THREAD;
       return scheduler::par([&]() {
-        return fft(input_2.begin(), INPUT_SIZE);
+        return conquer(data.begin(), fft::SIZE);
       }, []() {
-        return parallel_result<int>{0};
-      }).then([](int, int) {
+        return parallel_result<short>{0};
+      }).then([&](short, short) {
         return parallel_result<int>{0};
       });
     });
-    PROFILE_LOCK("DONE");
   }
-  auto end = std::chrono::steady_clock::now();
-  std::cout << "Framework:  " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
-            << std::endl;
-  PROFILE_SAVE("test_profile.prof");
-
-  start = std::chrono::steady_clock::now();
-  for (int i = 0; i < NUM_ITERATIONS; i++) {
-    complex_vector input_1(initial_input);
-    fft_normal(input_1.begin(), INPUT_SIZE);
+
+  for (int i = 0; i < fft::NUM_ITERATIONS; i++) {
+    scheduler.perform_work([&]() {
+      runner.start_iteration();
+
+      return scheduler::par([&]() {
+        return conquer(data.begin(), fft::SIZE);
+      }, []() {
+        return parallel_result<short>{0};
+      }).then([&](short, short) {
+        runner.end_iteration();
+        return parallel_result<int>{0};
+      });
+    });
   }
-  end = std::chrono::steady_clock::now();
-  std::cout << "Normal:     " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
-            << std::endl;
+  runner.commit_results(true);
 
   return 0;
 }
diff --git a/app/benchmark_matrix/CMakeLists.txt b/app/benchmark_matrix/CMakeLists.txt
index 0245a5b..67c9b09 100644
--- a/app/benchmark_matrix/CMakeLists.txt
+++ b/app/benchmark_matrix/CMakeLists.txt
@@ -1,5 +1,5 @@
-add_executable(benchmark_matrix main.cpp)
-target_link_libraries(benchmark_matrix pls)
+add_executable(benchmark_matrix_pls_v2 main.cpp)
+target_link_libraries(benchmark_matrix_pls_v2 pls benchmark_runner benchmark_base)
 if (EASY_PROFILER)
-    target_link_libraries(benchmark_matrix easy_profiler)
+    target_link_libraries(benchmark_matrix_pls_v2 easy_profiler)
 endif ()
diff --git a/app/benchmark_matrix/main.cpp b/app/benchmark_matrix/main.cpp
index 9a14a57..ae4dcf0 100644
--- a/app/benchmark_matrix/main.cpp
+++ b/app/benchmark_matrix/main.cpp
@@ -2,112 +2,78 @@
 #include "pls/internal/scheduling/parallel_result.h"
 #include "pls/internal/scheduling/scheduler_memory.h"
 #include "pls/algorithms/for_each.h"
+
 using namespace pls::internal::scheduling;
 
-#include <chrono>
+#include "benchmark_runner.h"
+#include "benchmark_base/matrix.h"
 
-const int MATRIX_SIZE = 128;
+using namespace comparison_benchmarks::base;
 
 template<typename T, int SIZE>
-class matrix {
+class pls_matrix : public matrix::matrix<T, SIZE> {
  public:
-  T data[SIZE][SIZE];
-
-  explicit matrix(T i = 1) {
-    std::fill(&data[0][0], &data[0][0] + SIZE * SIZE, i);
-  }
+  pls_matrix() : matrix::matrix<T, SIZE>() {}
 
-  parallel_result<int> multiply(const matrix<T, SIZE> &a, const matrix<T, SIZE> &b) {
-    return pls::algorithm::for_each_range(0, SIZE, [&](int i) {
+  parallel_result<int> pls_multiply(const matrix::matrix<T, SIZE> &a, const matrix::matrix<T, SIZE> &b) {
+    return pls::algorithm::for_each_range(0, SIZE, [this, &a, &b](int i) {
       this->multiply_column(i, a, b);
     });
   }
-
- private:
-  void multiply_column(int i, const matrix<T, SIZE> &a, const matrix<T, SIZE> &b) {
-    for (int j = 0; j < SIZE; ++j) {
-      data[i][j] = 0;
-    }
-    for (int k = 0; k < SIZE; ++k) {
-      for (int j = 0; j < SIZE; ++j) {
-        data[i][j] += a.data[i][k] * b.data[k][j];
-      }
-    }
-  }
 };
 
-void fill_with_data(matrix<double, MATRIX_SIZE> &a, matrix<double, MATRIX_SIZE> &b) {
-  // Fill in some data...
-  for (int i = 0; i < MATRIX_SIZE; i++) {
-    for (int j = 0; j < MATRIX_SIZE; j++) {
-      a.data[i][j] = i;
-      b.data[i][j] = j;
-    }
-  }
-}
+constexpr size_t MAX_NUM_THREADS = 8;
+constexpr size_t MAX_NUM_TASKS = 32;
+constexpr size_t MAX_NUM_CONTS = 32;
+constexpr size_t MAX_CONT_SIZE = 512;
 
-static constexpr int NUM_ITERATIONS = 1000;
-constexpr size_t NUM_THREADS = 3;
+int main(int argc, char **argv) {
+  int num_threads;
+  string directory;
+  benchmark_runner::read_args(argc, argv, num_threads, directory);
 
-constexpr size_t NUM_TASKS = 128;
+  string test_name = to_string(num_threads) + ".csv";
+  string full_directory = directory + "/PLS_v2/";
+  benchmark_runner runner{full_directory, test_name};
 
-constexpr size_t NUM_CONTS = 128;
-constexpr size_t MAX_CONT_SIZE = 512;
+  pls_matrix<double, matrix::MATRIX_SIZE> a;
+  pls_matrix<double, matrix::MATRIX_SIZE> b;
+  pls_matrix<double, matrix::MATRIX_SIZE> result;
 
-int main() {
-  PROFILE_ENABLE
-  matrix<double, MATRIX_SIZE> a;
-  matrix<double, MATRIX_SIZE> b;
-  matrix<double, MATRIX_SIZE> result;
-  fill_with_data(a, b);
-
-  static_scheduler_memory<NUM_THREADS,
-                          NUM_TASKS,
-                          NUM_CONTS,
+  static_scheduler_memory<MAX_NUM_THREADS,
+                          MAX_NUM_TASKS,
+                          MAX_NUM_CONTS,
                           MAX_CONT_SIZE> static_scheduler_memory;
 
-  scheduler scheduler{static_scheduler_memory, NUM_THREADS};
+  scheduler scheduler{static_scheduler_memory, (unsigned int) num_threads};
+
+  for (int i = 0; i < matrix::WARMUP_ITERATIONS; i++) {
 
-  auto start = std::chrono::steady_clock::now();
-  for (int i = 0; i < NUM_ITERATIONS; i++) {
     scheduler.perform_work([&]() {
-      PROFILE_MAIN_THREAD;
       return scheduler::par([&]() {
-        return result.multiply(a, b);
+        return result.pls_multiply(a, b);
       }, []() {
         return parallel_result<int>{0};
-      }).then([](int, int) {
+      }).then([&](int, int) {
         return parallel_result<int>{0};
       });
     });
   }
-  auto end = std::chrono::steady_clock::now();
-  std::cout << "Framework:  " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
-            << std::endl;
-}
 
-//int main() {
-//  PROFILE_ENABLE
-//  pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18u};
-//  pls::scheduler scheduler{&my_scheduler_memory, 4};
-//
-//  matrix<double, MATRIX_SIZE> a;
-//  matrix<double, MATRIX_SIZE> b;
-//  matrix<double, MATRIX_SIZE> result;
-//  fill_with_data(a, b);
-//
-//  scheduler.perform_work([&] {
-//    auto start_time = std::chrono::high_resolution_clock::now();
-//    PROFILE_MAIN_THREAD
-//    for (int i = 0; i < 10000; i++) {
-//      PROFILE_WORK_BLOCK("Top Level")
-//      result.multiply(a, b);
-//    }
-//    auto end_time = std::chrono::high_resolution_clock::now();
-//    long time = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
-//    std::cout << "Runtime: " << time << "us" << std::endl;
-//  });
-//
-//  PROFILE_SAVE("test_profile.prof")
-//}
+  for (int i = 0; i < matrix::NUM_ITERATIONS; i++) {
+    scheduler.perform_work([&]() {
+      runner.start_iteration();
 
+      return scheduler::par([&]() {
+        return result.pls_multiply(a, b);
+      }, []() {
+        return parallel_result<int>{0};
+      }).then([&](int, int) {
+        runner.end_iteration();
+        return parallel_result<int>{0};
+      });
+    });
+  }
+  runner.commit_results(true);
+
+}
diff --git a/app/benchmark_unbalanced/main.cpp b/app/benchmark_unbalanced/main.cpp
index 2753b8c..446fe15 100644
--- a/app/benchmark_unbalanced/main.cpp
+++ b/app/benchmark_unbalanced/main.cpp
@@ -51,22 +51,22 @@ parallel_result<int> unbalanced_tree_search(int seed, int root_children, double 
   return result;
 }
 
-constexpr size_t NUM_THREADS = 5;
+constexpr size_t MAX_NUM_THREADS = 5;
 
-constexpr size_t NUM_TASKS = 128;
+constexpr size_t MAX_NUM_TASKS = 128;
 
-constexpr size_t NUM_CONTS = 128;
+constexpr size_t MAX_NUM_CONTS = 128;
 constexpr size_t MAX_CONT_SIZE = 512;
 
 volatile int result;
 int main() {
   PROFILE_ENABLE
-  static_scheduler_memory<NUM_THREADS,
-                          NUM_TASKS,
-                          NUM_CONTS,
+  static_scheduler_memory<MAX_NUM_THREADS,
+                          MAX_NUM_TASKS,
+                          MAX_NUM_CONTS,
                           MAX_CONT_SIZE> static_scheduler_memory;
 
-  scheduler scheduler{static_scheduler_memory, NUM_THREADS};
+  scheduler scheduler{static_scheduler_memory, MAX_NUM_THREADS};
 
   scheduler.perform_work([&]() {
     return scheduler::par([&]() {
diff --git a/app/playground/main.cpp b/app/playground/main.cpp
index 7c353ae..d7d0c07 100644
--- a/app/playground/main.cpp
+++ b/app/playground/main.cpp
@@ -8,12 +8,12 @@
 
 using namespace pls::internal;
 
-constexpr size_t NUM_THREADS = 4;
+constexpr size_t MAX_NUM_THREADS = 1;
 
-constexpr size_t NUM_TASKS = 128;
-static constexpr int NUM_ITERATIONS = 100;
+constexpr size_t MAX_NUM_TASKS = 128;
+static constexpr int NUM_ITERATIONS = 10;
 
-constexpr size_t NUM_CONTS = 128;
+constexpr size_t MAX_NUM_CONTS = 128;
 constexpr size_t MAX_CONT_SIZE = 256;
 
 int fib_normal(int n) {
@@ -29,8 +29,13 @@ int fib_normal(int n) {
 }
 
 scheduling::parallel_result<int> fib(int n) {
-  if (n <= 10) {
-    return fib_normal(n);
+  pls::variable<int> i;
+  pls::array<int> a{10};
+  if (n == 0) {
+    return 0;
+  }
+  if (n == 1) {
+    return 1;
   }
 
   return scheduling::scheduler::par([=]() {
@@ -45,12 +50,12 @@ scheduling::parallel_result<int> fib(int n) {
 static volatile int result;
 int main() {
   PROFILE_ENABLE;
-  scheduling::static_scheduler_memory<NUM_THREADS,
-                                      NUM_TASKS,
-                                      NUM_CONTS,
+  scheduling::static_scheduler_memory<MAX_NUM_THREADS,
+                                      MAX_NUM_TASKS,
+                                      MAX_NUM_CONTS,
                                       MAX_CONT_SIZE> static_scheduler_memory;
 
-  scheduling::scheduler scheduler{static_scheduler_memory, NUM_THREADS};
+  scheduling::scheduler scheduler{static_scheduler_memory, MAX_NUM_THREADS};
 
   auto start = std::chrono::steady_clock::now();
   for (int i = 0; i < NUM_ITERATIONS; i++) {
diff --git a/extern/benchmark_base/CMakeLists.txt b/extern/benchmark_base/CMakeLists.txt
new file mode 100644
index 0000000..007519f
--- /dev/null
+++ b/extern/benchmark_base/CMakeLists.txt
@@ -0,0 +1,20 @@
+# Configuration and common algorithm pieces for benchmarks
+configure_file(src/sample_images.cpp.in sample_images.cpp)
+
+add_library(benchmark_base STATIC
+        ${CMAKE_CURRENT_BINARY_DIR}/sample_images.cpp
+        src/fft.cpp include/benchmark_base/fft.h
+        include/benchmark_base/heat.h
+        include/benchmark_base/matrix.h
+        include/benchmark_base/unbalanced.h src/unbalanced.cpp
+        include/benchmark_base/range.h)
+
+target_include_directories(benchmark_base
+        PUBLIC
+        $<INSTALL_INTERFACE:include>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+        PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/src
+        )
+
+target_link_libraries(benchmark_base picosha2)
diff --git a/extern/benchmark_base/include/benchmark_base/.gitkeep b/extern/benchmark_base/include/benchmark_base/.gitkeep
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/extern/benchmark_base/include/benchmark_base/.gitkeep
diff --git a/extern/benchmark_base/include/benchmark_base/RANGE_LICENSE.txt b/extern/benchmark_base/include/benchmark_base/RANGE_LICENSE.txt
new file mode 100644
index 0000000..36b7cd9
--- /dev/null
+++ b/extern/benchmark_base/include/benchmark_base/RANGE_LICENSE.txt
@@ -0,0 +1,23 @@
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/extern/benchmark_base/include/benchmark_base/fft.h b/extern/benchmark_base/include/benchmark_base/fft.h
new file mode 100644
index 0000000..f778b43
--- /dev/null
+++ b/extern/benchmark_base/include/benchmark_base/fft.h
@@ -0,0 +1,29 @@
+#ifndef COMPARISON_BENCHMARKS_BASE_FFT_H
+#define COMPARISON_BENCHMARKS_BASE_FFT_H
+
+#include <complex>
+#include <string>
+#include <vector>
+
+namespace comparison_benchmarks {
+namespace base {
+namespace fft {
+
+const int SIZE = 8192;
+const int NUM_ITERATIONS = 1000;
+const int NUM_WARMUP_ITERATIONS = 100;
+
+const int RECURSIVE_CUTOFF = 32;
+typedef std::vector<std::complex<double>> complex_vector;
+
+complex_vector generate_input();
+
+void divide(complex_vector::iterator data, int n);
+void conquer(complex_vector::iterator data, int n);
+void combine(complex_vector::iterator data, int n);
+
+}
+}
+}
+
+#endif //COMPARISON_BENCHMARKS_BASE_FFT_H
diff --git a/extern/benchmark_base/include/benchmark_base/heat.h b/extern/benchmark_base/include/benchmark_base/heat.h
new file mode 100644
index 0000000..007d643
--- /dev/null
+++ b/extern/benchmark_base/include/benchmark_base/heat.h
@@ -0,0 +1,117 @@
+
+#ifndef COMPARISON_BENCHMARKS_BASE_HEAT_H
+#define COMPARISON_BENCHMARKS_BASE_HEAT_H
+
+#include <array>
+#include <iostream>
+#include <memory>
+
+namespace comparison_benchmarks {
+namespace base {
+namespace heat {
+
+const int DIFFUSION_SIZE = 256;
+const int DIFFUSION_STEPS = 256;
+
+const int NUM_ITERATIONS = 100;
+const int WARMUP_ITERATIONS = 20;
+
+template<typename T, int SIZE>
+class heat_diffusion {
+  // Center portion is SIZExSIZE, borders are fixed temperature values
+  using matrix = std::array<std::array<T, SIZE + 2>, SIZE + 2>;
+
+ protected:
+  // Sane default values for the simulation (form paper).
+  // This is not about perfect simulation results but the speedup of the workload.
+  double c = 0.1;
+  double d_s = 1.0 / (SIZE + 1);
+  double d_t = (d_s * d_s) / (4 * c);
+
+ public:
+  matrix *current_data;
+  matrix *next_data;
+
+  explicit heat_diffusion() {
+    current_data = new matrix;
+    next_data = new matrix;
+    reset_data();
+  }
+
+  ~heat_diffusion() {
+    delete current_data;
+    delete next_data;
+  }
+
+  virtual void run_simulation(int n) {
+    for (int i = 0; i < n; i++) {
+      for (int row = 1; row <= SIZE; row++) {
+        for (int column = 1; column <= SIZE; column++) {
+          update_element(row, column);
+        }
+      }
+
+      // Synchronization point needed to coordinate the calculation!
+      swap_data_arrays();
+    }
+  }
+
+ protected:
+  void update_element(int row, int column) {
+    (*next_data)[row][column] = (*current_data)[row][column] + ((c * d_t) / (d_s * d_s)) *
+        ((*current_data)[row + 1][column] + (*current_data)[row - 1][column]
+            - 4 * (*current_data)[row][column]
+            + (*current_data)[row][column + 1] + (*current_data)[row][column - 1]);
+  }
+
+  void swap_data_arrays() {
+    matrix *tmp = current_data;
+    current_data = next_data;
+    next_data = tmp;
+  }
+
+  void reset_data() {
+    for (int row = 0; row < SIZE + 2; row++) {
+      for (int column = 0; column < SIZE + 2; column++) {
+        (*current_data)[row][column] = 0.0;
+        (*next_data)[row][column] = 0.0;
+
+        // Edges are a fixed, hot temperature
+        if (row == 0 || row == SIZE + 1) {
+          (*current_data)[row][column] = 1.0;
+          (*next_data)[row][column] = 1.0;
+        }
+      }
+    }
+  }
+};
+
+template<typename T, int SIZE>
+std::ostream &operator<<(std::ostream &strm, const heat_diffusion<T, SIZE> &simulation) {
+  for (int i = 0; i < SIZE + 2; i++) {
+    for (int j = 0; j < SIZE + 2; j++) {
+      // 'color' our output according to temperature
+      char out;
+      if (simulation.current_data[i][j] < 0.1) {
+        out = ' ';
+      } else if (simulation.current_data[i][j] < 0.2) {
+        out = '-';
+      } else if (simulation.current_data[i][j] < 0.5) {
+        out = '=';
+      } else {
+        out = '#';
+      }
+
+      strm << out << "\t";
+    }
+    strm << std::endl;
+  }
+
+  return strm;
+}
+
+}
+}
+}
+
+#endif //COMPARISON_BENCHMARKS_BASE_HEAT_H
diff --git a/extern/benchmark_base/include/benchmark_base/matrix.h b/extern/benchmark_base/include/benchmark_base/matrix.h
new file mode 100644
index 0000000..4838d08
--- /dev/null
+++ b/extern/benchmark_base/include/benchmark_base/matrix.h
@@ -0,0 +1,67 @@
+
+#ifndef COMPARISON_BENCHMARKS_BASE_MATRIX_H
+#define COMPARISON_BENCHMARKS_BASE_MATRIX_H
+
+#include <algorithm>
+#include <iostream>
+
+namespace comparison_benchmarks {
+namespace base {
+namespace matrix {
+
+const int MATRIX_SIZE = 128;
+
+const int NUM_ITERATIONS = 5000;
+const int WARMUP_ITERATIONS = 1000;
+
+template<typename T, int SIZE>
+class matrix {
+ public:
+  T data[SIZE][SIZE];
+
+  explicit matrix() {
+    for (int i = 0; i < MATRIX_SIZE; i++) {
+      for (int j = 0; j < MATRIX_SIZE; j++) {
+        data[i][j] = i;
+      }
+    }
+  }
+
+  virtual void multiply(const matrix<T, SIZE> &a, const matrix<T, SIZE> &b) {
+    for (int i = 0; i < SIZE; i++) {
+      multiply_column(i, a, b);
+    }
+  }
+
+ protected:
+  void multiply_column(int i, const matrix<T, SIZE> &a, const matrix<T, SIZE> &b) {
+    for (int j = 0; j < SIZE; ++j) {
+      data[i][j] = 0;
+    }
+    for (int k = 0; k < SIZE; ++k) {
+      for (int j = 0; j < SIZE; ++j) {
+        T a_data = a.data[i][k];
+        T b_data = b.data[k][j];
+        data[i][j] += a_data * b_data;
+      }
+    }
+  }
+};
+
+template<typename T, int SIZE>
+std::ostream &operator<<(std::ostream &strm, const matrix<T, SIZE> &matrix) {
+  for (int i = 0; i < SIZE; i++) {
+    for (int j = 0; j < SIZE; j++) {
+      strm << matrix.data[i][j] << "\t";
+    }
+    strm << std::endl;
+  }
+
+  return strm;
+}
+
+}
+}
+}
+
+#endif //COMPARISON_BENCHMARKS_BASE_MATRIX_H
diff --git a/extern/benchmark_base/include/benchmark_base/range.h b/extern/benchmark_base/include/benchmark_base/range.h
new file mode 100644
index 0000000..06bf65e
--- /dev/null
+++ b/extern/benchmark_base/include/benchmark_base/range.h
@@ -0,0 +1,608 @@
+/*
+                Range
+                =====
+
+    Copyright (c) 2009-2011 Khaled Alshaya
+
+    Distributed under the Boost Software License, version 1.0
+    (See the license at: http://www.boost.org/license_1_0.txt).
+*/
+
+/*
+                Rationale
+                =========
+
+    In Python, there is a beautiful function called "range".
+    "range" allows the programmer to iterate over a range elegantly.
+    This concept is not as general as "for-loops" in C++,
+    but non the less, it expresses the intent of the programmer
+    clearer than the general "for-loops" in many cases.
+
+
+                Design
+                ======
+
+    Range is made to be STL-like library. In fact, it is 
+    built on top of the concepts of STL. The library is designed to
+    work with STL algorithms as well. Range is more flexible
+    than the Python "range", because:
+    
+    Range is an "immutable ordered random access container"
+
+
+                Specifications
+                ==============
+
+    Range satisfies the following requirements:
+
+        * Immutable.
+        * Random Access Container.
+        * Random Access Iterator Interface.
+        * Constant Time Complexity Operations.
+
+
+    Range models an ordered sequence of elements,
+    where a range is defined by:
+
+        [begin, end)
+
+        * begin: the first element in the range. (Inclusive)
+        * end  : the last element in the range.  (Exclusive)
+        * step : the distance between two consecutive elements in a range.
+
+        where each element in the range is defined by:
+
+        element = begin + step * i
+
+        * i: is the index of the element in range.
+
+        The following precondition must be met for the sequence
+        to be a valid range:
+
+            step != 0
+            &&
+            (    
+                begin <= end && step > 0
+                ||
+                begin >= end && step < 0
+            )
+
+
+                Portability
+                ===========
+
+    Range Generator is written in standard C++ (C++98). It depends
+    -only- on the standard C++ library.
+*/
+
+// TODO: See if we should swap this out for our own implementation, for now this is fine, as it is self contained.
+/**
+ * Notes on Modification:
+ * The code was adpated to fit into our namespacing/naming scheme for simpler use.
+ * This includes ifdef's, namespace and code formatting style.
+ */
+
+#ifndef Range_h__
+#define Range_h__
+
+#include <iterator>
+#include <stdexcept>
+#include <cstddef>
+#include <cmath>
+
+namespace comparison_benchmarks {
+namespace base {
+namespace range {
+
+template<class IntegerType>
+struct basic_range {
+  struct const_iterator_impl {
+    typedef IntegerType value_type;
+    typedef std::size_t size_type;
+    typedef IntegerType difference_type;
+    typedef value_type *pointer;
+    typedef value_type &reference;
+    typedef
+    std::random_access_iterator_tag
+        iterator_category;
+
+    const_iterator_impl() : r(0), index(0) {}
+
+    const_iterator_impl(const const_iterator_impl &rhs)
+        : r(rhs.r), index(rhs.index) {}
+
+    const_iterator_impl(basic_range<IntegerType> const *p_range, size_type p_index)
+        : r(p_range), index(p_index) {}
+
+    const_iterator_impl &operator=(const const_iterator_impl &rhs) {
+      r = rhs.r;
+      index = rhs.index;
+      return *this;
+    }
+
+    bool operator==(const const_iterator_impl &rhs) const {
+      return *r == *(rhs.r) && index == rhs.index;
+    }
+
+    bool operator!=(const const_iterator_impl &rhs) const {
+      return !(*this == rhs);
+    }
+
+    bool operator<(const const_iterator_impl &rhs) const {
+      return index < rhs.index;
+    }
+
+    bool operator>(const const_iterator_impl &rhs) const {
+      return index > rhs.index;
+    }
+
+    bool operator<=(const const_iterator_impl &rhs) const {
+      return index <= rhs.index;
+    }
+
+    bool operator>=(const const_iterator_impl &rhs) const {
+      return index >= rhs.index;
+    }
+
+    value_type operator*() const {
+      return r->m_first_element + r->m_step * index;
+    }
+
+    // operator->
+    // is not implemented because the value_type is an integer type
+    // and primitive types in C++ don't define member functions.
+
+    const_iterator_impl &operator++() {
+      ++index;
+      return *this;
+    }
+
+    const_iterator_impl operator++(int) {
+      const_iterator_impl temp = *this;
+      ++index;
+      return temp;
+    }
+
+    const_iterator_impl &operator--() {
+      --index;
+      return *this;
+    }
+
+    const_iterator_impl operator--(int) {
+      const_iterator_impl temp = *this;
+      --index;
+      return temp;
+    }
+
+    const_iterator_impl &operator+=(difference_type increment) {
+      index += increment;
+      return *this;
+    }
+
+    // operator+
+    // is friend operator but operator-
+    // is not, because we want to allow the following for "+":
+    // iterator+5
+    // 5+iterator
+    // For the "-" it is not correct to do so, because
+    // iterator-5 != 5-iterator
+    friend const_iterator_impl operator+
+        (const const_iterator_impl &lhs, difference_type increment) {
+      const_iterator_impl sum;
+      sum.r = lhs.r;
+      sum.index = lhs.index + increment;
+      return sum;
+    }
+
+    const_iterator_impl &operator-=(difference_type decrement) {
+      index -= decrement;
+      return *this;
+    }
+
+    const_iterator_impl operator-(difference_type decrement) const {
+      const_iterator_impl shifted_iterator;
+      shifted_iterator.r = r;
+      shifted_iterator.index = index - decrement;
+      return shifted_iterator;
+    }
+
+    difference_type operator-(const const_iterator_impl &rhs) const {
+      return index - rhs.index;
+    }
+
+    value_type operator[](difference_type offset) const {
+      size_type new_index = index + offset;
+      return r->m_first_element + r->m_step * new_index;
+    }
+
+   private:
+    basic_range<IntegerType> const *r;
+    size_type index;
+  };
+
+  struct const_reverse_iterator_impl {
+    typedef IntegerType value_type;
+    typedef std::size_t size_type;
+    typedef IntegerType difference_type;
+    typedef value_type *pointer;
+    typedef value_type &reference;
+    typedef
+    std::random_access_iterator_tag
+        iterator_category;
+
+    const_reverse_iterator_impl() : r(0), index(0) {}
+
+    const_reverse_iterator_impl(const const_reverse_iterator_impl &rhs)
+        : r(rhs.r), index(rhs.index) {}
+
+    const_reverse_iterator_impl(basic_range<IntegerType> const *p_range, size_type p_index)
+        : r(p_range), index(p_index) {}
+
+    const_reverse_iterator_impl &operator=(const const_reverse_iterator_impl &rhs) {
+      r = rhs.r;
+      index = rhs.index;
+      return *this;
+    }
+
+    bool operator==(const const_reverse_iterator_impl &rhs) const {
+      return *r == *(rhs.r) && index == rhs.index;
+    }
+
+    bool operator!=(const const_reverse_iterator_impl &rhs) const {
+      return !(*this == rhs);
+    }
+
+    bool operator<(const const_reverse_iterator_impl &rhs) const {
+      return index < rhs.index;
+    }
+
+    bool operator>(const const_reverse_iterator_impl &rhs) const {
+      return index > rhs.index;
+    }
+
+    bool operator<=(const const_reverse_iterator_impl &rhs) const {
+      return index <= rhs.index;
+    }
+
+    bool operator>=(const const_reverse_iterator_impl &rhs) const {
+      return index >= rhs.index;
+    }
+
+    value_type operator*() const {
+      size_type reverse_index
+          = (r->m_element_count - 1) - index;
+      return r->m_first_element + r->m_step * reverse_index;
+    }
+
+    // operator->
+    // is not implemented because the value_type is integer type
+    // and primitive types in C++ don't define member functions.
+
+    const_reverse_iterator_impl &operator++() {
+      ++index;
+      return *this;
+    }
+
+    const_reverse_iterator_impl operator++(int) {
+      const_reverse_iterator_impl temp = *this;
+      ++index;
+      return temp;
+    }
+
+    const_reverse_iterator_impl &operator--() {
+      --index;
+      return *this;
+    }
+
+    const_reverse_iterator_impl operator--(int) {
+      const_reverse_iterator_impl temp = *this;
+      --index;
+      return temp;
+    }
+
+    const_reverse_iterator_impl &operator+=(difference_type increment) {
+      index += increment;
+      return *this;
+    }
+
+    // operator+
+    // is friend operator but operator-
+    // is not, because we want to allow the following for "+":
+    // iterator+5
+    // 5+iterator
+    // For the "-" it is not correct to do so, because
+    // iterator-5 != 5-iterator
+    friend const_reverse_iterator_impl operator+
+        (const const_reverse_iterator_impl &lhs, difference_type increment) {
+      const_reverse_iterator_impl sum;
+      sum.r = lhs.r;
+      sum.index = lhs.index + increment;
+      return sum;
+    }
+
+    const_reverse_iterator_impl &operator-=(difference_type decrement) {
+      index -= decrement;
+      return *this;
+    }
+
+    const_reverse_iterator_impl operator-(difference_type decrement) const {
+      const_reverse_iterator_impl shifted_iterator;
+      shifted_iterator.r = r;
+      shifted_iterator.index = index - decrement;
+      return shifted_iterator;
+    }
+
+    difference_type operator-(const const_reverse_iterator_impl &rhs) const {
+      return index - rhs.index;
+    }
+
+    value_type operator[](difference_type offset) const {
+      size_type new_reverse_index
+          = (r->m_element_count - 1) - (index + offset);
+      return r->m_first_element + r->m_step * new_reverse_index;
+    }
+
+   private:
+    basic_range<IntegerType> const *r;
+    size_type index;
+  };
+
+  typedef IntegerType value_type;
+  typedef const_iterator_impl iterator;
+  typedef const_iterator_impl const_iterator;
+  typedef const_reverse_iterator_impl reverse_iterator;
+  typedef const_reverse_iterator_impl const_reverse_iterator;
+  typedef value_type &reference;
+  typedef const value_type &const_reference;
+  typedef value_type *pointer;
+  typedef IntegerType difference_type;
+  typedef std::size_t size_type;
+
+  // In the case of default construction,
+  // the range is considered as an empty range with no elements.
+  // step can be anything other than 0. 1 is
+  // an implementation convention, and it doesn't have
+  // a significance in this case because the range is empty.
+  basic_range() : m_first_element(0), m_element_count(0), m_step(1) {}
+
+  // first_element: is begin in specifications.
+  // last_element: is end in specifications.
+  basic_range(value_type first_element, value_type last_element, value_type step)
+      : m_first_element(first_element),
+        m_step(step) {
+    // We need to count the number of elements.
+    // The only case where a range is invalid,
+    // when the step=0. It means that the range
+    // is infinite, because the number of elements
+    // in a range, is the length of that range
+    // divided by the difference between
+    // every two successive elements.
+
+    if (step == 0)
+      throw std::out_of_range("Invalid Range: step can't be equal to zero!");
+    if (first_element < last_element && step < 0)
+      throw std::out_of_range("Invalid Range: step can't be backward, while the range is forward!");
+    if (first_element > last_element && step > 0)
+      throw std::out_of_range("Invalid Range: step can't be forward, while the range is backward!");
+
+    m_element_count = (last_element - first_element) / step;
+    if ((last_element - first_element) % step != 0)
+      ++m_element_count;
+  }
+
+  // The following constructor, determines the step
+  // automatically. If the range is forward, then
+  // step will be one. If the range is backward,
+  // step will be minus one. If the begin is equal
+  // to end, then the step must not equal to zero
+  // and it is set to one as a convention.
+  basic_range(value_type first_element, value_type last_element)
+      : m_first_element(first_element) {
+    if (last_element >= first_element) *this = basic_range<IntegerType>(first_element, last_element, 1);
+    else *this = basic_range<IntegerType>(first_element, last_element, -1);
+
+  }
+
+  // The following constructor is a shortcut
+  // if you want the first element as zero.
+  // the step is determined automatically, based
+  // on the last element. If the last element is
+  // positive, then step is one, but if it is negative
+  // then step is minus one.
+  basic_range<IntegerType>(value_type last_element)
+      : m_first_element(0) {
+    if (last_element >= m_first_element) *this = basic_range<IntegerType>(m_first_element, last_element, 1);
+    else *this = basic_range<IntegerType>(m_first_element, last_element, -1);
+  }
+
+  basic_range<IntegerType>(const basic_range<IntegerType> &r)
+      : m_first_element(r.m_first_element),
+        m_element_count(r.m_element_count),
+        m_step(r.m_step) {}
+
+  basic_range<IntegerType> &operator=(const basic_range<IntegerType> &r) {
+    m_first_element = r.m_first_element;
+    m_element_count = r.m_element_count;
+    m_step = r.m_step;
+
+    return *this;
+  }
+
+  bool operator==(const basic_range<IntegerType> &r) const {
+    return m_first_element == r.m_first_element
+        &&
+            m_element_count == r.m_element_count
+        &&
+            m_step == r.m_step;
+  }
+
+  bool operator!=(const basic_range<IntegerType> &r) const {
+    return !(*this == r);
+  }
+
+  // The following four functions enable the user to compare
+  // ranges using ( <, >, <=, >=).
+  // The comparison between two ranges is a simple lexicographical
+  // comparison(element by element). By convention, if two ranges
+  // R1, R2 where R1 has a smaller number of elements. Then if
+  // R1 contains more elements but all R1 elements are found in R2
+  // R1 is considered less than R2.
+  bool operator<(const basic_range<IntegerType> &r) const {
+    // ********** This function needs refactoring.
+
+    if (m_element_count == 0 && r.m_element_count == 0)
+      return false;
+    if (m_element_count == 0 && r.m_element_count > 0)
+      return true;
+    if (m_element_count > 0 && r.m_element_count == 0)
+      return false;
+
+    // At this point, both has at least one element.
+    if (m_first_element < r.m_first_element)
+      return true;
+    if (m_first_element > r.m_first_element)
+      return false;
+
+    // At this point, the first element of both are equal.
+    if (m_element_count == 1 && r.m_element_count == 1)
+      return false;
+    if (m_element_count == 1 && r.m_element_count > 1)
+      return true;
+    if (m_element_count > 1 && r.m_element_count == 1)
+      return false;
+
+    // At this point, both have at least two elements with
+    // a similar first element. Note than the final answer
+    // in this case depends on the second element only, because
+    // we don't need to compare the elements further.
+    // Note that the second element is at (index == 1), because
+    // the first element is at (index == 0).
+    if (m_first_element + m_step * 1 < r.m_first_element + r.m_step * 1)
+      return true;
+    if (m_first_element + m_step * 1 > r.m_first_element + r.m_step * 1)
+      return false;
+
+    // if the first two elements of both ranges are equal, then
+    // they are co-linear ranges(because the step is constant).
+    // In that case, they comparison depends only on
+    // the size of the ranges by convention.
+    return m_element_count < r.m_element_count;
+  }
+
+  bool operator>(const basic_range<IntegerType> &r) const {
+    // ********** This function needs refactoring.
+
+    if (m_element_count == 0 && r.m_element_count == 0)
+      return false;
+    if (m_element_count == 0 && r.m_element_count > 0)
+      return false;
+    if (m_element_count > 0 && r.m_element_count == 0)
+      return true;
+
+    // At this point, both has at least one element.
+    if (m_first_element < r.m_first_element)
+      return false;
+    if (m_first_element > r.m_first_element)
+      return true;
+
+    // At this point, the first element of both are equal.
+    if (m_element_count == 1 && r.m_element_count == 1)
+      return false;
+    if (m_element_count == 1 && r.m_element_count > 1)
+      return false;
+    if (m_element_count > 1 && r.m_element_count == 1)
+      return true;
+
+    // At this point, both have at least two elements with
+    // a similar first element. Note than the final answer
+    // in this case depends on the second element only, because
+    // we don't need to compare the elements further.
+    // Note that the second element is at (index == 1), because
+    // the first element is at (index == 0).
+    if (m_first_element + m_step * 1 < r.m_first_element + r.m_step * 1)
+      return false;
+    if (m_first_element + m_step * 1 > r.m_first_element + r.m_step * 1)
+      return true;
+
+    // if the first two elements of both ranges are equal, then
+    // they are co-linear ranges(because the step is constant).
+    // In that case, they comparison depends only on
+    // the size of the ranges by convention.
+    return m_element_count > r.m_element_count;
+  }
+
+  bool operator<=(const basic_range<IntegerType> &r) const {
+    return !(*this > r);
+  }
+
+  bool operator>=(const basic_range<IntegerType> &r) const {
+    return !(*this < r);
+  }
+
+  const_iterator begin() const {
+    return const_iterator(this, 0);
+  }
+
+  const_iterator end() const {
+    return const_iterator(this, m_element_count);
+  }
+
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(this, 0);
+  }
+
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(this, m_element_count);
+  }
+
+  size_type size() const {
+    return m_element_count;
+  }
+
+  size_type max_size() const {
+    // Because this is an immutable container,
+    // max_size() == size()
+    return m_element_count;
+  }
+
+  bool empty() const {
+    return m_element_count == 0;
+  }
+
+  // exist() and find() are similar except that
+  // find() returns the index of the element.
+  iterator find(value_type element) const {
+    value_type element_index = (element - m_first_element) / m_step;
+    bool in_range = element_index >= 0 && element_index < m_element_count &&
+        (element - m_first_element) % m_step == 0;
+    if (in_range)
+      return begin() + element_index;
+    return end();
+  }
+
+  bool exist(value_type element) const {
+    return find(element) != end();
+  }
+
+  // In the standard, the operator[]
+  // should return a const reference.
+  // Because Range Generator doesn't store its elements
+  // internally, we return a copy of the value.
+  // In any case, this doesn't affect the semantics of the operator.
+  value_type operator[](size_type index) const {
+    return m_first_element + m_step * index;
+  }
+
+ private:
+  // m_first_element: begin (see specifications).
+  // m_element_count: (end - begin) / step
+  value_type m_first_element, m_element_count, m_step;
+};
+
+// This is the default type of range!
+typedef basic_range<int> range;
+}
+}
+}
+
+#endif // range_h__
diff --git a/extern/benchmark_base/include/benchmark_base/unbalanced.h b/extern/benchmark_base/include/benchmark_base/unbalanced.h
new file mode 100644
index 0000000..5396ce2
--- /dev/null
+++ b/extern/benchmark_base/include/benchmark_base/unbalanced.h
@@ -0,0 +1,97 @@
+
+#ifndef COMPARISON_BENCHMARKS_BASE_UNBALANCED_H_
+#define COMPARISON_BENCHMARKS_BASE_UNBALANCED_H_
+
+#include <cstdint>
+#include <array>
+#include <vector>
+
+#include "picosha2.h"
+
+namespace comparison_benchmarks {
+namespace base {
+namespace unbalanced {
+
+const int SEED = 42;
+const int ROOT_CHILDREN = 140;
+const double Q = 0.124875;
+const int NORMAL_CHILDREN = 8;
+
+const int NUM_NODES = 71069;
+
+const int NUM_ITERATIONS = 50;
+const int WARMUP_ITERATIONS = 5;
+
+using node_state = std::array<uint8_t, 20>;
+
+/**
+ * Node of an unballanced binomial tree (https://www.cs.unc.edu/~olivier/LCPC06.pdf).
+ * To build up the tree recursivly call spawn_child_nodes on each node until leaves are reached.
+ * The tree is not built up directly in memory, but rather by the recursive calls.
+ */
+class node {
+  // The state is used to allow a deterministic tree construction using sha256 hashes.
+  node_state state_;
+
+  // Number of children for the current node
+  int num_children_;
+
+  // Set this to a positive number for the root node to start the tree with a specific size
+  int root_children_;
+
+  // general branching factors
+  double q_;
+  int b_;
+
+  // Private constructor for children
+  node(node_state state, double q, int b) : state_{state},
+                                            num_children_{0},
+                                            root_children_{-1},
+                                            q_{q},
+                                            b_{b} { init_num_children(); }
+
+  std::array<uint8_t, 20> generate_child_state(uint32_t index);
+  double get_state_random();
+  void init_num_children() {
+    double state_random = get_state_random();
+    if (root_children_ > 0) {
+      num_children_ = root_children_; // Root always spawns children
+    } else if (state_random < q_) {
+      num_children_ = b_;
+    } else {
+      num_children_ = 0;
+    }
+  }
+
+ public:
+  node(uint32_t seed, int root_children, double q, int b)
+      : state_({{}}), num_children_{0}, root_children_{root_children}, q_{q}, b_{b} {
+
+    for (int i = 0; i < 16; i++) {
+      state_[i] = 0;
+    }
+    state_[16] = static_cast<uint8_t>(0xFFu & (seed >> 24u));
+    state_[17] = static_cast<uint8_t>(0xFFu & (seed >> 16u));
+    state_[18] = static_cast<uint8_t>(0xFFu & (seed >> 8u));
+    state_[19] = static_cast<uint8_t>(0xFFu & (seed >> 0u));
+
+    picosha2::hash256_one_by_one hasher;
+    hasher.process(state_.begin(), state_.end());
+    hasher.finish();
+    hasher.get_hash_bytes(state_.begin(), state_.end());
+
+    init_num_children();
+  }
+
+  int get_num_children() const { return num_children_; }
+
+  node spawn_child_node(int index) {
+    return {generate_child_state(index), q_, b_};
+  }
+};
+
+}
+}
+}
+
+#endif //COMPARISON_BENCHMARKS_BASE_UNBALANCED_H_
diff --git a/extern/benchmark_base/src/.gitkeep b/extern/benchmark_base/src/.gitkeep
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/extern/benchmark_base/src/.gitkeep
diff --git a/extern/benchmark_base/src/fft.cpp b/extern/benchmark_base/src/fft.cpp
new file mode 100644
index 0000000..c85c0b3
--- /dev/null
+++ b/extern/benchmark_base/src/fft.cpp
@@ -0,0 +1,63 @@
+#include "benchmark_base/fft.h"
+
+namespace comparison_benchmarks {
+namespace base {
+namespace fft {
+
+complex_vector generate_input() {
+  std::vector<double> known_frequencies{2, 11, 52, 88, 256};
+  fft::complex_vector data(SIZE);
+  // Set our input data to match a time series of the known_frequencies.
+  // When applying fft to this time-series we should find these frequencies.
+  for (int i = 0; i < SIZE; i++) {
+    data[i] = std::complex<double>(0.0, 0.0);
+    for (auto frequencie : known_frequencies) {
+      data[i] += sin(2 * M_PI * frequencie * i / SIZE);
+    }
+  }
+
+  return data;
+}
+
+void divide(complex_vector::iterator data, int n) {
+  complex_vector tmp_odd_elements(n / 2);
+  for (int i = 0; i < n / 2; i++) {
+    tmp_odd_elements[i] = data[i * 2 + 1];
+  }
+  for (int i = 0; i < n / 2; i++) {
+    data[i] = data[i * 2];
+  }
+  for (int i = 0; i < n / 2; i++) {
+    data[i + n / 2] = tmp_odd_elements[i];
+  }
+}
+
+void combine(complex_vector::iterator data, int n) {
+  for (int i = 0; i < n / 2; i++) {
+    std::complex<double> even = data[i];
+    std::complex<double> odd = data[i + n / 2];
+
+    // w is the "twiddle-factor".
+    // this could be cached, but we run the same 'base' algorithm parallel/serial,
+    // so it won't impact the performance comparison.
+    std::complex<double> w = exp(std::complex<double>(0, -2. * M_PI * i / n));
+
+    data[i] = even + w * odd;
+    data[i + n / 2] = even - w * odd;
+  }
+}
+
+void conquer(complex_vector::iterator data, int n) {
+  if (n < 2) {
+    return;
+  }
+
+  divide(data, n);
+  conquer(data, n / 2);
+  conquer(data + n / 2, n / 2);
+  combine(data, n);
+}
+
+}
+}
+}
diff --git a/extern/benchmark_base/src/sample_images.cpp.in b/extern/benchmark_base/src/sample_images.cpp.in
new file mode 100644
index 0000000..abef3b0
--- /dev/null
+++ b/extern/benchmark_base/src/sample_images.cpp.in
@@ -0,0 +1,23 @@
+#include <vector>
+#include <string>
+#include <iostream>
+#include <sstream>
+
+using namespace std;
+
+namespace comparison_benchmarks {
+    namespace base {
+        vector<string> get_sample_image_paths() {
+            const int num_images = 19;
+
+            vector<string> result(num_images);
+            for (int i = 0; i < num_images; i++) {
+                ostringstream string_stream;
+                string_stream << "@CMAKE_CURRENT_SOURCE_DIR@/sample_images/" << i << ".jpg";
+                result[i] = string_stream.str();
+            }
+
+            return result;
+        }
+    }
+}
diff --git a/extern/benchmark_base/src/unbalanced.cpp b/extern/benchmark_base/src/unbalanced.cpp
new file mode 100644
index 0000000..e23153e
--- /dev/null
+++ b/extern/benchmark_base/src/unbalanced.cpp
@@ -0,0 +1,34 @@
+#include "benchmark_base/unbalanced.h"
+
+namespace comparison_benchmarks {
+namespace base {
+namespace unbalanced {
+
+node_state node::generate_child_state(uint32_t index) {
+  node_state result;
+
+  picosha2::hash256_one_by_one hasher;
+  hasher.process(state_.begin(), state_.end());
+  auto index_begin = reinterpret_cast<uint8_t *>(&index);
+  hasher.process(index_begin, index_begin + 4);
+  hasher.finish();
+  hasher.get_hash_bytes(result.begin(), result.end());
+
+  return result;
+}
+
+double node::get_state_random() {
+  int32_t state_random_integer;
+  uint32_t b = ((uint32_t) state_[16] << 24u) |
+      ((uint32_t) state_[17] << 16u) |
+      ((uint32_t) state_[18] << 8u) |
+      ((uint32_t) state_[19] << 0u);
+  b = b & 0x7fffffff; // Mask out negative values
+  state_random_integer = static_cast<int32_t>(b);
+
+  return (double) state_random_integer / (double) INT32_MAX;
+}
+
+}
+}
+}
diff --git a/extern/benchmark_runner/CMakeLists.txt b/extern/benchmark_runner/CMakeLists.txt
new file mode 100644
index 0000000..27dfa8b
--- /dev/null
+++ b/extern/benchmark_runner/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_library(benchmark_runner INTERFACE)
+target_include_directories(benchmark_runner INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/extern/benchmark_runner/benchmark_runner.h b/extern/benchmark_runner/benchmark_runner.h
new file mode 100644
index 0000000..b10b84e
--- /dev/null
+++ b/extern/benchmark_runner/benchmark_runner.h
@@ -0,0 +1,102 @@
+
+#ifndef BENCHMARK_RUNNER_H
+#define BENCHMARK_RUNNER_H
+
+#include <string>
+#include <cstdlib>
+#include <vector>
+#include <chrono>
+#include <numeric>
+#include <iostream>
+#include <fstream>
+#include <bits/stdc++.h>
+
+using namespace std;
+
+class benchmark_runner {
+ private:
+  string csv_path_;
+  string csv_name_;
+
+  chrono::steady_clock::time_point last_start_time_;
+  vector<long> times_;
+
+  void print_statistics() {
+    long time_sum = std::accumulate(times_.begin(), times_.end(), 0l);
+    cout << "Average Runtime (us): " << (time_sum / times_.size()) << endl;
+  }
+
+  inline bool file_exists(const std::string &name) {
+    ifstream f(name);
+    return f.good();
+  }
+
+ public:
+  benchmark_runner(string csv_path, string csv_name) : csv_path_{std::move(csv_path)},
+                                                       csv_name_{std::move(csv_name)},
+                                                       times_{} {
+    string command = "mkdir -p " + csv_path_;
+    int res = system(command.c_str());
+    if (res) {
+      cout << "Error while creating directory!" << endl;
+      exit(1);
+    }
+  }
+
+  static void read_args(int argc, char **argv, int &num_threads, string &path) {
+    if (argc < 3) {
+      cout << "Must Specifiy concurrency and output directory! (usage: `benchmark <output_directory> <num_threads>`)"
+           << endl;
+      exit(1);
+    }
+
+    string tmp = argv[1];
+    path = tmp;
+    num_threads = atoi(argv[2]);
+  }
+
+  void start_iteration() {
+    last_start_time_ = chrono::steady_clock::now();
+  }
+
+  void end_iteration() {
+    auto end_time = chrono::steady_clock::now();
+    long time = chrono::duration_cast<chrono::microseconds>(end_time - last_start_time_).count();
+    times_.emplace_back(time);
+  }
+
+  void run_iterations(int count, function<void(void)> f, int warmup_count) {
+    for (int i = 0; i < warmup_count; i++) {
+      f();
+    }
+
+    for (int i = 0; i < count; i++) {
+      start_iteration();
+      f();
+      end_iteration();
+    }
+  }
+
+  void commit_results(bool print_stats) {
+    if (print_stats) {
+      print_statistics();
+    }
+
+    string full_filename = csv_path_ + csv_name_;
+    bool write_header = !file_exists(full_filename);
+
+    { // Scope for output file
+      ofstream o(full_filename, std::fstream::out | std::fstream::app);
+      if (write_header) {
+        o << "runtime_us" << endl;
+      }
+      for (auto time : times_) {
+        o << time << endl;
+      }
+    } // End Scope for output file
+
+    times_.clear();
+  }
+};
+
+#endif //BENCHMARK_RUNNER_H
diff --git a/extern/picosha2/CMakeLists.txt b/extern/picosha2/CMakeLists.txt
new file mode 100644
index 0000000..0f2f59d
--- /dev/null
+++ b/extern/picosha2/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_library(picosha2 INTERFACE)
+target_include_directories(picosha2 INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
\ No newline at end of file
diff --git a/extern/picosha2/LICENSE b/extern/picosha2/LICENSE
new file mode 100644
index 0000000..4e22100
--- /dev/null
+++ b/extern/picosha2/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017 okdshin
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/extern/picosha2/picosha2.h b/extern/picosha2/picosha2.h
new file mode 100644
index 0000000..bc00c74
--- /dev/null
+++ b/extern/picosha2/picosha2.h
@@ -0,0 +1,377 @@
+/*
+The MIT License (MIT)
+
+Copyright (C) 2017 okdshin
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef PICOSHA2_H
+#define PICOSHA2_H
+// picosha2:20140213
+
+#ifndef PICOSHA2_BUFFER_SIZE_FOR_INPUT_ITERATOR
+#define PICOSHA2_BUFFER_SIZE_FOR_INPUT_ITERATOR \
+    1048576  //=1024*1024: default is 1MB memory
+#endif
+
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <sstream>
+#include <vector>
+#include <fstream>
+namespace picosha2 {
+typedef unsigned long word_t;
+typedef unsigned char byte_t;
+
+static const size_t k_digest_size = 32;
+
+namespace detail {
+inline byte_t mask_8bit(byte_t x) { return x & 0xff; }
+
+inline word_t mask_32bit(word_t x) { return x & 0xffffffff; }
+
+const word_t add_constant[64] = {
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
+    0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
+    0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
+    0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
+    0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
+    0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2};
+
+const word_t initial_message_digest[8] = {0x6a09e667, 0xbb67ae85, 0x3c6ef372,
+                                          0xa54ff53a, 0x510e527f, 0x9b05688c,
+                                          0x1f83d9ab, 0x5be0cd19};
+
+inline word_t ch(word_t x, word_t y, word_t z) { return (x & y) ^ ((~x) & z); }
+
+inline word_t maj(word_t x, word_t y, word_t z) {
+    return (x & y) ^ (x & z) ^ (y & z);
+}
+
+inline word_t rotr(word_t x, std::size_t n) {
+    assert(n < 32);
+    return mask_32bit((x >> n) | (x << (32 - n)));
+}
+
+inline word_t bsig0(word_t x) { return rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22); }
+
+inline word_t bsig1(word_t x) { return rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25); }
+
+inline word_t shr(word_t x, std::size_t n) {
+    assert(n < 32);
+    return x >> n;
+}
+
+inline word_t ssig0(word_t x) { return rotr(x, 7) ^ rotr(x, 18) ^ shr(x, 3); }
+
+inline word_t ssig1(word_t x) { return rotr(x, 17) ^ rotr(x, 19) ^ shr(x, 10); }
+
+template <typename RaIter1, typename RaIter2>
+void hash256_block(RaIter1 message_digest, RaIter2 first, RaIter2 last) {
+    assert(first + 64 == last);
+    static_cast<void>(last);  // for avoiding unused-variable warning
+    word_t w[64];
+    std::fill(w, w + 64, 0);
+    for (std::size_t i = 0; i < 16; ++i) {
+        w[i] = (static_cast<word_t>(mask_8bit(*(first + i * 4))) << 24) |
+               (static_cast<word_t>(mask_8bit(*(first + i * 4 + 1))) << 16) |
+               (static_cast<word_t>(mask_8bit(*(first + i * 4 + 2))) << 8) |
+               (static_cast<word_t>(mask_8bit(*(first + i * 4 + 3))));
+    }
+    for (std::size_t i = 16; i < 64; ++i) {
+        w[i] = mask_32bit(ssig1(w[i - 2]) + w[i - 7] + ssig0(w[i - 15]) +
+                          w[i - 16]);
+    }
+
+    word_t a = *message_digest;
+    word_t b = *(message_digest + 1);
+    word_t c = *(message_digest + 2);
+    word_t d = *(message_digest + 3);
+    word_t e = *(message_digest + 4);
+    word_t f = *(message_digest + 5);
+    word_t g = *(message_digest + 6);
+    word_t h = *(message_digest + 7);
+
+    for (std::size_t i = 0; i < 64; ++i) {
+        word_t temp1 = h + bsig1(e) + ch(e, f, g) + add_constant[i] + w[i];
+        word_t temp2 = bsig0(a) + maj(a, b, c);
+        h = g;
+        g = f;
+        f = e;
+        e = mask_32bit(d + temp1);
+        d = c;
+        c = b;
+        b = a;
+        a = mask_32bit(temp1 + temp2);
+    }
+    *message_digest += a;
+    *(message_digest + 1) += b;
+    *(message_digest + 2) += c;
+    *(message_digest + 3) += d;
+    *(message_digest + 4) += e;
+    *(message_digest + 5) += f;
+    *(message_digest + 6) += g;
+    *(message_digest + 7) += h;
+    for (std::size_t i = 0; i < 8; ++i) {
+        *(message_digest + i) = mask_32bit(*(message_digest + i));
+    }
+}
+
+}  // namespace detail
+
+template <typename InIter>
+void output_hex(InIter first, InIter last, std::ostream& os) {
+    os.setf(std::ios::hex, std::ios::basefield);
+    while (first != last) {
+        os.width(2);
+        os.fill('0');
+        os << static_cast<unsigned int>(*first);
+        ++first;
+    }
+    os.setf(std::ios::dec, std::ios::basefield);
+}
+
+template <typename InIter>
+void bytes_to_hex_string(InIter first, InIter last, std::string& hex_str) {
+    std::ostringstream oss;
+    output_hex(first, last, oss);
+    hex_str.assign(oss.str());
+}
+
+template <typename InContainer>
+void bytes_to_hex_string(const InContainer& bytes, std::string& hex_str) {
+    bytes_to_hex_string(bytes.begin(), bytes.end(), hex_str);
+}
+
+template <typename InIter>
+std::string bytes_to_hex_string(InIter first, InIter last) {
+    std::string hex_str;
+    bytes_to_hex_string(first, last, hex_str);
+    return hex_str;
+}
+
+template <typename InContainer>
+std::string bytes_to_hex_string(const InContainer& bytes) {
+    std::string hex_str;
+    bytes_to_hex_string(bytes, hex_str);
+    return hex_str;
+}
+
+class hash256_one_by_one {
+   public:
+    hash256_one_by_one() { init(); }
+
+    void init() {
+        buffer_.clear();
+        std::fill(data_length_digits_, data_length_digits_ + 4, 0);
+        std::copy(detail::initial_message_digest,
+                  detail::initial_message_digest + 8, h_);
+    }
+
+    template <typename RaIter>
+    void process(RaIter first, RaIter last) {
+        add_to_data_length(static_cast<word_t>(std::distance(first, last)));
+        std::copy(first, last, std::back_inserter(buffer_));
+        std::size_t i = 0;
+        for (; i + 64 <= buffer_.size(); i += 64) {
+            detail::hash256_block(h_, buffer_.begin() + i,
+                                  buffer_.begin() + i + 64);
+        }
+        buffer_.erase(buffer_.begin(), buffer_.begin() + i);
+    }
+
+    void finish() {
+        byte_t temp[64];
+        std::fill(temp, temp + 64, 0);
+        std::size_t remains = buffer_.size();
+        std::copy(buffer_.begin(), buffer_.end(), temp);
+        temp[remains] = 0x80;
+
+        if (remains > 55) {
+            std::fill(temp + remains + 1, temp + 64, 0);
+            detail::hash256_block(h_, temp, temp + 64);
+            std::fill(temp, temp + 64 - 4, 0);
+        } else {
+            std::fill(temp + remains + 1, temp + 64 - 4, 0);
+        }
+
+        write_data_bit_length(&(temp[56]));
+        detail::hash256_block(h_, temp, temp + 64);
+    }
+
+    template <typename OutIter>
+    void get_hash_bytes(OutIter first, OutIter last) const {
+        for (const word_t* iter = h_; iter != h_ + 8; ++iter) {
+            for (std::size_t i = 0; i < 4 && first != last; ++i) {
+                *(first++) = detail::mask_8bit(
+                    static_cast<byte_t>((*iter >> (24 - 8 * i))));
+            }
+        }
+    }
+
+   private:
+    void add_to_data_length(word_t n) {
+        word_t carry = 0;
+        data_length_digits_[0] += n;
+        for (std::size_t i = 0; i < 4; ++i) {
+            data_length_digits_[i] += carry;
+            if (data_length_digits_[i] >= 65536u) {
+                carry = data_length_digits_[i] >> 16;
+                data_length_digits_[i] &= 65535u;
+            } else {
+                break;
+            }
+        }
+    }
+    void write_data_bit_length(byte_t* begin) {
+        word_t data_bit_length_digits[4];
+        std::copy(data_length_digits_, data_length_digits_ + 4,
+                  data_bit_length_digits);
+
+        // convert byte length to bit length (multiply 8 or shift 3 times left)
+        word_t carry = 0;
+        for (std::size_t i = 0; i < 4; ++i) {
+            word_t before_val = data_bit_length_digits[i];
+            data_bit_length_digits[i] <<= 3;
+            data_bit_length_digits[i] |= carry;
+            data_bit_length_digits[i] &= 65535u;
+            carry = (before_val >> (16 - 3)) & 65535u;
+        }
+
+        // write data_bit_length
+        for (int i = 3; i >= 0; --i) {
+            (*begin++) = static_cast<byte_t>(data_bit_length_digits[i] >> 8);
+            (*begin++) = static_cast<byte_t>(data_bit_length_digits[i]);
+        }
+    }
+    std::vector<byte_t> buffer_;
+    word_t data_length_digits_[4];  // as 64bit integer (16bit x 4 integer)
+    word_t h_[8];
+};
+
+inline void get_hash_hex_string(const hash256_one_by_one& hasher,
+                                std::string& hex_str) {
+    byte_t hash[k_digest_size];
+    hasher.get_hash_bytes(hash, hash + k_digest_size);
+    return bytes_to_hex_string(hash, hash + k_digest_size, hex_str);
+}
+
+inline std::string get_hash_hex_string(const hash256_one_by_one& hasher) {
+    std::string hex_str;
+    get_hash_hex_string(hasher, hex_str);
+    return hex_str;
+}
+
+namespace impl {
+template <typename RaIter, typename OutIter>
+void hash256_impl(RaIter first, RaIter last, OutIter first2, OutIter last2, int,
+                  std::random_access_iterator_tag) {
+    hash256_one_by_one hasher;
+    // hasher.init();
+    hasher.process(first, last);
+    hasher.finish();
+    hasher.get_hash_bytes(first2, last2);
+}
+
+template <typename InputIter, typename OutIter>
+void hash256_impl(InputIter first, InputIter last, OutIter first2,
+                  OutIter last2, int buffer_size, std::input_iterator_tag) {
+    std::vector<byte_t> buffer(buffer_size);
+    hash256_one_by_one hasher;
+    // hasher.init();
+    while (first != last) {
+        int size = buffer_size;
+        for (int i = 0; i != buffer_size; ++i, ++first) {
+            if (first == last) {
+                size = i;
+                break;
+            }
+            buffer[i] = *first;
+        }
+        hasher.process(buffer.begin(), buffer.begin() + size);
+    }
+    hasher.finish();
+    hasher.get_hash_bytes(first2, last2);
+}
+}
+
+template <typename InIter, typename OutIter>
+void hash256(InIter first, InIter last, OutIter first2, OutIter last2,
+             int buffer_size = PICOSHA2_BUFFER_SIZE_FOR_INPUT_ITERATOR) {
+    picosha2::impl::hash256_impl(
+        first, last, first2, last2, buffer_size,
+        typename std::iterator_traits<InIter>::iterator_category());
+}
+
+template <typename InIter, typename OutContainer>
+void hash256(InIter first, InIter last, OutContainer& dst) {
+    hash256(first, last, dst.begin(), dst.end());
+}
+
+template <typename InContainer, typename OutIter>
+void hash256(const InContainer& src, OutIter first, OutIter last) {
+    hash256(src.begin(), src.end(), first, last);
+}
+
+template <typename InContainer, typename OutContainer>
+void hash256(const InContainer& src, OutContainer& dst) {
+    hash256(src.begin(), src.end(), dst.begin(), dst.end());
+}
+
+template <typename InIter>
+void hash256_hex_string(InIter first, InIter last, std::string& hex_str) {
+    byte_t hashed[k_digest_size];
+    hash256(first, last, hashed, hashed + k_digest_size);
+    std::ostringstream oss;
+    output_hex(hashed, hashed + k_digest_size, oss);
+    hex_str.assign(oss.str());
+}
+
+template <typename InIter>
+std::string hash256_hex_string(InIter first, InIter last) {
+    std::string hex_str;
+    hash256_hex_string(first, last, hex_str);
+    return hex_str;
+}
+
+inline void hash256_hex_string(const std::string& src, std::string& hex_str) {
+    hash256_hex_string(src.begin(), src.end(), hex_str);
+}
+
+template <typename InContainer>
+void hash256_hex_string(const InContainer& src, std::string& hex_str) {
+    hash256_hex_string(src.begin(), src.end(), hex_str);
+}
+
+template <typename InContainer>
+std::string hash256_hex_string(const InContainer& src) {
+    return hash256_hex_string(src.begin(), src.end());
+}
+template<typename OutIter>void hash256(std::ifstream& f, OutIter first, OutIter last){
+    hash256(std::istreambuf_iterator<char>(f), std::istreambuf_iterator<char>(), first,last);
+
+}
+}// namespace picosha2
+#endif  // PICOSHA2_H
diff --git a/lib/pls/include/pls/algorithms/for_each_impl.h b/lib/pls/include/pls/algorithms/for_each_impl.h
index 2473f24..9e173c9 100644
--- a/lib/pls/include/pls/algorithms/for_each_impl.h
+++ b/lib/pls/include/pls/algorithms/for_each_impl.h
@@ -29,12 +29,12 @@ pls::internal::scheduling::parallel_result<int> for_each(const RandomIt first,
     // Cut in half recursively
     const long middle_index = num_elements / 2;
 
-    return scheduler::par([first, middle_index, last, &function, min_elements] {
+    return scheduler::par([first, middle_index, last, function, min_elements] {
       return internal::for_each(first,
                                 first + middle_index,
                                 function,
                                 min_elements);
-    }, [first, middle_index, last, &function, min_elements] {
+    }, [first, middle_index, last, function, min_elements] {
       return internal::for_each(first + middle_index,
                                 last,
                                 function,
diff --git a/lib/pls/include/pls/internal/helpers/range.h b/lib/pls/include/pls/internal/helpers/range.h
index d83cee8..9491798 100644
--- a/lib/pls/include/pls/internal/helpers/range.h
+++ b/lib/pls/include/pls/internal/helpers/range.h
@@ -112,7 +112,7 @@ struct basic_range {
         : r(rhs.r), index(rhs.index) {}
 
     const_iterator_impl(basic_range<IntegerType> const *p_range, size_type p_index)
-        : r(p_range), index(p_index) {}
+        : r(*p_range), index(p_index) {}
 
     const_iterator_impl &operator=(const const_iterator_impl &rhs) {
       r = rhs.r;
@@ -121,7 +121,7 @@ struct basic_range {
     }
 
     bool operator==(const const_iterator_impl &rhs) const {
-      return *r == *(rhs.r) && index == rhs.index;
+      return r == rhs.r && index == rhs.index;
     }
 
     bool operator!=(const const_iterator_impl &rhs) const {
@@ -145,7 +145,7 @@ struct basic_range {
     }
 
     value_type operator*() const {
-      return r->m_first_element + r->m_step * index;
+      return r.m_first_element + r.m_step * index;
     }
 
     // operator->
@@ -212,11 +212,11 @@ struct basic_range {
 
     value_type operator[](difference_type offset) const {
       size_type new_index = index + offset;
-      return r->m_first_element + r->m_step * new_index;
+      return r.m_first_element + r.m_step * new_index;
     }
 
    private:
-    basic_range<IntegerType> const *r;
+    basic_range<IntegerType> r;
     size_type index;
   };
 
@@ -236,7 +236,7 @@ struct basic_range {
         : r(rhs.r), index(rhs.index) {}
 
     const_reverse_iterator_impl(basic_range<IntegerType> const *p_range, size_type p_index)
-        : r(p_range), index(p_index) {}
+        : r(*p_range), index(p_index) {}
 
     const_reverse_iterator_impl &operator=(const const_reverse_iterator_impl &rhs) {
       r = rhs.r;
@@ -245,7 +245,7 @@ struct basic_range {
     }
 
     bool operator==(const const_reverse_iterator_impl &rhs) const {
-      return *r == *(rhs.r) && index == rhs.index;
+      return r == rhs.r && index == rhs.index;
     }
 
     bool operator!=(const const_reverse_iterator_impl &rhs) const {
@@ -270,8 +270,8 @@ struct basic_range {
 
     value_type operator*() const {
       size_type reverse_index
-          = (r->m_element_count - 1) - index;
-      return r->m_first_element + r->m_step * reverse_index;
+          = (r.m_element_count - 1) - index;
+      return r.m_first_element + r.m_step * reverse_index;
     }
 
     // operator->
@@ -338,12 +338,12 @@ struct basic_range {
 
     value_type operator[](difference_type offset) const {
       size_type new_reverse_index
-          = (r->m_element_count - 1) - (index + offset);
-      return r->m_first_element + r->m_step * new_reverse_index;
+          = (r.m_element_count - 1) - (index + offset);
+      return r.m_first_element + r.m_step * new_reverse_index;
     }
 
    private:
-    basic_range<IntegerType> const *r;
+    basic_range<IntegerType> r;
     size_type index;
   };
 
--
libgit2 0.26.0