Add basic 'mini_benchmark_runner'.

f0f3b80e · FritzFlorian · 44ea144a · f0f3b80e · f0f3b80e · f0f3b80e
Commit f0f3b80e authored Apr 05, 2019 by FritzFlorian
Showing with 148 additions and 1 deletions

CMakeLists.txt
+1 -0

app/benchmark_fft/CMakeLists.txt
+5 -0

app/benchmark_fft/main.cpp
+87 -0

lib/pls/CMakeLists.txt
+2 -1

lib/pls/include/pls/internal/helpers/mini_benchmark.h
+53 -0

No files found.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,7 @@ add_subdirectory(lib/pls)
 add_subdirectory(app/playground)
 add_subdirectory(app/test_for_new)
 add_subdirectory(app/invoke_parallel)
+add_subdirectory(app/benchmark_fft)
 # Add optional tests
 option(PACKAGE_TESTS "Build the tests" ON)

--- a/app/benchmark_fft/CMakeLists.txt
+++ b/app/benchmark_fft/CMakeLists.txt
+add_executable(benchmark_fft main.cpp)
+target_link_libraries(benchmark_fft pls)
+if(EASY_PROFILER)
+    target_link_libraries(benchmark_fft easy_profiler)
+endif()
--- a/app/benchmark_fft/main.cpp
+++ b/app/benchmark_fft/main.cpp
+#include <pls/pls.h>
+#include <pls/internal/helpers/profiler.h>
+#include <pls/internal/helpers/mini_benchmark.h>
+#include <iostream>
+#include <complex>
+#include <vector>
+static constexpr int CUTOFF = 10;
+static constexpr int NUM_ITERATIONS = 1000;
+static constexpr int INPUT_SIZE = 2064;
+typedef std::vector<std::complex<double>> complex_vector;
+void divide(complex_vector::iterator data, int n) {
+    complex_vector tmp_odd_elements(n / 2);
+    for (int i = 0; i < n / 2; i++) {
+        tmp_odd_elements[i] = data[i * 2 + 1];
+    }
+    for (int i = 0; i < n / 2; i++) {
+        data[i] = data[i * 2];
+    }
+    for (int i = 0; i < n / 2; i++) {
+        data[i + n / 2] = tmp_odd_elements[i];
+    }
+}
+void combine(complex_vector::iterator data, int n) {
+    for (int i = 0; i < n / 2; i++) {
+        std::complex<double> even = data[i];
+        std::complex<double> odd = data[i + n / 2];
+        // w is the "twiddle-factor".
+        // this could be cached, but we run the same 'base' algorithm parallel/serial,
+        // so it won't impact the performance comparison.
+        std::complex<double> w = exp(std::complex<double>(0, -2. * M_PI * i / n));
+        data[i] = even + w * odd;
+        data[i + n / 2] = even - w * odd;
+    }
+}
+void fft(complex_vector::iterator data, int n) {
+    if (n < 2) {
+        return;
+    }
+    divide(data, n);
+    if (n <= CUTOFF) {
+        fft(data, n / 2);
+        fft(data + n / 2, n / 2);
+    } else {
+        pls::invoke_parallel(
+            [&] { fft(data, n / 2); },
+            [&] { fft(data + n / 2, n / 2); }
+        );
+    }
+    combine(data, n);
+}
+complex_vector prepare_input(int input_size) {
+    std::vector<double> known_frequencies{2, 11, 52, 88, 256};
+    complex_vector data(input_size);
+    // Set our input data to match a time series of the known_frequencies.
+    // When applying fft to this time-series we should find these frequencies.
+    for (int i = 0; i < input_size; i++) {
+        data[i] = std::complex<double>(0.0, 0.0);
+        for (auto frequencie : known_frequencies) {
+            data[i] += sin(2 * M_PI * frequencie * i / input_size);
+        }
+    }
+    return data;
+}
+int main() {
+    PROFILE_ENABLE
+    complex_vector initial_input = prepare_input(INPUT_SIZE);
+    pls::internal::helpers::run_mini_benchmark([&] {
+        complex_vector input = initial_input;
+        fft(input.begin(), input.size());
+    }, 8);
+    PROFILE_SAVE("test_profile.prof")
+}
--- a/lib/pls/CMakeLists.txt
+++ b/lib/pls/CMakeLists.txt
@@ -17,7 +17,8 @@ add_library(pls STATIC
            src/algorithms/invoke_parallel.cpp include/pls/algorithms/invoke_parallel.h
            include/pls/internal/base/error_handling.h
            include/pls/internal/scheduling/scheduler_memory.h src/internal/scheduling/scheduler_memory.cpp
-        include/pls/internal/helpers/profiler.h)
+            include/pls/internal/helpers/profiler.h
+            include/pls/internal/helpers/mini_benchmark.h)
 # Add everything in `./include` to be in the include path of this project
 target_include_directories(pls

--- a/lib/pls/include/pls/internal/helpers/mini_benchmark.h
+++ b/lib/pls/include/pls/internal/helpers/mini_benchmark.h
+#ifndef PLS_MINI_BENCHMARK_H
+#define PLS_MINI_BENCHMARK_H
+#include "pls/internal/scheduling/scheduler_memory.h"
+#include "pls/internal/scheduling/scheduler.h"
+#include <chrono>
+#include <iostream>
+namespace pls {
+    namespace internal {
+        namespace helpers {
+            // TODO: Clean up (separate into small functions and .cpp file)
+            template<typename Function>
+            void run_mini_benchmark(const Function& lambda, size_t max_threads, long max_runtime_ms=1000) {
+                using namespace std;
+                using namespace pls::internal::scheduling;
+                malloc_scheduler_memory scheduler_memory{max_threads};
+                for (unsigned int num_threads = 1; num_threads <= max_threads; num_threads++) {
+                    scheduler local_scheduler{&scheduler_memory, num_threads};
+                    chrono::high_resolution_clock::time_point start_time;
+                    chrono::high_resolution_clock::time_point end_time;
+                    unsigned long iterations = 0;
+                    local_scheduler.perform_work([&] {
+                        start_time = chrono::high_resolution_clock::now();
+                        end_time = start_time;
+                        chrono::high_resolution_clock::time_point planned_end_time = start_time + chrono::milliseconds(max_runtime_ms);
+                        while (end_time < planned_end_time) {
+                            lambda();
+                            end_time = chrono::high_resolution_clock::now();
+                            iterations++;
+                        }
+                    });
+                    long time = chrono::duration_cast<chrono::microseconds>(end_time - start_time).count();
+                    double time_per_iteration = (double)time / iterations;
+                    std::cout << time_per_iteration;
+                    if (num_threads < max_threads) {
+                        std::cout << ", ";
+                    }
+                }
+                std::cout << std::endl;
+            }
+        }
+    }
+}
+#endif //PLS_MINI_BENCHMARK_H