diff --git a/CMakeLists.txt b/CMakeLists.txt index 1afba5a..be1f6ac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,7 @@ add_subdirectory(lib/pls) add_subdirectory(app/playground) add_subdirectory(app/test_for_new) add_subdirectory(app/invoke_parallel) +add_subdirectory(app/benchmark_fft) # Add optional tests option(PACKAGE_TESTS "Build the tests" ON) diff --git a/app/benchmark_fft/CMakeLists.txt b/app/benchmark_fft/CMakeLists.txt new file mode 100644 index 0000000..41591e5 --- /dev/null +++ b/app/benchmark_fft/CMakeLists.txt @@ -0,0 +1,5 @@ +add_executable(benchmark_fft main.cpp) +target_link_libraries(benchmark_fft pls) +if(EASY_PROFILER) + target_link_libraries(benchmark_fft easy_profiler) +endif() diff --git a/app/benchmark_fft/main.cpp b/app/benchmark_fft/main.cpp new file mode 100644 index 0000000..33cbd88 --- /dev/null +++ b/app/benchmark_fft/main.cpp @@ -0,0 +1,87 @@ +#include +#include +#include + +#include +#include +#include + +static constexpr int CUTOFF = 10; +static constexpr int NUM_ITERATIONS = 1000; +static constexpr int INPUT_SIZE = 2064; +typedef std::vector> complex_vector; + +void divide(complex_vector::iterator data, int n) { + complex_vector tmp_odd_elements(n / 2); + for (int i = 0; i < n / 2; i++) { + tmp_odd_elements[i] = data[i * 2 + 1]; + } + for (int i = 0; i < n / 2; i++) { + data[i] = data[i * 2]; + } + for (int i = 0; i < n / 2; i++) { + data[i + n / 2] = tmp_odd_elements[i]; + } +} + +void combine(complex_vector::iterator data, int n) { + for (int i = 0; i < n / 2; i++) { + std::complex even = data[i]; + std::complex odd = data[i + n / 2]; + + // w is the "twiddle-factor". + // this could be cached, but we run the same 'base' algorithm parallel/serial, + // so it won't impact the performance comparison. + std::complex w = exp(std::complex(0, -2. * M_PI * i / n)); + + data[i] = even + w * odd; + data[i + n / 2] = even - w * odd; + } +} + +void fft(complex_vector::iterator data, int n) { + if (n < 2) { + return; + } + + divide(data, n); + if (n <= CUTOFF) { + fft(data, n / 2); + fft(data + n / 2, n / 2); + } else { + pls::invoke_parallel( + [&] { fft(data, n / 2); }, + [&] { fft(data + n / 2, n / 2); } + ); + } + combine(data, n); +} + +complex_vector prepare_input(int input_size) { + std::vector known_frequencies{2, 11, 52, 88, 256}; + complex_vector data(input_size); + + // Set our input data to match a time series of the known_frequencies. + // When applying fft to this time-series we should find these frequencies. + for (int i = 0; i < input_size; i++) { + data[i] = std::complex(0.0, 0.0); + for (auto frequencie : known_frequencies) { + data[i] += sin(2 * M_PI * frequencie * i / input_size); + } + } + + return data; +} + + +int main() { + PROFILE_ENABLE + complex_vector initial_input = prepare_input(INPUT_SIZE); + + pls::internal::helpers::run_mini_benchmark([&] { + complex_vector input = initial_input; + fft(input.begin(), input.size()); + }, 8); + + PROFILE_SAVE("test_profile.prof") +} diff --git a/lib/pls/CMakeLists.txt b/lib/pls/CMakeLists.txt index 758a947..6cee207 100644 --- a/lib/pls/CMakeLists.txt +++ b/lib/pls/CMakeLists.txt @@ -17,7 +17,8 @@ add_library(pls STATIC src/algorithms/invoke_parallel.cpp include/pls/algorithms/invoke_parallel.h include/pls/internal/base/error_handling.h include/pls/internal/scheduling/scheduler_memory.h src/internal/scheduling/scheduler_memory.cpp - include/pls/internal/helpers/profiler.h) + include/pls/internal/helpers/profiler.h + include/pls/internal/helpers/mini_benchmark.h) # Add everything in `./include` to be in the include path of this project target_include_directories(pls diff --git a/lib/pls/include/pls/internal/helpers/mini_benchmark.h b/lib/pls/include/pls/internal/helpers/mini_benchmark.h new file mode 100644 index 0000000..c3742ad --- /dev/null +++ b/lib/pls/include/pls/internal/helpers/mini_benchmark.h @@ -0,0 +1,53 @@ + +#ifndef PLS_MINI_BENCHMARK_H +#define PLS_MINI_BENCHMARK_H + +#include "pls/internal/scheduling/scheduler_memory.h" +#include "pls/internal/scheduling/scheduler.h" + +#include +#include + +namespace pls { + namespace internal { + namespace helpers { + // TODO: Clean up (separate into small functions and .cpp file) + template + void run_mini_benchmark(const Function& lambda, size_t max_threads, long max_runtime_ms=1000) { + using namespace std; + using namespace pls::internal::scheduling; + + malloc_scheduler_memory scheduler_memory{max_threads}; + for (unsigned int num_threads = 1; num_threads <= max_threads; num_threads++) { + scheduler local_scheduler{&scheduler_memory, num_threads}; + + chrono::high_resolution_clock::time_point start_time; + chrono::high_resolution_clock::time_point end_time; + unsigned long iterations = 0; + local_scheduler.perform_work([&] { + start_time = chrono::high_resolution_clock::now(); + end_time = start_time; + chrono::high_resolution_clock::time_point planned_end_time = start_time + chrono::milliseconds(max_runtime_ms); + + while (end_time < planned_end_time) { + lambda(); + end_time = chrono::high_resolution_clock::now(); + iterations++; + } + }); + + long time = chrono::duration_cast(end_time - start_time).count(); + double time_per_iteration = (double)time / iterations; + + std::cout << time_per_iteration; + if (num_threads < max_threads) { + std::cout << ", "; + } + } + std::cout << std::endl; + } + } + } +} + +#endif //PLS_MINI_BENCHMARK_H