From 7a8f320b1763fdc4cd019a4b8c0ecb3963b2bd8a Mon Sep 17 00:00:00 2001 From: FritzFlorian Date: Wed, 22 Jul 2020 11:41:07 +0200 Subject: [PATCH] Add profiling information to benchmarks and matrix_then_fft benchmark. --- CMakeLists.txt | 1 + app/benchmark_fft/main.cpp | 10 ++++++++++ app/benchmark_fib/main.cpp | 10 ++++++++++ app/benchmark_matrix_div_conquer/main.cpp | 11 +++++++++++ app/benchmark_matrix_then_fft/CMakeLists.txt | 7 +++++++ app/benchmark_matrix_then_fft/main.cpp | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ app/benchmark_unbalanced/main.cpp | 10 ++++++++++ extern/benchmark_runner/benchmark_runner.h | 38 +++++++++++++++++++++++++++----------- 8 files changed, 173 insertions(+), 11 deletions(-) create mode 100644 app/benchmark_matrix_then_fft/CMakeLists.txt create mode 100644 app/benchmark_matrix_then_fft/main.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 12cb45e..20d9142 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,7 @@ add_subdirectory(app/benchmark_matrix) add_subdirectory(app/benchmark_matrix_div_conquer) add_subdirectory(app/benchmark_fib) add_subdirectory(app/context_switch) +add_subdirectory(app/benchmark_matrix_then_fft) # Add optional tests option(PACKAGE_TESTS "Build the tests" ON) diff --git a/app/benchmark_fft/main.cpp b/app/benchmark_fft/main.cpp index 03090d4..d168a87 100644 --- a/app/benchmark_fft/main.cpp +++ b/app/benchmark_fft/main.cpp @@ -43,6 +43,11 @@ int main(int argc, char **argv) { pls::scheduler scheduler{(unsigned) settings.num_threads_, MAX_NUM_TASKS, MAX_STACK_SIZE}; if (settings.type_ == benchmark_runner::benchmark_settings::ISOLATED) { +#if PLS_PROFILING_ENABLED + scheduler.get_profiler().disable_memory_measure(); + runner.add_custom_stats_field("T_1"); + runner.add_custom_stats_field("T_inf"); +#endif printf("Running isolated measurement...\n"); runner.enable_memory_stats(); runner.pre_allocate_stats(); @@ -53,6 +58,11 @@ int main(int argc, char **argv) { }); }, [&]() { fft::fill_input(data); // Reset data before each run + }, [&]() { +#if PLS_PROFILING_ENABLED + runner.store_custom_stat("T_1", scheduler.get_profiler().current_run().t_1_); + runner.store_custom_stat("T_inf", scheduler.get_profiler().current_run().t_inf_); +#endif }); runner.commit_results(true); } else { diff --git a/app/benchmark_fib/main.cpp b/app/benchmark_fib/main.cpp index d46ff8f..356515e 100644 --- a/app/benchmark_fib/main.cpp +++ b/app/benchmark_fib/main.cpp @@ -34,6 +34,11 @@ int main(int argc, char **argv) { volatile int res; if (settings.type_ == benchmark_runner::benchmark_settings::ISOLATED) { +#if PLS_PROFILING_ENABLED + scheduler.get_profiler().disable_memory_measure(); + runner.add_custom_stats_field("T_1"); + runner.add_custom_stats_field("T_inf"); +#endif printf("Running isolated measurement...\n"); runner.enable_memory_stats(); runner.pre_allocate_stats(); @@ -42,6 +47,11 @@ int main(int argc, char **argv) { scheduler.perform_work([&]() { res = pls_fib(settings.size_); }); + }, [&]() {}, [&]() { +#if PLS_PROFILING_ENABLED + runner.store_custom_stat("T_1", scheduler.get_profiler().current_run().t_1_); + runner.store_custom_stat("T_inf", scheduler.get_profiler().current_run().t_inf_); +#endif }); runner.commit_results(true); } else { diff --git a/app/benchmark_matrix_div_conquer/main.cpp b/app/benchmark_matrix_div_conquer/main.cpp index 0175d0f..f40a99f 100644 --- a/app/benchmark_matrix_div_conquer/main.cpp +++ b/app/benchmark_matrix_div_conquer/main.cpp @@ -125,6 +125,7 @@ int main(int argc, char **argv) { // Fill data arrays as needed a.fill_default_data(); b.fill_default_data(); + result.fill_default_data(); // Strain local data std::vector>>> div_conquer_temp_arrays; @@ -158,6 +159,11 @@ int main(int argc, char **argv) { pls::scheduler scheduler{(unsigned) settings.num_threads_, max_depth + 2, MAX_STACK_SIZE}; if (settings.type_ == benchmark_runner::benchmark_settings::ISOLATED) { +#if PLS_PROFILING_ENABLED + scheduler.get_profiler().disable_memory_measure(); + runner.add_custom_stats_field("T_1"); + runner.add_custom_stats_field("T_inf"); +#endif printf("Running isolated measurement...\n"); runner.enable_memory_stats(); runner.pre_allocate_stats(); @@ -166,6 +172,11 @@ int main(int argc, char **argv) { scheduler.perform_work([&]() { multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, 0, result, a, b); }); + }, [&]() {}, [&]() { +#if PLS_PROFILING_ENABLED + runner.store_custom_stat("T_1", scheduler.get_profiler().current_run().t_1_); + runner.store_custom_stat("T_inf", scheduler.get_profiler().current_run().t_inf_); +#endif }); runner.commit_results(true); } else { diff --git a/app/benchmark_matrix_then_fft/CMakeLists.txt b/app/benchmark_matrix_then_fft/CMakeLists.txt new file mode 100644 index 0000000..89a0e66 --- /dev/null +++ b/app/benchmark_matrix_then_fft/CMakeLists.txt @@ -0,0 +1,7 @@ +add_executable(benchmark_matrix_then_fft_pls_v3 main.cpp) +target_link_libraries(benchmark_matrix_then_fft_pls_v3 pls benchmark_runner benchmark_base) +ADD_DEPENDENCIES(benchmark.pls benchmark_matrix_then_fft_pls_v3) + +if (EASY_PROFILER) + target_link_libraries(benchmark_matrix_then_fft_pls_v3 easy_profiler) +endif () diff --git a/app/benchmark_matrix_then_fft/main.cpp b/app/benchmark_matrix_then_fft/main.cpp new file mode 100644 index 0000000..2e12eb1 --- /dev/null +++ b/app/benchmark_matrix_then_fft/main.cpp @@ -0,0 +1,97 @@ +#include "pls/pls.h" + +#include "benchmark_runner.h" +#include "benchmark_base/matrix.h" +#include "benchmark_base/fft.h" + +using namespace comparison_benchmarks::base; + +void pls_conquer(fft::complex_vector::iterator data, fft::complex_vector::iterator swap_array, int n) { + if (n < 2) { + return; + } + + fft::divide(data, swap_array, n); + if (n <= fft::RECURSIVE_CUTOFF) { + fft::conquer(data, swap_array, n / 2); + fft::conquer(data + n / 2, swap_array + n / 2, n / 2); + } else { + pls::spawn([data, n, swap_array]() { + pls_conquer(data, swap_array, n / 2); + }); + pls::spawn_and_sync([data, n, swap_array]() { + pls_conquer(data + n / 2, swap_array + n / 2, n / 2); + }); + } + + fft::combine(data, n); +} + +constexpr int MAX_NUM_TASKS = 16; +constexpr int MAX_STACK_SIZE = 4096 * 1; + +int main(int argc, char **argv) { + auto settings = benchmark_runner::parse_parameters(argc, argv); + size_t matrix_size = settings.size_; + size_t fft_size = 8192; + + string test_name = to_string(settings.num_threads_) + ".csv"; + string full_directory = settings.output_directory_ + "/PLS_v3/"; + benchmark_runner runner{full_directory, test_name}; + + pls::scheduler scheduler{(unsigned) settings.num_threads_, MAX_NUM_TASKS, MAX_STACK_SIZE}; + + // Data Containers + fft::complex_vector fft_data(fft_size); + fft::complex_vector fft_swap_array(fft_size); + fft::fill_input(fft_data); + matrix::matrix matrix_a{settings.size_}; + matrix::matrix matrix_b{settings.size_}; + matrix::matrix matrix_result{settings.size_}; + + if (settings.type_ == benchmark_runner::benchmark_settings::ISOLATED) { +#if PLS_PROFILING_ENABLED + scheduler.get_profiler().disable_memory_measure(); + runner.add_custom_stats_field("T_1"); + runner.add_custom_stats_field("T_inf"); +#endif + printf("Running isolated measurement...\n"); + runner.enable_memory_stats(); + runner.pre_allocate_stats(); + + runner.run_iterations(settings.iterations_, [&]() { + // Serial Matrix Multiplication + matrix_result.multiply(matrix_a, matrix_b); + // Parallel FFT + scheduler.perform_work([&]() { + pls_conquer(fft_data.begin(), fft_swap_array.begin(), fft_size); + }); + }, [&]() { + fft::fill_input(fft_data); // Reset data before each run + }, [&]() { +#if PLS_PROFILING_ENABLED + runner.store_custom_stat("T_1", scheduler.get_profiler().current_run().t_1_); + runner.store_custom_stat("T_inf", scheduler.get_profiler().current_run().t_inf_); +#endif + }); + runner.commit_results(true); + } else { + printf("Running periodic measurement...\n"); + runner.enable_wall_time_stats(); + runner.pre_allocate_stats(); + + runner.run_periodic(settings.iterations_, settings.interval_period_, settings.interval_deadline_, [&]() { + // Serial Matrix Multiplication + matrix_result.multiply(matrix_a, matrix_b); + // Parallel FFT + scheduler.perform_work([&]() { + pls_conquer(fft_data.begin(), fft_swap_array.begin(), fft_size); + }); + // Reset data before each run + fft::fill_input(fft_data); + }); + runner.commit_results(true); + } + + return 0; +} diff --git a/app/benchmark_unbalanced/main.cpp b/app/benchmark_unbalanced/main.cpp index c482090..0c621b2 100644 --- a/app/benchmark_unbalanced/main.cpp +++ b/app/benchmark_unbalanced/main.cpp @@ -44,6 +44,11 @@ int main(int argc, char **argv) { pls::scheduler scheduler{(unsigned) settings.num_threads_, MAX_NUM_TASKS, MAX_STACK_SIZE}; if (settings.type_ == benchmark_runner::benchmark_settings::ISOLATED) { +#if PLS_PROFILING_ENABLED + scheduler.get_profiler().disable_memory_measure(); + runner.add_custom_stats_field("T_1"); + runner.add_custom_stats_field("T_inf"); +#endif printf("Running isolated measurement...\n"); runner.enable_memory_stats(); runner.pre_allocate_stats(); @@ -55,6 +60,11 @@ int main(int argc, char **argv) { unbalanced::Q, unbalanced::NORMAL_CHILDREN); }); + }, [&]() {}, [&]() { +#if PLS_PROFILING_ENABLED + runner.store_custom_stat("T_1", scheduler.get_profiler().current_run().t_1_); + runner.store_custom_stat("T_inf", scheduler.get_profiler().current_run().t_inf_); +#endif }); runner.commit_results(true); } else { diff --git a/extern/benchmark_runner/benchmark_runner.h b/extern/benchmark_runner/benchmark_runner.h index 36a8f55..cd45ee1 100644 --- a/extern/benchmark_runner/benchmark_runner.h +++ b/extern/benchmark_runner/benchmark_runner.h @@ -42,7 +42,10 @@ class benchmark_runner { unsigned long wall_time_pre_run_; unsigned long wall_time_post_run_; - map> custom_stats_; + const string WALL_TIME_ITERATION_START = "wall_time_iteration_start_us"; + const string WALL_TIME_ITERATION_END = "wall_time_iteration_end_us"; + + map> custom_stats_; void print_statistics() { long time_sum = std::accumulate(times_.begin(), times_.end(), 0l); @@ -119,9 +122,11 @@ class benchmark_runner { wall_time_enabled_ = true; add_custom_stats_field(WALL_TIME_PRE_RUN); add_custom_stats_field(WALL_TIME_POST_RUN); + add_custom_stats_field(WALL_TIME_ITERATION_START); + add_custom_stats_field(WALL_TIME_ITERATION_END); } - void pre_allocate_stats(size_t num = 100000) { + void pre_allocate_stats(size_t num = 500000) { times_.reserve(num); memset(times_.data(), 'a', num * sizeof(long)); for (auto &iter : custom_stats_) { @@ -187,8 +192,7 @@ class benchmark_runner { } if (wall_time_enabled_) { wall_time_pre_run_ = - std::chrono::duration_cast(std::chrono::time_point_cast( - std::chrono::system_clock::now()).time_since_epoch()).count(); + chrono::duration_cast(chrono::system_clock::now().time_since_epoch()).count(); } last_start_time_ = chrono::steady_clock::now(); @@ -213,8 +217,8 @@ class benchmark_runner { } if (wall_time_enabled_) { wall_time_post_run_ = - std::chrono::duration_cast(std::chrono::time_point_cast( - std::chrono::system_clock::now()).time_since_epoch()).count(); + chrono::duration_cast(chrono::system_clock::now().time_since_epoch()).count(); + custom_stats_[WALL_TIME_PRE_RUN][iteration_index] = wall_time_pre_run_; custom_stats_[WALL_TIME_POST_RUN][iteration_index] = wall_time_post_run_; } @@ -271,6 +275,7 @@ class benchmark_runner { add_to_timespec(iteration_start, period_seconds, period_nanoseconds); size_t current_iteration = 0; + size_t deadline_misses = 0; while (current_iteration < count) { // Sleep until the next iteration long sleep_error = clock_nanosleep(CLOCK_MONOTONIC, TIMER_ABSTIME, &iteration_start, nullptr); @@ -295,19 +300,28 @@ class benchmark_runner { deadline_end = iteration_start; add_to_timespec(deadline_end, deadline_seconds, deadline_nanoseconds); + // Keep stats... + if (wall_time_enabled_) { + custom_stats_[WALL_TIME_ITERATION_START][current_iteration] = + iteration_start.tv_sec * 1000 * 1000 + iteration_start.tv_nsec / 1000; + custom_stats_[WALL_TIME_ITERATION_END][current_iteration] = + iteration_end.tv_sec * 1000 * 1000 + iteration_end.tv_nsec / 1000; + } + // Store 'actual' wall time instead of iteration time (we want to include sleeping here!) - long wall_time_us = 0; + unsigned long wall_time_us = 0; wall_time_us += (finish_time.tv_sec - iteration_start.tv_sec) * 1000l * 1000l; - wall_time_us += ((long) finish_time.tv_nsec - (long) iteration_start.tv_nsec) / 1000l; - printf("Difference: %ld\n", wall_time_us - times_[current_iteration]); + long nano_second_difference = ((long) finish_time.tv_nsec - (long) iteration_start.tv_nsec) / 1000l; + wall_time_us += nano_second_difference; times_[current_iteration] = wall_time_us; if (finish_time.tv_sec >= deadline_end.tv_sec && finish_time.tv_nsec > deadline_end.tv_nsec) { - printf("Deadline Miss!\n"); // TODO: Remove + deadline_misses++; } // Skip iterations if their start time is later than the current time (skipping) - while (finish_time.tv_sec >= iteration_end.tv_sec && finish_time.tv_nsec > iteration_end.tv_nsec) { + while (finish_time.tv_sec > iteration_end.tv_sec + || (finish_time.tv_sec == iteration_end.tv_sec && finish_time.tv_nsec > iteration_end.tv_nsec)) { iteration_start = iteration_end; iteration_end = iteration_start; @@ -322,6 +336,8 @@ class benchmark_runner { current_iteration++; iteration_start = iteration_end; } + + printf("%ld deadline misses!\n", deadline_misses); } void commit_results(bool print_stats) { -- libgit2 0.26.0