From 86333a60d7dfbe1a7dd8582c265651c9a2952888 Mon Sep 17 00:00:00 2001 From: FritzFlorian Date: Sat, 13 Jun 2020 22:39:34 +0200 Subject: [PATCH] Final preparations for single-app benchmark runs. --- app/benchmark_matrix_div_conquer/main.cpp | 64 ++++++++++++++++++++++++++++++++++++++++------------------------ lib/pls/include/pls/internal/scheduling/scheduler.h | 2 +- lib/pls/include/pls/internal/scheduling/strain_local_resource.h | 4 +++- lib/pls/include/pls/internal/scheduling/thread_state.h | 3 +++ 4 files changed, 47 insertions(+), 26 deletions(-) diff --git a/app/benchmark_matrix_div_conquer/main.cpp b/app/benchmark_matrix_div_conquer/main.cpp index 7c91ab0..050090f 100644 --- a/app/benchmark_matrix_div_conquer/main.cpp +++ b/app/benchmark_matrix_div_conquer/main.cpp @@ -8,30 +8,39 @@ using namespace comparison_benchmarks::base; #include #include +#include #include void multiply_div_conquer(const std::vector>>> &tmp_arrays, pls::strain_local_resource &local_indices, size_t size, size_t depth, + size_t branch, matrix_div_conquer::blocked_matrix_view &result, matrix_div_conquer::blocked_matrix_view &a, matrix_div_conquer::blocked_matrix_view &b) { - if (size <= 8) { + if (size <= matrix_div_conquer::CUTOFF_SIZE) { multiply_naive(size, result, a, b); return; } // Temporary storage required for the intermediate results auto strain_local_index = local_indices.get_item(depth); - std::unique_ptr const &data_1_1_a = tmp_arrays[depth][strain_local_index.get_strain_index()][0]; - std::unique_ptr const &data_1_1_b = tmp_arrays[depth][strain_local_index.get_strain_index()][1]; - std::unique_ptr const &data_1_2_a = tmp_arrays[depth][strain_local_index.get_strain_index()][2]; - std::unique_ptr const &data_1_2_b = tmp_arrays[depth][strain_local_index.get_strain_index()][3]; - std::unique_ptr const &data_2_1_a = tmp_arrays[depth][strain_local_index.get_strain_index()][4]; - std::unique_ptr const &data_2_1_b = tmp_arrays[depth][strain_local_index.get_strain_index()][5]; - std::unique_ptr const &data_2_2_a = tmp_arrays[depth][strain_local_index.get_strain_index()][6]; - std::unique_ptr const &data_2_2_b = tmp_arrays[depth][strain_local_index.get_strain_index()][7]; + size_t index; + if (depth == 0 || (8u << (depth - 1u)) <= local_indices.get_num_threads()) { + index = branch; + } else { + index = strain_local_index.get_strain_index(); + } + + std::unique_ptr const &data_1_1_a = tmp_arrays[depth][index][0]; + std::unique_ptr const &data_1_1_b = tmp_arrays[depth][index][1]; + std::unique_ptr const &data_1_2_a = tmp_arrays[depth][index][2]; + std::unique_ptr const &data_1_2_b = tmp_arrays[depth][index][3]; + std::unique_ptr const &data_2_1_a = tmp_arrays[depth][index][4]; + std::unique_ptr const &data_2_1_b = tmp_arrays[depth][index][5]; + std::unique_ptr const &data_2_2_a = tmp_arrays[depth][index][6]; + std::unique_ptr const &data_2_2_b = tmp_arrays[depth][index][7]; // Handles to sub-matrices used matrix_div_conquer::blocked_matrix_view result_1_1 = result.quadrant_1_1(); @@ -60,31 +69,31 @@ void multiply_div_conquer(const std::vector>>> div_conquer_temp_arrays; size_t max_depth = 0; + size_t buffers_needed = 1; size_t remaining_size = size; - while (remaining_size > 1) { + while (remaining_size > matrix_div_conquer::CUTOFF_SIZE) { auto &depth_buffers = div_conquer_temp_arrays.emplace_back(); - for (int thread_id = 0; thread_id < 8; thread_id++) { + buffers_needed = std::min(buffers_needed, (size_t) settings.num_threads_); + for (int thread_id = 0; thread_id < buffers_needed; thread_id++) { auto &depth_thread_buffers = depth_buffers.emplace_back(); for (int i = 0; i < 8; i++) { - depth_thread_buffers.emplace_back(new double[(remaining_size / 2) * (remaining_size / 2)]); + size_t matrix_elements = (remaining_size / 2) * (remaining_size / 2); + depth_thread_buffers.emplace_back(new double[matrix_elements]); + for (size_t j = 0; j < matrix_elements; j += 32) { + depth_thread_buffers[i][j] = 1.0; // Touch memory + } } } max_depth++; + buffers_needed *= 8; remaining_size = remaining_size / 2; } pls::strain_local_resource local_indices{(unsigned) settings.num_threads_, (unsigned) max_depth}; @@ -152,7 +168,7 @@ int main(int argc, char **argv) { runner.run_iterations(settings.iterations_, [&]() { scheduler.perform_work([&]() { - multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, result, a, b); + multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, 0, result, a, b); }); }); runner.commit_results(true); @@ -163,7 +179,7 @@ int main(int argc, char **argv) { runner.run_periodic(settings.iterations_, settings.interval_period_, settings.interval_deadline_, [&]() { scheduler.perform_work([&]() { - multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, result, a, b); + multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, 0, result, a, b); }); }); runner.commit_results(true); diff --git a/lib/pls/include/pls/internal/scheduling/scheduler.h b/lib/pls/include/pls/internal/scheduling/scheduler.h index 7a1b0ba..c68d452 100644 --- a/lib/pls/include/pls/internal/scheduling/scheduler.h +++ b/lib/pls/include/pls/internal/scheduling/scheduler.h @@ -45,7 +45,7 @@ class scheduler { size_t computation_depth, size_t stack_size, bool reuse_thread = true, - size_t serial_stack_size = 4096 * 8); + size_t serial_stack_size = 4096 * 1); template explicit scheduler(unsigned int num_threads, diff --git a/lib/pls/include/pls/internal/scheduling/strain_local_resource.h b/lib/pls/include/pls/internal/scheduling/strain_local_resource.h index 18c79e5..ce4df36 100644 --- a/lib/pls/include/pls/internal/scheduling/strain_local_resource.h +++ b/lib/pls/include/pls/internal/scheduling/strain_local_resource.h @@ -60,7 +60,7 @@ class strain_local_resource { }; strain_local_resource(unsigned num_threads, - unsigned depth) : local_items_(num_threads) { + unsigned depth) : num_threads_{num_threads}, local_items_(num_threads) { for (unsigned thread_id = 0; thread_id < num_threads; thread_id++) { local_items_[thread_id].reserve(depth); for (unsigned i = 0; i < depth; i++) { @@ -70,11 +70,13 @@ class strain_local_resource { } } + [[nodiscard]] unsigned get_num_threads() const { return num_threads_; } item_handle get_item(unsigned depth); static strain_resource *get_local_copy(strain_resource *other_resources, unsigned thread_id); static void acquire_locally(strain_resource *other_resources, unsigned thread_id); private: + const unsigned num_threads_; std::vector> local_items_; }; diff --git a/lib/pls/include/pls/internal/scheduling/thread_state.h b/lib/pls/include/pls/internal/scheduling/thread_state.h index b46b5d2..9b7cbb3 100644 --- a/lib/pls/include/pls/internal/scheduling/thread_state.h +++ b/lib/pls/include/pls/internal/scheduling/thread_state.h @@ -69,6 +69,9 @@ struct PLS_CACHE_ALIGN thread_state { stack_allocator_{stack_allocator}, serial_call_stack_size_{serial_call_stack_size} { serial_call_stack_ = stack_allocator->allocate_stack(serial_call_stack_size_); + for (size_t i = 0; i < serial_call_stack_size; i += base::system_details::CACHE_LINE_SIZE) { + serial_call_stack_[i] = 'a'; // Touch the stack + } }; ~thread_state() { -- libgit2 0.26.0