From 8f47876d872eb26e82efcff5427cb251099292f8 Mon Sep 17 00:00:00 2001 From: FritzFlorian Date: Mon, 8 Jun 2020 10:54:34 +0200 Subject: [PATCH] Add standard divide and conquer matrix multiplication test for comparison. --- app/benchmark_matrix_div_conquer/main.cpp | 286 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------------------------------------------------------------------------------------------------------------------------------------------- extern/benchmark_base/CMakeLists.txt | 3 ++- extern/benchmark_base/include/benchmark_base/matrix_div_conquer.h | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ lib/pls/include/pls/internal/scheduling/scheduler_impl.h | 1 - lib/pls/include/pls/internal/scheduling/strain_local_resource.h | 3 +-- lib/pls/include/pls/pls.h | 4 ++++ 6 files changed, 228 insertions(+), 165 deletions(-) create mode 100644 extern/benchmark_base/include/benchmark_base/matrix_div_conquer.h diff --git a/app/benchmark_matrix_div_conquer/main.cpp b/app/benchmark_matrix_div_conquer/main.cpp index 2b26022..9ba2cc7 100644 --- a/app/benchmark_matrix_div_conquer/main.cpp +++ b/app/benchmark_matrix_div_conquer/main.cpp @@ -1,194 +1,158 @@ -//#include "pls/pls.h" -//using namespace pls; +#include "pls/pls.h" +using namespace pls; #include "benchmark_runner.h" +#include "benchmark_base/matrix_div_conquer.h" + +using namespace comparison_benchmarks::base; #include #include -#include - -// Helpers to directly index into blocked matrices -const size_t MAX_BLOCK_LOOKUP = 256; -std::array, MAX_BLOCK_LOOKUP> BLOCK_LOOKUP; // ROW, COLUMN -void fill_block_lookup(size_t size) { - if (size <= 1) { - BLOCK_LOOKUP[0][0] = 0; - return; - } - - fill_block_lookup(size / 2); - - size_t elements_per_quarter = (size / 2) * (size / 2); - for (size_t row = 0; row < size / 2; row++) { - for (size_t column = 0; column < size / 2; column++) { - BLOCK_LOOKUP[row][size / 2 + column] = BLOCK_LOOKUP[row][column] + elements_per_quarter; - BLOCK_LOOKUP[size / 2 + row][column] = BLOCK_LOOKUP[row][column] + 2 * elements_per_quarter; - BLOCK_LOOKUP[size / 2 + row][size / 2 + column] = BLOCK_LOOKUP[row][column] + 3 * elements_per_quarter; - } - } -} -class blocked_matrix_view { - public: - blocked_matrix_view(double *data, size_t size) : data_{data}, size_{size} {} - - blocked_matrix_view quadrant_1_1() { - size_t elements_per_quarter = (size_ / 2) * (size_ / 2); - return blocked_matrix_view(data_ + 0 * elements_per_quarter, size_ / 2); - } - blocked_matrix_view quadrant_1_2() { - size_t elements_per_quarter = (size_ / 2) * (size_ / 2); - return blocked_matrix_view(data_ + 1 * elements_per_quarter, size_ / 2); - } - blocked_matrix_view quadrant_2_1() { - size_t elements_per_quarter = (size_ / 2) * (size_ / 2); - return blocked_matrix_view(data_ + 2 * elements_per_quarter, size_ / 2); - } - blocked_matrix_view quadrant_2_2() { - size_t elements_per_quarter = (size_ / 2) * (size_ / 2); - return blocked_matrix_view(data_ + 3 * elements_per_quarter, size_ / 2); - } - - double &at(size_t row, size_t column) { - return data_[BLOCK_LOOKUP[row][column]]; - } - - double *get_data() { - return data_; - } - - private: - double *data_; - size_t size_; -}; +#include -void multiply_naive(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) { - for (size_t i = 0; i < size; i++) { - for (size_t j = 0; j < size; j++) { - result.at(i, j) = 0; - } - for (size_t j = 0; j < size; j++) { - for (size_t k = 0; k < size; k++) { - result.at(i, j) += a.at(i, k) * b.at(k, j); - } - } - } -} +void multiply_div_conquer(const std::vector>>> &tmp_arrays, + pls::strain_local_resource &local_indices, + size_t size, + size_t depth, + matrix_div_conquer::blocked_matrix_view &result, + matrix_div_conquer::blocked_matrix_view &a, + matrix_div_conquer::blocked_matrix_view &b) { -void multiply_div_conquer(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) { if (size <= 8) { multiply_naive(size, result, a, b); return; } // Temporary storage required for the intermediate results - std::unique_ptr data_1_1_a{new double[(size / 2) * (size / 2)]}; - std::unique_ptr data_1_1_b{new double[(size / 2) * (size / 2)]}; - std::unique_ptr data_1_2_a{new double[(size / 2) * (size / 2)]}; - std::unique_ptr data_1_2_b{new double[(size / 2) * (size / 2)]}; - std::unique_ptr data_2_1_a{new double[(size / 2) * (size / 2)]}; - std::unique_ptr data_2_1_b{new double[(size / 2) * (size / 2)]}; - std::unique_ptr data_2_2_a{new double[(size / 2) * (size / 2)]}; - std::unique_ptr data_2_2_b{new double[(size / 2) * (size / 2)]}; + auto strain_local_index = local_indices.get_item(depth); + std::unique_ptr const &data_1_1_a = tmp_arrays[depth][strain_local_index.get_strain_index()][0]; + std::unique_ptr const &data_1_1_b = tmp_arrays[depth][strain_local_index.get_strain_index()][1]; + std::unique_ptr const &data_1_2_a = tmp_arrays[depth][strain_local_index.get_strain_index()][2]; + std::unique_ptr const &data_1_2_b = tmp_arrays[depth][strain_local_index.get_strain_index()][3]; + std::unique_ptr const &data_2_1_a = tmp_arrays[depth][strain_local_index.get_strain_index()][4]; + std::unique_ptr const &data_2_1_b = tmp_arrays[depth][strain_local_index.get_strain_index()][5]; + std::unique_ptr const &data_2_2_a = tmp_arrays[depth][strain_local_index.get_strain_index()][6]; + std::unique_ptr const &data_2_2_b = tmp_arrays[depth][strain_local_index.get_strain_index()][7]; // Handles to sub-matrices used - blocked_matrix_view result_1_1 = result.quadrant_1_1(); - blocked_matrix_view result_1_2 = result.quadrant_1_2(); - blocked_matrix_view result_2_1 = result.quadrant_2_1(); - blocked_matrix_view result_2_2 = result.quadrant_2_2(); - - blocked_matrix_view result_1_1_a{data_1_1_a.get(), size / 2}; - blocked_matrix_view result_1_1_b{data_1_1_b.get(), size / 2}; - blocked_matrix_view result_1_2_a{data_1_2_a.get(), size / 2}; - blocked_matrix_view result_1_2_b{data_1_2_b.get(), size / 2}; - blocked_matrix_view result_2_1_a{data_2_1_a.get(), size / 2}; - blocked_matrix_view result_2_1_b{data_2_1_b.get(), size / 2}; - blocked_matrix_view result_2_2_a{data_2_2_a.get(), size / 2}; - blocked_matrix_view result_2_2_b{data_2_2_b.get(), size / 2}; - - blocked_matrix_view a_1_1 = a.quadrant_1_1(); - blocked_matrix_view a_1_2 = a.quadrant_1_2(); - blocked_matrix_view a_2_1 = a.quadrant_2_1(); - blocked_matrix_view a_2_2 = a.quadrant_2_2(); - - blocked_matrix_view b_1_1 = b.quadrant_1_1(); - blocked_matrix_view b_1_2 = b.quadrant_1_2(); - blocked_matrix_view b_2_1 = b.quadrant_2_1(); - blocked_matrix_view b_2_2 = b.quadrant_2_2(); + matrix_div_conquer::blocked_matrix_view result_1_1 = result.quadrant_1_1(); + matrix_div_conquer::blocked_matrix_view result_1_2 = result.quadrant_1_2(); + matrix_div_conquer::blocked_matrix_view result_2_1 = result.quadrant_2_1(); + matrix_div_conquer::blocked_matrix_view result_2_2 = result.quadrant_2_2(); + + matrix_div_conquer::blocked_matrix_view result_1_1_a{data_1_1_a.get(), size / 2}; + matrix_div_conquer::blocked_matrix_view result_1_1_b{data_1_1_b.get(), size / 2}; + matrix_div_conquer::blocked_matrix_view result_1_2_a{data_1_2_a.get(), size / 2}; + matrix_div_conquer::blocked_matrix_view result_1_2_b{data_1_2_b.get(), size / 2}; + matrix_div_conquer::blocked_matrix_view result_2_1_a{data_2_1_a.get(), size / 2}; + matrix_div_conquer::blocked_matrix_view result_2_1_b{data_2_1_b.get(), size / 2}; + matrix_div_conquer::blocked_matrix_view result_2_2_a{data_2_2_a.get(), size / 2}; + matrix_div_conquer::blocked_matrix_view result_2_2_b{data_2_2_b.get(), size / 2}; + + matrix_div_conquer::blocked_matrix_view a_1_1 = a.quadrant_1_1(); + matrix_div_conquer::blocked_matrix_view a_1_2 = a.quadrant_1_2(); + matrix_div_conquer::blocked_matrix_view a_2_1 = a.quadrant_2_1(); + matrix_div_conquer::blocked_matrix_view a_2_2 = a.quadrant_2_2(); + + matrix_div_conquer::blocked_matrix_view b_1_1 = b.quadrant_1_1(); + matrix_div_conquer::blocked_matrix_view b_1_2 = b.quadrant_1_2(); + matrix_div_conquer::blocked_matrix_view b_2_1 = b.quadrant_2_1(); + matrix_div_conquer::blocked_matrix_view b_2_2 = b.quadrant_2_2(); // Divide Work Into Sub-Calls - multiply_div_conquer(size / 2, result_1_1_a, a_1_1, b_1_1); - multiply_div_conquer(size / 2, result_1_1_b, a_1_2, b_2_1); - - multiply_div_conquer(size / 2, result_1_2_a, a_1_1, b_1_2); - multiply_div_conquer(size / 2, result_1_2_b, a_1_2, b_2_2); - - multiply_div_conquer(size / 2, result_2_1_a, a_2_1, b_1_1); - multiply_div_conquer(size / 2, result_2_1_b, a_2_2, b_2_1); - - multiply_div_conquer(size / 2, result_2_2_a, a_2_1, b_1_2); - multiply_div_conquer(size / 2, result_2_2_b, a_2_2, b_2_2); + pls::spawn( + [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_1_a, a_1_1, b_1_1); } + ); + pls::spawn( + [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_1_b, a_1_2, b_2_1); } + ); + + pls::spawn( + [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_2_a, a_1_1, b_1_2); } + ); + pls::spawn( + [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_2_b, a_1_2, b_2_2); } + ); + + pls::spawn( + [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_1_a, a_2_1, b_1_1); } + ); + pls::spawn( + [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_1_b, a_2_2, b_2_1); } + ); + + pls::spawn( + [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_2_a, a_2_1, b_1_2); } + ); + pls::spawn( + [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_2_b, a_2_2, b_2_2); } + ); + + pls::sync(); // Combine results - for (size_t row = 0; row < size / 2; row++) { - for (size_t column = 0; column < size / 2; column++) { - result_1_1.at(row, column) = result_1_1_a.at(row, column) + result_1_1_b.at(row, column); - result_1_2.at(row, column) = result_1_2_a.at(row, column) + result_1_2_b.at(row, column); - result_2_1.at(row, column) = result_2_1_a.at(row, column) + result_2_1_b.at(row, column); - result_2_2.at(row, column) = result_2_2_a.at(row, column) + result_2_2_b.at(row, column); - } + for (size_t i = 0; i < (size / 2) * (size / 2); i++) { + // The layout is not important here, ass all have the same order, so just sum element wise + result_1_1.get_data()[i] = result_1_1_a.get_data()[i] + result_1_1_b.get_data()[i]; + result_1_2.get_data()[i] = result_1_2_a.get_data()[i] + result_1_2_b.get_data()[i]; + result_2_1.get_data()[i] = result_2_1_a.get_data()[i] + result_2_1_b.get_data()[i]; + result_2_2.get_data()[i] = result_2_2_a.get_data()[i] + result_2_2_b.get_data()[i]; } } constexpr int MAX_NUM_TASKS = 32; -constexpr int MAX_STACK_SIZE = 4096 * 1; +constexpr int MAX_STACK_SIZE = 4096 * 2; int main(int argc, char **argv) { - fill_block_lookup(MAX_BLOCK_LOOKUP); + const size_t size = matrix_div_conquer::MATRIX_SIZE; - size_t size = 64; - std::unique_ptr result_data_naive{new double[size * size]}; - std::unique_ptr result_data_div{new double[size * size]}; + int num_threads; + string directory; + benchmark_runner::read_args(argc, argv, num_threads, directory); + + string test_name = to_string(num_threads) + ".csv"; + string full_directory = directory + "/PLS_v3/"; + benchmark_runner runner{full_directory, test_name}; + + // Only run on one version to avoid copy + std::unique_ptr result_data{new double[size * size]}; std::unique_ptr a_data{new double[size * size]}; std::unique_ptr b_data{new double[size * size]}; - blocked_matrix_view result_naive{result_data_naive.get(), size}; - blocked_matrix_view result_div{result_data_div.get(), size}; - blocked_matrix_view a{a_data.get(), size}; - blocked_matrix_view b{b_data.get(), size}; - - for (size_t row = 0; row < size; row++) { - for (size_t column = 0; column < size; column++) { - a.at(row, column) = row; - b.at(row, column) = column; + matrix_div_conquer::blocked_matrix_view a{a_data.get(), size}; + matrix_div_conquer::blocked_matrix_view b{b_data.get(), size}; + matrix_div_conquer::blocked_matrix_view result{result_data.get(), size}; + + // Fill data arrays as needed + a.fill_default_data(); + b.fill_default_data(); + matrix_div_conquer::fill_block_lookup(size); + + // Strain local data + std::vector>>> div_conquer_temp_arrays; + size_t max_depth = 0; + size_t remaining_size = size; + while (remaining_size > 1) { + auto &depth_buffers = div_conquer_temp_arrays.emplace_back(); + for (int thread_id = 0; thread_id < 8; thread_id++) { + auto &depth_thread_buffers = depth_buffers.emplace_back(); + for (int i = 0; i < 8; i++) { + depth_thread_buffers.emplace_back(new double[(remaining_size / 2) * (remaining_size / 2)]); + } } + + max_depth++; + remaining_size = remaining_size / 2; } + pls::strain_local_resource local_indices{(unsigned) num_threads, (unsigned) max_depth}; - multiply_div_conquer(size, result_div, a, b); - multiply_naive(size, result_naive, a, b); + scheduler scheduler{(unsigned) num_threads, MAX_NUM_TASKS, MAX_STACK_SIZE}; - size_t misses = 0; - for (size_t row = 0; row < size; row++) { - for (size_t column = 0; column < size; column++) { - if (result_div.at(row, column) != result_naive.at(row, column)) { - misses++; - printf("%5.5f\t\t", result_div.at(row, column) - result_naive.at(row, column)); - } - } - } - printf("\n%d", misses); - -// int num_threads; -// string directory; -// benchmark_runner::read_args(argc, argv, num_threads, directory); -// -// string test_name = to_string(num_threads) + ".csv"; -// string full_directory = directory + "/PLS_v3/"; -// benchmark_runner runner{full_directory, test_name}; -// -// scheduler scheduler{(unsigned) num_threads, MAX_NUM_TASKS, MAX_STACK_SIZE}; -// -// runner.run_iterations(1000, [&]() { -// scheduler.perform_work([&]() { -// }); -// }, 100); -// runner.commit_results(true); + runner.run_iterations(1, [&]() { + scheduler.perform_work([&]() { + multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, result, a, b); + }); + }, 0); + runner.commit_results(true); + + scheduler.terminate(); } diff --git a/extern/benchmark_base/CMakeLists.txt b/extern/benchmark_base/CMakeLists.txt index 890a006..da76867 100644 --- a/extern/benchmark_base/CMakeLists.txt +++ b/extern/benchmark_base/CMakeLists.txt @@ -8,7 +8,8 @@ add_library(benchmark_base STATIC include/benchmark_base/matrix.h include/benchmark_base/unbalanced.h src/unbalanced.cpp include/benchmark_base/range.h - include/benchmark_base/fib.h) + include/benchmark_base/fib.h + include/benchmark_base/matrix_div_conquer.h) target_include_directories(benchmark_base PUBLIC diff --git a/extern/benchmark_base/include/benchmark_base/matrix_div_conquer.h b/extern/benchmark_base/include/benchmark_base/matrix_div_conquer.h new file mode 100644 index 0000000..9002de1 --- /dev/null +++ b/extern/benchmark_base/include/benchmark_base/matrix_div_conquer.h @@ -0,0 +1,96 @@ +#ifndef COMPARISON_BENCHMARKS_BASE_MATRIX_DIV_CONQUER_H +#define COMPARISON_BENCHMARKS_BASE_MATRIX_DIV_CONQUER_H + +#include + +namespace comparison_benchmarks { +namespace base { +namespace matrix_div_conquer { + +const int MATRIX_SIZE = 128; +const int CUTOFF_SIZE = 8; + +const int NUM_ITERATIONS = 100; +const int WARMUP_ITERATIONS = 10; + +// Helpers to directly index into blocked matrices +const size_t MAX_SIZE = 128; +std::array, MAX_SIZE> BLOCK_LOOKUP; // ROW, COLUMN +void fill_block_lookup(size_t size = MAX_SIZE) { + if (size <= 1) { + BLOCK_LOOKUP[0][0] = 0; + return; + } + + fill_block_lookup(size / 2); + + size_t elements_per_quarter = (size / 2) * (size / 2); + for (size_t row = 0; row < size / 2; row++) { + for (size_t column = 0; column < size / 2; column++) { + BLOCK_LOOKUP[row][size / 2 + column] = BLOCK_LOOKUP[row][column] + elements_per_quarter; + BLOCK_LOOKUP[size / 2 + row][column] = BLOCK_LOOKUP[row][column] + 2 * elements_per_quarter; + BLOCK_LOOKUP[size / 2 + row][size / 2 + column] = BLOCK_LOOKUP[row][column] + 3 * elements_per_quarter; + } + } +} + +class blocked_matrix_view { + public: + blocked_matrix_view(double *data, size_t size) : data_{data}, size_{size} {} + + void fill_default_data() { + for (size_t row = 0; row < size_; row++) { + for (size_t column = 0; column < size_; column++) { + at(row, column) = row; + } + } + } + + blocked_matrix_view quadrant_1_1() { + size_t elements_per_quarter = (size_ / 2) * (size_ / 2); + return blocked_matrix_view(data_ + 0 * elements_per_quarter, size_ / 2); + } + blocked_matrix_view quadrant_1_2() { + size_t elements_per_quarter = (size_ / 2) * (size_ / 2); + return blocked_matrix_view(data_ + 1 * elements_per_quarter, size_ / 2); + } + blocked_matrix_view quadrant_2_1() { + size_t elements_per_quarter = (size_ / 2) * (size_ / 2); + return blocked_matrix_view(data_ + 2 * elements_per_quarter, size_ / 2); + } + blocked_matrix_view quadrant_2_2() { + size_t elements_per_quarter = (size_ / 2) * (size_ / 2); + return blocked_matrix_view(data_ + 3 * elements_per_quarter, size_ / 2); + } + + double &at(size_t row, size_t column) { + return data_[BLOCK_LOOKUP[row][column]]; + } + + double *get_data() { + return data_; + } + + private: + double *data_; + size_t size_; +}; + +void multiply_naive(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) { + for (size_t i = 0; i < size; i++) { + for (size_t j = 0; j < size; j++) { + result.at(i, j) = 0; + } + for (size_t j = 0; j < size; j++) { + for (size_t k = 0; k < size; k++) { + result.at(i, j) += a.at(i, k) * b.at(k, j); + } + } + } +} + +} +} +} + +#endif // COMPARISON_BENCHMARKS_BASE_MATRIX_DIV_CONQUER_H diff --git a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h index 48c1b9b..fa9cf9c 100644 --- a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h +++ b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h @@ -154,7 +154,6 @@ template void scheduler::spawn_internal(Function &&lambda) { if (thread_state::is_scheduler_active()) { thread_state &spawning_state = thread_state::get(); - scheduler &scheduler = spawning_state.get_scheduler(); base_task *last_task = spawning_state.get_active_task(); base_task *spawned_task = last_task->next_; diff --git a/lib/pls/include/pls/internal/scheduling/strain_local_resource.h b/lib/pls/include/pls/internal/scheduling/strain_local_resource.h index 36fafa3..18c79e5 100644 --- a/lib/pls/include/pls/internal/scheduling/strain_local_resource.h +++ b/lib/pls/include/pls/internal/scheduling/strain_local_resource.h @@ -60,8 +60,7 @@ class strain_local_resource { }; strain_local_resource(unsigned num_threads, - unsigned depth) : local_items_() { - local_items_.reserve(num_threads); + unsigned depth) : local_items_(num_threads) { for (unsigned thread_id = 0; thread_id < num_threads; thread_id++) { local_items_[thread_id].reserve(depth); for (unsigned i = 0; i < depth; i++) { diff --git a/lib/pls/include/pls/pls.h b/lib/pls/include/pls/pls.h index 645d65f..15c1ebf 100644 --- a/lib/pls/include/pls/pls.h +++ b/lib/pls/include/pls/pls.h @@ -8,6 +8,7 @@ #include "pls/algorithms/reduce.h" #include "pls/internal/scheduling/scheduler.h" +#include "pls/internal/scheduling/strain_local_resource.h" #include "pls/internal/helpers/range.h" #include "pls/internal/helpers/member_function.h" @@ -28,6 +29,9 @@ static void serial(Function &&function) { scheduler::serial(std::forward(function)); } +// strain local resource support (rather low-level) +using internal::scheduling::strain_local_resource; + // general helpers that can be handy when using PLS template using member_function = internal::helpers::member_function; -- libgit2 0.26.0