From 92ee564c6a6d6a311478fcaa819e64b551d3bbfd Mon Sep 17 00:00:00 2001 From: FritzFlorian Date: Thu, 4 Jun 2020 12:52:51 +0200 Subject: [PATCH] WIP: Start work on divide and conquer example with temporary buffers. --- CMakeLists.txt | 1 + app/benchmark_fib/main.cpp | 14 ++++++-------- app/benchmark_matrix_div_conquer/CMakeLists.txt | 5 +++++ app/benchmark_matrix_div_conquer/main.cpp | 194 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ lib/pls/CMakeLists.txt | 2 +- lib/pls/include/pls/algorithms/divide_and_conquer_buffers.h | 5 +++++ lib/pls/include/pls/internal/base/system_details.h | 2 +- lib/pls/include/pls/internal/scheduling/strain_local_resource.h | 6 +++--- lib/pls/src/internal/scheduling/strain_local_resource.cpp | 11 +++++++---- 9 files changed, 223 insertions(+), 17 deletions(-) create mode 100644 app/benchmark_matrix_div_conquer/CMakeLists.txt create mode 100644 app/benchmark_matrix_div_conquer/main.cpp create mode 100644 lib/pls/include/pls/algorithms/divide_and_conquer_buffers.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f2f432..215aa65 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,6 +48,7 @@ add_subdirectory(app/playground) add_subdirectory(app/benchmark_fft) add_subdirectory(app/benchmark_unbalanced) add_subdirectory(app/benchmark_matrix) +add_subdirectory(app/benchmark_matrix_div_conquer) add_subdirectory(app/benchmark_fib) add_subdirectory(app/context_switch) diff --git a/app/benchmark_fib/main.cpp b/app/benchmark_fib/main.cpp index 44ea705..a76b1a0 100644 --- a/app/benchmark_fib/main.cpp +++ b/app/benchmark_fib/main.cpp @@ -1,7 +1,5 @@ #include "pls/pls.h" -#include - #include "benchmark_runner.h" #include "benchmark_base/fib.h" @@ -10,7 +8,7 @@ using namespace comparison_benchmarks::base; constexpr int MAX_NUM_TASKS = 32; constexpr int MAX_STACK_SIZE = 4096 * 1; -int pls_fib(int n) { +int pls_fib(int n, int d) { if (n == 0) { return 0; } @@ -19,11 +17,11 @@ int pls_fib(int n) { } int a, b; - pls::spawn([n, &a]() { - a = pls_fib(n - 1); + pls::spawn([n, d, &a]() { + a = pls_fib(n - 1, d + 1); }); - pls::spawn([n, &b]() { - b = pls_fib(n - 2); + pls::spawn([n, d, &b]() { + b = pls_fib(n - 2, d + 1); }); pls::sync(); @@ -45,7 +43,7 @@ int main(int argc, char **argv) { // scheduler.get_profiler().disable_memory_measure(); runner.run_iterations(fib::NUM_ITERATIONS, [&]() { scheduler.perform_work([&]() { - res = pls_fib(fib::INPUT_N); + res = pls_fib(fib::INPUT_N, 0); }); }, fib::NUM_WARMUP_ITERATIONS); // scheduler.get_profiler().current_run().print_dag(std::cout); diff --git a/app/benchmark_matrix_div_conquer/CMakeLists.txt b/app/benchmark_matrix_div_conquer/CMakeLists.txt new file mode 100644 index 0000000..4d510c0 --- /dev/null +++ b/app/benchmark_matrix_div_conquer/CMakeLists.txt @@ -0,0 +1,5 @@ +add_executable(benchmark_matrix_div_conquer_pls_v3 main.cpp) +target_link_libraries(benchmark_matrix_div_conquer_pls_v3 pls benchmark_runner benchmark_base) +if (EASY_PROFILER) + target_link_libraries(benchmark_matrix_div_conquer_pls_v3 easy_profiler) +endif () diff --git a/app/benchmark_matrix_div_conquer/main.cpp b/app/benchmark_matrix_div_conquer/main.cpp new file mode 100644 index 0000000..2b26022 --- /dev/null +++ b/app/benchmark_matrix_div_conquer/main.cpp @@ -0,0 +1,194 @@ +//#include "pls/pls.h" +//using namespace pls; + +#include "benchmark_runner.h" + +#include +#include +#include + +// Helpers to directly index into blocked matrices +const size_t MAX_BLOCK_LOOKUP = 256; +std::array, MAX_BLOCK_LOOKUP> BLOCK_LOOKUP; // ROW, COLUMN +void fill_block_lookup(size_t size) { + if (size <= 1) { + BLOCK_LOOKUP[0][0] = 0; + return; + } + + fill_block_lookup(size / 2); + + size_t elements_per_quarter = (size / 2) * (size / 2); + for (size_t row = 0; row < size / 2; row++) { + for (size_t column = 0; column < size / 2; column++) { + BLOCK_LOOKUP[row][size / 2 + column] = BLOCK_LOOKUP[row][column] + elements_per_quarter; + BLOCK_LOOKUP[size / 2 + row][column] = BLOCK_LOOKUP[row][column] + 2 * elements_per_quarter; + BLOCK_LOOKUP[size / 2 + row][size / 2 + column] = BLOCK_LOOKUP[row][column] + 3 * elements_per_quarter; + } + } +} +class blocked_matrix_view { + public: + blocked_matrix_view(double *data, size_t size) : data_{data}, size_{size} {} + + blocked_matrix_view quadrant_1_1() { + size_t elements_per_quarter = (size_ / 2) * (size_ / 2); + return blocked_matrix_view(data_ + 0 * elements_per_quarter, size_ / 2); + } + blocked_matrix_view quadrant_1_2() { + size_t elements_per_quarter = (size_ / 2) * (size_ / 2); + return blocked_matrix_view(data_ + 1 * elements_per_quarter, size_ / 2); + } + blocked_matrix_view quadrant_2_1() { + size_t elements_per_quarter = (size_ / 2) * (size_ / 2); + return blocked_matrix_view(data_ + 2 * elements_per_quarter, size_ / 2); + } + blocked_matrix_view quadrant_2_2() { + size_t elements_per_quarter = (size_ / 2) * (size_ / 2); + return blocked_matrix_view(data_ + 3 * elements_per_quarter, size_ / 2); + } + + double &at(size_t row, size_t column) { + return data_[BLOCK_LOOKUP[row][column]]; + } + + double *get_data() { + return data_; + } + + private: + double *data_; + size_t size_; +}; + +void multiply_naive(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) { + for (size_t i = 0; i < size; i++) { + for (size_t j = 0; j < size; j++) { + result.at(i, j) = 0; + } + for (size_t j = 0; j < size; j++) { + for (size_t k = 0; k < size; k++) { + result.at(i, j) += a.at(i, k) * b.at(k, j); + } + } + } +} + +void multiply_div_conquer(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) { + if (size <= 8) { + multiply_naive(size, result, a, b); + return; + } + // Temporary storage required for the intermediate results + std::unique_ptr data_1_1_a{new double[(size / 2) * (size / 2)]}; + std::unique_ptr data_1_1_b{new double[(size / 2) * (size / 2)]}; + std::unique_ptr data_1_2_a{new double[(size / 2) * (size / 2)]}; + std::unique_ptr data_1_2_b{new double[(size / 2) * (size / 2)]}; + std::unique_ptr data_2_1_a{new double[(size / 2) * (size / 2)]}; + std::unique_ptr data_2_1_b{new double[(size / 2) * (size / 2)]}; + std::unique_ptr data_2_2_a{new double[(size / 2) * (size / 2)]}; + std::unique_ptr data_2_2_b{new double[(size / 2) * (size / 2)]}; + + // Handles to sub-matrices used + blocked_matrix_view result_1_1 = result.quadrant_1_1(); + blocked_matrix_view result_1_2 = result.quadrant_1_2(); + blocked_matrix_view result_2_1 = result.quadrant_2_1(); + blocked_matrix_view result_2_2 = result.quadrant_2_2(); + + blocked_matrix_view result_1_1_a{data_1_1_a.get(), size / 2}; + blocked_matrix_view result_1_1_b{data_1_1_b.get(), size / 2}; + blocked_matrix_view result_1_2_a{data_1_2_a.get(), size / 2}; + blocked_matrix_view result_1_2_b{data_1_2_b.get(), size / 2}; + blocked_matrix_view result_2_1_a{data_2_1_a.get(), size / 2}; + blocked_matrix_view result_2_1_b{data_2_1_b.get(), size / 2}; + blocked_matrix_view result_2_2_a{data_2_2_a.get(), size / 2}; + blocked_matrix_view result_2_2_b{data_2_2_b.get(), size / 2}; + + blocked_matrix_view a_1_1 = a.quadrant_1_1(); + blocked_matrix_view a_1_2 = a.quadrant_1_2(); + blocked_matrix_view a_2_1 = a.quadrant_2_1(); + blocked_matrix_view a_2_2 = a.quadrant_2_2(); + + blocked_matrix_view b_1_1 = b.quadrant_1_1(); + blocked_matrix_view b_1_2 = b.quadrant_1_2(); + blocked_matrix_view b_2_1 = b.quadrant_2_1(); + blocked_matrix_view b_2_2 = b.quadrant_2_2(); + + // Divide Work Into Sub-Calls + multiply_div_conquer(size / 2, result_1_1_a, a_1_1, b_1_1); + multiply_div_conquer(size / 2, result_1_1_b, a_1_2, b_2_1); + + multiply_div_conquer(size / 2, result_1_2_a, a_1_1, b_1_2); + multiply_div_conquer(size / 2, result_1_2_b, a_1_2, b_2_2); + + multiply_div_conquer(size / 2, result_2_1_a, a_2_1, b_1_1); + multiply_div_conquer(size / 2, result_2_1_b, a_2_2, b_2_1); + + multiply_div_conquer(size / 2, result_2_2_a, a_2_1, b_1_2); + multiply_div_conquer(size / 2, result_2_2_b, a_2_2, b_2_2); + + // Combine results + for (size_t row = 0; row < size / 2; row++) { + for (size_t column = 0; column < size / 2; column++) { + result_1_1.at(row, column) = result_1_1_a.at(row, column) + result_1_1_b.at(row, column); + result_1_2.at(row, column) = result_1_2_a.at(row, column) + result_1_2_b.at(row, column); + result_2_1.at(row, column) = result_2_1_a.at(row, column) + result_2_1_b.at(row, column); + result_2_2.at(row, column) = result_2_2_a.at(row, column) + result_2_2_b.at(row, column); + } + } +} + +constexpr int MAX_NUM_TASKS = 32; +constexpr int MAX_STACK_SIZE = 4096 * 1; + +int main(int argc, char **argv) { + fill_block_lookup(MAX_BLOCK_LOOKUP); + + size_t size = 64; + std::unique_ptr result_data_naive{new double[size * size]}; + std::unique_ptr result_data_div{new double[size * size]}; + std::unique_ptr a_data{new double[size * size]}; + std::unique_ptr b_data{new double[size * size]}; + + blocked_matrix_view result_naive{result_data_naive.get(), size}; + blocked_matrix_view result_div{result_data_div.get(), size}; + blocked_matrix_view a{a_data.get(), size}; + blocked_matrix_view b{b_data.get(), size}; + + for (size_t row = 0; row < size; row++) { + for (size_t column = 0; column < size; column++) { + a.at(row, column) = row; + b.at(row, column) = column; + } + } + + multiply_div_conquer(size, result_div, a, b); + multiply_naive(size, result_naive, a, b); + + size_t misses = 0; + for (size_t row = 0; row < size; row++) { + for (size_t column = 0; column < size; column++) { + if (result_div.at(row, column) != result_naive.at(row, column)) { + misses++; + printf("%5.5f\t\t", result_div.at(row, column) - result_naive.at(row, column)); + } + } + } + printf("\n%d", misses); + +// int num_threads; +// string directory; +// benchmark_runner::read_args(argc, argv, num_threads, directory); +// +// string test_name = to_string(num_threads) + ".csv"; +// string full_directory = directory + "/PLS_v3/"; +// benchmark_runner runner{full_directory, test_name}; +// +// scheduler scheduler{(unsigned) num_threads, MAX_NUM_TASKS, MAX_STACK_SIZE}; +// +// runner.run_iterations(1000, [&]() { +// scheduler.perform_work([&]() { +// }); +// }, 100); +// runner.commit_results(true); +} diff --git a/lib/pls/CMakeLists.txt b/lib/pls/CMakeLists.txt index 4c42f70..a580128 100644 --- a/lib/pls/CMakeLists.txt +++ b/lib/pls/CMakeLists.txt @@ -48,7 +48,7 @@ add_library(pls STATIC include/pls/internal/profiling/dag_node.h src/internal/profiling/dag_node.cpp include/pls/internal/profiling/profiler.h src/internal/profiling/profiler.cpp - include/pls/internal/profiling/thread_stats.h src/internal/profiling/thread_stats.cpp) + include/pls/internal/profiling/thread_stats.h src/internal/profiling/thread_stats.cpp include/pls/algorithms/divide_and_conquer_buffers.h) # Dependencies for pls target_link_libraries(pls Threads::Threads) diff --git a/lib/pls/include/pls/algorithms/divide_and_conquer_buffers.h b/lib/pls/include/pls/algorithms/divide_and_conquer_buffers.h new file mode 100644 index 0000000..8786c30 --- /dev/null +++ b/lib/pls/include/pls/algorithms/divide_and_conquer_buffers.h @@ -0,0 +1,5 @@ + +#ifndef PLS_ALGORITHMS_DIVIDE_AND_CONQUER_BUFFERS_H_ +#define PLS_ALGORITHMS_DIVIDE_AND_CONQUER_BUFFERS_H_ + +#endif //PLS_ALGORITHMS_DIVIDE_AND_CONQUER_BUFFERS_H_ diff --git a/lib/pls/include/pls/internal/base/system_details.h b/lib/pls/include/pls/internal/base/system_details.h index 68821a8..5430bdc 100644 --- a/lib/pls/include/pls/internal/base/system_details.h +++ b/lib/pls/include/pls/internal/base/system_details.h @@ -47,7 +47,7 @@ constexpr size_t CACHE_LINE_SIZE = 64; /** * Helper to align types/values on cache lines. */ -#define PLS_CACHE_ALIGN alignas(base::system_details::CACHE_LINE_SIZE) +#define PLS_CACHE_ALIGN alignas(::pls::internal::base::system_details::CACHE_LINE_SIZE) /** * Helper to find mmap page size. Either set constant or rely on system specific getter function. diff --git a/lib/pls/include/pls/internal/scheduling/strain_local_resource.h b/lib/pls/include/pls/internal/scheduling/strain_local_resource.h index 27f62d6..36fafa3 100644 --- a/lib/pls/include/pls/internal/scheduling/strain_local_resource.h +++ b/lib/pls/include/pls/internal/scheduling/strain_local_resource.h @@ -24,7 +24,7 @@ struct PLS_CACHE_ALIGN strain_resource { unsigned const index_; unsigned const depth_; - bool used_{false}; + std::atomic used_{0}; std::atomic next_{nullptr}; }; @@ -48,7 +48,7 @@ class strain_local_resource { item_handle(item_handle &&) = delete; item_handle &operator=(item_handle &&) = delete; - explicit item_handle(strain_resource *resource) : resource_{resource} {} + explicit item_handle(strain_resource *resource); ~item_handle(); unsigned get_strain_index() { @@ -56,7 +56,7 @@ class strain_local_resource { } private: - strain_resource *resource_; + strain_resource *const resource_; }; strain_local_resource(unsigned num_threads, diff --git a/lib/pls/src/internal/scheduling/strain_local_resource.cpp b/lib/pls/src/internal/scheduling/strain_local_resource.cpp index e1efec3..f40362d 100644 --- a/lib/pls/src/internal/scheduling/strain_local_resource.cpp +++ b/lib/pls/src/internal/scheduling/strain_local_resource.cpp @@ -23,11 +23,14 @@ strain_local_resource::item_handle strain_local_resource::get_item(unsigned dept active_task->attached_resources_.store(result, std::memory_order_relaxed); // Wrap it for RAII usage on stack - PLS_ASSERT(!result->used_, "Must not try to allocate an already used resource!"); - result->used_ = true; return strain_local_resource::item_handle{result}; } +strain_local_resource::item_handle::item_handle(strain_resource *resource) : resource_{resource} { + PLS_ASSERT(resource_->used_.fetch_add(1, std::memory_order_relaxed) == 0, + "Must not create a handle of a already used resource!"); +} + // Return item to locally owned items strain_local_resource::item_handle::~item_handle() { // Only change our resource usage when synced. @@ -54,8 +57,8 @@ strain_local_resource::item_handle::~item_handle() { // Give the resource handle back to our local resource array auto &local_resource = resource_->strain_local_resource_->local_items_[my_state.get_thread_id()][resource_->depth_]; local_resource.resource_ = resource_; - PLS_ASSERT(resource_->used_, "Must only release used resources!"); - resource_->used_ = false; + PLS_ASSERT(resource_->used_.fetch_sub(1, std::memory_order_relaxed) == 1, + "Accidentally freed resource that was accessed multiple times!"); } strain_resource *strain_local_resource::get_local_copy(strain_resource *other_resources, unsigned thread_id) { -- libgit2 0.26.0