Commit 8f47876d by FritzFlorian

Add standard divide and conquer matrix multiplication test for comparison.

parent 92ee564c
Pipeline #1502 passed with stages
in 4 minutes 13 seconds
//#include "pls/pls.h" #include "pls/pls.h"
//using namespace pls; using namespace pls;
#include "benchmark_runner.h" #include "benchmark_runner.h"
#include "benchmark_base/matrix_div_conquer.h"
using namespace comparison_benchmarks::base;
#include <memory> #include <memory>
#include <array> #include <array>
#include <math.h> #include <vector>
// Helpers to directly index into blocked matrices
const size_t MAX_BLOCK_LOOKUP = 256;
std::array<std::array<size_t, MAX_BLOCK_LOOKUP>, MAX_BLOCK_LOOKUP> BLOCK_LOOKUP; // ROW, COLUMN
void fill_block_lookup(size_t size) {
if (size <= 1) {
BLOCK_LOOKUP[0][0] = 0;
return;
}
fill_block_lookup(size / 2);
size_t elements_per_quarter = (size / 2) * (size / 2);
for (size_t row = 0; row < size / 2; row++) {
for (size_t column = 0; column < size / 2; column++) {
BLOCK_LOOKUP[row][size / 2 + column] = BLOCK_LOOKUP[row][column] + elements_per_quarter;
BLOCK_LOOKUP[size / 2 + row][column] = BLOCK_LOOKUP[row][column] + 2 * elements_per_quarter;
BLOCK_LOOKUP[size / 2 + row][size / 2 + column] = BLOCK_LOOKUP[row][column] + 3 * elements_per_quarter;
}
}
}
class blocked_matrix_view {
public:
blocked_matrix_view(double *data, size_t size) : data_{data}, size_{size} {}
blocked_matrix_view quadrant_1_1() {
size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
return blocked_matrix_view(data_ + 0 * elements_per_quarter, size_ / 2);
}
blocked_matrix_view quadrant_1_2() {
size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
return blocked_matrix_view(data_ + 1 * elements_per_quarter, size_ / 2);
}
blocked_matrix_view quadrant_2_1() {
size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
return blocked_matrix_view(data_ + 2 * elements_per_quarter, size_ / 2);
}
blocked_matrix_view quadrant_2_2() {
size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
return blocked_matrix_view(data_ + 3 * elements_per_quarter, size_ / 2);
}
double &at(size_t row, size_t column) {
return data_[BLOCK_LOOKUP[row][column]];
}
double *get_data() {
return data_;
}
private:
double *data_;
size_t size_;
};
void multiply_naive(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) { void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> &tmp_arrays,
for (size_t i = 0; i < size; i++) { pls::strain_local_resource &local_indices,
for (size_t j = 0; j < size; j++) { size_t size,
result.at(i, j) = 0; size_t depth,
} matrix_div_conquer::blocked_matrix_view &result,
for (size_t j = 0; j < size; j++) { matrix_div_conquer::blocked_matrix_view &a,
for (size_t k = 0; k < size; k++) { matrix_div_conquer::blocked_matrix_view &b) {
result.at(i, j) += a.at(i, k) * b.at(k, j);
}
}
}
}
void multiply_div_conquer(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) {
if (size <= 8) { if (size <= 8) {
multiply_naive(size, result, a, b); multiply_naive(size, result, a, b);
return; return;
} }
// Temporary storage required for the intermediate results // Temporary storage required for the intermediate results
std::unique_ptr<double[]> data_1_1_a{new double[(size / 2) * (size / 2)]}; auto strain_local_index = local_indices.get_item(depth);
std::unique_ptr<double[]> data_1_1_b{new double[(size / 2) * (size / 2)]}; std::unique_ptr<double[]> const &data_1_1_a = tmp_arrays[depth][strain_local_index.get_strain_index()][0];
std::unique_ptr<double[]> data_1_2_a{new double[(size / 2) * (size / 2)]}; std::unique_ptr<double[]> const &data_1_1_b = tmp_arrays[depth][strain_local_index.get_strain_index()][1];
std::unique_ptr<double[]> data_1_2_b{new double[(size / 2) * (size / 2)]}; std::unique_ptr<double[]> const &data_1_2_a = tmp_arrays[depth][strain_local_index.get_strain_index()][2];
std::unique_ptr<double[]> data_2_1_a{new double[(size / 2) * (size / 2)]}; std::unique_ptr<double[]> const &data_1_2_b = tmp_arrays[depth][strain_local_index.get_strain_index()][3];
std::unique_ptr<double[]> data_2_1_b{new double[(size / 2) * (size / 2)]}; std::unique_ptr<double[]> const &data_2_1_a = tmp_arrays[depth][strain_local_index.get_strain_index()][4];
std::unique_ptr<double[]> data_2_2_a{new double[(size / 2) * (size / 2)]}; std::unique_ptr<double[]> const &data_2_1_b = tmp_arrays[depth][strain_local_index.get_strain_index()][5];
std::unique_ptr<double[]> data_2_2_b{new double[(size / 2) * (size / 2)]}; std::unique_ptr<double[]> const &data_2_2_a = tmp_arrays[depth][strain_local_index.get_strain_index()][6];
std::unique_ptr<double[]> const &data_2_2_b = tmp_arrays[depth][strain_local_index.get_strain_index()][7];
// Handles to sub-matrices used // Handles to sub-matrices used
blocked_matrix_view result_1_1 = result.quadrant_1_1(); matrix_div_conquer::blocked_matrix_view result_1_1 = result.quadrant_1_1();
blocked_matrix_view result_1_2 = result.quadrant_1_2(); matrix_div_conquer::blocked_matrix_view result_1_2 = result.quadrant_1_2();
blocked_matrix_view result_2_1 = result.quadrant_2_1(); matrix_div_conquer::blocked_matrix_view result_2_1 = result.quadrant_2_1();
blocked_matrix_view result_2_2 = result.quadrant_2_2(); matrix_div_conquer::blocked_matrix_view result_2_2 = result.quadrant_2_2();
blocked_matrix_view result_1_1_a{data_1_1_a.get(), size / 2}; matrix_div_conquer::blocked_matrix_view result_1_1_a{data_1_1_a.get(), size / 2};
blocked_matrix_view result_1_1_b{data_1_1_b.get(), size / 2}; matrix_div_conquer::blocked_matrix_view result_1_1_b{data_1_1_b.get(), size / 2};
blocked_matrix_view result_1_2_a{data_1_2_a.get(), size / 2}; matrix_div_conquer::blocked_matrix_view result_1_2_a{data_1_2_a.get(), size / 2};
blocked_matrix_view result_1_2_b{data_1_2_b.get(), size / 2}; matrix_div_conquer::blocked_matrix_view result_1_2_b{data_1_2_b.get(), size / 2};
blocked_matrix_view result_2_1_a{data_2_1_a.get(), size / 2}; matrix_div_conquer::blocked_matrix_view result_2_1_a{data_2_1_a.get(), size / 2};
blocked_matrix_view result_2_1_b{data_2_1_b.get(), size / 2}; matrix_div_conquer::blocked_matrix_view result_2_1_b{data_2_1_b.get(), size / 2};
blocked_matrix_view result_2_2_a{data_2_2_a.get(), size / 2}; matrix_div_conquer::blocked_matrix_view result_2_2_a{data_2_2_a.get(), size / 2};
blocked_matrix_view result_2_2_b{data_2_2_b.get(), size / 2}; matrix_div_conquer::blocked_matrix_view result_2_2_b{data_2_2_b.get(), size / 2};
blocked_matrix_view a_1_1 = a.quadrant_1_1(); matrix_div_conquer::blocked_matrix_view a_1_1 = a.quadrant_1_1();
blocked_matrix_view a_1_2 = a.quadrant_1_2(); matrix_div_conquer::blocked_matrix_view a_1_2 = a.quadrant_1_2();
blocked_matrix_view a_2_1 = a.quadrant_2_1(); matrix_div_conquer::blocked_matrix_view a_2_1 = a.quadrant_2_1();
blocked_matrix_view a_2_2 = a.quadrant_2_2(); matrix_div_conquer::blocked_matrix_view a_2_2 = a.quadrant_2_2();
blocked_matrix_view b_1_1 = b.quadrant_1_1(); matrix_div_conquer::blocked_matrix_view b_1_1 = b.quadrant_1_1();
blocked_matrix_view b_1_2 = b.quadrant_1_2(); matrix_div_conquer::blocked_matrix_view b_1_2 = b.quadrant_1_2();
blocked_matrix_view b_2_1 = b.quadrant_2_1(); matrix_div_conquer::blocked_matrix_view b_2_1 = b.quadrant_2_1();
blocked_matrix_view b_2_2 = b.quadrant_2_2(); matrix_div_conquer::blocked_matrix_view b_2_2 = b.quadrant_2_2();
// Divide Work Into Sub-Calls // Divide Work Into Sub-Calls
multiply_div_conquer(size / 2, result_1_1_a, a_1_1, b_1_1); pls::spawn(
multiply_div_conquer(size / 2, result_1_1_b, a_1_2, b_2_1); [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_1_a, a_1_1, b_1_1); }
);
multiply_div_conquer(size / 2, result_1_2_a, a_1_1, b_1_2); pls::spawn(
multiply_div_conquer(size / 2, result_1_2_b, a_1_2, b_2_2); [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_1_b, a_1_2, b_2_1); }
);
multiply_div_conquer(size / 2, result_2_1_a, a_2_1, b_1_1);
multiply_div_conquer(size / 2, result_2_1_b, a_2_2, b_2_1); pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_2_a, a_1_1, b_1_2); }
multiply_div_conquer(size / 2, result_2_2_a, a_2_1, b_1_2); );
multiply_div_conquer(size / 2, result_2_2_b, a_2_2, b_2_2); pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_2_b, a_1_2, b_2_2); }
);
pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_1_a, a_2_1, b_1_1); }
);
pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_1_b, a_2_2, b_2_1); }
);
pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_2_a, a_2_1, b_1_2); }
);
pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_2_b, a_2_2, b_2_2); }
);
pls::sync();
// Combine results // Combine results
for (size_t row = 0; row < size / 2; row++) { for (size_t i = 0; i < (size / 2) * (size / 2); i++) {
for (size_t column = 0; column < size / 2; column++) { // The layout is not important here, ass all have the same order, so just sum element wise
result_1_1.at(row, column) = result_1_1_a.at(row, column) + result_1_1_b.at(row, column); result_1_1.get_data()[i] = result_1_1_a.get_data()[i] + result_1_1_b.get_data()[i];
result_1_2.at(row, column) = result_1_2_a.at(row, column) + result_1_2_b.at(row, column); result_1_2.get_data()[i] = result_1_2_a.get_data()[i] + result_1_2_b.get_data()[i];
result_2_1.at(row, column) = result_2_1_a.at(row, column) + result_2_1_b.at(row, column); result_2_1.get_data()[i] = result_2_1_a.get_data()[i] + result_2_1_b.get_data()[i];
result_2_2.at(row, column) = result_2_2_a.at(row, column) + result_2_2_b.at(row, column); result_2_2.get_data()[i] = result_2_2_a.get_data()[i] + result_2_2_b.get_data()[i];
}
} }
} }
constexpr int MAX_NUM_TASKS = 32; constexpr int MAX_NUM_TASKS = 32;
constexpr int MAX_STACK_SIZE = 4096 * 1; constexpr int MAX_STACK_SIZE = 4096 * 2;
int main(int argc, char **argv) { int main(int argc, char **argv) {
fill_block_lookup(MAX_BLOCK_LOOKUP); const size_t size = matrix_div_conquer::MATRIX_SIZE;
size_t size = 64; int num_threads;
std::unique_ptr<double[]> result_data_naive{new double[size * size]}; string directory;
std::unique_ptr<double[]> result_data_div{new double[size * size]}; benchmark_runner::read_args(argc, argv, num_threads, directory);
string test_name = to_string(num_threads) + ".csv";
string full_directory = directory + "/PLS_v3/";
benchmark_runner runner{full_directory, test_name};
// Only run on one version to avoid copy
std::unique_ptr<double[]> result_data{new double[size * size]};
std::unique_ptr<double[]> a_data{new double[size * size]}; std::unique_ptr<double[]> a_data{new double[size * size]};
std::unique_ptr<double[]> b_data{new double[size * size]}; std::unique_ptr<double[]> b_data{new double[size * size]};
blocked_matrix_view result_naive{result_data_naive.get(), size}; matrix_div_conquer::blocked_matrix_view a{a_data.get(), size};
blocked_matrix_view result_div{result_data_div.get(), size}; matrix_div_conquer::blocked_matrix_view b{b_data.get(), size};
blocked_matrix_view a{a_data.get(), size}; matrix_div_conquer::blocked_matrix_view result{result_data.get(), size};
blocked_matrix_view b{b_data.get(), size};
for (size_t row = 0; row < size; row++) {
for (size_t column = 0; column < size; column++) {
a.at(row, column) = row;
b.at(row, column) = column;
}
}
multiply_div_conquer(size, result_div, a, b); // Fill data arrays as needed
multiply_naive(size, result_naive, a, b); a.fill_default_data();
b.fill_default_data();
matrix_div_conquer::fill_block_lookup(size);
size_t misses = 0; // Strain local data
for (size_t row = 0; row < size; row++) { std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> div_conquer_temp_arrays;
for (size_t column = 0; column < size; column++) { size_t max_depth = 0;
if (result_div.at(row, column) != result_naive.at(row, column)) { size_t remaining_size = size;
misses++; while (remaining_size > 1) {
printf("%5.5f\t\t", result_div.at(row, column) - result_naive.at(row, column)); auto &depth_buffers = div_conquer_temp_arrays.emplace_back();
for (int thread_id = 0; thread_id < 8; thread_id++) {
auto &depth_thread_buffers = depth_buffers.emplace_back();
for (int i = 0; i < 8; i++) {
depth_thread_buffers.emplace_back(new double[(remaining_size / 2) * (remaining_size / 2)]);
} }
} }
max_depth++;
remaining_size = remaining_size / 2;
} }
printf("\n%d", misses); pls::strain_local_resource local_indices{(unsigned) num_threads, (unsigned) max_depth};
// int num_threads; scheduler scheduler{(unsigned) num_threads, MAX_NUM_TASKS, MAX_STACK_SIZE};
// string directory;
// benchmark_runner::read_args(argc, argv, num_threads, directory); runner.run_iterations(1, [&]() {
// scheduler.perform_work([&]() {
// string test_name = to_string(num_threads) + ".csv"; multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, result, a, b);
// string full_directory = directory + "/PLS_v3/"; });
// benchmark_runner runner{full_directory, test_name}; }, 0);
// runner.commit_results(true);
// scheduler scheduler{(unsigned) num_threads, MAX_NUM_TASKS, MAX_STACK_SIZE};
// scheduler.terminate();
// runner.run_iterations(1000, [&]() {
// scheduler.perform_work([&]() {
// });
// }, 100);
// runner.commit_results(true);
} }
...@@ -8,7 +8,8 @@ add_library(benchmark_base STATIC ...@@ -8,7 +8,8 @@ add_library(benchmark_base STATIC
include/benchmark_base/matrix.h include/benchmark_base/matrix.h
include/benchmark_base/unbalanced.h src/unbalanced.cpp include/benchmark_base/unbalanced.h src/unbalanced.cpp
include/benchmark_base/range.h include/benchmark_base/range.h
include/benchmark_base/fib.h) include/benchmark_base/fib.h
include/benchmark_base/matrix_div_conquer.h)
target_include_directories(benchmark_base target_include_directories(benchmark_base
PUBLIC PUBLIC
......
#ifndef COMPARISON_BENCHMARKS_BASE_MATRIX_DIV_CONQUER_H
#define COMPARISON_BENCHMARKS_BASE_MATRIX_DIV_CONQUER_H
#include <array>
namespace comparison_benchmarks {
namespace base {
namespace matrix_div_conquer {
const int MATRIX_SIZE = 128;
const int CUTOFF_SIZE = 8;
const int NUM_ITERATIONS = 100;
const int WARMUP_ITERATIONS = 10;
// Helpers to directly index into blocked matrices
const size_t MAX_SIZE = 128;
std::array<std::array<size_t, MAX_SIZE>, MAX_SIZE> BLOCK_LOOKUP; // ROW, COLUMN
void fill_block_lookup(size_t size = MAX_SIZE) {
if (size <= 1) {
BLOCK_LOOKUP[0][0] = 0;
return;
}
fill_block_lookup(size / 2);
size_t elements_per_quarter = (size / 2) * (size / 2);
for (size_t row = 0; row < size / 2; row++) {
for (size_t column = 0; column < size / 2; column++) {
BLOCK_LOOKUP[row][size / 2 + column] = BLOCK_LOOKUP[row][column] + elements_per_quarter;
BLOCK_LOOKUP[size / 2 + row][column] = BLOCK_LOOKUP[row][column] + 2 * elements_per_quarter;
BLOCK_LOOKUP[size / 2 + row][size / 2 + column] = BLOCK_LOOKUP[row][column] + 3 * elements_per_quarter;
}
}
}
class blocked_matrix_view {
public:
blocked_matrix_view(double *data, size_t size) : data_{data}, size_{size} {}
void fill_default_data() {
for (size_t row = 0; row < size_; row++) {
for (size_t column = 0; column < size_; column++) {
at(row, column) = row;
}
}
}
blocked_matrix_view quadrant_1_1() {
size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
return blocked_matrix_view(data_ + 0 * elements_per_quarter, size_ / 2);
}
blocked_matrix_view quadrant_1_2() {
size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
return blocked_matrix_view(data_ + 1 * elements_per_quarter, size_ / 2);
}
blocked_matrix_view quadrant_2_1() {
size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
return blocked_matrix_view(data_ + 2 * elements_per_quarter, size_ / 2);
}
blocked_matrix_view quadrant_2_2() {
size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
return blocked_matrix_view(data_ + 3 * elements_per_quarter, size_ / 2);
}
double &at(size_t row, size_t column) {
return data_[BLOCK_LOOKUP[row][column]];
}
double *get_data() {
return data_;
}
private:
double *data_;
size_t size_;
};
void multiply_naive(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) {
for (size_t i = 0; i < size; i++) {
for (size_t j = 0; j < size; j++) {
result.at(i, j) = 0;
}
for (size_t j = 0; j < size; j++) {
for (size_t k = 0; k < size; k++) {
result.at(i, j) += a.at(i, k) * b.at(k, j);
}
}
}
}
}
}
}
#endif // COMPARISON_BENCHMARKS_BASE_MATRIX_DIV_CONQUER_H
...@@ -154,7 +154,6 @@ template<typename Function> ...@@ -154,7 +154,6 @@ template<typename Function>
void scheduler::spawn_internal(Function &&lambda) { void scheduler::spawn_internal(Function &&lambda) {
if (thread_state::is_scheduler_active()) { if (thread_state::is_scheduler_active()) {
thread_state &spawning_state = thread_state::get(); thread_state &spawning_state = thread_state::get();
scheduler &scheduler = spawning_state.get_scheduler();
base_task *last_task = spawning_state.get_active_task(); base_task *last_task = spawning_state.get_active_task();
base_task *spawned_task = last_task->next_; base_task *spawned_task = last_task->next_;
......
...@@ -60,8 +60,7 @@ class strain_local_resource { ...@@ -60,8 +60,7 @@ class strain_local_resource {
}; };
strain_local_resource(unsigned num_threads, strain_local_resource(unsigned num_threads,
unsigned depth) : local_items_() { unsigned depth) : local_items_(num_threads) {
local_items_.reserve(num_threads);
for (unsigned thread_id = 0; thread_id < num_threads; thread_id++) { for (unsigned thread_id = 0; thread_id < num_threads; thread_id++) {
local_items_[thread_id].reserve(depth); local_items_[thread_id].reserve(depth);
for (unsigned i = 0; i < depth; i++) { for (unsigned i = 0; i < depth; i++) {
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include "pls/algorithms/reduce.h" #include "pls/algorithms/reduce.h"
#include "pls/internal/scheduling/scheduler.h" #include "pls/internal/scheduling/scheduler.h"
#include "pls/internal/scheduling/strain_local_resource.h"
#include "pls/internal/helpers/range.h" #include "pls/internal/helpers/range.h"
#include "pls/internal/helpers/member_function.h" #include "pls/internal/helpers/member_function.h"
...@@ -28,6 +29,9 @@ static void serial(Function &&function) { ...@@ -28,6 +29,9 @@ static void serial(Function &&function) {
scheduler::serial(std::forward<Function>(function)); scheduler::serial(std::forward<Function>(function));
} }
// strain local resource support (rather low-level)
using internal::scheduling::strain_local_resource;
// general helpers that can be handy when using PLS // general helpers that can be handy when using PLS
template<class C, typename R, typename ...ARGS> template<class C, typename R, typename ...ARGS>
using member_function = internal::helpers::member_function<C, R, ARGS...>; using member_function = internal::helpers::member_function<C, R, ARGS...>;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment