Commit 8f47876d by FritzFlorian

Add standard divide and conquer matrix multiplication test for comparison.

parent 92ee564c
Pipeline #1502 passed with stages
in 4 minutes 13 seconds
//#include "pls/pls.h"
//using namespace pls;
#include "pls/pls.h"
using namespace pls;
#include "benchmark_runner.h"
#include "benchmark_base/matrix_div_conquer.h"
using namespace comparison_benchmarks::base;
#include <memory>
#include <array>
#include <math.h>
// Helpers to directly index into blocked matrices
const size_t MAX_BLOCK_LOOKUP = 256;
std::array<std::array<size_t, MAX_BLOCK_LOOKUP>, MAX_BLOCK_LOOKUP> BLOCK_LOOKUP; // ROW, COLUMN
void fill_block_lookup(size_t size) {
if (size <= 1) {
BLOCK_LOOKUP[0][0] = 0;
return;
}
fill_block_lookup(size / 2);
size_t elements_per_quarter = (size / 2) * (size / 2);
for (size_t row = 0; row < size / 2; row++) {
for (size_t column = 0; column < size / 2; column++) {
BLOCK_LOOKUP[row][size / 2 + column] = BLOCK_LOOKUP[row][column] + elements_per_quarter;
BLOCK_LOOKUP[size / 2 + row][column] = BLOCK_LOOKUP[row][column] + 2 * elements_per_quarter;
BLOCK_LOOKUP[size / 2 + row][size / 2 + column] = BLOCK_LOOKUP[row][column] + 3 * elements_per_quarter;
}
}
}
class blocked_matrix_view {
public:
blocked_matrix_view(double *data, size_t size) : data_{data}, size_{size} {}
blocked_matrix_view quadrant_1_1() {
size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
return blocked_matrix_view(data_ + 0 * elements_per_quarter, size_ / 2);
}
blocked_matrix_view quadrant_1_2() {
size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
return blocked_matrix_view(data_ + 1 * elements_per_quarter, size_ / 2);
}
blocked_matrix_view quadrant_2_1() {
size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
return blocked_matrix_view(data_ + 2 * elements_per_quarter, size_ / 2);
}
blocked_matrix_view quadrant_2_2() {
size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
return blocked_matrix_view(data_ + 3 * elements_per_quarter, size_ / 2);
}
double &at(size_t row, size_t column) {
return data_[BLOCK_LOOKUP[row][column]];
}
double *get_data() {
return data_;
}
private:
double *data_;
size_t size_;
};
#include <vector>
void multiply_naive(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) {
for (size_t i = 0; i < size; i++) {
for (size_t j = 0; j < size; j++) {
result.at(i, j) = 0;
}
for (size_t j = 0; j < size; j++) {
for (size_t k = 0; k < size; k++) {
result.at(i, j) += a.at(i, k) * b.at(k, j);
}
}
}
}
void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> &tmp_arrays,
pls::strain_local_resource &local_indices,
size_t size,
size_t depth,
matrix_div_conquer::blocked_matrix_view &result,
matrix_div_conquer::blocked_matrix_view &a,
matrix_div_conquer::blocked_matrix_view &b) {
void multiply_div_conquer(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) {
if (size <= 8) {
multiply_naive(size, result, a, b);
return;
}
// Temporary storage required for the intermediate results
std::unique_ptr<double[]> data_1_1_a{new double[(size / 2) * (size / 2)]};
std::unique_ptr<double[]> data_1_1_b{new double[(size / 2) * (size / 2)]};
std::unique_ptr<double[]> data_1_2_a{new double[(size / 2) * (size / 2)]};
std::unique_ptr<double[]> data_1_2_b{new double[(size / 2) * (size / 2)]};
std::unique_ptr<double[]> data_2_1_a{new double[(size / 2) * (size / 2)]};
std::unique_ptr<double[]> data_2_1_b{new double[(size / 2) * (size / 2)]};
std::unique_ptr<double[]> data_2_2_a{new double[(size / 2) * (size / 2)]};
std::unique_ptr<double[]> data_2_2_b{new double[(size / 2) * (size / 2)]};
auto strain_local_index = local_indices.get_item(depth);
std::unique_ptr<double[]> const &data_1_1_a = tmp_arrays[depth][strain_local_index.get_strain_index()][0];
std::unique_ptr<double[]> const &data_1_1_b = tmp_arrays[depth][strain_local_index.get_strain_index()][1];
std::unique_ptr<double[]> const &data_1_2_a = tmp_arrays[depth][strain_local_index.get_strain_index()][2];
std::unique_ptr<double[]> const &data_1_2_b = tmp_arrays[depth][strain_local_index.get_strain_index()][3];
std::unique_ptr<double[]> const &data_2_1_a = tmp_arrays[depth][strain_local_index.get_strain_index()][4];
std::unique_ptr<double[]> const &data_2_1_b = tmp_arrays[depth][strain_local_index.get_strain_index()][5];
std::unique_ptr<double[]> const &data_2_2_a = tmp_arrays[depth][strain_local_index.get_strain_index()][6];
std::unique_ptr<double[]> const &data_2_2_b = tmp_arrays[depth][strain_local_index.get_strain_index()][7];
// Handles to sub-matrices used
blocked_matrix_view result_1_1 = result.quadrant_1_1();
blocked_matrix_view result_1_2 = result.quadrant_1_2();
blocked_matrix_view result_2_1 = result.quadrant_2_1();
blocked_matrix_view result_2_2 = result.quadrant_2_2();
blocked_matrix_view result_1_1_a{data_1_1_a.get(), size / 2};
blocked_matrix_view result_1_1_b{data_1_1_b.get(), size / 2};
blocked_matrix_view result_1_2_a{data_1_2_a.get(), size / 2};
blocked_matrix_view result_1_2_b{data_1_2_b.get(), size / 2};
blocked_matrix_view result_2_1_a{data_2_1_a.get(), size / 2};
blocked_matrix_view result_2_1_b{data_2_1_b.get(), size / 2};
blocked_matrix_view result_2_2_a{data_2_2_a.get(), size / 2};
blocked_matrix_view result_2_2_b{data_2_2_b.get(), size / 2};
blocked_matrix_view a_1_1 = a.quadrant_1_1();
blocked_matrix_view a_1_2 = a.quadrant_1_2();
blocked_matrix_view a_2_1 = a.quadrant_2_1();
blocked_matrix_view a_2_2 = a.quadrant_2_2();
blocked_matrix_view b_1_1 = b.quadrant_1_1();
blocked_matrix_view b_1_2 = b.quadrant_1_2();
blocked_matrix_view b_2_1 = b.quadrant_2_1();
blocked_matrix_view b_2_2 = b.quadrant_2_2();
matrix_div_conquer::blocked_matrix_view result_1_1 = result.quadrant_1_1();
matrix_div_conquer::blocked_matrix_view result_1_2 = result.quadrant_1_2();
matrix_div_conquer::blocked_matrix_view result_2_1 = result.quadrant_2_1();
matrix_div_conquer::blocked_matrix_view result_2_2 = result.quadrant_2_2();
matrix_div_conquer::blocked_matrix_view result_1_1_a{data_1_1_a.get(), size / 2};
matrix_div_conquer::blocked_matrix_view result_1_1_b{data_1_1_b.get(), size / 2};
matrix_div_conquer::blocked_matrix_view result_1_2_a{data_1_2_a.get(), size / 2};
matrix_div_conquer::blocked_matrix_view result_1_2_b{data_1_2_b.get(), size / 2};
matrix_div_conquer::blocked_matrix_view result_2_1_a{data_2_1_a.get(), size / 2};
matrix_div_conquer::blocked_matrix_view result_2_1_b{data_2_1_b.get(), size / 2};
matrix_div_conquer::blocked_matrix_view result_2_2_a{data_2_2_a.get(), size / 2};
matrix_div_conquer::blocked_matrix_view result_2_2_b{data_2_2_b.get(), size / 2};
matrix_div_conquer::blocked_matrix_view a_1_1 = a.quadrant_1_1();
matrix_div_conquer::blocked_matrix_view a_1_2 = a.quadrant_1_2();
matrix_div_conquer::blocked_matrix_view a_2_1 = a.quadrant_2_1();
matrix_div_conquer::blocked_matrix_view a_2_2 = a.quadrant_2_2();
matrix_div_conquer::blocked_matrix_view b_1_1 = b.quadrant_1_1();
matrix_div_conquer::blocked_matrix_view b_1_2 = b.quadrant_1_2();
matrix_div_conquer::blocked_matrix_view b_2_1 = b.quadrant_2_1();
matrix_div_conquer::blocked_matrix_view b_2_2 = b.quadrant_2_2();
// Divide Work Into Sub-Calls
multiply_div_conquer(size / 2, result_1_1_a, a_1_1, b_1_1);
multiply_div_conquer(size / 2, result_1_1_b, a_1_2, b_2_1);
multiply_div_conquer(size / 2, result_1_2_a, a_1_1, b_1_2);
multiply_div_conquer(size / 2, result_1_2_b, a_1_2, b_2_2);
multiply_div_conquer(size / 2, result_2_1_a, a_2_1, b_1_1);
multiply_div_conquer(size / 2, result_2_1_b, a_2_2, b_2_1);
multiply_div_conquer(size / 2, result_2_2_a, a_2_1, b_1_2);
multiply_div_conquer(size / 2, result_2_2_b, a_2_2, b_2_2);
pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_1_a, a_1_1, b_1_1); }
);
pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_1_b, a_1_2, b_2_1); }
);
pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_2_a, a_1_1, b_1_2); }
);
pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_2_b, a_1_2, b_2_2); }
);
pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_1_a, a_2_1, b_1_1); }
);
pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_1_b, a_2_2, b_2_1); }
);
pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_2_a, a_2_1, b_1_2); }
);
pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_2_b, a_2_2, b_2_2); }
);
pls::sync();
// Combine results
for (size_t row = 0; row < size / 2; row++) {
for (size_t column = 0; column < size / 2; column++) {
result_1_1.at(row, column) = result_1_1_a.at(row, column) + result_1_1_b.at(row, column);
result_1_2.at(row, column) = result_1_2_a.at(row, column) + result_1_2_b.at(row, column);
result_2_1.at(row, column) = result_2_1_a.at(row, column) + result_2_1_b.at(row, column);
result_2_2.at(row, column) = result_2_2_a.at(row, column) + result_2_2_b.at(row, column);
}
for (size_t i = 0; i < (size / 2) * (size / 2); i++) {
// The layout is not important here, ass all have the same order, so just sum element wise
result_1_1.get_data()[i] = result_1_1_a.get_data()[i] + result_1_1_b.get_data()[i];
result_1_2.get_data()[i] = result_1_2_a.get_data()[i] + result_1_2_b.get_data()[i];
result_2_1.get_data()[i] = result_2_1_a.get_data()[i] + result_2_1_b.get_data()[i];
result_2_2.get_data()[i] = result_2_2_a.get_data()[i] + result_2_2_b.get_data()[i];
}
}
constexpr int MAX_NUM_TASKS = 32;
constexpr int MAX_STACK_SIZE = 4096 * 1;
constexpr int MAX_STACK_SIZE = 4096 * 2;
int main(int argc, char **argv) {
fill_block_lookup(MAX_BLOCK_LOOKUP);
const size_t size = matrix_div_conquer::MATRIX_SIZE;
size_t size = 64;
std::unique_ptr<double[]> result_data_naive{new double[size * size]};
std::unique_ptr<double[]> result_data_div{new double[size * size]};
int num_threads;
string directory;
benchmark_runner::read_args(argc, argv, num_threads, directory);
string test_name = to_string(num_threads) + ".csv";
string full_directory = directory + "/PLS_v3/";
benchmark_runner runner{full_directory, test_name};
// Only run on one version to avoid copy
std::unique_ptr<double[]> result_data{new double[size * size]};
std::unique_ptr<double[]> a_data{new double[size * size]};
std::unique_ptr<double[]> b_data{new double[size * size]};
blocked_matrix_view result_naive{result_data_naive.get(), size};
blocked_matrix_view result_div{result_data_div.get(), size};
blocked_matrix_view a{a_data.get(), size};
blocked_matrix_view b{b_data.get(), size};
for (size_t row = 0; row < size; row++) {
for (size_t column = 0; column < size; column++) {
a.at(row, column) = row;
b.at(row, column) = column;
matrix_div_conquer::blocked_matrix_view a{a_data.get(), size};
matrix_div_conquer::blocked_matrix_view b{b_data.get(), size};
matrix_div_conquer::blocked_matrix_view result{result_data.get(), size};
// Fill data arrays as needed
a.fill_default_data();
b.fill_default_data();
matrix_div_conquer::fill_block_lookup(size);
// Strain local data
std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> div_conquer_temp_arrays;
size_t max_depth = 0;
size_t remaining_size = size;
while (remaining_size > 1) {
auto &depth_buffers = div_conquer_temp_arrays.emplace_back();
for (int thread_id = 0; thread_id < 8; thread_id++) {
auto &depth_thread_buffers = depth_buffers.emplace_back();
for (int i = 0; i < 8; i++) {
depth_thread_buffers.emplace_back(new double[(remaining_size / 2) * (remaining_size / 2)]);
}
}
max_depth++;
remaining_size = remaining_size / 2;
}
pls::strain_local_resource local_indices{(unsigned) num_threads, (unsigned) max_depth};
multiply_div_conquer(size, result_div, a, b);
multiply_naive(size, result_naive, a, b);
scheduler scheduler{(unsigned) num_threads, MAX_NUM_TASKS, MAX_STACK_SIZE};
size_t misses = 0;
for (size_t row = 0; row < size; row++) {
for (size_t column = 0; column < size; column++) {
if (result_div.at(row, column) != result_naive.at(row, column)) {
misses++;
printf("%5.5f\t\t", result_div.at(row, column) - result_naive.at(row, column));
}
}
}
printf("\n%d", misses);
// int num_threads;
// string directory;
// benchmark_runner::read_args(argc, argv, num_threads, directory);
//
// string test_name = to_string(num_threads) + ".csv";
// string full_directory = directory + "/PLS_v3/";
// benchmark_runner runner{full_directory, test_name};
//
// scheduler scheduler{(unsigned) num_threads, MAX_NUM_TASKS, MAX_STACK_SIZE};
//
// runner.run_iterations(1000, [&]() {
// scheduler.perform_work([&]() {
// });
// }, 100);
// runner.commit_results(true);
runner.run_iterations(1, [&]() {
scheduler.perform_work([&]() {
multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, result, a, b);
});
}, 0);
runner.commit_results(true);
scheduler.terminate();
}
......@@ -8,7 +8,8 @@ add_library(benchmark_base STATIC
include/benchmark_base/matrix.h
include/benchmark_base/unbalanced.h src/unbalanced.cpp
include/benchmark_base/range.h
include/benchmark_base/fib.h)
include/benchmark_base/fib.h
include/benchmark_base/matrix_div_conquer.h)
target_include_directories(benchmark_base
PUBLIC
......
#ifndef COMPARISON_BENCHMARKS_BASE_MATRIX_DIV_CONQUER_H
#define COMPARISON_BENCHMARKS_BASE_MATRIX_DIV_CONQUER_H
#include <array>
namespace comparison_benchmarks {
namespace base {
namespace matrix_div_conquer {
const int MATRIX_SIZE = 128;
const int CUTOFF_SIZE = 8;
const int NUM_ITERATIONS = 100;
const int WARMUP_ITERATIONS = 10;
// Helpers to directly index into blocked matrices
const size_t MAX_SIZE = 128;
std::array<std::array<size_t, MAX_SIZE>, MAX_SIZE> BLOCK_LOOKUP; // ROW, COLUMN
void fill_block_lookup(size_t size = MAX_SIZE) {
if (size <= 1) {
BLOCK_LOOKUP[0][0] = 0;
return;
}
fill_block_lookup(size / 2);
size_t elements_per_quarter = (size / 2) * (size / 2);
for (size_t row = 0; row < size / 2; row++) {
for (size_t column = 0; column < size / 2; column++) {
BLOCK_LOOKUP[row][size / 2 + column] = BLOCK_LOOKUP[row][column] + elements_per_quarter;
BLOCK_LOOKUP[size / 2 + row][column] = BLOCK_LOOKUP[row][column] + 2 * elements_per_quarter;
BLOCK_LOOKUP[size / 2 + row][size / 2 + column] = BLOCK_LOOKUP[row][column] + 3 * elements_per_quarter;
}
}
}
class blocked_matrix_view {
public:
blocked_matrix_view(double *data, size_t size) : data_{data}, size_{size} {}
void fill_default_data() {
for (size_t row = 0; row < size_; row++) {
for (size_t column = 0; column < size_; column++) {
at(row, column) = row;
}
}
}
blocked_matrix_view quadrant_1_1() {
size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
return blocked_matrix_view(data_ + 0 * elements_per_quarter, size_ / 2);
}
blocked_matrix_view quadrant_1_2() {
size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
return blocked_matrix_view(data_ + 1 * elements_per_quarter, size_ / 2);
}
blocked_matrix_view quadrant_2_1() {
size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
return blocked_matrix_view(data_ + 2 * elements_per_quarter, size_ / 2);
}
blocked_matrix_view quadrant_2_2() {
size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
return blocked_matrix_view(data_ + 3 * elements_per_quarter, size_ / 2);
}
double &at(size_t row, size_t column) {
return data_[BLOCK_LOOKUP[row][column]];
}
double *get_data() {
return data_;
}
private:
double *data_;
size_t size_;
};
void multiply_naive(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) {
for (size_t i = 0; i < size; i++) {
for (size_t j = 0; j < size; j++) {
result.at(i, j) = 0;
}
for (size_t j = 0; j < size; j++) {
for (size_t k = 0; k < size; k++) {
result.at(i, j) += a.at(i, k) * b.at(k, j);
}
}
}
}
}
}
}
#endif // COMPARISON_BENCHMARKS_BASE_MATRIX_DIV_CONQUER_H
......@@ -154,7 +154,6 @@ template<typename Function>
void scheduler::spawn_internal(Function &&lambda) {
if (thread_state::is_scheduler_active()) {
thread_state &spawning_state = thread_state::get();
scheduler &scheduler = spawning_state.get_scheduler();
base_task *last_task = spawning_state.get_active_task();
base_task *spawned_task = last_task->next_;
......
......@@ -60,8 +60,7 @@ class strain_local_resource {
};
strain_local_resource(unsigned num_threads,
unsigned depth) : local_items_() {
local_items_.reserve(num_threads);
unsigned depth) : local_items_(num_threads) {
for (unsigned thread_id = 0; thread_id < num_threads; thread_id++) {
local_items_[thread_id].reserve(depth);
for (unsigned i = 0; i < depth; i++) {
......
......@@ -8,6 +8,7 @@
#include "pls/algorithms/reduce.h"
#include "pls/internal/scheduling/scheduler.h"
#include "pls/internal/scheduling/strain_local_resource.h"
#include "pls/internal/helpers/range.h"
#include "pls/internal/helpers/member_function.h"
......@@ -28,6 +29,9 @@ static void serial(Function &&function) {
scheduler::serial(std::forward<Function>(function));
}
// strain local resource support (rather low-level)
using internal::scheduling::strain_local_resource;
// general helpers that can be handy when using PLS
template<class C, typename R, typename ...ARGS>
using member_function = internal::helpers::member_function<C, R, ARGS...>;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment