//#include "pls/pls.h" //using namespace pls; #include "benchmark_runner.h" #include #include #include // Helpers to directly index into blocked matrices const size_t MAX_BLOCK_LOOKUP = 256; std::array, MAX_BLOCK_LOOKUP> BLOCK_LOOKUP; // ROW, COLUMN void fill_block_lookup(size_t size) { if (size <= 1) { BLOCK_LOOKUP[0][0] = 0; return; } fill_block_lookup(size / 2); size_t elements_per_quarter = (size / 2) * (size / 2); for (size_t row = 0; row < size / 2; row++) { for (size_t column = 0; column < size / 2; column++) { BLOCK_LOOKUP[row][size / 2 + column] = BLOCK_LOOKUP[row][column] + elements_per_quarter; BLOCK_LOOKUP[size / 2 + row][column] = BLOCK_LOOKUP[row][column] + 2 * elements_per_quarter; BLOCK_LOOKUP[size / 2 + row][size / 2 + column] = BLOCK_LOOKUP[row][column] + 3 * elements_per_quarter; } } } class blocked_matrix_view { public: blocked_matrix_view(double *data, size_t size) : data_{data}, size_{size} {} blocked_matrix_view quadrant_1_1() { size_t elements_per_quarter = (size_ / 2) * (size_ / 2); return blocked_matrix_view(data_ + 0 * elements_per_quarter, size_ / 2); } blocked_matrix_view quadrant_1_2() { size_t elements_per_quarter = (size_ / 2) * (size_ / 2); return blocked_matrix_view(data_ + 1 * elements_per_quarter, size_ / 2); } blocked_matrix_view quadrant_2_1() { size_t elements_per_quarter = (size_ / 2) * (size_ / 2); return blocked_matrix_view(data_ + 2 * elements_per_quarter, size_ / 2); } blocked_matrix_view quadrant_2_2() { size_t elements_per_quarter = (size_ / 2) * (size_ / 2); return blocked_matrix_view(data_ + 3 * elements_per_quarter, size_ / 2); } double &at(size_t row, size_t column) { return data_[BLOCK_LOOKUP[row][column]]; } double *get_data() { return data_; } private: double *data_; size_t size_; }; void multiply_naive(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) { for (size_t i = 0; i < size; i++) { for (size_t j = 0; j < size; j++) { result.at(i, j) = 0; } for (size_t j = 0; j < size; j++) { for (size_t k = 0; k < size; k++) { result.at(i, j) += a.at(i, k) * b.at(k, j); } } } } void multiply_div_conquer(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) { if (size <= 8) { multiply_naive(size, result, a, b); return; } // Temporary storage required for the intermediate results std::unique_ptr data_1_1_a{new double[(size / 2) * (size / 2)]}; std::unique_ptr data_1_1_b{new double[(size / 2) * (size / 2)]}; std::unique_ptr data_1_2_a{new double[(size / 2) * (size / 2)]}; std::unique_ptr data_1_2_b{new double[(size / 2) * (size / 2)]}; std::unique_ptr data_2_1_a{new double[(size / 2) * (size / 2)]}; std::unique_ptr data_2_1_b{new double[(size / 2) * (size / 2)]}; std::unique_ptr data_2_2_a{new double[(size / 2) * (size / 2)]}; std::unique_ptr data_2_2_b{new double[(size / 2) * (size / 2)]}; // Handles to sub-matrices used blocked_matrix_view result_1_1 = result.quadrant_1_1(); blocked_matrix_view result_1_2 = result.quadrant_1_2(); blocked_matrix_view result_2_1 = result.quadrant_2_1(); blocked_matrix_view result_2_2 = result.quadrant_2_2(); blocked_matrix_view result_1_1_a{data_1_1_a.get(), size / 2}; blocked_matrix_view result_1_1_b{data_1_1_b.get(), size / 2}; blocked_matrix_view result_1_2_a{data_1_2_a.get(), size / 2}; blocked_matrix_view result_1_2_b{data_1_2_b.get(), size / 2}; blocked_matrix_view result_2_1_a{data_2_1_a.get(), size / 2}; blocked_matrix_view result_2_1_b{data_2_1_b.get(), size / 2}; blocked_matrix_view result_2_2_a{data_2_2_a.get(), size / 2}; blocked_matrix_view result_2_2_b{data_2_2_b.get(), size / 2}; blocked_matrix_view a_1_1 = a.quadrant_1_1(); blocked_matrix_view a_1_2 = a.quadrant_1_2(); blocked_matrix_view a_2_1 = a.quadrant_2_1(); blocked_matrix_view a_2_2 = a.quadrant_2_2(); blocked_matrix_view b_1_1 = b.quadrant_1_1(); blocked_matrix_view b_1_2 = b.quadrant_1_2(); blocked_matrix_view b_2_1 = b.quadrant_2_1(); blocked_matrix_view b_2_2 = b.quadrant_2_2(); // Divide Work Into Sub-Calls multiply_div_conquer(size / 2, result_1_1_a, a_1_1, b_1_1); multiply_div_conquer(size / 2, result_1_1_b, a_1_2, b_2_1); multiply_div_conquer(size / 2, result_1_2_a, a_1_1, b_1_2); multiply_div_conquer(size / 2, result_1_2_b, a_1_2, b_2_2); multiply_div_conquer(size / 2, result_2_1_a, a_2_1, b_1_1); multiply_div_conquer(size / 2, result_2_1_b, a_2_2, b_2_1); multiply_div_conquer(size / 2, result_2_2_a, a_2_1, b_1_2); multiply_div_conquer(size / 2, result_2_2_b, a_2_2, b_2_2); // Combine results for (size_t row = 0; row < size / 2; row++) { for (size_t column = 0; column < size / 2; column++) { result_1_1.at(row, column) = result_1_1_a.at(row, column) + result_1_1_b.at(row, column); result_1_2.at(row, column) = result_1_2_a.at(row, column) + result_1_2_b.at(row, column); result_2_1.at(row, column) = result_2_1_a.at(row, column) + result_2_1_b.at(row, column); result_2_2.at(row, column) = result_2_2_a.at(row, column) + result_2_2_b.at(row, column); } } } constexpr int MAX_NUM_TASKS = 32; constexpr int MAX_STACK_SIZE = 4096 * 1; int main(int argc, char **argv) { fill_block_lookup(MAX_BLOCK_LOOKUP); size_t size = 64; std::unique_ptr result_data_naive{new double[size * size]}; std::unique_ptr result_data_div{new double[size * size]}; std::unique_ptr a_data{new double[size * size]}; std::unique_ptr b_data{new double[size * size]}; blocked_matrix_view result_naive{result_data_naive.get(), size}; blocked_matrix_view result_div{result_data_div.get(), size}; blocked_matrix_view a{a_data.get(), size}; blocked_matrix_view b{b_data.get(), size}; for (size_t row = 0; row < size; row++) { for (size_t column = 0; column < size; column++) { a.at(row, column) = row; b.at(row, column) = column; } } multiply_div_conquer(size, result_div, a, b); multiply_naive(size, result_naive, a, b); size_t misses = 0; for (size_t row = 0; row < size; row++) { for (size_t column = 0; column < size; column++) { if (result_div.at(row, column) != result_naive.at(row, column)) { misses++; printf("%5.5f\t\t", result_div.at(row, column) - result_naive.at(row, column)); } } } printf("\n%d", misses); // int num_threads; // string directory; // benchmark_runner::read_args(argc, argv, num_threads, directory); // // string test_name = to_string(num_threads) + ".csv"; // string full_directory = directory + "/PLS_v3/"; // benchmark_runner runner{full_directory, test_name}; // // scheduler scheduler{(unsigned) num_threads, MAX_NUM_TASKS, MAX_STACK_SIZE}; // // runner.run_iterations(1000, [&]() { // scheduler.perform_work([&]() { // }); // }, 100); // runner.commit_results(true); }