#include "pls/pls.h" using namespace pls; #include "benchmark_runner.h" #include "benchmark_base/matrix_div_conquer.h" using namespace comparison_benchmarks::base; #include #include #include #include void multiply_div_conquer(const std::vector>>> &tmp_arrays, pls::strain_local_resource &local_indices, size_t size, size_t depth, size_t branch, matrix_div_conquer::blocked_matrix_view &result, matrix_div_conquer::blocked_matrix_view &a, matrix_div_conquer::blocked_matrix_view &b) { if (size <= matrix_div_conquer::CUTOFF_SIZE) { multiply_naive(size, result, a, b); return; } // Temporary storage required for the intermediate results auto strain_local_index = local_indices.get_item(depth); size_t index; if (depth == 0 || (8u << (depth - 1u)) <= local_indices.get_num_threads()) { index = branch; } else { index = strain_local_index.get_strain_index(); } std::unique_ptr const &data_1_1_a = tmp_arrays[depth][index][0]; std::unique_ptr const &data_1_1_b = tmp_arrays[depth][index][1]; std::unique_ptr const &data_1_2_a = tmp_arrays[depth][index][2]; std::unique_ptr const &data_1_2_b = tmp_arrays[depth][index][3]; std::unique_ptr const &data_2_1_a = tmp_arrays[depth][index][4]; std::unique_ptr const &data_2_1_b = tmp_arrays[depth][index][5]; std::unique_ptr const &data_2_2_a = tmp_arrays[depth][index][6]; std::unique_ptr const &data_2_2_b = tmp_arrays[depth][index][7]; // Handles to sub-matrices used matrix_div_conquer::blocked_matrix_view result_1_1 = result.quadrant_1_1(); matrix_div_conquer::blocked_matrix_view result_1_2 = result.quadrant_1_2(); matrix_div_conquer::blocked_matrix_view result_2_1 = result.quadrant_2_1(); matrix_div_conquer::blocked_matrix_view result_2_2 = result.quadrant_2_2(); matrix_div_conquer::blocked_matrix_view result_1_1_a{data_1_1_a.get(), size / 2}; matrix_div_conquer::blocked_matrix_view result_1_1_b{data_1_1_b.get(), size / 2}; matrix_div_conquer::blocked_matrix_view result_1_2_a{data_1_2_a.get(), size / 2}; matrix_div_conquer::blocked_matrix_view result_1_2_b{data_1_2_b.get(), size / 2}; matrix_div_conquer::blocked_matrix_view result_2_1_a{data_2_1_a.get(), size / 2}; matrix_div_conquer::blocked_matrix_view result_2_1_b{data_2_1_b.get(), size / 2}; matrix_div_conquer::blocked_matrix_view result_2_2_a{data_2_2_a.get(), size / 2}; matrix_div_conquer::blocked_matrix_view result_2_2_b{data_2_2_b.get(), size / 2}; matrix_div_conquer::blocked_matrix_view a_1_1 = a.quadrant_1_1(); matrix_div_conquer::blocked_matrix_view a_1_2 = a.quadrant_1_2(); matrix_div_conquer::blocked_matrix_view a_2_1 = a.quadrant_2_1(); matrix_div_conquer::blocked_matrix_view a_2_2 = a.quadrant_2_2(); matrix_div_conquer::blocked_matrix_view b_1_1 = b.quadrant_1_1(); matrix_div_conquer::blocked_matrix_view b_1_2 = b.quadrant_1_2(); matrix_div_conquer::blocked_matrix_view b_2_1 = b.quadrant_2_1(); matrix_div_conquer::blocked_matrix_view b_2_2 = b.quadrant_2_2(); // Divide Work Into Sub-Calls pls::spawn( [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 0, result_1_1_a, a_1_1, b_1_1); } ); pls::spawn( [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 1, result_1_1_b, a_1_2, b_2_1); } ); pls::spawn( [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 2, result_1_2_a, a_1_1, b_1_2); } ); pls::spawn( [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 3, result_1_2_b, a_1_2, b_2_2); } ); pls::spawn( [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 4, result_2_1_a, a_2_1, b_1_1); } ); pls::spawn( [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 5, result_2_1_b, a_2_2, b_2_1); } ); pls::spawn( [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 6, result_2_2_a, a_2_1, b_1_2); } ); pls::spawn_and_sync( [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 7, result_2_2_b, a_2_2, b_2_2); } ); // Combine results for (size_t i = 0; i < (size / 2) * (size / 2); i++) { // The layout is not important here, ass all have the same order, so just sum element wise result_1_1.get_data()[i] = result_1_1_a.get_data()[i] + result_1_1_b.get_data()[i]; result_1_2.get_data()[i] = result_1_2_a.get_data()[i] + result_1_2_b.get_data()[i]; result_2_1.get_data()[i] = result_2_1_a.get_data()[i] + result_2_1_b.get_data()[i]; result_2_2.get_data()[i] = result_2_2_a.get_data()[i] + result_2_2_b.get_data()[i]; } } constexpr int MAX_STACK_SIZE = 4096 * 1; int main(int argc, char **argv) { auto settings = benchmark_runner::parse_parameters(argc, argv); const size_t size = settings.size_; // Only run on one version to avoid copy std::unique_ptr result_data{new double[size * size]}; std::unique_ptr a_data{new double[size * size]}; std::unique_ptr b_data{new double[size * size]}; matrix_div_conquer::blocked_matrix_view a{a_data.get(), size}; matrix_div_conquer::blocked_matrix_view b{b_data.get(), size}; matrix_div_conquer::blocked_matrix_view result{result_data.get(), size}; // Fill data arrays as needed a.fill_default_data(); b.fill_default_data(); result.fill_default_data(); // Strain local data std::vector>>> div_conquer_temp_arrays; size_t max_depth = 0; size_t buffers_needed = 1; size_t remaining_size = size; while (remaining_size > matrix_div_conquer::CUTOFF_SIZE) { auto &depth_buffers = div_conquer_temp_arrays.emplace_back(); buffers_needed = std::min(buffers_needed, (size_t) settings.num_threads_); for (size_t thread_id = 0; thread_id < buffers_needed; thread_id++) { auto &depth_thread_buffers = depth_buffers.emplace_back(); for (int i = 0; i < 8; i++) { size_t matrix_elements = (remaining_size / 2) * (remaining_size / 2); depth_thread_buffers.emplace_back(new double[matrix_elements]); for (size_t j = 0; j < matrix_elements; j += 32) { depth_thread_buffers[i][j] = 1.0; // Touch memory } } } max_depth++; buffers_needed *= 8; remaining_size = remaining_size / 2; } pls::strain_local_resource local_indices{(unsigned) settings.num_threads_, (unsigned) max_depth}; string test_name = to_string(settings.num_threads_) + ".csv"; string full_directory = settings.output_directory_ + "/PLS_v3/"; benchmark_runner runner{full_directory, test_name}; pls::scheduler scheduler{(unsigned) settings.num_threads_, max_depth + 2, MAX_STACK_SIZE}; if (settings.type_ == benchmark_runner::benchmark_settings::ISOLATED) { #if PLS_PROFILING_ENABLED scheduler.get_profiler().disable_memory_measure(); runner.add_custom_stats_field("T_1"); runner.add_custom_stats_field("T_inf"); #endif printf("Running isolated measurement...\n"); runner.enable_memory_stats(); runner.pre_allocate_stats(); runner.run_iterations(settings.iterations_, [&]() { scheduler.perform_work([&]() { multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, 0, result, a, b); }); }, [&]() {}, [&]() { #if PLS_PROFILING_ENABLED runner.store_custom_stat("T_1", scheduler.get_profiler().current_run().t_1_); runner.store_custom_stat("T_inf", scheduler.get_profiler().current_run().t_inf_); #endif }); runner.commit_results(true); } else { printf("Running periodic measurement...\n"); runner.enable_wall_time_stats(); runner.pre_allocate_stats(); runner.run_periodic(settings.iterations_, settings.interval_period_, settings.interval_deadline_, [&]() { scheduler.perform_work([&]() { multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, 0, result, a, b); }); }); runner.commit_results(true); } }