main.cpp 6.89 KB
Newer Older
1 2
#include "pls/pls.h"
using namespace pls;
3 4

#include "benchmark_runner.h"
5 6 7
#include "benchmark_base/matrix_div_conquer.h"

using namespace comparison_benchmarks::base;
8 9 10

#include <memory>
#include <array>
11
#include <vector>
12

13 14 15 16 17 18 19
void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> &tmp_arrays,
                          pls::strain_local_resource &local_indices,
                          size_t size,
                          size_t depth,
                          matrix_div_conquer::blocked_matrix_view &result,
                          matrix_div_conquer::blocked_matrix_view &a,
                          matrix_div_conquer::blocked_matrix_view &b) {
20 21 22 23 24 25

  if (size <= 8) {
    multiply_naive(size, result, a, b);
    return;
  }
  // Temporary storage required for the intermediate results
26 27 28 29 30 31 32 33 34
  auto strain_local_index = local_indices.get_item(depth);
  std::unique_ptr<double[]> const &data_1_1_a = tmp_arrays[depth][strain_local_index.get_strain_index()][0];
  std::unique_ptr<double[]> const &data_1_1_b = tmp_arrays[depth][strain_local_index.get_strain_index()][1];
  std::unique_ptr<double[]> const &data_1_2_a = tmp_arrays[depth][strain_local_index.get_strain_index()][2];
  std::unique_ptr<double[]> const &data_1_2_b = tmp_arrays[depth][strain_local_index.get_strain_index()][3];
  std::unique_ptr<double[]> const &data_2_1_a = tmp_arrays[depth][strain_local_index.get_strain_index()][4];
  std::unique_ptr<double[]> const &data_2_1_b = tmp_arrays[depth][strain_local_index.get_strain_index()][5];
  std::unique_ptr<double[]> const &data_2_2_a = tmp_arrays[depth][strain_local_index.get_strain_index()][6];
  std::unique_ptr<double[]> const &data_2_2_b = tmp_arrays[depth][strain_local_index.get_strain_index()][7];
35 36

  // Handles to sub-matrices used
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
  matrix_div_conquer::blocked_matrix_view result_1_1 = result.quadrant_1_1();
  matrix_div_conquer::blocked_matrix_view result_1_2 = result.quadrant_1_2();
  matrix_div_conquer::blocked_matrix_view result_2_1 = result.quadrant_2_1();
  matrix_div_conquer::blocked_matrix_view result_2_2 = result.quadrant_2_2();

  matrix_div_conquer::blocked_matrix_view result_1_1_a{data_1_1_a.get(), size / 2};
  matrix_div_conquer::blocked_matrix_view result_1_1_b{data_1_1_b.get(), size / 2};
  matrix_div_conquer::blocked_matrix_view result_1_2_a{data_1_2_a.get(), size / 2};
  matrix_div_conquer::blocked_matrix_view result_1_2_b{data_1_2_b.get(), size / 2};
  matrix_div_conquer::blocked_matrix_view result_2_1_a{data_2_1_a.get(), size / 2};
  matrix_div_conquer::blocked_matrix_view result_2_1_b{data_2_1_b.get(), size / 2};
  matrix_div_conquer::blocked_matrix_view result_2_2_a{data_2_2_a.get(), size / 2};
  matrix_div_conquer::blocked_matrix_view result_2_2_b{data_2_2_b.get(), size / 2};

  matrix_div_conquer::blocked_matrix_view a_1_1 = a.quadrant_1_1();
  matrix_div_conquer::blocked_matrix_view a_1_2 = a.quadrant_1_2();
  matrix_div_conquer::blocked_matrix_view a_2_1 = a.quadrant_2_1();
  matrix_div_conquer::blocked_matrix_view a_2_2 = a.quadrant_2_2();

  matrix_div_conquer::blocked_matrix_view b_1_1 = b.quadrant_1_1();
  matrix_div_conquer::blocked_matrix_view b_1_2 = b.quadrant_1_2();
  matrix_div_conquer::blocked_matrix_view b_2_1 = b.quadrant_2_1();
  matrix_div_conquer::blocked_matrix_view b_2_2 = b.quadrant_2_2();
60 61

  // Divide Work Into Sub-Calls
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
  pls::spawn(
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_1_a, a_1_1, b_1_1); }
  );
  pls::spawn(
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_1_b, a_1_2, b_2_1); }
  );

  pls::spawn(
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_2_a, a_1_1, b_1_2); }
  );
  pls::spawn(
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_2_b, a_1_2, b_2_2); }
  );

  pls::spawn(
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_1_a, a_2_1, b_1_1); }
  );
  pls::spawn(
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_1_b, a_2_2, b_2_1); }
  );

  pls::spawn(
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_2_a, a_2_1, b_1_2); }
  );
  pls::spawn(
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_2_b, a_2_2, b_2_2); }
  );

  pls::sync();
91 92

  // Combine results
93 94 95 96 97 98
  for (size_t i = 0; i < (size / 2) * (size / 2); i++) {
    // The layout is not important here, ass all have the same order, so just sum element wise
    result_1_1.get_data()[i] = result_1_1_a.get_data()[i] + result_1_1_b.get_data()[i];
    result_1_2.get_data()[i] = result_1_2_a.get_data()[i] + result_1_2_b.get_data()[i];
    result_2_1.get_data()[i] = result_2_1_a.get_data()[i] + result_2_1_b.get_data()[i];
    result_2_2.get_data()[i] = result_2_2_a.get_data()[i] + result_2_2_b.get_data()[i];
99 100 101 102
  }
}

constexpr int MAX_NUM_TASKS = 32;
103
constexpr int MAX_STACK_SIZE = 4096 * 2;
104 105

int main(int argc, char **argv) {
106
  const size_t size = matrix_div_conquer::MATRIX_SIZE;
107

108 109 110 111 112 113 114
  int num_threads;
  string directory;
  benchmark_runner::read_args(argc, argv, num_threads, directory);

  string test_name = to_string(num_threads) + ".csv";
  string full_directory = directory + "/PLS_v3/";
  benchmark_runner runner{full_directory, test_name};
115
  runner.enable_memory_stats();
116
  runner.pre_allocate_stats();
117 118 119

  // Only run on one version to avoid copy
  std::unique_ptr<double[]> result_data{new double[size * size]};
120 121 122
  std::unique_ptr<double[]> a_data{new double[size * size]};
  std::unique_ptr<double[]> b_data{new double[size * size]};

123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
  matrix_div_conquer::blocked_matrix_view a{a_data.get(), size};
  matrix_div_conquer::blocked_matrix_view b{b_data.get(), size};
  matrix_div_conquer::blocked_matrix_view result{result_data.get(), size};

  // Fill data arrays as needed
  a.fill_default_data();
  b.fill_default_data();
  matrix_div_conquer::fill_block_lookup(size);

  // Strain local data
  std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> div_conquer_temp_arrays;
  size_t max_depth = 0;
  size_t remaining_size = size;
  while (remaining_size > 1) {
    auto &depth_buffers = div_conquer_temp_arrays.emplace_back();
    for (int thread_id = 0; thread_id < 8; thread_id++) {
      auto &depth_thread_buffers = depth_buffers.emplace_back();
      for (int i = 0; i < 8; i++) {
        depth_thread_buffers.emplace_back(new double[(remaining_size / 2) * (remaining_size / 2)]);
      }
143
    }
144 145 146

    max_depth++;
    remaining_size = remaining_size / 2;
147
  }
148
  pls::strain_local_resource local_indices{(unsigned) num_threads, (unsigned) max_depth};
149

150
  scheduler scheduler{(unsigned) num_threads, MAX_NUM_TASKS, MAX_STACK_SIZE};
151

152 153 154 155 156 157 158 159
  runner.run_iterations(1, [&]() {
    scheduler.perform_work([&]() {
      multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, result, a, b);
    });
  }, 0);
  runner.commit_results(true);

  scheduler.terminate();
160
}