main.cpp 7.82 KB
Newer Older
1 2
#include "pls/pls.h"
using namespace pls;
3 4

#include "benchmark_runner.h"
5 6 7
#include "benchmark_base/matrix_div_conquer.h"

using namespace comparison_benchmarks::base;
8 9 10

#include <memory>
#include <array>
11
#include <algorithm>
12
#include <vector>
13

14 15 16 17
void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> &tmp_arrays,
                          pls::strain_local_resource &local_indices,
                          size_t size,
                          size_t depth,
18
                          size_t branch,
19 20 21
                          matrix_div_conquer::blocked_matrix_view &result,
                          matrix_div_conquer::blocked_matrix_view &a,
                          matrix_div_conquer::blocked_matrix_view &b) {
22

23
  if (size <= matrix_div_conquer::CUTOFF_SIZE) {
24 25 26 27
    multiply_naive(size, result, a, b);
    return;
  }
  // Temporary storage required for the intermediate results
28
  auto strain_local_index = local_indices.get_item(depth);
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
  size_t index;
  if (depth == 0 || (8u << (depth - 1u)) <= local_indices.get_num_threads()) {
    index = branch;
  } else {
    index = strain_local_index.get_strain_index();
  }

  std::unique_ptr<double[]> const &data_1_1_a = tmp_arrays[depth][index][0];
  std::unique_ptr<double[]> const &data_1_1_b = tmp_arrays[depth][index][1];
  std::unique_ptr<double[]> const &data_1_2_a = tmp_arrays[depth][index][2];
  std::unique_ptr<double[]> const &data_1_2_b = tmp_arrays[depth][index][3];
  std::unique_ptr<double[]> const &data_2_1_a = tmp_arrays[depth][index][4];
  std::unique_ptr<double[]> const &data_2_1_b = tmp_arrays[depth][index][5];
  std::unique_ptr<double[]> const &data_2_2_a = tmp_arrays[depth][index][6];
  std::unique_ptr<double[]> const &data_2_2_b = tmp_arrays[depth][index][7];
44 45

  // Handles to sub-matrices used
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
  matrix_div_conquer::blocked_matrix_view result_1_1 = result.quadrant_1_1();
  matrix_div_conquer::blocked_matrix_view result_1_2 = result.quadrant_1_2();
  matrix_div_conquer::blocked_matrix_view result_2_1 = result.quadrant_2_1();
  matrix_div_conquer::blocked_matrix_view result_2_2 = result.quadrant_2_2();

  matrix_div_conquer::blocked_matrix_view result_1_1_a{data_1_1_a.get(), size / 2};
  matrix_div_conquer::blocked_matrix_view result_1_1_b{data_1_1_b.get(), size / 2};
  matrix_div_conquer::blocked_matrix_view result_1_2_a{data_1_2_a.get(), size / 2};
  matrix_div_conquer::blocked_matrix_view result_1_2_b{data_1_2_b.get(), size / 2};
  matrix_div_conquer::blocked_matrix_view result_2_1_a{data_2_1_a.get(), size / 2};
  matrix_div_conquer::blocked_matrix_view result_2_1_b{data_2_1_b.get(), size / 2};
  matrix_div_conquer::blocked_matrix_view result_2_2_a{data_2_2_a.get(), size / 2};
  matrix_div_conquer::blocked_matrix_view result_2_2_b{data_2_2_b.get(), size / 2};

  matrix_div_conquer::blocked_matrix_view a_1_1 = a.quadrant_1_1();
  matrix_div_conquer::blocked_matrix_view a_1_2 = a.quadrant_1_2();
  matrix_div_conquer::blocked_matrix_view a_2_1 = a.quadrant_2_1();
  matrix_div_conquer::blocked_matrix_view a_2_2 = a.quadrant_2_2();

  matrix_div_conquer::blocked_matrix_view b_1_1 = b.quadrant_1_1();
  matrix_div_conquer::blocked_matrix_view b_1_2 = b.quadrant_1_2();
  matrix_div_conquer::blocked_matrix_view b_2_1 = b.quadrant_2_1();
  matrix_div_conquer::blocked_matrix_view b_2_2 = b.quadrant_2_2();
69 70

  // Divide Work Into Sub-Calls
71
  pls::spawn(
72
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 0, result_1_1_a, a_1_1, b_1_1); }
73 74
  );
  pls::spawn(
75
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 1, result_1_1_b, a_1_2, b_2_1); }
76 77 78
  );

  pls::spawn(
79
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 2, result_1_2_a, a_1_1, b_1_2); }
80 81
  );
  pls::spawn(
82
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 3, result_1_2_b, a_1_2, b_2_2); }
83 84 85
  );

  pls::spawn(
86
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 4, result_2_1_a, a_2_1, b_1_1); }
87 88
  );
  pls::spawn(
89
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 5, result_2_1_b, a_2_2, b_2_1); }
90 91 92
  );

  pls::spawn(
93
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 6, result_2_2_a, a_2_1, b_1_2); }
94 95
  );
  pls::spawn(
96
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 7, result_2_2_b, a_2_2, b_2_2); }
97 98 99
  );

  pls::sync();
100 101

  // Combine results
102 103 104 105 106 107
  for (size_t i = 0; i < (size / 2) * (size / 2); i++) {
    // The layout is not important here, ass all have the same order, so just sum element wise
    result_1_1.get_data()[i] = result_1_1_a.get_data()[i] + result_1_1_b.get_data()[i];
    result_1_2.get_data()[i] = result_1_2_a.get_data()[i] + result_1_2_b.get_data()[i];
    result_2_1.get_data()[i] = result_2_1_a.get_data()[i] + result_2_1_b.get_data()[i];
    result_2_2.get_data()[i] = result_2_2_a.get_data()[i] + result_2_2_b.get_data()[i];
108 109 110
  }
}

111 112
constexpr int MAX_NUM_TASKS = 10;
constexpr int MAX_STACK_SIZE = 4096 * 1;
113 114

int main(int argc, char **argv) {
115 116
  auto settings = benchmark_runner::parse_parameters(argc, argv);
  const size_t size = settings.size_;
117

118 119 120

  // Only run on one version to avoid copy
  std::unique_ptr<double[]> result_data{new double[size * size]};
121 122 123
  std::unique_ptr<double[]> a_data{new double[size * size]};
  std::unique_ptr<double[]> b_data{new double[size * size]};

124 125 126 127 128 129 130 131 132 133 134 135
  matrix_div_conquer::blocked_matrix_view a{a_data.get(), size};
  matrix_div_conquer::blocked_matrix_view b{b_data.get(), size};
  matrix_div_conquer::blocked_matrix_view result{result_data.get(), size};

  // Fill data arrays as needed
  a.fill_default_data();
  b.fill_default_data();
  matrix_div_conquer::fill_block_lookup(size);

  // Strain local data
  std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> div_conquer_temp_arrays;
  size_t max_depth = 0;
136
  size_t buffers_needed = 1;
137
  size_t remaining_size = size;
138
  while (remaining_size > matrix_div_conquer::CUTOFF_SIZE) {
139
    auto &depth_buffers = div_conquer_temp_arrays.emplace_back();
140 141
    buffers_needed = std::min(buffers_needed, (size_t) settings.num_threads_);
    for (int thread_id = 0; thread_id < buffers_needed; thread_id++) {
142 143
      auto &depth_thread_buffers = depth_buffers.emplace_back();
      for (int i = 0; i < 8; i++) {
144 145 146 147 148
        size_t matrix_elements = (remaining_size / 2) * (remaining_size / 2);
        depth_thread_buffers.emplace_back(new double[matrix_elements]);
        for (size_t j = 0; j < matrix_elements; j += 32) {
          depth_thread_buffers[i][j] = 1.0; // Touch memory
        }
149
      }
150
    }
151 152

    max_depth++;
153
    buffers_needed *= 8;
154
    remaining_size = remaining_size / 2;
155
  }
156 157 158 159 160
  pls::strain_local_resource local_indices{(unsigned) settings.num_threads_, (unsigned) max_depth};

  string test_name = to_string(settings.num_threads_) + ".csv";
  string full_directory = settings.output_directory_ + "/PLS_v3/";
  benchmark_runner runner{full_directory, test_name};
161

162
  pls::scheduler scheduler{(unsigned) settings.num_threads_, MAX_NUM_TASKS, MAX_STACK_SIZE};
163

164 165 166 167 168 169 170
  if (settings.type_ == benchmark_runner::benchmark_settings::ISOLATED) {
    printf("Running isolated measurement...\n");
    runner.enable_memory_stats();
    runner.pre_allocate_stats();

    runner.run_iterations(settings.iterations_, [&]() {
      scheduler.perform_work([&]() {
171
        multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, 0, result, a, b);
172 173 174 175 176 177 178 179 180 181
      });
    });
    runner.commit_results(true);
  } else {
    printf("Running periodic measurement...\n");
    runner.enable_wall_time_stats();
    runner.pre_allocate_stats();

    runner.run_periodic(settings.iterations_, settings.interval_period_, settings.interval_deadline_, [&]() {
      scheduler.perform_work([&]() {
182
        multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, 0, result, a, b);
183
      });
184
    });
185 186
    runner.commit_results(true);
  }
187

188
}