main.cpp 8.16 KB
Newer Older
1 2
#include "pls/pls.h"
using namespace pls;
3 4

#include "benchmark_runner.h"
5 6 7
#include "benchmark_base/matrix_div_conquer.h"

using namespace comparison_benchmarks::base;
8 9 10

#include <memory>
#include <array>
11
#include <algorithm>
12
#include <vector>
13

14 15 16 17
void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> &tmp_arrays,
                          pls::strain_local_resource &local_indices,
                          size_t size,
                          size_t depth,
18
                          size_t branch,
19 20 21
                          matrix_div_conquer::blocked_matrix_view &result,
                          matrix_div_conquer::blocked_matrix_view &a,
                          matrix_div_conquer::blocked_matrix_view &b) {
22

23
  if (size <= matrix_div_conquer::CUTOFF_SIZE) {
24 25 26 27
    multiply_naive(size, result, a, b);
    return;
  }
  // Temporary storage required for the intermediate results
28
  auto strain_local_index = local_indices.get_item(depth);
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
  size_t index;
  if (depth == 0 || (8u << (depth - 1u)) <= local_indices.get_num_threads()) {
    index = branch;
  } else {
    index = strain_local_index.get_strain_index();
  }

  std::unique_ptr<double[]> const &data_1_1_a = tmp_arrays[depth][index][0];
  std::unique_ptr<double[]> const &data_1_1_b = tmp_arrays[depth][index][1];
  std::unique_ptr<double[]> const &data_1_2_a = tmp_arrays[depth][index][2];
  std::unique_ptr<double[]> const &data_1_2_b = tmp_arrays[depth][index][3];
  std::unique_ptr<double[]> const &data_2_1_a = tmp_arrays[depth][index][4];
  std::unique_ptr<double[]> const &data_2_1_b = tmp_arrays[depth][index][5];
  std::unique_ptr<double[]> const &data_2_2_a = tmp_arrays[depth][index][6];
  std::unique_ptr<double[]> const &data_2_2_b = tmp_arrays[depth][index][7];
44 45

  // Handles to sub-matrices used
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
  matrix_div_conquer::blocked_matrix_view result_1_1 = result.quadrant_1_1();
  matrix_div_conquer::blocked_matrix_view result_1_2 = result.quadrant_1_2();
  matrix_div_conquer::blocked_matrix_view result_2_1 = result.quadrant_2_1();
  matrix_div_conquer::blocked_matrix_view result_2_2 = result.quadrant_2_2();

  matrix_div_conquer::blocked_matrix_view result_1_1_a{data_1_1_a.get(), size / 2};
  matrix_div_conquer::blocked_matrix_view result_1_1_b{data_1_1_b.get(), size / 2};
  matrix_div_conquer::blocked_matrix_view result_1_2_a{data_1_2_a.get(), size / 2};
  matrix_div_conquer::blocked_matrix_view result_1_2_b{data_1_2_b.get(), size / 2};
  matrix_div_conquer::blocked_matrix_view result_2_1_a{data_2_1_a.get(), size / 2};
  matrix_div_conquer::blocked_matrix_view result_2_1_b{data_2_1_b.get(), size / 2};
  matrix_div_conquer::blocked_matrix_view result_2_2_a{data_2_2_a.get(), size / 2};
  matrix_div_conquer::blocked_matrix_view result_2_2_b{data_2_2_b.get(), size / 2};

  matrix_div_conquer::blocked_matrix_view a_1_1 = a.quadrant_1_1();
  matrix_div_conquer::blocked_matrix_view a_1_2 = a.quadrant_1_2();
  matrix_div_conquer::blocked_matrix_view a_2_1 = a.quadrant_2_1();
  matrix_div_conquer::blocked_matrix_view a_2_2 = a.quadrant_2_2();

  matrix_div_conquer::blocked_matrix_view b_1_1 = b.quadrant_1_1();
  matrix_div_conquer::blocked_matrix_view b_1_2 = b.quadrant_1_2();
  matrix_div_conquer::blocked_matrix_view b_2_1 = b.quadrant_2_1();
  matrix_div_conquer::blocked_matrix_view b_2_2 = b.quadrant_2_2();
69 70

  // Divide Work Into Sub-Calls
71
  pls::spawn(
72
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 0, result_1_1_a, a_1_1, b_1_1); }
73 74
  );
  pls::spawn(
75
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 1, result_1_1_b, a_1_2, b_2_1); }
76 77 78
  );

  pls::spawn(
79
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 2, result_1_2_a, a_1_1, b_1_2); }
80 81
  );
  pls::spawn(
82
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 3, result_1_2_b, a_1_2, b_2_2); }
83 84 85
  );

  pls::spawn(
86
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 4, result_2_1_a, a_2_1, b_1_1); }
87 88
  );
  pls::spawn(
89
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 5, result_2_1_b, a_2_2, b_2_1); }
90 91 92
  );

  pls::spawn(
93
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 6, result_2_2_a, a_2_1, b_1_2); }
94
  );
95
  pls::spawn_and_sync(
96
      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 7, result_2_2_b, a_2_2, b_2_2); }
97 98
  );

99
  // Combine results
100 101 102 103 104 105
  for (size_t i = 0; i < (size / 2) * (size / 2); i++) {
    // The layout is not important here, ass all have the same order, so just sum element wise
    result_1_1.get_data()[i] = result_1_1_a.get_data()[i] + result_1_1_b.get_data()[i];
    result_1_2.get_data()[i] = result_1_2_a.get_data()[i] + result_1_2_b.get_data()[i];
    result_2_1.get_data()[i] = result_2_1_a.get_data()[i] + result_2_1_b.get_data()[i];
    result_2_2.get_data()[i] = result_2_2_a.get_data()[i] + result_2_2_b.get_data()[i];
106 107 108
  }
}

109
constexpr int MAX_STACK_SIZE = 4096 * 1;
110 111

int main(int argc, char **argv) {
112 113
  auto settings = benchmark_runner::parse_parameters(argc, argv);
  const size_t size = settings.size_;
114

115 116 117

  // Only run on one version to avoid copy
  std::unique_ptr<double[]> result_data{new double[size * size]};
118 119 120
  std::unique_ptr<double[]> a_data{new double[size * size]};
  std::unique_ptr<double[]> b_data{new double[size * size]};

121 122 123 124 125 126 127
  matrix_div_conquer::blocked_matrix_view a{a_data.get(), size};
  matrix_div_conquer::blocked_matrix_view b{b_data.get(), size};
  matrix_div_conquer::blocked_matrix_view result{result_data.get(), size};

  // Fill data arrays as needed
  a.fill_default_data();
  b.fill_default_data();
128
  result.fill_default_data();
129 130 131 132

  // Strain local data
  std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> div_conquer_temp_arrays;
  size_t max_depth = 0;
133
  size_t buffers_needed = 1;
134
  size_t remaining_size = size;
135
  while (remaining_size > matrix_div_conquer::CUTOFF_SIZE) {
136
    auto &depth_buffers = div_conquer_temp_arrays.emplace_back();
137
    buffers_needed = std::min(buffers_needed, (size_t) settings.num_threads_);
138
    for (size_t thread_id = 0; thread_id < buffers_needed; thread_id++) {
139 140
      auto &depth_thread_buffers = depth_buffers.emplace_back();
      for (int i = 0; i < 8; i++) {
141 142 143 144 145
        size_t matrix_elements = (remaining_size / 2) * (remaining_size / 2);
        depth_thread_buffers.emplace_back(new double[matrix_elements]);
        for (size_t j = 0; j < matrix_elements; j += 32) {
          depth_thread_buffers[i][j] = 1.0; // Touch memory
        }
146
      }
147
    }
148 149

    max_depth++;
150
    buffers_needed *= 8;
151
    remaining_size = remaining_size / 2;
152
  }
153 154 155 156 157
  pls::strain_local_resource local_indices{(unsigned) settings.num_threads_, (unsigned) max_depth};

  string test_name = to_string(settings.num_threads_) + ".csv";
  string full_directory = settings.output_directory_ + "/PLS_v3/";
  benchmark_runner runner{full_directory, test_name};
158

159
  pls::scheduler scheduler{(unsigned) settings.num_threads_, max_depth + 2, MAX_STACK_SIZE};
160

161
  if (settings.type_ == benchmark_runner::benchmark_settings::ISOLATED) {
162 163 164 165 166
#if PLS_PROFILING_ENABLED
    scheduler.get_profiler().disable_memory_measure();
    runner.add_custom_stats_field("T_1");
    runner.add_custom_stats_field("T_inf");
#endif
167 168 169 170 171 172
    printf("Running isolated measurement...\n");
    runner.enable_memory_stats();
    runner.pre_allocate_stats();

    runner.run_iterations(settings.iterations_, [&]() {
      scheduler.perform_work([&]() {
173
        multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, 0, result, a, b);
174
      });
175 176 177 178 179
    }, [&]() {}, [&]() {
#if PLS_PROFILING_ENABLED
      runner.store_custom_stat("T_1", scheduler.get_profiler().current_run().t_1_);
      runner.store_custom_stat("T_inf", scheduler.get_profiler().current_run().t_inf_);
#endif
180 181 182 183 184 185 186 187 188
    });
    runner.commit_results(true);
  } else {
    printf("Running periodic measurement...\n");
    runner.enable_wall_time_stats();
    runner.pre_allocate_stats();

    runner.run_periodic(settings.iterations_, settings.interval_period_, settings.interval_deadline_, [&]() {
      scheduler.perform_work([&]() {
189
        multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, 0, result, a, b);
190
      });
191
    });
192 193
    runner.commit_results(true);
  }
194

195
}