Final preparations for single-app benchmark runs.

86333a60 · FritzFlorian · e2f584c4 · 86333a60 · 86333a60 · 86333a60
Commit 86333a60 authored Jun 13, 2020 by FritzFlorian
4 changed files
--- a/app/benchmark_matrix_div_conquer/main.cpp
+++ b/app/benchmark_matrix_div_conquer/main.cpp
@@ -8,30 +8,39 @@ using namespace comparison_benchmarks::base;

 #include <memory>
 #include <array>
+#include <algorithm>
 #include <vector>

 void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> &tmp_arrays,
                          pls::strain_local_resource &local_indices,
                          size_t size,
                          size_t depth,
+                          size_t branch,
                          matrix_div_conquer::blocked_matrix_view &result,
                          matrix_div_conquer::blocked_matrix_view &a,
                          matrix_div_conquer::blocked_matrix_view &b) {

-  if (size <= 8) {
+  if (size <= matrix_div_conquer::CUTOFF_SIZE) {
    multiply_naive(size, result, a, b);
    return;
  }
  // Temporary storage required for the intermediate results
  auto strain_local_index = local_indices.get_item(depth);
-  std::unique_ptr<double[]> const &data_1_1_a = tmp_arrays[depth][strain_local_index.get_strain_index()][0];
-  std::unique_ptr<double[]> const &data_1_1_b = tmp_arrays[depth][strain_local_index.get_strain_index()][1];
-  std::unique_ptr<double[]> const &data_1_2_a = tmp_arrays[depth][strain_local_index.get_strain_index()][2];
-  std::unique_ptr<double[]> const &data_1_2_b = tmp_arrays[depth][strain_local_index.get_strain_index()][3];
-  std::unique_ptr<double[]> const &data_2_1_a = tmp_arrays[depth][strain_local_index.get_strain_index()][4];
-  std::unique_ptr<double[]> const &data_2_1_b = tmp_arrays[depth][strain_local_index.get_strain_index()][5];
-  std::unique_ptr<double[]> const &data_2_2_a = tmp_arrays[depth][strain_local_index.get_strain_index()][6];
-  std::unique_ptr<double[]> const &data_2_2_b = tmp_arrays[depth][strain_local_index.get_strain_index()][7];
+  size_t index;
+  if (depth == 0 || (8u << (depth - 1u)) <= local_indices.get_num_threads()) {
+    index = branch;
+  } else {
+    index = strain_local_index.get_strain_index();
+  }
+
+  std::unique_ptr<double[]> const &data_1_1_a = tmp_arrays[depth][index][0];
+  std::unique_ptr<double[]> const &data_1_1_b = tmp_arrays[depth][index][1];
+  std::unique_ptr<double[]> const &data_1_2_a = tmp_arrays[depth][index][2];
+  std::unique_ptr<double[]> const &data_1_2_b = tmp_arrays[depth][index][3];
+  std::unique_ptr<double[]> const &data_2_1_a = tmp_arrays[depth][index][4];
+  std::unique_ptr<double[]> const &data_2_1_b = tmp_arrays[depth][index][5];
+  std::unique_ptr<double[]> const &data_2_2_a = tmp_arrays[depth][index][6];
+  std::unique_ptr<double[]> const &data_2_2_b = tmp_arrays[depth][index][7];

  // Handles to sub-matrices used
  matrix_div_conquer::blocked_matrix_view result_1_1 = result.quadrant_1_1();
@@ -60,31 +69,31 @@ void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_

  // Divide Work Into Sub-Calls
  pls::spawn(
-      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_1_a, a_1_1, b_1_1); }
+      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 0, result_1_1_a, a_1_1, b_1_1); }
  );
  pls::spawn(
-      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_1_b, a_1_2, b_2_1); }
+      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 1, result_1_1_b, a_1_2, b_2_1); }
  );

  pls::spawn(
-      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_2_a, a_1_1, b_1_2); }
+      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 2, result_1_2_a, a_1_1, b_1_2); }
  );
  pls::spawn(
-      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_2_b, a_1_2, b_2_2); }
+      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 3, result_1_2_b, a_1_2, b_2_2); }
  );

  pls::spawn(
-      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_1_a, a_2_1, b_1_1); }
+      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 4, result_2_1_a, a_2_1, b_1_1); }
  );
  pls::spawn(
-      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_1_b, a_2_2, b_2_1); }
+      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 5, result_2_1_b, a_2_2, b_2_1); }
  );

  pls::spawn(
-      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_2_a, a_2_1, b_1_2); }
+      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 6, result_2_2_a, a_2_1, b_1_2); }
  );
  pls::spawn(
-      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_2_b, a_2_2, b_2_2); }
+      [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 7, result_2_2_b, a_2_2, b_2_2); }
  );

  pls::sync();
@@ -99,8 +108,8 @@ void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_
  }
 }

-constexpr int MAX_NUM_TASKS = 16;
-constexpr int MAX_STACK_SIZE = 4096 * 2;
+constexpr int MAX_NUM_TASKS = 10;
+constexpr int MAX_STACK_SIZE = 4096 * 1;

 int main(int argc, char **argv) {
  auto settings = benchmark_runner::parse_parameters(argc, argv);
@@ -124,17 +133,24 @@ int main(int argc, char **argv) {
  // Strain local data
  std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> div_conquer_temp_arrays;
  size_t max_depth = 0;
+  size_t buffers_needed = 1;
  size_t remaining_size = size;
-  while (remaining_size > 1) {
+  while (remaining_size > matrix_div_conquer::CUTOFF_SIZE) {
    auto &depth_buffers = div_conquer_temp_arrays.emplace_back();
-    for (int thread_id = 0; thread_id < 8; thread_id++) {
+    buffers_needed = std::min(buffers_needed, (size_t) settings.num_threads_);
+    for (int thread_id = 0; thread_id < buffers_needed; thread_id++) {
      auto &depth_thread_buffers = depth_buffers.emplace_back();
      for (int i = 0; i < 8; i++) {
-        depth_thread_buffers.emplace_back(new double[(remaining_size / 2) * (remaining_size / 2)]);
+        size_t matrix_elements = (remaining_size / 2) * (remaining_size / 2);
+        depth_thread_buffers.emplace_back(new double[matrix_elements]);
+        for (size_t j = 0; j < matrix_elements; j += 32) {
+          depth_thread_buffers[i][j] = 1.0; // Touch memory
+        }
      }
    }

    max_depth++;
+    buffers_needed *= 8;
    remaining_size = remaining_size / 2;
  }
  pls::strain_local_resource local_indices{(unsigned) settings.num_threads_, (unsigned) max_depth};
@@ -152,7 +168,7 @@ int main(int argc, char **argv) {

    runner.run_iterations(settings.iterations_, [&]() {
      scheduler.perform_work([&]() {
-        multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, result, a, b);
+        multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, 0, result, a, b);
      });
    });
    runner.commit_results(true);
@@ -163,7 +179,7 @@ int main(int argc, char **argv) {

    runner.run_periodic(settings.iterations_, settings.interval_period_, settings.interval_deadline_, [&]() {
      scheduler.perform_work([&]() {
-        multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, result, a, b);
+        multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, 0, result, a, b);
      });
    });
    runner.commit_results(true);

--- a/lib/pls/include/pls/internal/scheduling/scheduler.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler.h
@@ -45,7 +45,7 @@ class scheduler {
                     size_t computation_depth,
                     size_t stack_size,
                     bool reuse_thread = true,
-                     size_t serial_stack_size = 4096 * 8);
+                     size_t serial_stack_size = 4096 * 1);

  template<typename ALLOC>
  explicit scheduler(unsigned int num_threads,

--- a/lib/pls/include/pls/internal/scheduling/strain_local_resource.h
+++ b/lib/pls/include/pls/internal/scheduling/strain_local_resource.h
@@ -60,7 +60,7 @@ class strain_local_resource {
  };

  strain_local_resource(unsigned num_threads,
-                        unsigned depth) : local_items_(num_threads) {
+                        unsigned depth) : num_threads_{num_threads}, local_items_(num_threads) {
    for (unsigned thread_id = 0; thread_id < num_threads; thread_id++) {
      local_items_[thread_id].reserve(depth);
      for (unsigned i = 0; i < depth; i++) {
@@ -70,11 +70,13 @@ class strain_local_resource {
    }
  }

+  [[nodiscard]] unsigned get_num_threads() const { return num_threads_; }
  item_handle get_item(unsigned depth);
  static strain_resource *get_local_copy(strain_resource *other_resources, unsigned thread_id);
  static void acquire_locally(strain_resource *other_resources, unsigned thread_id);

 private:
+  const unsigned num_threads_;
  std::vector<std::vector<local_item>> local_items_;
 };


--- a/lib/pls/include/pls/internal/scheduling/thread_state.h
+++ b/lib/pls/include/pls/internal/scheduling/thread_state.h
@@ -69,6 +69,9 @@ struct PLS_CACHE_ALIGN thread_state {
      stack_allocator_{stack_allocator},
      serial_call_stack_size_{serial_call_stack_size} {
    serial_call_stack_ = stack_allocator->allocate_stack(serial_call_stack_size_);
+    for (size_t i = 0; i < serial_call_stack_size; i += base::system_details::CACHE_LINE_SIZE) {
+      serial_call_stack_[i] = 'a'; // Touch the stack
+    }
  };

  ~thread_state() {