main.cpp 7.03 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
//#include "pls/pls.h"
//using namespace pls;

#include "benchmark_runner.h"

#include <memory>
#include <array>
#include <math.h>

// Helpers to directly index into blocked matrices
const size_t MAX_BLOCK_LOOKUP = 256;
std::array<std::array<size_t, MAX_BLOCK_LOOKUP>, MAX_BLOCK_LOOKUP> BLOCK_LOOKUP; // ROW, COLUMN
void fill_block_lookup(size_t size) {
  if (size <= 1) {
    BLOCK_LOOKUP[0][0] = 0;
    return;
  }

  fill_block_lookup(size / 2);

  size_t elements_per_quarter = (size / 2) * (size / 2);
  for (size_t row = 0; row < size / 2; row++) {
    for (size_t column = 0; column < size / 2; column++) {
      BLOCK_LOOKUP[row][size / 2 + column] = BLOCK_LOOKUP[row][column] + elements_per_quarter;
      BLOCK_LOOKUP[size / 2 + row][column] = BLOCK_LOOKUP[row][column] + 2 * elements_per_quarter;
      BLOCK_LOOKUP[size / 2 + row][size / 2 + column] = BLOCK_LOOKUP[row][column] + 3 * elements_per_quarter;
    }
  }
}
class blocked_matrix_view {
 public:
  blocked_matrix_view(double *data, size_t size) : data_{data}, size_{size} {}

  blocked_matrix_view quadrant_1_1() {
    size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
    return blocked_matrix_view(data_ + 0 * elements_per_quarter, size_ / 2);
  }
  blocked_matrix_view quadrant_1_2() {
    size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
    return blocked_matrix_view(data_ + 1 * elements_per_quarter, size_ / 2);
  }
  blocked_matrix_view quadrant_2_1() {
    size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
    return blocked_matrix_view(data_ + 2 * elements_per_quarter, size_ / 2);
  }
  blocked_matrix_view quadrant_2_2() {
    size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
    return blocked_matrix_view(data_ + 3 * elements_per_quarter, size_ / 2);
  }

  double &at(size_t row, size_t column) {
    return data_[BLOCK_LOOKUP[row][column]];
  }

  double *get_data() {
    return data_;
  }

 private:
  double *data_;
  size_t size_;
};

void multiply_naive(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) {
  for (size_t i = 0; i < size; i++) {
    for (size_t j = 0; j < size; j++) {
      result.at(i, j) = 0;
    }
    for (size_t j = 0; j < size; j++) {
      for (size_t k = 0; k < size; k++) {
        result.at(i, j) += a.at(i, k) * b.at(k, j);
      }
    }
  }
}

void multiply_div_conquer(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) {
  if (size <= 8) {
    multiply_naive(size, result, a, b);
    return;
  }
  // Temporary storage required for the intermediate results
  std::unique_ptr<double[]> data_1_1_a{new double[(size / 2) * (size / 2)]};
  std::unique_ptr<double[]> data_1_1_b{new double[(size / 2) * (size / 2)]};
  std::unique_ptr<double[]> data_1_2_a{new double[(size / 2) * (size / 2)]};
  std::unique_ptr<double[]> data_1_2_b{new double[(size / 2) * (size / 2)]};
  std::unique_ptr<double[]> data_2_1_a{new double[(size / 2) * (size / 2)]};
  std::unique_ptr<double[]> data_2_1_b{new double[(size / 2) * (size / 2)]};
  std::unique_ptr<double[]> data_2_2_a{new double[(size / 2) * (size / 2)]};
  std::unique_ptr<double[]> data_2_2_b{new double[(size / 2) * (size / 2)]};

  // Handles to sub-matrices used
  blocked_matrix_view result_1_1 = result.quadrant_1_1();
  blocked_matrix_view result_1_2 = result.quadrant_1_2();
  blocked_matrix_view result_2_1 = result.quadrant_2_1();
  blocked_matrix_view result_2_2 = result.quadrant_2_2();

  blocked_matrix_view result_1_1_a{data_1_1_a.get(), size / 2};
  blocked_matrix_view result_1_1_b{data_1_1_b.get(), size / 2};
  blocked_matrix_view result_1_2_a{data_1_2_a.get(), size / 2};
  blocked_matrix_view result_1_2_b{data_1_2_b.get(), size / 2};
  blocked_matrix_view result_2_1_a{data_2_1_a.get(), size / 2};
  blocked_matrix_view result_2_1_b{data_2_1_b.get(), size / 2};
  blocked_matrix_view result_2_2_a{data_2_2_a.get(), size / 2};
  blocked_matrix_view result_2_2_b{data_2_2_b.get(), size / 2};

  blocked_matrix_view a_1_1 = a.quadrant_1_1();
  blocked_matrix_view a_1_2 = a.quadrant_1_2();
  blocked_matrix_view a_2_1 = a.quadrant_2_1();
  blocked_matrix_view a_2_2 = a.quadrant_2_2();

  blocked_matrix_view b_1_1 = b.quadrant_1_1();
  blocked_matrix_view b_1_2 = b.quadrant_1_2();
  blocked_matrix_view b_2_1 = b.quadrant_2_1();
  blocked_matrix_view b_2_2 = b.quadrant_2_2();

  // Divide Work Into Sub-Calls
  multiply_div_conquer(size / 2, result_1_1_a, a_1_1, b_1_1);
  multiply_div_conquer(size / 2, result_1_1_b, a_1_2, b_2_1);

  multiply_div_conquer(size / 2, result_1_2_a, a_1_1, b_1_2);
  multiply_div_conquer(size / 2, result_1_2_b, a_1_2, b_2_2);

  multiply_div_conquer(size / 2, result_2_1_a, a_2_1, b_1_1);
  multiply_div_conquer(size / 2, result_2_1_b, a_2_2, b_2_1);

  multiply_div_conquer(size / 2, result_2_2_a, a_2_1, b_1_2);
  multiply_div_conquer(size / 2, result_2_2_b, a_2_2, b_2_2);

  // Combine results
  for (size_t row = 0; row < size / 2; row++) {
    for (size_t column = 0; column < size / 2; column++) {
      result_1_1.at(row, column) = result_1_1_a.at(row, column) + result_1_1_b.at(row, column);
      result_1_2.at(row, column) = result_1_2_a.at(row, column) + result_1_2_b.at(row, column);
      result_2_1.at(row, column) = result_2_1_a.at(row, column) + result_2_1_b.at(row, column);
      result_2_2.at(row, column) = result_2_2_a.at(row, column) + result_2_2_b.at(row, column);
    }
  }
}

constexpr int MAX_NUM_TASKS = 32;
constexpr int MAX_STACK_SIZE = 4096 * 1;

int main(int argc, char **argv) {
  fill_block_lookup(MAX_BLOCK_LOOKUP);

  size_t size = 64;
  std::unique_ptr<double[]> result_data_naive{new double[size * size]};
  std::unique_ptr<double[]> result_data_div{new double[size * size]};
  std::unique_ptr<double[]> a_data{new double[size * size]};
  std::unique_ptr<double[]> b_data{new double[size * size]};

  blocked_matrix_view result_naive{result_data_naive.get(), size};
  blocked_matrix_view result_div{result_data_div.get(), size};
  blocked_matrix_view a{a_data.get(), size};
  blocked_matrix_view b{b_data.get(), size};

  for (size_t row = 0; row < size; row++) {
    for (size_t column = 0; column < size; column++) {
      a.at(row, column) = row;
      b.at(row, column) = column;
    }
  }

  multiply_div_conquer(size, result_div, a, b);
  multiply_naive(size, result_naive, a, b);

  size_t misses = 0;
  for (size_t row = 0; row < size; row++) {
    for (size_t column = 0; column < size; column++) {
      if (result_div.at(row, column) != result_naive.at(row, column)) {
        misses++;
        printf("%5.5f\t\t", result_div.at(row, column) - result_naive.at(row, column));
      }
    }
  }
  printf("\n%d", misses);

//  int num_threads;
//  string directory;
//  benchmark_runner::read_args(argc, argv, num_threads, directory);
//
//  string test_name = to_string(num_threads) + ".csv";
//  string full_directory = directory + "/PLS_v3/";
//  benchmark_runner runner{full_directory, test_name};
//
//  scheduler scheduler{(unsigned) num_threads, MAX_NUM_TASKS, MAX_STACK_SIZE};
//
//  runner.run_iterations(1000, [&]() {
//    scheduler.perform_work([&]() {
//    });
//  }, 100);
//  runner.commit_results(true);
}