Commit 86333a60 by FritzFlorian

Final preparations for single-app benchmark runs.

parent e2f584c4
Pipeline #1515 passed with stages
in 4 minutes 37 seconds
...@@ -8,30 +8,39 @@ using namespace comparison_benchmarks::base; ...@@ -8,30 +8,39 @@ using namespace comparison_benchmarks::base;
#include <memory> #include <memory>
#include <array> #include <array>
#include <algorithm>
#include <vector> #include <vector>
void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> &tmp_arrays, void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> &tmp_arrays,
pls::strain_local_resource &local_indices, pls::strain_local_resource &local_indices,
size_t size, size_t size,
size_t depth, size_t depth,
size_t branch,
matrix_div_conquer::blocked_matrix_view &result, matrix_div_conquer::blocked_matrix_view &result,
matrix_div_conquer::blocked_matrix_view &a, matrix_div_conquer::blocked_matrix_view &a,
matrix_div_conquer::blocked_matrix_view &b) { matrix_div_conquer::blocked_matrix_view &b) {
if (size <= 8) { if (size <= matrix_div_conquer::CUTOFF_SIZE) {
multiply_naive(size, result, a, b); multiply_naive(size, result, a, b);
return; return;
} }
// Temporary storage required for the intermediate results // Temporary storage required for the intermediate results
auto strain_local_index = local_indices.get_item(depth); auto strain_local_index = local_indices.get_item(depth);
std::unique_ptr<double[]> const &data_1_1_a = tmp_arrays[depth][strain_local_index.get_strain_index()][0]; size_t index;
std::unique_ptr<double[]> const &data_1_1_b = tmp_arrays[depth][strain_local_index.get_strain_index()][1]; if (depth == 0 || (8u << (depth - 1u)) <= local_indices.get_num_threads()) {
std::unique_ptr<double[]> const &data_1_2_a = tmp_arrays[depth][strain_local_index.get_strain_index()][2]; index = branch;
std::unique_ptr<double[]> const &data_1_2_b = tmp_arrays[depth][strain_local_index.get_strain_index()][3]; } else {
std::unique_ptr<double[]> const &data_2_1_a = tmp_arrays[depth][strain_local_index.get_strain_index()][4]; index = strain_local_index.get_strain_index();
std::unique_ptr<double[]> const &data_2_1_b = tmp_arrays[depth][strain_local_index.get_strain_index()][5]; }
std::unique_ptr<double[]> const &data_2_2_a = tmp_arrays[depth][strain_local_index.get_strain_index()][6];
std::unique_ptr<double[]> const &data_2_2_b = tmp_arrays[depth][strain_local_index.get_strain_index()][7]; std::unique_ptr<double[]> const &data_1_1_a = tmp_arrays[depth][index][0];
std::unique_ptr<double[]> const &data_1_1_b = tmp_arrays[depth][index][1];
std::unique_ptr<double[]> const &data_1_2_a = tmp_arrays[depth][index][2];
std::unique_ptr<double[]> const &data_1_2_b = tmp_arrays[depth][index][3];
std::unique_ptr<double[]> const &data_2_1_a = tmp_arrays[depth][index][4];
std::unique_ptr<double[]> const &data_2_1_b = tmp_arrays[depth][index][5];
std::unique_ptr<double[]> const &data_2_2_a = tmp_arrays[depth][index][6];
std::unique_ptr<double[]> const &data_2_2_b = tmp_arrays[depth][index][7];
// Handles to sub-matrices used // Handles to sub-matrices used
matrix_div_conquer::blocked_matrix_view result_1_1 = result.quadrant_1_1(); matrix_div_conquer::blocked_matrix_view result_1_1 = result.quadrant_1_1();
...@@ -60,31 +69,31 @@ void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_ ...@@ -60,31 +69,31 @@ void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_
// Divide Work Into Sub-Calls // Divide Work Into Sub-Calls
pls::spawn( pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_1_a, a_1_1, b_1_1); } [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 0, result_1_1_a, a_1_1, b_1_1); }
); );
pls::spawn( pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_1_b, a_1_2, b_2_1); } [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 1, result_1_1_b, a_1_2, b_2_1); }
); );
pls::spawn( pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_2_a, a_1_1, b_1_2); } [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 2, result_1_2_a, a_1_1, b_1_2); }
); );
pls::spawn( pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_2_b, a_1_2, b_2_2); } [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 3, result_1_2_b, a_1_2, b_2_2); }
); );
pls::spawn( pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_1_a, a_2_1, b_1_1); } [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 4, result_2_1_a, a_2_1, b_1_1); }
); );
pls::spawn( pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_1_b, a_2_2, b_2_1); } [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 5, result_2_1_b, a_2_2, b_2_1); }
); );
pls::spawn( pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_2_a, a_2_1, b_1_2); } [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 6, result_2_2_a, a_2_1, b_1_2); }
); );
pls::spawn( pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_2_b, a_2_2, b_2_2); } [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 7, result_2_2_b, a_2_2, b_2_2); }
); );
pls::sync(); pls::sync();
...@@ -99,8 +108,8 @@ void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_ ...@@ -99,8 +108,8 @@ void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_
} }
} }
constexpr int MAX_NUM_TASKS = 16; constexpr int MAX_NUM_TASKS = 10;
constexpr int MAX_STACK_SIZE = 4096 * 2; constexpr int MAX_STACK_SIZE = 4096 * 1;
int main(int argc, char **argv) { int main(int argc, char **argv) {
auto settings = benchmark_runner::parse_parameters(argc, argv); auto settings = benchmark_runner::parse_parameters(argc, argv);
...@@ -124,17 +133,24 @@ int main(int argc, char **argv) { ...@@ -124,17 +133,24 @@ int main(int argc, char **argv) {
// Strain local data // Strain local data
std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> div_conquer_temp_arrays; std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> div_conquer_temp_arrays;
size_t max_depth = 0; size_t max_depth = 0;
size_t buffers_needed = 1;
size_t remaining_size = size; size_t remaining_size = size;
while (remaining_size > 1) { while (remaining_size > matrix_div_conquer::CUTOFF_SIZE) {
auto &depth_buffers = div_conquer_temp_arrays.emplace_back(); auto &depth_buffers = div_conquer_temp_arrays.emplace_back();
for (int thread_id = 0; thread_id < 8; thread_id++) { buffers_needed = std::min(buffers_needed, (size_t) settings.num_threads_);
for (int thread_id = 0; thread_id < buffers_needed; thread_id++) {
auto &depth_thread_buffers = depth_buffers.emplace_back(); auto &depth_thread_buffers = depth_buffers.emplace_back();
for (int i = 0; i < 8; i++) { for (int i = 0; i < 8; i++) {
depth_thread_buffers.emplace_back(new double[(remaining_size / 2) * (remaining_size / 2)]); size_t matrix_elements = (remaining_size / 2) * (remaining_size / 2);
depth_thread_buffers.emplace_back(new double[matrix_elements]);
for (size_t j = 0; j < matrix_elements; j += 32) {
depth_thread_buffers[i][j] = 1.0; // Touch memory
}
} }
} }
max_depth++; max_depth++;
buffers_needed *= 8;
remaining_size = remaining_size / 2; remaining_size = remaining_size / 2;
} }
pls::strain_local_resource local_indices{(unsigned) settings.num_threads_, (unsigned) max_depth}; pls::strain_local_resource local_indices{(unsigned) settings.num_threads_, (unsigned) max_depth};
...@@ -152,7 +168,7 @@ int main(int argc, char **argv) { ...@@ -152,7 +168,7 @@ int main(int argc, char **argv) {
runner.run_iterations(settings.iterations_, [&]() { runner.run_iterations(settings.iterations_, [&]() {
scheduler.perform_work([&]() { scheduler.perform_work([&]() {
multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, result, a, b); multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, 0, result, a, b);
}); });
}); });
runner.commit_results(true); runner.commit_results(true);
...@@ -163,7 +179,7 @@ int main(int argc, char **argv) { ...@@ -163,7 +179,7 @@ int main(int argc, char **argv) {
runner.run_periodic(settings.iterations_, settings.interval_period_, settings.interval_deadline_, [&]() { runner.run_periodic(settings.iterations_, settings.interval_period_, settings.interval_deadline_, [&]() {
scheduler.perform_work([&]() { scheduler.perform_work([&]() {
multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, result, a, b); multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, 0, result, a, b);
}); });
}); });
runner.commit_results(true); runner.commit_results(true);
......
...@@ -45,7 +45,7 @@ class scheduler { ...@@ -45,7 +45,7 @@ class scheduler {
size_t computation_depth, size_t computation_depth,
size_t stack_size, size_t stack_size,
bool reuse_thread = true, bool reuse_thread = true,
size_t serial_stack_size = 4096 * 8); size_t serial_stack_size = 4096 * 1);
template<typename ALLOC> template<typename ALLOC>
explicit scheduler(unsigned int num_threads, explicit scheduler(unsigned int num_threads,
......
...@@ -60,7 +60,7 @@ class strain_local_resource { ...@@ -60,7 +60,7 @@ class strain_local_resource {
}; };
strain_local_resource(unsigned num_threads, strain_local_resource(unsigned num_threads,
unsigned depth) : local_items_(num_threads) { unsigned depth) : num_threads_{num_threads}, local_items_(num_threads) {
for (unsigned thread_id = 0; thread_id < num_threads; thread_id++) { for (unsigned thread_id = 0; thread_id < num_threads; thread_id++) {
local_items_[thread_id].reserve(depth); local_items_[thread_id].reserve(depth);
for (unsigned i = 0; i < depth; i++) { for (unsigned i = 0; i < depth; i++) {
...@@ -70,11 +70,13 @@ class strain_local_resource { ...@@ -70,11 +70,13 @@ class strain_local_resource {
} }
} }
[[nodiscard]] unsigned get_num_threads() const { return num_threads_; }
item_handle get_item(unsigned depth); item_handle get_item(unsigned depth);
static strain_resource *get_local_copy(strain_resource *other_resources, unsigned thread_id); static strain_resource *get_local_copy(strain_resource *other_resources, unsigned thread_id);
static void acquire_locally(strain_resource *other_resources, unsigned thread_id); static void acquire_locally(strain_resource *other_resources, unsigned thread_id);
private: private:
const unsigned num_threads_;
std::vector<std::vector<local_item>> local_items_; std::vector<std::vector<local_item>> local_items_;
}; };
......
...@@ -69,6 +69,9 @@ struct PLS_CACHE_ALIGN thread_state { ...@@ -69,6 +69,9 @@ struct PLS_CACHE_ALIGN thread_state {
stack_allocator_{stack_allocator}, stack_allocator_{stack_allocator},
serial_call_stack_size_{serial_call_stack_size} { serial_call_stack_size_{serial_call_stack_size} {
serial_call_stack_ = stack_allocator->allocate_stack(serial_call_stack_size_); serial_call_stack_ = stack_allocator->allocate_stack(serial_call_stack_size_);
for (size_t i = 0; i < serial_call_stack_size; i += base::system_details::CACHE_LINE_SIZE) {
serial_call_stack_[i] = 'a'; // Touch the stack
}
}; };
~thread_state() { ~thread_state() {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment