Commit 86333a60 by FritzFlorian

Final preparations for single-app benchmark runs.

parent e2f584c4
Pipeline #1515 passed with stages
in 4 minutes 37 seconds
......@@ -8,30 +8,39 @@ using namespace comparison_benchmarks::base;
#include <memory>
#include <array>
#include <algorithm>
#include <vector>
void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> &tmp_arrays,
pls::strain_local_resource &local_indices,
size_t size,
size_t depth,
size_t branch,
matrix_div_conquer::blocked_matrix_view &result,
matrix_div_conquer::blocked_matrix_view &a,
matrix_div_conquer::blocked_matrix_view &b) {
if (size <= 8) {
if (size <= matrix_div_conquer::CUTOFF_SIZE) {
multiply_naive(size, result, a, b);
return;
}
// Temporary storage required for the intermediate results
auto strain_local_index = local_indices.get_item(depth);
std::unique_ptr<double[]> const &data_1_1_a = tmp_arrays[depth][strain_local_index.get_strain_index()][0];
std::unique_ptr<double[]> const &data_1_1_b = tmp_arrays[depth][strain_local_index.get_strain_index()][1];
std::unique_ptr<double[]> const &data_1_2_a = tmp_arrays[depth][strain_local_index.get_strain_index()][2];
std::unique_ptr<double[]> const &data_1_2_b = tmp_arrays[depth][strain_local_index.get_strain_index()][3];
std::unique_ptr<double[]> const &data_2_1_a = tmp_arrays[depth][strain_local_index.get_strain_index()][4];
std::unique_ptr<double[]> const &data_2_1_b = tmp_arrays[depth][strain_local_index.get_strain_index()][5];
std::unique_ptr<double[]> const &data_2_2_a = tmp_arrays[depth][strain_local_index.get_strain_index()][6];
std::unique_ptr<double[]> const &data_2_2_b = tmp_arrays[depth][strain_local_index.get_strain_index()][7];
size_t index;
if (depth == 0 || (8u << (depth - 1u)) <= local_indices.get_num_threads()) {
index = branch;
} else {
index = strain_local_index.get_strain_index();
}
std::unique_ptr<double[]> const &data_1_1_a = tmp_arrays[depth][index][0];
std::unique_ptr<double[]> const &data_1_1_b = tmp_arrays[depth][index][1];
std::unique_ptr<double[]> const &data_1_2_a = tmp_arrays[depth][index][2];
std::unique_ptr<double[]> const &data_1_2_b = tmp_arrays[depth][index][3];
std::unique_ptr<double[]> const &data_2_1_a = tmp_arrays[depth][index][4];
std::unique_ptr<double[]> const &data_2_1_b = tmp_arrays[depth][index][5];
std::unique_ptr<double[]> const &data_2_2_a = tmp_arrays[depth][index][6];
std::unique_ptr<double[]> const &data_2_2_b = tmp_arrays[depth][index][7];
// Handles to sub-matrices used
matrix_div_conquer::blocked_matrix_view result_1_1 = result.quadrant_1_1();
......@@ -60,31 +69,31 @@ void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_
// Divide Work Into Sub-Calls
pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_1_a, a_1_1, b_1_1); }
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 0, result_1_1_a, a_1_1, b_1_1); }
);
pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_1_b, a_1_2, b_2_1); }
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 1, result_1_1_b, a_1_2, b_2_1); }
);
pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_2_a, a_1_1, b_1_2); }
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 2, result_1_2_a, a_1_1, b_1_2); }
);
pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_1_2_b, a_1_2, b_2_2); }
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 3, result_1_2_b, a_1_2, b_2_2); }
);
pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_1_a, a_2_1, b_1_1); }
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 4, result_2_1_a, a_2_1, b_1_1); }
);
pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_1_b, a_2_2, b_2_1); }
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 5, result_2_1_b, a_2_2, b_2_1); }
);
pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_2_a, a_2_1, b_1_2); }
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 6, result_2_2_a, a_2_1, b_1_2); }
);
pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, result_2_2_b, a_2_2, b_2_2); }
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 7, result_2_2_b, a_2_2, b_2_2); }
);
pls::sync();
......@@ -99,8 +108,8 @@ void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_
}
}
constexpr int MAX_NUM_TASKS = 16;
constexpr int MAX_STACK_SIZE = 4096 * 2;
constexpr int MAX_NUM_TASKS = 10;
constexpr int MAX_STACK_SIZE = 4096 * 1;
int main(int argc, char **argv) {
auto settings = benchmark_runner::parse_parameters(argc, argv);
......@@ -124,17 +133,24 @@ int main(int argc, char **argv) {
// Strain local data
std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> div_conquer_temp_arrays;
size_t max_depth = 0;
size_t buffers_needed = 1;
size_t remaining_size = size;
while (remaining_size > 1) {
while (remaining_size > matrix_div_conquer::CUTOFF_SIZE) {
auto &depth_buffers = div_conquer_temp_arrays.emplace_back();
for (int thread_id = 0; thread_id < 8; thread_id++) {
buffers_needed = std::min(buffers_needed, (size_t) settings.num_threads_);
for (int thread_id = 0; thread_id < buffers_needed; thread_id++) {
auto &depth_thread_buffers = depth_buffers.emplace_back();
for (int i = 0; i < 8; i++) {
depth_thread_buffers.emplace_back(new double[(remaining_size / 2) * (remaining_size / 2)]);
size_t matrix_elements = (remaining_size / 2) * (remaining_size / 2);
depth_thread_buffers.emplace_back(new double[matrix_elements]);
for (size_t j = 0; j < matrix_elements; j += 32) {
depth_thread_buffers[i][j] = 1.0; // Touch memory
}
}
}
max_depth++;
buffers_needed *= 8;
remaining_size = remaining_size / 2;
}
pls::strain_local_resource local_indices{(unsigned) settings.num_threads_, (unsigned) max_depth};
......@@ -152,7 +168,7 @@ int main(int argc, char **argv) {
runner.run_iterations(settings.iterations_, [&]() {
scheduler.perform_work([&]() {
multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, result, a, b);
multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, 0, result, a, b);
});
});
runner.commit_results(true);
......@@ -163,7 +179,7 @@ int main(int argc, char **argv) {
runner.run_periodic(settings.iterations_, settings.interval_period_, settings.interval_deadline_, [&]() {
scheduler.perform_work([&]() {
multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, result, a, b);
multiply_div_conquer(div_conquer_temp_arrays, local_indices, size, 0, 0, result, a, b);
});
});
runner.commit_results(true);
......
......@@ -45,7 +45,7 @@ class scheduler {
size_t computation_depth,
size_t stack_size,
bool reuse_thread = true,
size_t serial_stack_size = 4096 * 8);
size_t serial_stack_size = 4096 * 1);
template<typename ALLOC>
explicit scheduler(unsigned int num_threads,
......
......@@ -60,7 +60,7 @@ class strain_local_resource {
};
strain_local_resource(unsigned num_threads,
unsigned depth) : local_items_(num_threads) {
unsigned depth) : num_threads_{num_threads}, local_items_(num_threads) {
for (unsigned thread_id = 0; thread_id < num_threads; thread_id++) {
local_items_[thread_id].reserve(depth);
for (unsigned i = 0; i < depth; i++) {
......@@ -70,11 +70,13 @@ class strain_local_resource {
}
}
[[nodiscard]] unsigned get_num_threads() const { return num_threads_; }
item_handle get_item(unsigned depth);
static strain_resource *get_local_copy(strain_resource *other_resources, unsigned thread_id);
static void acquire_locally(strain_resource *other_resources, unsigned thread_id);
private:
const unsigned num_threads_;
std::vector<std::vector<local_item>> local_items_;
};
......
......@@ -69,6 +69,9 @@ struct PLS_CACHE_ALIGN thread_state {
stack_allocator_{stack_allocator},
serial_call_stack_size_{serial_call_stack_size} {
serial_call_stack_ = stack_allocator->allocate_stack(serial_call_stack_size_);
for (size_t i = 0; i < serial_call_stack_size; i += base::system_details::CACHE_LINE_SIZE) {
serial_call_stack_[i] = 'a'; // Touch the stack
}
};
~thread_state() {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment