Commit 92ee564c by FritzFlorian

WIP: Start work on divide and conquer example with temporary buffers.

parent a81e082d
Pipeline #1500 passed with stages
in 4 minutes 9 seconds
......@@ -48,6 +48,7 @@ add_subdirectory(app/playground)
add_subdirectory(app/benchmark_fft)
add_subdirectory(app/benchmark_unbalanced)
add_subdirectory(app/benchmark_matrix)
add_subdirectory(app/benchmark_matrix_div_conquer)
add_subdirectory(app/benchmark_fib)
add_subdirectory(app/context_switch)
......
#include "pls/pls.h"
#include <iostream>
#include "benchmark_runner.h"
#include "benchmark_base/fib.h"
......@@ -10,7 +8,7 @@ using namespace comparison_benchmarks::base;
constexpr int MAX_NUM_TASKS = 32;
constexpr int MAX_STACK_SIZE = 4096 * 1;
int pls_fib(int n) {
int pls_fib(int n, int d) {
if (n == 0) {
return 0;
}
......@@ -19,11 +17,11 @@ int pls_fib(int n) {
}
int a, b;
pls::spawn([n, &a]() {
a = pls_fib(n - 1);
pls::spawn([n, d, &a]() {
a = pls_fib(n - 1, d + 1);
});
pls::spawn([n, &b]() {
b = pls_fib(n - 2);
pls::spawn([n, d, &b]() {
b = pls_fib(n - 2, d + 1);
});
pls::sync();
......@@ -45,7 +43,7 @@ int main(int argc, char **argv) {
// scheduler.get_profiler().disable_memory_measure();
runner.run_iterations(fib::NUM_ITERATIONS, [&]() {
scheduler.perform_work([&]() {
res = pls_fib(fib::INPUT_N);
res = pls_fib(fib::INPUT_N, 0);
});
}, fib::NUM_WARMUP_ITERATIONS);
// scheduler.get_profiler().current_run().print_dag(std::cout);
......
add_executable(benchmark_matrix_div_conquer_pls_v3 main.cpp)
target_link_libraries(benchmark_matrix_div_conquer_pls_v3 pls benchmark_runner benchmark_base)
if (EASY_PROFILER)
target_link_libraries(benchmark_matrix_div_conquer_pls_v3 easy_profiler)
endif ()
//#include "pls/pls.h"
//using namespace pls;
#include "benchmark_runner.h"
#include <memory>
#include <array>
#include <math.h>
// Helpers to directly index into blocked matrices
const size_t MAX_BLOCK_LOOKUP = 256;
std::array<std::array<size_t, MAX_BLOCK_LOOKUP>, MAX_BLOCK_LOOKUP> BLOCK_LOOKUP; // ROW, COLUMN
void fill_block_lookup(size_t size) {
if (size <= 1) {
BLOCK_LOOKUP[0][0] = 0;
return;
}
fill_block_lookup(size / 2);
size_t elements_per_quarter = (size / 2) * (size / 2);
for (size_t row = 0; row < size / 2; row++) {
for (size_t column = 0; column < size / 2; column++) {
BLOCK_LOOKUP[row][size / 2 + column] = BLOCK_LOOKUP[row][column] + elements_per_quarter;
BLOCK_LOOKUP[size / 2 + row][column] = BLOCK_LOOKUP[row][column] + 2 * elements_per_quarter;
BLOCK_LOOKUP[size / 2 + row][size / 2 + column] = BLOCK_LOOKUP[row][column] + 3 * elements_per_quarter;
}
}
}
class blocked_matrix_view {
public:
blocked_matrix_view(double *data, size_t size) : data_{data}, size_{size} {}
blocked_matrix_view quadrant_1_1() {
size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
return blocked_matrix_view(data_ + 0 * elements_per_quarter, size_ / 2);
}
blocked_matrix_view quadrant_1_2() {
size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
return blocked_matrix_view(data_ + 1 * elements_per_quarter, size_ / 2);
}
blocked_matrix_view quadrant_2_1() {
size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
return blocked_matrix_view(data_ + 2 * elements_per_quarter, size_ / 2);
}
blocked_matrix_view quadrant_2_2() {
size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
return blocked_matrix_view(data_ + 3 * elements_per_quarter, size_ / 2);
}
double &at(size_t row, size_t column) {
return data_[BLOCK_LOOKUP[row][column]];
}
double *get_data() {
return data_;
}
private:
double *data_;
size_t size_;
};
void multiply_naive(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) {
for (size_t i = 0; i < size; i++) {
for (size_t j = 0; j < size; j++) {
result.at(i, j) = 0;
}
for (size_t j = 0; j < size; j++) {
for (size_t k = 0; k < size; k++) {
result.at(i, j) += a.at(i, k) * b.at(k, j);
}
}
}
}
void multiply_div_conquer(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) {
if (size <= 8) {
multiply_naive(size, result, a, b);
return;
}
// Temporary storage required for the intermediate results
std::unique_ptr<double[]> data_1_1_a{new double[(size / 2) * (size / 2)]};
std::unique_ptr<double[]> data_1_1_b{new double[(size / 2) * (size / 2)]};
std::unique_ptr<double[]> data_1_2_a{new double[(size / 2) * (size / 2)]};
std::unique_ptr<double[]> data_1_2_b{new double[(size / 2) * (size / 2)]};
std::unique_ptr<double[]> data_2_1_a{new double[(size / 2) * (size / 2)]};
std::unique_ptr<double[]> data_2_1_b{new double[(size / 2) * (size / 2)]};
std::unique_ptr<double[]> data_2_2_a{new double[(size / 2) * (size / 2)]};
std::unique_ptr<double[]> data_2_2_b{new double[(size / 2) * (size / 2)]};
// Handles to sub-matrices used
blocked_matrix_view result_1_1 = result.quadrant_1_1();
blocked_matrix_view result_1_2 = result.quadrant_1_2();
blocked_matrix_view result_2_1 = result.quadrant_2_1();
blocked_matrix_view result_2_2 = result.quadrant_2_2();
blocked_matrix_view result_1_1_a{data_1_1_a.get(), size / 2};
blocked_matrix_view result_1_1_b{data_1_1_b.get(), size / 2};
blocked_matrix_view result_1_2_a{data_1_2_a.get(), size / 2};
blocked_matrix_view result_1_2_b{data_1_2_b.get(), size / 2};
blocked_matrix_view result_2_1_a{data_2_1_a.get(), size / 2};
blocked_matrix_view result_2_1_b{data_2_1_b.get(), size / 2};
blocked_matrix_view result_2_2_a{data_2_2_a.get(), size / 2};
blocked_matrix_view result_2_2_b{data_2_2_b.get(), size / 2};
blocked_matrix_view a_1_1 = a.quadrant_1_1();
blocked_matrix_view a_1_2 = a.quadrant_1_2();
blocked_matrix_view a_2_1 = a.quadrant_2_1();
blocked_matrix_view a_2_2 = a.quadrant_2_2();
blocked_matrix_view b_1_1 = b.quadrant_1_1();
blocked_matrix_view b_1_2 = b.quadrant_1_2();
blocked_matrix_view b_2_1 = b.quadrant_2_1();
blocked_matrix_view b_2_2 = b.quadrant_2_2();
// Divide Work Into Sub-Calls
multiply_div_conquer(size / 2, result_1_1_a, a_1_1, b_1_1);
multiply_div_conquer(size / 2, result_1_1_b, a_1_2, b_2_1);
multiply_div_conquer(size / 2, result_1_2_a, a_1_1, b_1_2);
multiply_div_conquer(size / 2, result_1_2_b, a_1_2, b_2_2);
multiply_div_conquer(size / 2, result_2_1_a, a_2_1, b_1_1);
multiply_div_conquer(size / 2, result_2_1_b, a_2_2, b_2_1);
multiply_div_conquer(size / 2, result_2_2_a, a_2_1, b_1_2);
multiply_div_conquer(size / 2, result_2_2_b, a_2_2, b_2_2);
// Combine results
for (size_t row = 0; row < size / 2; row++) {
for (size_t column = 0; column < size / 2; column++) {
result_1_1.at(row, column) = result_1_1_a.at(row, column) + result_1_1_b.at(row, column);
result_1_2.at(row, column) = result_1_2_a.at(row, column) + result_1_2_b.at(row, column);
result_2_1.at(row, column) = result_2_1_a.at(row, column) + result_2_1_b.at(row, column);
result_2_2.at(row, column) = result_2_2_a.at(row, column) + result_2_2_b.at(row, column);
}
}
}
constexpr int MAX_NUM_TASKS = 32;
constexpr int MAX_STACK_SIZE = 4096 * 1;
int main(int argc, char **argv) {
fill_block_lookup(MAX_BLOCK_LOOKUP);
size_t size = 64;
std::unique_ptr<double[]> result_data_naive{new double[size * size]};
std::unique_ptr<double[]> result_data_div{new double[size * size]};
std::unique_ptr<double[]> a_data{new double[size * size]};
std::unique_ptr<double[]> b_data{new double[size * size]};
blocked_matrix_view result_naive{result_data_naive.get(), size};
blocked_matrix_view result_div{result_data_div.get(), size};
blocked_matrix_view a{a_data.get(), size};
blocked_matrix_view b{b_data.get(), size};
for (size_t row = 0; row < size; row++) {
for (size_t column = 0; column < size; column++) {
a.at(row, column) = row;
b.at(row, column) = column;
}
}
multiply_div_conquer(size, result_div, a, b);
multiply_naive(size, result_naive, a, b);
size_t misses = 0;
for (size_t row = 0; row < size; row++) {
for (size_t column = 0; column < size; column++) {
if (result_div.at(row, column) != result_naive.at(row, column)) {
misses++;
printf("%5.5f\t\t", result_div.at(row, column) - result_naive.at(row, column));
}
}
}
printf("\n%d", misses);
// int num_threads;
// string directory;
// benchmark_runner::read_args(argc, argv, num_threads, directory);
//
// string test_name = to_string(num_threads) + ".csv";
// string full_directory = directory + "/PLS_v3/";
// benchmark_runner runner{full_directory, test_name};
//
// scheduler scheduler{(unsigned) num_threads, MAX_NUM_TASKS, MAX_STACK_SIZE};
//
// runner.run_iterations(1000, [&]() {
// scheduler.perform_work([&]() {
// });
// }, 100);
// runner.commit_results(true);
}
......@@ -48,7 +48,7 @@ add_library(pls STATIC
include/pls/internal/profiling/dag_node.h src/internal/profiling/dag_node.cpp
include/pls/internal/profiling/profiler.h src/internal/profiling/profiler.cpp
include/pls/internal/profiling/thread_stats.h src/internal/profiling/thread_stats.cpp)
include/pls/internal/profiling/thread_stats.h src/internal/profiling/thread_stats.cpp include/pls/algorithms/divide_and_conquer_buffers.h)
# Dependencies for pls
target_link_libraries(pls Threads::Threads)
......
#ifndef PLS_ALGORITHMS_DIVIDE_AND_CONQUER_BUFFERS_H_
#define PLS_ALGORITHMS_DIVIDE_AND_CONQUER_BUFFERS_H_
#endif //PLS_ALGORITHMS_DIVIDE_AND_CONQUER_BUFFERS_H_
......@@ -47,7 +47,7 @@ constexpr size_t CACHE_LINE_SIZE = 64;
/**
* Helper to align types/values on cache lines.
*/
#define PLS_CACHE_ALIGN alignas(base::system_details::CACHE_LINE_SIZE)
#define PLS_CACHE_ALIGN alignas(::pls::internal::base::system_details::CACHE_LINE_SIZE)
/**
* Helper to find mmap page size. Either set constant or rely on system specific getter function.
......
......@@ -24,7 +24,7 @@ struct PLS_CACHE_ALIGN strain_resource {
unsigned const index_;
unsigned const depth_;
bool used_{false};
std::atomic<int> used_{0};
std::atomic<strain_resource *> next_{nullptr};
};
......@@ -48,7 +48,7 @@ class strain_local_resource {
item_handle(item_handle &&) = delete;
item_handle &operator=(item_handle &&) = delete;
explicit item_handle(strain_resource *resource) : resource_{resource} {}
explicit item_handle(strain_resource *resource);
~item_handle();
unsigned get_strain_index() {
......@@ -56,7 +56,7 @@ class strain_local_resource {
}
private:
strain_resource *resource_;
strain_resource *const resource_;
};
strain_local_resource(unsigned num_threads,
......
......@@ -23,11 +23,14 @@ strain_local_resource::item_handle strain_local_resource::get_item(unsigned dept
active_task->attached_resources_.store(result, std::memory_order_relaxed);
// Wrap it for RAII usage on stack
PLS_ASSERT(!result->used_, "Must not try to allocate an already used resource!");
result->used_ = true;
return strain_local_resource::item_handle{result};
}
strain_local_resource::item_handle::item_handle(strain_resource *resource) : resource_{resource} {
PLS_ASSERT(resource_->used_.fetch_add(1, std::memory_order_relaxed) == 0,
"Must not create a handle of a already used resource!");
}
// Return item to locally owned items
strain_local_resource::item_handle::~item_handle() {
// Only change our resource usage when synced.
......@@ -54,8 +57,8 @@ strain_local_resource::item_handle::~item_handle() {
// Give the resource handle back to our local resource array
auto &local_resource = resource_->strain_local_resource_->local_items_[my_state.get_thread_id()][resource_->depth_];
local_resource.resource_ = resource_;
PLS_ASSERT(resource_->used_, "Must only release used resources!");
resource_->used_ = false;
PLS_ASSERT(resource_->used_.fetch_sub(1, std::memory_order_relaxed) == 1,
"Accidentally freed resource that was accessed multiple times!");
}
strain_resource *strain_local_resource::get_local_copy(strain_resource *other_resources, unsigned thread_id) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment