Commit 9fa9296a by FritzFlorian

Final version ready for benchmarking.

parent 7d090b3c
Pipeline #1542 failed with stages
in 61 minutes 12 seconds
...@@ -43,6 +43,9 @@ ADD_CUSTOM_TARGET(install.pls ...@@ -43,6 +43,9 @@ ADD_CUSTOM_TARGET(install.pls
-P ${CMAKE_BINARY_DIR}/cmake_install.cmake) -P ${CMAKE_BINARY_DIR}/cmake_install.cmake)
ADD_DEPENDENCIES(install.pls context_switcher pls) ADD_DEPENDENCIES(install.pls context_switcher pls)
# ... second custom target to only build the benchmarks.
ADD_CUSTOM_TARGET(benchmark.pls)
# Include examples # Include examples
add_subdirectory(app/playground) add_subdirectory(app/playground)
add_subdirectory(app/benchmark_fft) add_subdirectory(app/benchmark_fft)
......
add_executable(benchmark_fft_pls_v3 main.cpp) add_executable(benchmark_fft_pls_v3 main.cpp)
target_link_libraries(benchmark_fft_pls_v3 pls benchmark_runner benchmark_base) target_link_libraries(benchmark_fft_pls_v3 pls benchmark_runner benchmark_base)
ADD_DEPENDENCIES(benchmark.pls benchmark_fft_pls_v3)
if (EASY_PROFILER) if (EASY_PROFILER)
target_link_libraries(benchmark_fft_pls_v3 easy_profiler) target_link_libraries(benchmark_fft_pls_v3 easy_profiler)
endif () endif ()
add_executable(benchmark_fib_pls_v3 main.cpp) add_executable(benchmark_fib_pls_v3 main.cpp)
target_link_libraries(benchmark_fib_pls_v3 pls benchmark_runner benchmark_base) target_link_libraries(benchmark_fib_pls_v3 pls benchmark_runner benchmark_base)
ADD_DEPENDENCIES(benchmark.pls benchmark_fib_pls_v3)
if (EASY_PROFILER) if (EASY_PROFILER)
target_link_libraries(benchmark_fib_pls_v3 easy_profiler) target_link_libraries(benchmark_fib_pls_v3 easy_profiler)
endif () endif ()
add_executable(benchmark_matrix_pls_v3 main.cpp) add_executable(benchmark_matrix_pls_v3 main.cpp)
target_link_libraries(benchmark_matrix_pls_v3 pls benchmark_runner benchmark_base) target_link_libraries(benchmark_matrix_pls_v3 pls benchmark_runner benchmark_base)
ADD_DEPENDENCIES(benchmark.pls benchmark_matrix_pls_v3)
if (EASY_PROFILER) if (EASY_PROFILER)
target_link_libraries(benchmark_matrix_pls_v3 easy_profiler) target_link_libraries(benchmark_matrix_pls_v3 easy_profiler)
endif () endif ()
...@@ -37,6 +37,12 @@ int main(int argc, char **argv) { ...@@ -37,6 +37,12 @@ int main(int argc, char **argv) {
pls::scheduler scheduler{(unsigned) settings.num_threads_, MAX_NUM_TASKS, MAX_STACK_SIZE}; pls::scheduler scheduler{(unsigned) settings.num_threads_, MAX_NUM_TASKS, MAX_STACK_SIZE};
if (settings.type_ == benchmark_runner::benchmark_settings::ISOLATED) { if (settings.type_ == benchmark_runner::benchmark_settings::ISOLATED) {
#if PLS_PROFILING_ENABLED
scheduler.get_profiler().disable_memory_measure();
runner.add_custom_stats_field("T_1");
runner.add_custom_stats_field("T_inf");
#endif
printf("Running isolated measurement...\n"); printf("Running isolated measurement...\n");
runner.enable_memory_stats(); runner.enable_memory_stats();
runner.pre_allocate_stats(); runner.pre_allocate_stats();
...@@ -45,6 +51,11 @@ int main(int argc, char **argv) { ...@@ -45,6 +51,11 @@ int main(int argc, char **argv) {
scheduler.perform_work([&]() { scheduler.perform_work([&]() {
result.multiply(a, b); result.multiply(a, b);
}); });
}, [&]() {}, [&]() {
#if PLS_PROFILING_ENABLED
runner.store_custom_stat("T_1", scheduler.get_profiler().current_run().t_1_);
runner.store_custom_stat("T_inf", scheduler.get_profiler().current_run().t_inf_);
#endif
}); });
runner.commit_results(true); runner.commit_results(true);
} else { } else {
......
add_executable(benchmark_matrix_div_conquer_pls_v3 main.cpp) add_executable(benchmark_matrix_div_conquer_pls_v3 main.cpp)
target_link_libraries(benchmark_matrix_div_conquer_pls_v3 pls benchmark_runner benchmark_base) target_link_libraries(benchmark_matrix_div_conquer_pls_v3 pls benchmark_runner benchmark_base)
ADD_DEPENDENCIES(benchmark.pls benchmark_matrix_div_conquer_pls_v3)
if (EASY_PROFILER) if (EASY_PROFILER)
target_link_libraries(benchmark_matrix_div_conquer_pls_v3 easy_profiler) target_link_libraries(benchmark_matrix_div_conquer_pls_v3 easy_profiler)
endif () endif ()
...@@ -92,12 +92,10 @@ void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_ ...@@ -92,12 +92,10 @@ void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_
pls::spawn( pls::spawn(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 6, result_2_2_a, a_2_1, b_1_2); } [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 6, result_2_2_a, a_2_1, b_1_2); }
); );
pls::spawn( pls::spawn_and_sync(
[&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 7, result_2_2_b, a_2_2, b_2_2); } [&]() { multiply_div_conquer(tmp_arrays, local_indices, size / 2, depth + 1, 7, result_2_2_b, a_2_2, b_2_2); }
); );
pls::sync();
// Combine results // Combine results
for (size_t i = 0; i < (size / 2) * (size / 2); i++) { for (size_t i = 0; i < (size / 2) * (size / 2); i++) {
// The layout is not important here, ass all have the same order, so just sum element wise // The layout is not important here, ass all have the same order, so just sum element wise
...@@ -108,7 +106,6 @@ void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_ ...@@ -108,7 +106,6 @@ void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_
} }
} }
constexpr int MAX_NUM_TASKS = 10;
constexpr int MAX_STACK_SIZE = 4096 * 1; constexpr int MAX_STACK_SIZE = 4096 * 1;
int main(int argc, char **argv) { int main(int argc, char **argv) {
...@@ -128,7 +125,6 @@ int main(int argc, char **argv) { ...@@ -128,7 +125,6 @@ int main(int argc, char **argv) {
// Fill data arrays as needed // Fill data arrays as needed
a.fill_default_data(); a.fill_default_data();
b.fill_default_data(); b.fill_default_data();
matrix_div_conquer::fill_block_lookup(size);
// Strain local data // Strain local data
std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> div_conquer_temp_arrays; std::vector<std::vector<std::vector<std::unique_ptr<double[]>>>> div_conquer_temp_arrays;
...@@ -138,7 +134,7 @@ int main(int argc, char **argv) { ...@@ -138,7 +134,7 @@ int main(int argc, char **argv) {
while (remaining_size > matrix_div_conquer::CUTOFF_SIZE) { while (remaining_size > matrix_div_conquer::CUTOFF_SIZE) {
auto &depth_buffers = div_conquer_temp_arrays.emplace_back(); auto &depth_buffers = div_conquer_temp_arrays.emplace_back();
buffers_needed = std::min(buffers_needed, (size_t) settings.num_threads_); buffers_needed = std::min(buffers_needed, (size_t) settings.num_threads_);
for (int thread_id = 0; thread_id < buffers_needed; thread_id++) { for (size_t thread_id = 0; thread_id < buffers_needed; thread_id++) {
auto &depth_thread_buffers = depth_buffers.emplace_back(); auto &depth_thread_buffers = depth_buffers.emplace_back();
for (int i = 0; i < 8; i++) { for (int i = 0; i < 8; i++) {
size_t matrix_elements = (remaining_size / 2) * (remaining_size / 2); size_t matrix_elements = (remaining_size / 2) * (remaining_size / 2);
...@@ -159,7 +155,7 @@ int main(int argc, char **argv) { ...@@ -159,7 +155,7 @@ int main(int argc, char **argv) {
string full_directory = settings.output_directory_ + "/PLS_v3/"; string full_directory = settings.output_directory_ + "/PLS_v3/";
benchmark_runner runner{full_directory, test_name}; benchmark_runner runner{full_directory, test_name};
pls::scheduler scheduler{(unsigned) settings.num_threads_, MAX_NUM_TASKS, MAX_STACK_SIZE}; pls::scheduler scheduler{(unsigned) settings.num_threads_, max_depth + 2, MAX_STACK_SIZE};
if (settings.type_ == benchmark_runner::benchmark_settings::ISOLATED) { if (settings.type_ == benchmark_runner::benchmark_settings::ISOLATED) {
printf("Running isolated measurement...\n"); printf("Running isolated measurement...\n");
......
add_executable(benchmark_unbalanced_pls_v3 main.cpp) add_executable(benchmark_unbalanced_pls_v3 main.cpp)
target_link_libraries(benchmark_unbalanced_pls_v3 benchmark_runner benchmark_base pls) target_link_libraries(benchmark_unbalanced_pls_v3 benchmark_runner benchmark_base pls)
ADD_DEPENDENCIES(benchmark.pls benchmark_unbalanced_pls_v3)
if (EASY_PROFILER) if (EASY_PROFILER)
target_link_libraries(benchmark_unbalanced_pls_v3 easy_profiler) target_link_libraries(benchmark_unbalanced_pls_v3 easy_profiler)
endif () endif ()
...@@ -31,7 +31,7 @@ int unbalanced_tree_search(int seed, int root_children, double q, int normal_chi ...@@ -31,7 +31,7 @@ int unbalanced_tree_search(int seed, int root_children, double q, int normal_chi
return count_child_nodes(root); return count_child_nodes(root);
} }
constexpr int MAX_NUM_TASKS = 256; constexpr int MAX_NUM_TASKS = 180;
constexpr int MAX_STACK_SIZE = 4096 * 1; constexpr int MAX_STACK_SIZE = 4096 * 1;
int main(int argc, char **argv) { int main(int argc, char **argv) {
......
...@@ -9,7 +9,7 @@ add_library(benchmark_base STATIC ...@@ -9,7 +9,7 @@ add_library(benchmark_base STATIC
include/benchmark_base/unbalanced.h src/unbalanced.cpp include/benchmark_base/unbalanced.h src/unbalanced.cpp
include/benchmark_base/range.h include/benchmark_base/range.h
include/benchmark_base/fib.h include/benchmark_base/fib.h
include/benchmark_base/matrix_div_conquer.h) include/benchmark_base/matrix_div_conquer.h src/matrix_div_conquer.cpp)
target_include_directories(benchmark_base target_include_directories(benchmark_base
PUBLIC PUBLIC
......
#ifndef COMPARISON_BENCHMARKS_BASE_MATRIX_DIV_CONQUER_H #ifndef COMPARISON_BENCHMARKS_BASE_MATRIX_DIV_CONQUER_H
#define COMPARISON_BENCHMARKS_BASE_MATRIX_DIV_CONQUER_H #define COMPARISON_BENCHMARKS_BASE_MATRIX_DIV_CONQUER_H
#include <array> #include <vector>
#include <cstdio>
#include <cstdlib>
namespace comparison_benchmarks { namespace comparison_benchmarks {
namespace base { namespace base {
namespace matrix_div_conquer { namespace matrix_div_conquer {
const int MATRIX_SIZE = 128;
const int CUTOFF_SIZE = 8; const int CUTOFF_SIZE = 8;
const int NUM_ITERATIONS = 100;
const int WARMUP_ITERATIONS = 10;
// Helpers to directly index into blocked matrices
const size_t MAX_SIZE = 128;
std::array<std::array<size_t, MAX_SIZE>, MAX_SIZE> BLOCK_LOOKUP; // ROW, COLUMN
void fill_block_lookup(size_t size = MAX_SIZE) {
if (size <= 1) {
BLOCK_LOOKUP[0][0] = 0;
return;
}
fill_block_lookup(size / 2);
size_t elements_per_quarter = (size / 2) * (size / 2);
for (size_t row = 0; row < size / 2; row++) {
for (size_t column = 0; column < size / 2; column++) {
BLOCK_LOOKUP[row][size / 2 + column] = BLOCK_LOOKUP[row][column] + elements_per_quarter;
BLOCK_LOOKUP[size / 2 + row][column] = BLOCK_LOOKUP[row][column] + 2 * elements_per_quarter;
BLOCK_LOOKUP[size / 2 + row][size / 2 + column] = BLOCK_LOOKUP[row][column] + 3 * elements_per_quarter;
}
}
}
class blocked_matrix_view { class blocked_matrix_view {
public: public:
blocked_matrix_view(double *data, size_t size) : data_{data}, size_{size} {} blocked_matrix_view(double *data, size_t size) : data_{data},
size_{size} {
void fill_default_data() { if (size > BLOCK_LOOKUPS_SIZE) {
for (size_t row = 0; row < size_; row++) { init_block_lookup(size);
for (size_t column = 0; column < size_; column++) {
at(row, column) = row;
}
} }
} }
void fill_default_data();
blocked_matrix_view quadrant_1_1() { blocked_matrix_view quadrant_1_1() {
size_t elements_per_quarter = (size_ / 2) * (size_ / 2); size_t elements_per_quarter = (size_ / 2) * (size_ / 2);
return blocked_matrix_view(data_ + 0 * elements_per_quarter, size_ / 2); return blocked_matrix_view(data_ + 0 * elements_per_quarter, size_ / 2);
...@@ -64,7 +40,7 @@ class blocked_matrix_view { ...@@ -64,7 +40,7 @@ class blocked_matrix_view {
} }
double &at(size_t row, size_t column) { double &at(size_t row, size_t column) {
return data_[BLOCK_LOOKUP[row][column]]; return data_[BLOCK_LOOKUPS[block_lookup_at(row, column)]];
} }
double *get_data() { double *get_data() {
...@@ -72,22 +48,21 @@ class blocked_matrix_view { ...@@ -72,22 +48,21 @@ class blocked_matrix_view {
} }
private: private:
double *data_; double *const data_;
size_t size_; const size_t size_;
};
void multiply_naive(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) { // Lookup indices for non divide-conquer block lookups
for (size_t i = 0; i < size; i++) { static std::vector<size_t> BLOCK_LOOKUPS;
for (size_t j = 0; j < size; j++) { static size_t BLOCK_LOOKUPS_SIZE;
result.at(i, j) = 0;
} static void fill_block_lookup(size_t size, std::vector<size_t> &BLOCK_LOOKUP);
for (size_t j = 0; j < size; j++) { static void init_block_lookup(size_t max_size);
for (size_t k = 0; k < size; k++) { static size_t block_lookup_at(size_t row, size_t column) {
result.at(i, j) += a.at(i, k) * b.at(k, j); return row * BLOCK_LOOKUPS_SIZE + column;
}
}
} }
} };
void multiply_naive(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b);
} }
} }
......
#include <cstdio>
#include <cstdlib>
#include "benchmark_base/matrix_div_conquer.h"
namespace comparison_benchmarks {
namespace base {
namespace matrix_div_conquer {
void multiply_naive(size_t size, blocked_matrix_view &result, blocked_matrix_view &a, blocked_matrix_view &b) {
for (size_t i = 0; i < size; i++) {
for (size_t j = 0; j < size; j++) {
result.at(i, j) = 0;
}
for (size_t j = 0; j < size; j++) {
for (size_t k = 0; k < size; k++) {
result.at(i, j) += a.at(i, k) * b.at(k, j);
}
}
}
}
void blocked_matrix_view::fill_default_data() {
for (size_t row = 0; row < size_; row++) {
for (size_t column = 0; column < size_; column++) {
at(row, column) = row;
}
}
}
std::vector<size_t> blocked_matrix_view::BLOCK_LOOKUPS;
size_t blocked_matrix_view::BLOCK_LOOKUPS_SIZE;
void blocked_matrix_view::fill_block_lookup(size_t size, std::vector<size_t> &BLOCK_LOOKUP) {
if (size <= 1) {
BLOCK_LOOKUP[block_lookup_at(0, 0)] = 0;
return;
}
fill_block_lookup(size / 2, BLOCK_LOOKUP);
size_t elements_per_quarter = (size / 2) * (size / 2);
for (size_t row = 0; row < size / 2; row++) {
for (size_t column = 0; column < size / 2; column++) {
BLOCK_LOOKUP[block_lookup_at(row, size / 2 + column)] =
BLOCK_LOOKUP[block_lookup_at(row, column)] + elements_per_quarter;
BLOCK_LOOKUP[block_lookup_at(size / 2 + row, column)] =
BLOCK_LOOKUP[block_lookup_at(row, column)] + 2 * elements_per_quarter;
BLOCK_LOOKUP[block_lookup_at(size / 2 + row, size / 2 + column)] =
BLOCK_LOOKUP[block_lookup_at(row, column)] + 3 * elements_per_quarter;
}
}
}
void blocked_matrix_view::init_block_lookup(size_t max_size) {
if (BLOCK_LOOKUPS.size() < max_size) {
BLOCK_LOOKUPS = std::vector<size_t>(max_size * max_size);
BLOCK_LOOKUPS_SIZE = max_size;
fill_block_lookup(max_size, BLOCK_LOOKUPS);
}
}
}
}
}
...@@ -299,7 +299,7 @@ class benchmark_runner { ...@@ -299,7 +299,7 @@ class benchmark_runner {
long wall_time_us = 0; long wall_time_us = 0;
wall_time_us += (finish_time.tv_sec - iteration_start.tv_sec) * 1000l * 1000l; wall_time_us += (finish_time.tv_sec - iteration_start.tv_sec) * 1000l * 1000l;
wall_time_us += ((long) finish_time.tv_nsec - (long) iteration_start.tv_nsec) / 1000l; wall_time_us += ((long) finish_time.tv_nsec - (long) iteration_start.tv_nsec) / 1000l;
printf("Difference: %d\n", wall_time_us - times_[current_iteration]); printf("Difference: %ld\n", wall_time_us - times_[current_iteration]);
times_[current_iteration] = wall_time_us; times_[current_iteration] = wall_time_us;
if (finish_time.tv_sec >= deadline_end.tv_sec && finish_time.tv_nsec > deadline_end.tv_nsec) { if (finish_time.tv_sec >= deadline_end.tv_sec && finish_time.tv_nsec > deadline_end.tv_nsec) {
......
...@@ -49,7 +49,7 @@ add_library(pls STATIC ...@@ -49,7 +49,7 @@ add_library(pls STATIC
include/pls/internal/profiling/dag_node.h src/internal/profiling/dag_node.cpp include/pls/internal/profiling/dag_node.h src/internal/profiling/dag_node.cpp
include/pls/internal/profiling/profiler.h src/internal/profiling/profiler.cpp include/pls/internal/profiling/profiler.h src/internal/profiling/profiler.cpp
include/pls/internal/profiling/thread_stats.h src/internal/profiling/thread_stats.cpp include/pls/algorithms/divide_and_conquer_buffers.h) include/pls/internal/profiling/thread_stats.h src/internal/profiling/thread_stats.cpp)
# Dependencies for pls # Dependencies for pls
target_link_libraries(pls Threads::Threads) target_link_libraries(pls Threads::Threads)
......
#ifndef PLS_ALGORITHMS_DIVIDE_AND_CONQUER_BUFFERS_H_
#define PLS_ALGORITHMS_DIVIDE_AND_CONQUER_BUFFERS_H_
#endif //PLS_ALGORITHMS_DIVIDE_AND_CONQUER_BUFFERS_H_
...@@ -7,14 +7,14 @@ ...@@ -7,14 +7,14 @@
namespace pls::algorithm { namespace pls::algorithm {
template<typename Function, typename ExecutionStrategy> template<typename Function, typename ExecutionStrategy>
static void for_each_range(unsigned long first, static void for_each_range(long first,
unsigned long last, long last,
const Function &function, const Function &function,
ExecutionStrategy &execution_strategy); ExecutionStrategy &execution_strategy);
template<typename Function> template<typename Function>
static void for_each_range(unsigned long first, static void for_each_range(long first,
unsigned long last, long last,
const Function &function); const Function &function);
template<typename RandomIt, typename Function, typename ExecutionStrategy> template<typename RandomIt, typename Function, typename ExecutionStrategy>
......
...@@ -10,10 +10,10 @@ namespace pls::algorithm { ...@@ -10,10 +10,10 @@ namespace pls::algorithm {
namespace internal { namespace internal {
template<typename RandomIt, typename Function> template<typename RandomIt, typename Function>
static void for_each(const RandomIt first, static void for_each_iterator(const RandomIt first,
const RandomIt last, const RandomIt last,
const Function &function, const Function &function,
const long min_elements) { const size_t min_elements) {
using namespace ::pls::internal::scheduling; using namespace ::pls::internal::scheduling;
const long num_elements = std::distance(first, last); const long num_elements = std::distance(first, last);
...@@ -27,16 +27,48 @@ static void for_each(const RandomIt first, ...@@ -27,16 +27,48 @@ static void for_each(const RandomIt first,
const long middle_index = num_elements / 2; const long middle_index = num_elements / 2;
scheduler::spawn([first, middle_index, last, &function, min_elements] { scheduler::spawn([first, middle_index, last, &function, min_elements] {
internal::for_each(first, internal::for_each_iterator(first,
first + middle_index, first + middle_index,
function, function,
min_elements); min_elements);
});
scheduler::spawn_and_sync([first, middle_index, last, &function, min_elements] {
internal::for_each_iterator(first + middle_index,
last,
function,
min_elements);
});
}
}
template<typename Function>
static void for_each_range(const long first,
const long last,
const Function &function,
const size_t min_elements) {
using namespace ::pls::internal::scheduling;
const long num_elements = last - first;
if (num_elements <= min_elements) {
// calculate last elements in loop to avoid overhead
for (auto current = first; current != last; current++) {
function(current);
}
} else {
// Cut in half recursively
const long middle_index = num_elements / 2;
scheduler::spawn([first, middle_index, last, &function, min_elements] {
internal::for_each_range(first,
first + middle_index,
function,
min_elements);
}); });
scheduler::spawn_and_sync([first, middle_index, last, &function, min_elements] { scheduler::spawn_and_sync([first, middle_index, last, &function, min_elements] {
internal::for_each(first + middle_index, internal::for_each_range(first + middle_index,
last, last,
function, function,
min_elements); min_elements);
}); });
} }
} }
...@@ -44,15 +76,14 @@ static void for_each(const RandomIt first, ...@@ -44,15 +76,14 @@ static void for_each(const RandomIt first,
} }
template<typename RandomIt, typename Function, typename ExecutionStrategy> template<typename RandomIt, typename Function, typename ExecutionStrategy>
static void for_each(RandomIt static void for_each(RandomIt first,
first,
RandomIt last, RandomIt last,
const Function &function, const Function &function,
ExecutionStrategy ExecutionStrategy
execution_strategy) { execution_strategy) {
long num_elements = std::distance(first, last); long num_elements = std::distance(first, last);
return return
internal::for_each(first, last, function, execution_strategy.calculate_min_elements(num_elements)); internal::for_each_iterator(first, last, function, execution_strategy.calculate_min_elements(num_elements));
} }
template<typename RandomIt, typename Function> template<typename RandomIt, typename Function>
...@@ -61,20 +92,19 @@ static void for_each(RandomIt first, RandomIt last, const Function &function) { ...@@ -61,20 +92,19 @@ static void for_each(RandomIt first, RandomIt last, const Function &function) {
} }
template<typename Function, typename ExecutionStrategy> template<typename Function, typename ExecutionStrategy>
static void for_each_range(unsigned long first, static void for_each_range(long first,
unsigned long last, long last,
const Function &function, const Function &function,
ExecutionStrategy execution_strategy) { ExecutionStrategy execution_strategy) {
auto range = pls::internal::helpers::range(first, last); long num_elements = last - first;
return for_each(range.begin(), range.end(), function, execution_strategy); return internal::for_each_range(first, last, function, execution_strategy.calculate_min_elements(num_elements));
} }
template<typename Function> template<typename Function>
static void for_each_range(unsigned long first, static void for_each_range(long first,
unsigned long last, long last,
const Function &function) { const Function &function) {
auto range = pls::internal::helpers::range(first, last); return for_each_range(first, last, function, dynamic_strategy{4});
return for_each(range.begin(), range.end(), function);
} }
} }
......
...@@ -18,4 +18,9 @@ void pls_error(const char *msg); ...@@ -18,4 +18,9 @@ void pls_error(const char *msg);
// TODO: Distinguish between debug/internal asserts and production asserts. // TODO: Distinguish between debug/internal asserts and production asserts.
#define PLS_ASSERT(cond, msg) if (!(cond)) { pls_error(msg); } #define PLS_ASSERT(cond, msg) if (!(cond)) { pls_error(msg); }
// Enable/Disable more expensive asserts.
// On very small workloads also the 'normal' asserts can be disabled for more performance.
//#define PLS_ASSERT_EXPENSIVE(cond, msg) if (!(cond)) { pls_error(msg); }
#define PLS_ASSERT_EXPENSIVE(cond, msg)
#endif //PLS_ERROR_HANDLING_H #endif //PLS_ERROR_HANDLING_H
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include <cstddef> #include <cstddef>
namespace pls::internal::base { namespace pls::internal::base {
class stack_allocator { class stack_allocator {
public: public:
virtual char *allocate_stack(size_t size) = 0; virtual char *allocate_stack(size_t size) = 0;
......
...@@ -56,12 +56,13 @@ struct base_task { ...@@ -56,12 +56,13 @@ struct base_task {
} }
// General task information // General task information
unsigned depth_; const unsigned depth_;
unsigned thread_id_; const unsigned thread_id_;
// Stack/continuation management // Stack/continuation management
char *stack_memory_; char * const stack_memory_;
size_t stack_size_; const size_t stack_size_;
context_switcher::continuation continuation_; context_switcher::continuation continuation_;
bool is_synchronized_; bool is_synchronized_;
bool is_serial_section_; bool is_serial_section_;
......
...@@ -30,11 +30,9 @@ struct task : public base_task { ...@@ -30,11 +30,9 @@ struct task : public base_task {
static task *find_task(unsigned id, unsigned depth); static task *find_task(unsigned id, unsigned depth);
private: private:
std::atomic<int> num_resources_{};
// STAMP = thread id of 'owning' thread before task was inserted into stack. // STAMP = thread id of 'owning' thread before task was inserted into stack.
// VALUE = next item in stack, indicated by thread ID. // VALUE = next item in stack, indicated by thread ID.
std::atomic<data_structures::stamped_integer> resource_stack_next_{{0, 0}}; PLS_CACHE_ALIGN std::atomic<data_structures::stamped_integer> resource_stack_next_{{0, 0}};
// STAMP = CAS stamp, half CAS length (16 or 32 Bit) // STAMP = CAS stamp, half CAS length (16 or 32 Bit)
// VALUE = Root of the actual stack, indicated by thread ID (16 or 32 Bit) // VALUE = Root of the actual stack, indicated by thread ID (16 or 32 Bit)
......
...@@ -63,6 +63,20 @@ scheduler::scheduler(unsigned int num_threads, ...@@ -63,6 +63,20 @@ scheduler::scheduler(unsigned int num_threads,
work_thread_main_loop(); work_thread_main_loop();
}); });
} }
// Make sure all threads are created and touched their stacks.
// Executing a work section ensures one wakeup/sleep cycle of all workers
// and explicitly forcing one task per worker forces them to initialize their stacks.
std::atomic<unsigned> num_spawned;
this->perform_work([&]() {
for (unsigned i = 0; i < num_threads; i++) {
spawn([&]() {
num_spawned++;
while (num_spawned < num_threads) std::this_thread::yield();
});
}
sync();
});
} }
class scheduler::init_function { class scheduler::init_function {
...@@ -195,7 +209,7 @@ void scheduler::spawn_internal(Function &&lambda) { ...@@ -195,7 +209,7 @@ void scheduler::spawn_internal(Function &&lambda) {
#if PLS_SLEEP_WORKERS_ON_EMPTY #if PLS_SLEEP_WORKERS_ON_EMPTY
// TODO: relax atomic operations on empty flag // TODO: relax atomic operations on empty flag
data_structures::stamped_integer queue_empty_flag = spawning_state.get_queue_empty_flag().load(); data_structures::stamped_integer queue_empty_flag = spawning_state.get_queue_empty_flag().load();
switch (queue_empty_flag.value) { switch (queue_empty_flag.value_) {
case EMPTY_QUEUE_STATE::QUEUE_NON_EMPTY: { case EMPTY_QUEUE_STATE::QUEUE_NON_EMPTY: {
// The queue was not found empty, ignore it. // The queue was not found empty, ignore it.
break; break;
...@@ -203,9 +217,9 @@ void scheduler::spawn_internal(Function &&lambda) { ...@@ -203,9 +217,9 @@ void scheduler::spawn_internal(Function &&lambda) {
case EMPTY_QUEUE_STATE::QUEUE_MAYBE_EMPTY: { case EMPTY_QUEUE_STATE::QUEUE_MAYBE_EMPTY: {
// Someone tries to mark us empty and might be re-stealing right now. // Someone tries to mark us empty and might be re-stealing right now.
data_structures::stamped_integer data_structures::stamped_integer
queue_non_empty_flag{queue_empty_flag.stamp++, EMPTY_QUEUE_STATE::QUEUE_NON_EMPTY}; queue_non_empty_flag{queue_empty_flag.stamp_++, EMPTY_QUEUE_STATE::QUEUE_NON_EMPTY};
auto actual_empty_flag = spawning_state.get_queue_empty_flag().exchange(queue_non_empty_flag); auto actual_empty_flag = spawning_state.get_queue_empty_flag().exchange(queue_non_empty_flag);
if (actual_empty_flag.value == EMPTY_QUEUE_STATE::QUEUE_EMPTY) { if (actual_empty_flag.value_ == EMPTY_QUEUE_STATE::QUEUE_EMPTY) {
spawning_state.get_scheduler().empty_queue_decrease_counter_and_wake(); spawning_state.get_scheduler().empty_queue_decrease_counter_and_wake();
} }
break; break;
...@@ -213,7 +227,7 @@ void scheduler::spawn_internal(Function &&lambda) { ...@@ -213,7 +227,7 @@ void scheduler::spawn_internal(Function &&lambda) {
case EMPTY_QUEUE_STATE::QUEUE_EMPTY: { case EMPTY_QUEUE_STATE::QUEUE_EMPTY: {
// Someone already marked the queue empty, we must revert its action on the central queue. // Someone already marked the queue empty, we must revert its action on the central queue.
data_structures::stamped_integer data_structures::stamped_integer
queue_non_empty_flag{queue_empty_flag.stamp++, EMPTY_QUEUE_STATE::QUEUE_NON_EMPTY}; queue_non_empty_flag{queue_empty_flag.stamp_++, EMPTY_QUEUE_STATE::QUEUE_NON_EMPTY};
spawning_state.get_queue_empty_flag().store(queue_non_empty_flag); spawning_state.get_queue_empty_flag().store(queue_non_empty_flag);
spawning_state.get_scheduler().empty_queue_decrease_counter_and_wake(); spawning_state.get_scheduler().empty_queue_decrease_counter_and_wake();
break; break;
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "pls/internal/scheduling/scheduler.h" #include "pls/internal/scheduling/scheduler.h"
#include "pls/internal/scheduling/strain_local_resource.h" #include "pls/internal/scheduling/strain_local_resource.h"
#include "pls/internal/base/stack_allocator.h"
#include "pls/internal/helpers/range.h" #include "pls/internal/helpers/range.h"
#include "pls/internal/helpers/member_function.h" #include "pls/internal/helpers/member_function.h"
...@@ -18,6 +19,8 @@ namespace pls { ...@@ -18,6 +19,8 @@ namespace pls {
// 'basic' for-join APIs // 'basic' for-join APIs
using internal::scheduling::scheduler; using internal::scheduling::scheduler;
using internal::base::heap_stack_allocator;
using internal::base::mmap_stack_allocator;
template<typename Function> template<typename Function>
static void spawn(Function &&function) { static void spawn(Function &&function) {
scheduler::spawn(std::forward<Function>(function)); scheduler::spawn(std::forward<Function>(function));
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
namespace pls::internal::scheduling::lock_free { namespace pls::internal::scheduling::lock_free {
traded_cas_field external_trading_deque::peek_traded_object(task *target_task) { traded_cas_field external_trading_deque::peek_traded_object(task *target_task) {
traded_cas_field current_cas = target_task->external_trading_deque_cas_.load(); traded_cas_field current_cas = target_task->external_trading_deque_cas_.load(std::memory_order_relaxed);
return current_cas; return current_cas;
} }
...@@ -17,7 +17,9 @@ task *external_trading_deque::get_trade_object(task *target_task, ...@@ -17,7 +17,9 @@ task *external_trading_deque::get_trade_object(task *target_task,
traded_cas_field empty_cas = peeked_cas; traded_cas_field empty_cas = peeked_cas;
empty_cas.make_empty(); empty_cas.make_empty();
if (target_task->external_trading_deque_cas_.compare_exchange_strong(current_cas, empty_cas)) { if (target_task->external_trading_deque_cas_.compare_exchange_strong(current_cas,
empty_cas,
std::memory_order_acq_rel)) {
task *result = task::find_task(result_id, target_task->depth_); task *result = task::find_task(result_id, target_task->depth_);
return result; return result;
} }
...@@ -50,8 +52,8 @@ void external_trading_deque::reset_bot_and_top() { ...@@ -50,8 +52,8 @@ void external_trading_deque::reset_bot_and_top() {
bot_internal_.value_ = 0; bot_internal_.value_ = 0;
bot_internal_.stamp_++; bot_internal_.stamp_++;
bot_.store(0); bot_.store(0, std::memory_order_release);
top_.store({bot_internal_.stamp_, 0}); top_.store({bot_internal_.stamp_, 0}, std::memory_order_release);
} }
task *external_trading_deque::pop_bot() { task *external_trading_deque::pop_bot() {
...@@ -83,11 +85,11 @@ task *external_trading_deque::pop_bot() { ...@@ -83,11 +85,11 @@ task *external_trading_deque::pop_bot() {
} }
external_trading_deque::peek_result external_trading_deque::peek_top() { external_trading_deque::peek_result external_trading_deque::peek_top() {
auto local_top = top_.load(); auto local_top = top_.load(std::memory_order_acquire);
auto local_bot = bot_.load(); auto local_bot = bot_.load(std::memory_order_acquire);
if (local_top.value_ < local_bot) { if (local_top.value_ < local_bot) {
return peek_result{entries_[local_top.value_].traded_task_, local_top}; return peek_result{entries_[local_top.value_].traded_task_.load(std::memory_order_relaxed), local_top};
} else { } else {
return peek_result{nullptr, local_top}; return peek_result{nullptr, local_top};
} }
...@@ -95,7 +97,7 @@ external_trading_deque::peek_result external_trading_deque::peek_top() { ...@@ -95,7 +97,7 @@ external_trading_deque::peek_result external_trading_deque::peek_top() {
task *external_trading_deque::pop_top(task *offered_task, peek_result peek_result) { task *external_trading_deque::pop_top(task *offered_task, peek_result peek_result) {
stamped_integer expected_top = peek_result.top_pointer_; stamped_integer expected_top = peek_result.top_pointer_;
auto local_bot = bot_.load(); auto local_bot = bot_.load(std::memory_order_acquire);
if (expected_top.value_ >= local_bot) { if (expected_top.value_ >= local_bot) {
return nullptr; return nullptr;
} }
...@@ -103,8 +105,8 @@ task *external_trading_deque::pop_top(task *offered_task, peek_result peek_resul ...@@ -103,8 +105,8 @@ task *external_trading_deque::pop_top(task *offered_task, peek_result peek_resul
auto &target_entry = entries_[expected_top.value_]; auto &target_entry = entries_[expected_top.value_];
// Read our potential result // Read our potential result
task *result = target_entry.traded_task_.load(); task *result = target_entry.traded_task_.load(std::memory_order_relaxed);
unsigned long forwarding_stamp = target_entry.forwarding_stamp_.load(); unsigned long forwarding_stamp = target_entry.forwarding_stamp_.load(std::memory_order_relaxed);
if (result == nullptr) { if (result == nullptr) {
return nullptr; return nullptr;
...@@ -112,7 +114,7 @@ task *external_trading_deque::pop_top(task *offered_task, peek_result peek_resul ...@@ -112,7 +114,7 @@ task *external_trading_deque::pop_top(task *offered_task, peek_result peek_resul
if (forwarding_stamp != expected_top.stamp_) { if (forwarding_stamp != expected_top.stamp_) {
// ...we failed because the top tag lags behind...try to fix it. // ...we failed because the top tag lags behind...try to fix it.
// This means only updating the tag, as this location can still hold data we need. // This means only updating the tag, as this location can still hold data we need.
top_.compare_exchange_strong(expected_top, {forwarding_stamp, expected_top.value_}); top_.compare_exchange_strong(expected_top, {forwarding_stamp, expected_top.value_}, std::memory_order_relaxed);
return nullptr; return nullptr;
} }
...@@ -123,16 +125,20 @@ task *external_trading_deque::pop_top(task *offered_task, peek_result peek_resul ...@@ -123,16 +125,20 @@ task *external_trading_deque::pop_top(task *offered_task, peek_result peek_resul
traded_cas_field offered_field = expected_sync_cas_field; traded_cas_field offered_field = expected_sync_cas_field;
offered_field.fill_with_task(offered_task->thread_id_); offered_field.fill_with_task(offered_task->thread_id_);
if (result->external_trading_deque_cas_.compare_exchange_strong(expected_sync_cas_field, offered_field)) { if (result->external_trading_deque_cas_.compare_exchange_strong(expected_sync_cas_field,
offered_field,
std::memory_order_acq_rel)) {
// We got it, for sure move the top pointer forward. // We got it, for sure move the top pointer forward.
top_.compare_exchange_strong(expected_top, {expected_top.stamp_ + 1, expected_top.value_ + 1}); top_.compare_exchange_strong(expected_top,
{expected_top.stamp_ + 1, expected_top.value_ + 1},
std::memory_order_acq_rel);
return result; return result;
} else { } else {
// TODO: Re-Check this condition for forwarding the stamp! Should only happen if another top-stealer took the
// slot that we where interested in!
if (expected_sync_cas_field.is_filled_with_object() && expected_sync_cas_field.get_stamp() == expected_top.stamp_ if (expected_sync_cas_field.is_filled_with_object() && expected_sync_cas_field.get_stamp() == expected_top.stamp_
&& expected_sync_cas_field.get_trade_request_thread_id() == thread_id_) { && expected_sync_cas_field.get_trade_request_thread_id() == thread_id_) {
top_.compare_exchange_strong(expected_top, {expected_top.stamp_ + 1, expected_top.value_ + 1}); top_.compare_exchange_strong(expected_top,
{expected_top.stamp_ + 1, expected_top.value_ + 1},
std::memory_order_relaxed);
} }
return nullptr; return nullptr;
} }
......
...@@ -15,14 +15,12 @@ void task::prepare_for_push(unsigned int pushing_thread_id) { ...@@ -15,14 +15,12 @@ void task::prepare_for_push(unsigned int pushing_thread_id) {
} }
bool task::push_task_chain(task *spare_task_chain, unsigned pushing_thread_id) { bool task::push_task_chain(task *spare_task_chain, unsigned pushing_thread_id) {
num_resources_++;
PLS_ASSERT(this->thread_id_ != spare_task_chain->thread_id_, PLS_ASSERT(this->thread_id_ != spare_task_chain->thread_id_,
"Makes no sense to push task onto itself, as it is not clean by definition."); "Makes no sense to push task onto itself, as it is not clean by definition.");
PLS_ASSERT(this->depth_ == spare_task_chain->depth_, PLS_ASSERT(this->depth_ == spare_task_chain->depth_,
"Must only push tasks with correct depth."); "Must only push tasks with correct depth.");
data_structures::stamped_integer current_root; data_structures::stamped_integer current_root = this->resource_stack_root_.load(std::memory_order_relaxed);
data_structures::stamped_integer target_root; data_structures::stamped_integer target_root;
data_structures::stamped_integer expected_next_field; data_structures::stamped_integer expected_next_field;
...@@ -30,10 +28,8 @@ bool task::push_task_chain(task *spare_task_chain, unsigned pushing_thread_id) { ...@@ -30,10 +28,8 @@ bool task::push_task_chain(task *spare_task_chain, unsigned pushing_thread_id) {
expected_next_field.stamp_ = pushing_thread_id + 1; expected_next_field.stamp_ = pushing_thread_id + 1;
expected_next_field.value_ = 0; expected_next_field.value_ = 0;
int iteration = 0;
do { do {
iteration++; // current_root implicitly re-loaded by CAS in loop
current_root = this->resource_stack_root_.load();
target_root.stamp_ = current_root.stamp_ + 1; target_root.stamp_ = current_root.stamp_ + 1;
target_root.value_ = spare_task_chain->thread_id_ + 1; target_root.value_ = spare_task_chain->thread_id_ + 1;
...@@ -50,53 +46,49 @@ bool task::push_task_chain(task *spare_task_chain, unsigned pushing_thread_id) { ...@@ -50,53 +46,49 @@ bool task::push_task_chain(task *spare_task_chain, unsigned pushing_thread_id) {
target_next_field.value_ = current_root_task->thread_id_ + 1; target_next_field.value_ = current_root_task->thread_id_ + 1;
} }
if (!spare_task_chain->resource_stack_next_.compare_exchange_strong(expected_next_field, target_next_field)) { if (!spare_task_chain->resource_stack_next_.compare_exchange_strong(expected_next_field,
num_resources_--; target_next_field,
std::memory_order_relaxed)) {
return false; return false;
} else { } else {
expected_next_field = target_next_field; expected_next_field = target_next_field;
} }
} while (!this->resource_stack_root_.compare_exchange_strong(current_root, target_root)); } while (!this->resource_stack_root_.compare_exchange_strong(current_root, target_root, std::memory_order_acq_rel));
return true; return true;
} }
void task::reset_task_chain(task *expected_content) { void task::reset_task_chain(task *expected_content) {
num_resources_--; data_structures::stamped_integer current_root = this->resource_stack_root_.load(std::memory_order_relaxed);
data_structures::stamped_integer current_root = this->resource_stack_root_.load();
PLS_ASSERT(current_root.value_ == expected_content->thread_id_ + 1, PLS_ASSERT(current_root.value_ == expected_content->thread_id_ + 1,
"Must only reset the task chain if we exactly know its state! (current_root.value_)"); "Must only reset the task chain if we exactly know its state! (current_root.value_)");
data_structures::stamped_integer target_root; data_structures::stamped_integer target_root;
target_root.stamp_ = current_root.stamp_ + 1; target_root.stamp_ = current_root.stamp_ + 1;
bool success = this->resource_stack_root_.compare_exchange_strong(current_root, target_root); this->resource_stack_root_.store(target_root, std::memory_order_relaxed);
PLS_ASSERT(success, "Must always succeed in resetting the chain, as we must be the sole one operating on it!");
} }
task *task::pop_task_chain() { task *task::pop_task_chain() {
data_structures::stamped_integer current_root; data_structures::stamped_integer current_root = this->resource_stack_root_.load(std::memory_order_relaxed);
data_structures::stamped_integer target_root; data_structures::stamped_integer target_root;
task *output_task; task *output_task;
do { do {
current_root = this->resource_stack_root_.load(); // current_root implicitly re-loaded by CAS in loop
if (current_root.value_ == 0) { if (current_root.value_ == 0) {
// Empty... // Empty...
return nullptr; return nullptr;
} else { } else {
// Found something, try to pop it // Found something, try to pop it
auto *current_root_task = find_task(current_root.value_ - 1, this->depth_); auto *current_root_task = find_task(current_root.value_ - 1, this->depth_);
auto next_stack_cas = current_root_task->resource_stack_next_.load(); auto next_stack_cas = current_root_task->resource_stack_next_.load(std::memory_order_relaxed);
target_root.stamp_ = current_root.stamp_ + 1; target_root.stamp_ = current_root.stamp_ + 1;
target_root.value_ = next_stack_cas.value_; target_root.value_ = next_stack_cas.value_;
output_task = current_root_task; output_task = current_root_task;
} }
} while (!this->resource_stack_root_.compare_exchange_strong(current_root, target_root)); } while (!this->resource_stack_root_.compare_exchange_strong(current_root, target_root, std::memory_order_acq_rel));
PLS_ASSERT(num_resources_.fetch_add(-1) > 0, "Must only return an task from the chain if there are items!");
output_task->resource_stack_next_.store({0, 0}); output_task->resource_stack_next_.store({0, 0});
return output_task; return output_task;
......
...@@ -44,7 +44,7 @@ base_task *task_manager::pop_local_task() { ...@@ -44,7 +44,7 @@ base_task *task_manager::pop_local_task() {
std::tuple<base_task *, base_task *, bool> task_manager::steal_task(thread_state &stealing_state) { std::tuple<base_task *, base_task *, bool> task_manager::steal_task(thread_state &stealing_state) {
PLS_ASSERT(stealing_state.get_active_task()->depth_ == 0, "Must only steal with clean task chain."); PLS_ASSERT(stealing_state.get_active_task()->depth_ == 0, "Must only steal with clean task chain.");
PLS_ASSERT(scheduler::check_task_chain(*stealing_state.get_active_task()), "Must only steal with clean task chain."); PLS_ASSERT_EXPENSIVE(scheduler::check_task_chain(*stealing_state.get_active_task()), "Must only steal with clean task chain.");
auto peek = deque_.peek_top(); auto peek = deque_.peek_top();
if (peek.top_task_) { if (peek.top_task_) {
...@@ -83,7 +83,6 @@ std::tuple<base_task *, base_task *, bool> task_manager::steal_task(thread_state ...@@ -83,7 +83,6 @@ std::tuple<base_task *, base_task *, bool> task_manager::steal_task(thread_state
return std::tuple{stolen_task, chain_after_stolen_task, true}; return std::tuple{stolen_task, chain_after_stolen_task, true};
} else { } else {
// TODO: traded task resource_stack_next_ field is de-marked from being mine
return std::tuple{nullptr, nullptr, false}; return std::tuple{nullptr, nullptr, false};
} }
} else { } else {
...@@ -94,15 +93,17 @@ std::tuple<base_task *, base_task *, bool> task_manager::steal_task(thread_state ...@@ -94,15 +93,17 @@ std::tuple<base_task *, base_task *, bool> task_manager::steal_task(thread_state
base_task *task_manager::pop_clean_task_chain(base_task *base_task) { base_task *task_manager::pop_clean_task_chain(base_task *base_task) {
task *target_task = static_cast<task *>(base_task); task *target_task = static_cast<task *>(base_task);
traded_cas_field peeked_task_cas_before, peeked_task_cas_after;
peeked_task_cas_after = external_trading_deque::peek_traded_object(target_task);
while (true) { while (true) {
// Try to get a clean resource chain to go back to the main stealing loop // Try to get a clean resource chain to go back to the main stealing loop
auto peeked_task_cas_before = external_trading_deque::peek_traded_object(target_task); peeked_task_cas_before = peeked_task_cas_after;
task *pop_result = target_task->pop_task_chain(); task *pop_result = target_task->pop_task_chain();
if (pop_result) { if (pop_result) {
PLS_ASSERT(scheduler::check_task_chain_backward(*pop_result), "Must only pop proper task chains."); PLS_ASSERT(scheduler::check_task_chain_backward(*pop_result), "Must only pop proper task chains.");
return pop_result; // Got something, so we are simply done here return pop_result; // Got something, so we are simply done here
} }
auto peeked_task_cas_after = external_trading_deque::peek_traded_object(target_task); peeked_task_cas_after = external_trading_deque::peek_traded_object(target_task);
if (peeked_task_cas_before != peeked_task_cas_after) { if (peeked_task_cas_before != peeked_task_cas_after) {
continue; continue;
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include "pls/internal/scheduling/strain_local_resource.h" #include "pls/internal/scheduling/strain_local_resource.h"
#include "pls/internal/build_flavour.h" #include "pls/internal/build_flavour.h"
#include "pls/internal/base/error_handling.h" #include "pls/internal/base/error_handling.h"
#include "pls/internal/base/futex_wrapper.h"
#include <thread> #include <thread>
...@@ -57,7 +58,7 @@ void scheduler::work_thread_work_section() { ...@@ -57,7 +58,7 @@ void scheduler::work_thread_work_section() {
#if PLS_PROFILING_ENABLED #if PLS_PROFILING_ENABLED
my_state.get_scheduler().profiler_.stealing_start(my_state.get_thread_id()); my_state.get_scheduler().profiler_.stealing_start(my_state.get_thread_id());
#endif #endif
PLS_ASSERT(check_task_chain(*my_state.get_active_task()), "Must start stealing with a clean task chain."); PLS_ASSERT_EXPENSIVE(check_task_chain(*my_state.get_active_task()), "Must start stealing with a clean task chain.");
size_t target; size_t target;
do { do {
...@@ -91,7 +92,7 @@ void scheduler::work_thread_work_section() { ...@@ -91,7 +92,7 @@ void scheduler::work_thread_work_section() {
auto *stolen_resources = stolen_task->attached_resources_.load(std::memory_order_relaxed); auto *stolen_resources = stolen_task->attached_resources_.load(std::memory_order_relaxed);
strain_local_resource::acquire_locally(stolen_resources, my_state.get_thread_id()); strain_local_resource::acquire_locally(stolen_resources, my_state.get_thread_id());
PLS_ASSERT(check_task_chain_forward(*my_state.get_active_task()), PLS_ASSERT_EXPENSIVE(check_task_chain_forward(*my_state.get_active_task()),
"We are sole owner of this chain, it has to be valid!"); "We are sole owner of this chain, it has to be valid!");
// Execute the stolen task by jumping to it's continuation. // Execute the stolen task by jumping to it's continuation.
...@@ -117,12 +118,12 @@ void scheduler::work_thread_work_section() { ...@@ -117,12 +118,12 @@ void scheduler::work_thread_work_section() {
my_state.get_scheduler().profiler_.stealing_end(my_state.get_thread_id(), false); my_state.get_scheduler().profiler_.stealing_end(my_state.get_thread_id(), false);
#endif #endif
#if PLS_SLEEP_WORKERS_ON_EMPTY #if PLS_SLEEP_WORKERS_ON_EMPTY
switch (target_queue_empty_flag.value) { switch (target_queue_empty_flag.value_) {
case EMPTY_QUEUE_STATE::QUEUE_NON_EMPTY: { case EMPTY_QUEUE_STATE::QUEUE_NON_EMPTY: {
// We found the queue empty, but the flag says it should still be full. // We found the queue empty, but the flag says it should still be full.
// We want to declare it empty, bet we need to re-check the queue in a sub-step to avoid races. // We want to declare it empty, bet we need to re-check the queue in a sub-step to avoid races.
data_structures::stamped_integer data_structures::stamped_integer
maybe_empty_flag{target_queue_empty_flag.stamp + 1, EMPTY_QUEUE_STATE::QUEUE_MAYBE_EMPTY}; maybe_empty_flag{target_queue_empty_flag.stamp_ + 1, EMPTY_QUEUE_STATE::QUEUE_MAYBE_EMPTY};
if (target_state.get_queue_empty_flag().compare_exchange_strong(target_queue_empty_flag, if (target_state.get_queue_empty_flag().compare_exchange_strong(target_queue_empty_flag,
maybe_empty_flag)) { maybe_empty_flag)) {
goto queue_empty_flag_retry_steal; goto queue_empty_flag_retry_steal;
...@@ -133,7 +134,7 @@ void scheduler::work_thread_work_section() { ...@@ -133,7 +134,7 @@ void scheduler::work_thread_work_section() {
// We found the queue empty and it was already marked as maybe empty. // We found the queue empty and it was already marked as maybe empty.
// We can safely mark it empty and increment the central counter. // We can safely mark it empty and increment the central counter.
data_structures::stamped_integer data_structures::stamped_integer
empty_flag{target_queue_empty_flag.stamp + 1, EMPTY_QUEUE_STATE::QUEUE_EMPTY}; empty_flag{target_queue_empty_flag.stamp_ + 1, EMPTY_QUEUE_STATE::QUEUE_EMPTY};
if (target_state.get_queue_empty_flag().compare_exchange_strong(target_queue_empty_flag, empty_flag)) { if (target_state.get_queue_empty_flag().compare_exchange_strong(target_queue_empty_flag, empty_flag)) {
// We marked it empty, now its our duty to modify the central counter // We marked it empty, now its our duty to modify the central counter
my_state.get_scheduler().empty_queue_increase_counter(); my_state.get_scheduler().empty_queue_increase_counter();
...@@ -216,8 +217,8 @@ context_switcher::continuation scheduler::slow_return(thread_state &calling_stat ...@@ -216,8 +217,8 @@ context_switcher::continuation scheduler::slow_return(thread_state &calling_stat
"Resources must only reside in the correct depth!"); "Resources must only reside in the correct depth!");
PLS_ASSERT(last_task != clean_chain, PLS_ASSERT(last_task != clean_chain,
"We want to swap out the last task and its chain to use a clean one, thus they must differ."); "We want to swap out the last task and its chain to use a clean one, thus they must differ.");
PLS_ASSERT(check_task_chain_backward(*clean_chain), PLS_ASSERT_EXPENSIVE(check_task_chain_backward(*clean_chain),
"Can only acquire clean chains for clean returns!"); "Can only acquire clean chains for clean returns!");
// Acquire it/merge it with our task chain. // Acquire it/merge it with our task chain.
this_task->prev_ = clean_chain; this_task->prev_ = clean_chain;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment