Commit 3c60e8d7 by FritzFlorian

First working version on both ARM and x86.

parent 731b47c5
Pipeline #1403 failed with stages
in 39 seconds
add_executable(benchmark_fft_pls_v2 main.cpp) add_executable(benchmark_fft_pls_v3 main.cpp)
target_link_libraries(benchmark_fft_pls_v2 pls benchmark_runner benchmark_base) target_link_libraries(benchmark_fft_pls_v3 pls benchmark_runner benchmark_base)
if (EASY_PROFILER) if (EASY_PROFILER)
target_link_libraries(benchmark_fft_pls_v2 easy_profiler) target_link_libraries(benchmark_fft_pls_v3 easy_profiler)
endif () endif ()
...@@ -37,7 +37,7 @@ void conquer(fft::complex_vector::iterator data, int n) { ...@@ -37,7 +37,7 @@ void conquer(fft::complex_vector::iterator data, int n) {
constexpr int MAX_NUM_THREADS = 8; constexpr int MAX_NUM_THREADS = 8;
constexpr int MAX_NUM_TASKS = 32; constexpr int MAX_NUM_TASKS = 32;
constexpr int MAX_STACK_SIZE = 1024 * 4; constexpr int MAX_STACK_SIZE = 1024 * 64;
static_scheduler_memory<MAX_NUM_THREADS, static_scheduler_memory<MAX_NUM_THREADS,
MAX_NUM_TASKS, MAX_NUM_TASKS,
...@@ -49,7 +49,7 @@ int main(int argc, char **argv) { ...@@ -49,7 +49,7 @@ int main(int argc, char **argv) {
benchmark_runner::read_args(argc, argv, num_threads, directory); benchmark_runner::read_args(argc, argv, num_threads, directory);
string test_name = to_string(num_threads) + ".csv"; string test_name = to_string(num_threads) + ".csv";
string full_directory = directory + "/PLS_v2/"; string full_directory = directory + "/PLS_v3/";
benchmark_runner runner{full_directory, test_name}; benchmark_runner runner{full_directory, test_name};
fft::complex_vector data = fft::generate_input(); fft::complex_vector data = fft::generate_input();
......
add_executable(benchmark_fib_pls_v2 main.cpp) add_executable(benchmark_fib_pls_v3 main.cpp)
target_link_libraries(benchmark_fib_pls_v2 pls benchmark_runner benchmark_base) target_link_libraries(benchmark_fib_pls_v3 pls benchmark_runner benchmark_base)
if (EASY_PROFILER) if (EASY_PROFILER)
target_link_libraries(benchmark_fib_pls_v2 easy_profiler) target_link_libraries(benchmark_fib_pls_v3 easy_profiler)
endif () endif ()
...@@ -33,7 +33,7 @@ int pls_fib(int n) { ...@@ -33,7 +33,7 @@ int pls_fib(int n) {
constexpr int MAX_NUM_THREADS = 8; constexpr int MAX_NUM_THREADS = 8;
constexpr int MAX_NUM_TASKS = 32; constexpr int MAX_NUM_TASKS = 32;
constexpr int MAX_STACK_SIZE = 1024 * 1; constexpr int MAX_STACK_SIZE = 1024 * 4;
static_scheduler_memory<MAX_NUM_THREADS, static_scheduler_memory<MAX_NUM_THREADS,
MAX_NUM_TASKS, MAX_NUM_TASKS,
......
add_executable(benchmark_matrix_pls_v2 main.cpp) add_executable(benchmark_matrix_pls_v3 main.cpp)
target_link_libraries(benchmark_matrix_pls_v2 pls benchmark_runner benchmark_base) target_link_libraries(benchmark_matrix_pls_v3 pls benchmark_runner benchmark_base)
if (EASY_PROFILER) if (EASY_PROFILER)
target_link_libraries(benchmark_matrix_pls_v2 easy_profiler) target_link_libraries(benchmark_matrix_pls_v3 easy_profiler)
endif () endif ()
...@@ -23,7 +23,7 @@ class pls_matrix : public matrix::matrix<T, SIZE> { ...@@ -23,7 +23,7 @@ class pls_matrix : public matrix::matrix<T, SIZE> {
constexpr int MAX_NUM_THREADS = 8; constexpr int MAX_NUM_THREADS = 8;
constexpr int MAX_NUM_TASKS = 32; constexpr int MAX_NUM_TASKS = 32;
constexpr int MAX_STACK_SIZE = 1024 * 1; constexpr int MAX_STACK_SIZE = 1024 * 4;
static_scheduler_memory<MAX_NUM_THREADS, static_scheduler_memory<MAX_NUM_THREADS,
MAX_NUM_TASKS, MAX_NUM_TASKS,
...@@ -35,7 +35,7 @@ int main(int argc, char **argv) { ...@@ -35,7 +35,7 @@ int main(int argc, char **argv) {
benchmark_runner::read_args(argc, argv, num_threads, directory); benchmark_runner::read_args(argc, argv, num_threads, directory);
string test_name = to_string(num_threads) + ".csv"; string test_name = to_string(num_threads) + ".csv";
string full_directory = directory + "/PLS_v2/"; string full_directory = directory + "/PLS_v3/";
benchmark_runner runner{full_directory, test_name}; benchmark_runner runner{full_directory, test_name};
pls_matrix<double, matrix::MATRIX_SIZE> a; pls_matrix<double, matrix::MATRIX_SIZE> a;
......
...@@ -18,7 +18,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release") ...@@ -18,7 +18,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release")
# but inlining functions and SIMD/Vectorization is # but inlining functions and SIMD/Vectorization is
# only enabled by -O3, thus it's way faster in some # only enabled by -O3, thus it's way faster in some
# array calculations. # array calculations.
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2 -march=native")
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
else () else ()
set(CMAKE_CXX_FLAGS_DEBUG "-g -O0") set(CMAKE_CXX_FLAGS_DEBUG "-g -O0")
......
...@@ -111,6 +111,9 @@ void lambda_capture_callback(fcontext::transfer_t transfer) { ...@@ -111,6 +111,9 @@ void lambda_capture_callback(fcontext::transfer_t transfer) {
lambda_capture->~T(); lambda_capture->~T();
continuation_t cont_pointer = cont.consume(); continuation_t cont_pointer = cont.consume();
if (cont_pointer == nullptr) {
printf("Error!!!\n");
}
fcontext::jump_fcontext(cont_pointer, (void *) 0); fcontext::jump_fcontext(cont_pointer, (void *) 0);
} }
......
...@@ -50,10 +50,6 @@ struct continuation { ...@@ -50,10 +50,6 @@ struct continuation {
} }
continuation_t consume() { continuation_t consume() {
if (cont_pointer_ == nullptr) {
printf("Error!\n");
}
auto tmp = cont_pointer_; auto tmp = cont_pointer_;
cont_pointer_ = nullptr; cont_pointer_ = nullptr;
return tmp; return tmp;
......
...@@ -40,7 +40,7 @@ add_library(pls STATIC ...@@ -40,7 +40,7 @@ add_library(pls STATIC
include/pls/internal/scheduling/task_manager_impl.h include/pls/internal/scheduling/task_manager_impl.h
include/pls/internal/scheduling/static_scheduler_memory.h include/pls/internal/scheduling/static_scheduler_memory.h
include/pls/internal/scheduling/heap_scheduler_memory.h include/pls/internal/scheduling/heap_scheduler_memory.h
src/internal/scheduling/task_manager.cpp) src/internal/scheduling/task_manager.cpp src/internal/scheduling/thread_state.cpp)
# Dependencies for pls # Dependencies for pls
target_link_libraries(pls Threads::Threads) target_link_libraries(pls Threads::Threads)
......
...@@ -16,6 +16,6 @@ ...@@ -16,6 +16,6 @@
void pls_error(const char *msg); void pls_error(const char *msg);
// TODO: Distinguish between debug/internal asserts and production asserts. // TODO: Distinguish between debug/internal asserts and production asserts.
#define PLS_ASSERT(cond, msg) // if (!(cond)) { pls_error(msg); } #define PLS_ASSERT(cond, msg) if (!(cond)) { pls_error(msg); }
#endif //PLS_ERROR_HANDLING_H #endif //PLS_ERROR_HANDLING_H
...@@ -67,6 +67,24 @@ inline void relax_cpu() { ...@@ -67,6 +67,24 @@ inline void relax_cpu() {
#endif #endif
} }
/**
* Prevent inlining of functions. This is a compiler specific setting and
* it is seen as an error to not properly declare this.
* (Some functions in the codebase MUST be re-evaluated after fiber switches,
* by preventing inlining them we prevent the compiler caching their results)
*/
#if defined(_MSC_VER)
#define PLS_NOINLINE __declspec(noinline)
#elif defined(__GNUC__) && __GNUC__ > 3
#if defined(__CUDACC__)
#define PLS_NOINLINE __attribute__ ((noinline))
#else
#define PLS_NOINLINE __attribute__ ((__noinline__))
#endif
#else
#error "PLS requires inline prevention for certain functions."
#endif
} }
} }
} }
......
...@@ -78,20 +78,20 @@ class external_trading_deque { ...@@ -78,20 +78,20 @@ class external_trading_deque {
auto expected_stamp = bot_internal_.stamp; auto expected_stamp = bot_internal_.stamp;
auto &current_entry = entries_[bot_internal_.value]; auto &current_entry = entries_[bot_internal_.value];
// Publish the prepared task in the deque.
current_entry.forwarding_stamp_.store(expected_stamp, std::memory_order_relaxed);
current_entry.traded_task_.store(published_task, std::memory_order_relaxed);
// Field that all threads synchronize on. // Field that all threads synchronize on.
// This happens not in the deque itself, but in the published task. // This happens not in the deque itself, but in the published task.
traded_cas_field sync_cas_field; traded_cas_field sync_cas_field;
sync_cas_field.fill_with_stamp(expected_stamp, thread_id_); sync_cas_field.fill_with_stamp(expected_stamp, thread_id_);
published_task->external_trading_deque_cas_.store(sync_cas_field); published_task->external_trading_deque_cas_.store(sync_cas_field, std::memory_order_release);
// Publish the prepared task in the deque.
current_entry.forwarding_stamp_.store(expected_stamp);
current_entry.traded_task_.store(published_task);
// Advance the bot pointer. Linearization point for making the task public. // Advance the bot pointer. Linearization point for making the task public.
bot_internal_.stamp++; bot_internal_.stamp++;
bot_internal_.value++; bot_internal_.value++;
bot_.store(bot_internal_.value); bot_.store(bot_internal_.value, std::memory_order_release);
} }
void reset_bot_and_top() { void reset_bot_and_top() {
...@@ -104,7 +104,7 @@ class external_trading_deque { ...@@ -104,7 +104,7 @@ class external_trading_deque {
void decrease_bot() { void decrease_bot() {
bot_internal_.value--; bot_internal_.value--;
bot_.store(bot_internal_.value); bot_.store(bot_internal_.value, std::memory_order_relaxed);
} }
/** /**
...@@ -120,15 +120,17 @@ class external_trading_deque { ...@@ -120,15 +120,17 @@ class external_trading_deque {
decrease_bot(); decrease_bot();
auto &current_entry = entries_[bot_internal_.value]; auto &current_entry = entries_[bot_internal_.value];
auto *popped_task = current_entry.traded_task_.load(); auto *popped_task = current_entry.traded_task_.load(std::memory_order_relaxed);
auto expected_stamp = current_entry.forwarding_stamp_.load(); auto expected_stamp = current_entry.forwarding_stamp_.load(std::memory_order_relaxed);
// We know what value must be in the cas field if no other thread stole it. // We know what value must be in the cas field if no other thread stole it.
traded_cas_field expected_sync_cas_field; traded_cas_field expected_sync_cas_field;
expected_sync_cas_field.fill_with_stamp(expected_stamp, thread_id_); expected_sync_cas_field.fill_with_stamp(expected_stamp, thread_id_);
traded_cas_field empty_cas_field; traded_cas_field empty_cas_field;
if (popped_task->external_trading_deque_cas_.compare_exchange_strong(expected_sync_cas_field, empty_cas_field)) { if (popped_task->external_trading_deque_cas_.compare_exchange_strong(expected_sync_cas_field,
empty_cas_field,
std::memory_order_acq_rel)) {
return optional<task *>{popped_task}; return optional<task *>{popped_task};
} else { } else {
reset_bot_and_top(); reset_bot_and_top();
......
...@@ -27,10 +27,11 @@ class scheduler::init_function_impl : public init_function { ...@@ -27,10 +27,11 @@ class scheduler::init_function_impl : public init_function {
void run() override { void run() override {
auto &root_task = thread_state::get().get_task_manager().get_active_task(); auto &root_task = thread_state::get().get_task_manager().get_active_task();
root_task.run_as_task([&](context_switcher::continuation cont) { root_task.run_as_task([&](context_switcher::continuation cont) {
thread_state::get().set_main_continuation(std::move(cont)); thread_state::get().main_continuation() = std::move(cont);
function_(); function_();
thread_state::get().get_scheduler().work_section_done_.store(true); thread_state::get().get_scheduler().work_section_done_.store(true);
return std::move(thread_state::get().get_main_continuation()); PLS_ASSERT(thread_state::get().main_continuation().valid(), "Must return valid continuation from main task.");
return std::move(thread_state::get().main_continuation());
}); });
} }
......
...@@ -52,7 +52,7 @@ class task_manager { ...@@ -52,7 +52,7 @@ class task_manager {
void spawn_child(F &&lambda); void spawn_child(F &&lambda);
void sync(); void sync();
task* steal_task(task_manager &stealing_task_manager); task *steal_task(task_manager &stealing_task_manager);
bool try_clean_return(context_switcher::continuation &result_cont); bool try_clean_return(context_switcher::continuation &result_cont);
......
...@@ -71,7 +71,7 @@ void task_manager::spawn_child(F &&lambda) { ...@@ -71,7 +71,7 @@ void task_manager::spawn_child(F &&lambda) {
if (continuation.valid()) { if (continuation.valid()) {
// We jumped in here from the main loop, keep track! // We jumped in here from the main loop, keep track!
thread_state::get().set_main_continuation(std::move(continuation)); thread_state::get().main_continuation() = std::move(continuation);
} }
} }
......
...@@ -6,6 +6,8 @@ ...@@ -6,6 +6,8 @@
#include <chrono> #include <chrono>
#include <utility> #include <utility>
#include "pls/internal/base/system_details.h"
#include "context_switcher/continuation.h" #include "context_switcher/continuation.h"
namespace pls { namespace pls {
...@@ -37,9 +39,11 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) thread_state { ...@@ -37,9 +39,11 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) thread_state {
* Must only be called on threads that are associated with a thread_state, * Must only be called on threads that are associated with a thread_state,
* this will most likely be threads created by the scheduler. * this will most likely be threads created by the scheduler.
* *
* Each call is guaranteed to be a new lockup, i.e. it is not cached after fiber context switches.
*
* @return The thread_state of this thread. * @return The thread_state of this thread.
*/ */
static thread_state &get() { return *base::this_thread::state<thread_state>(); } static thread_state &PLS_NOINLINE get();
unsigned get_id() { return id_; } unsigned get_id() { return id_; }
void set_id(unsigned id) { void set_id(unsigned id) {
...@@ -54,11 +58,8 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) thread_state { ...@@ -54,11 +58,8 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) thread_state {
return random_(); return random_();
} }
void set_main_continuation(context_switcher::continuation &&continuation) { context_switcher::continuation &main_continuation() {
main_loop_continuation_ = std::move(continuation); return main_loop_continuation_;
}
context_switcher::continuation get_main_continuation() {
return std::move(main_loop_continuation_);
} }
// Do not allow move/copy operations. // Do not allow move/copy operations.
......
#include <tuple>
#include "pls/internal/scheduling/task_manager.h" #include "pls/internal/scheduling/task_manager.h"
#include "pls/internal/scheduling/task.h" #include "pls/internal/scheduling/task.h"
...@@ -76,7 +74,6 @@ task *task_manager::steal_task(task_manager &stealing_task_manager) { ...@@ -76,7 +74,6 @@ task *task_manager::steal_task(task_manager &stealing_task_manager) {
} }
void task_manager::push_resource_on_task(task *target_task, task *spare_task_chain) { void task_manager::push_resource_on_task(task *target_task, task *spare_task_chain) {
PLS_ASSERT(check_task_chain_backward(spare_task_chain), "Must only push proper task chains.");
PLS_ASSERT(target_task->thread_id_ != spare_task_chain->thread_id_, PLS_ASSERT(target_task->thread_id_ != spare_task_chain->thread_id_,
"Makes no sense to push task onto itself, as it is not clean by definition."); "Makes no sense to push task onto itself, as it is not clean by definition.");
PLS_ASSERT(target_task->depth_ == spare_task_chain->depth_, "Must only push tasks with correct depth."); PLS_ASSERT(target_task->depth_ == spare_task_chain->depth_, "Must only push tasks with correct depth.");
...@@ -90,11 +87,11 @@ void task_manager::push_resource_on_task(task *target_task, task *spare_task_cha ...@@ -90,11 +87,11 @@ void task_manager::push_resource_on_task(task *target_task, task *spare_task_cha
if (current_root.value == 0) { if (current_root.value == 0) {
// Empty, simply push in with no successor // Empty, simply push in with no successor
spare_task_chain->resource_stack_next_.store(nullptr, std::memory_order_relaxed); spare_task_chain->resource_stack_next_.store(nullptr);
} else { } else {
// Already an entry. Find it's corresponding task and set it as our successor. // Already an entry. Find it's corresponding task and set it as our successor.
auto *current_root_task = find_task(current_root.value - 1, target_task->depth_); auto *current_root_task = find_task(current_root.value - 1, target_task->depth_);
spare_task_chain->resource_stack_next_.store(current_root_task, std::memory_order_relaxed); spare_task_chain->resource_stack_next_.store(current_root_task);
} }
} while (!target_task->resource_stack_root_.compare_exchange_strong(current_root, target_root)); } while (!target_task->resource_stack_root_.compare_exchange_strong(current_root, target_root));
...@@ -112,7 +109,7 @@ task *task_manager::pop_resource_from_task(task *target_task) { ...@@ -112,7 +109,7 @@ task *task_manager::pop_resource_from_task(task *target_task) {
} else { } else {
// Found something, try to pop it // Found something, try to pop it
auto *current_root_task = find_task(current_root.value - 1, target_task->depth_); auto *current_root_task = find_task(current_root.value - 1, target_task->depth_);
auto *next_stack_task = current_root_task->resource_stack_next_.load(std::memory_order_relaxed); auto *next_stack_task = current_root_task->resource_stack_next_.load();
target_root.stamp = current_root.stamp + 1; target_root.stamp = current_root.stamp + 1;
target_root.value = next_stack_task != nullptr ? next_stack_task->thread_id_ + 1 : 0; target_root.value = next_stack_task != nullptr ? next_stack_task->thread_id_ + 1 : 0;
...@@ -122,7 +119,7 @@ task *task_manager::pop_resource_from_task(task *target_task) { ...@@ -122,7 +119,7 @@ task *task_manager::pop_resource_from_task(task *target_task) {
} while (!target_task->resource_stack_root_.compare_exchange_strong(current_root, target_root)); } while (!target_task->resource_stack_root_.compare_exchange_strong(current_root, target_root));
PLS_ASSERT(check_task_chain_backward(output_task), "Must only pop proper task chains."); PLS_ASSERT(check_task_chain_backward(output_task), "Must only pop proper task chains.");
output_task->resource_stack_next_.store(nullptr, std::memory_order_relaxed); output_task->resource_stack_next_.store(nullptr);
return output_task; return output_task;
} }
...@@ -187,7 +184,8 @@ bool task_manager::try_clean_return(context_switcher::continuation &result_cont) ...@@ -187,7 +184,8 @@ bool task_manager::try_clean_return(context_switcher::continuation &result_cont)
} }
// jump back to the continuation in main scheduling loop, time to steal some work // jump back to the continuation in main scheduling loop, time to steal some work
result_cont = thread_state::get().get_main_continuation(); result_cont = std::move(thread_state::get().main_continuation());
PLS_ASSERT(result_cont.valid(), "Must return a valid continuation.");
return true; return true;
} else { } else {
// Make sure that we are owner fo this full continuation/task chain. // Make sure that we are owner fo this full continuation/task chain.
...@@ -198,13 +196,16 @@ bool task_manager::try_clean_return(context_switcher::continuation &result_cont) ...@@ -198,13 +196,16 @@ bool task_manager::try_clean_return(context_switcher::continuation &result_cont)
active_task_ = last_task; active_task_ = last_task;
result_cont = std::move(last_task->continuation_); result_cont = std::move(last_task->continuation_);
PLS_ASSERT(result_cont.valid(), "Must return a valid continuation.");
return false; return false;
} }
} }
bool task_manager::check_task_chain_forward(task *start_task) { bool task_manager::check_task_chain_forward(task *start_task) {
while (start_task->next_ != nullptr) { while (start_task->next_ != nullptr) {
PLS_ASSERT(start_task->next_->prev_ == start_task, "Chain must have correct prev/next fields for linked list!"); if (start_task->next_->prev_ != start_task) {
return false;
}
start_task = start_task->next_; start_task = start_task->next_;
} }
return true; return true;
...@@ -212,17 +213,16 @@ bool task_manager::check_task_chain_forward(task *start_task) { ...@@ -212,17 +213,16 @@ bool task_manager::check_task_chain_forward(task *start_task) {
bool task_manager::check_task_chain_backward(task *start_task) { bool task_manager::check_task_chain_backward(task *start_task) {
while (start_task->prev_ != nullptr) { while (start_task->prev_ != nullptr) {
PLS_ASSERT(start_task->prev_->next_ == start_task, "Chain must have correct prev/next fields for linked list!"); if (start_task->prev_->next_ != start_task) {
return false;
}
start_task = start_task->prev_; start_task = start_task->prev_;
} }
return true; return true;
} }
bool task_manager::check_task_chain() { bool task_manager::check_task_chain() {
check_task_chain_backward(active_task_); return check_task_chain_backward(active_task_) && check_task_chain_forward(active_task_);
check_task_chain_forward(active_task_);
return true;
} }
} }
......
#include "pls/internal/scheduling/thread_state.h"
#include "pls/internal/base/thread.h"
namespace pls {
namespace internal {
namespace scheduling {
thread_state &thread_state::get() { return *base::this_thread::state<thread_state>(); }
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment