diff --git a/app/benchmark_fft/CMakeLists.txt b/app/benchmark_fft/CMakeLists.txt index 4c7f3fe..cfef00b 100644 --- a/app/benchmark_fft/CMakeLists.txt +++ b/app/benchmark_fft/CMakeLists.txt @@ -1,5 +1,5 @@ -add_executable(benchmark_fft_pls_v2 main.cpp) -target_link_libraries(benchmark_fft_pls_v2 pls benchmark_runner benchmark_base) +add_executable(benchmark_fft_pls_v3 main.cpp) +target_link_libraries(benchmark_fft_pls_v3 pls benchmark_runner benchmark_base) if (EASY_PROFILER) - target_link_libraries(benchmark_fft_pls_v2 easy_profiler) + target_link_libraries(benchmark_fft_pls_v3 easy_profiler) endif () diff --git a/app/benchmark_fft/main.cpp b/app/benchmark_fft/main.cpp index 0a34b30..64f2915 100644 --- a/app/benchmark_fft/main.cpp +++ b/app/benchmark_fft/main.cpp @@ -37,7 +37,7 @@ void conquer(fft::complex_vector::iterator data, int n) { constexpr int MAX_NUM_THREADS = 8; constexpr int MAX_NUM_TASKS = 32; -constexpr int MAX_STACK_SIZE = 1024 * 4; +constexpr int MAX_STACK_SIZE = 1024 * 64; static_scheduler_memory { constexpr int MAX_NUM_THREADS = 8; constexpr int MAX_NUM_TASKS = 32; -constexpr int MAX_STACK_SIZE = 1024 * 1; +constexpr int MAX_STACK_SIZE = 1024 * 4; static_scheduler_memory a; diff --git a/cmake/SetupOptimizationLevel.cmake b/cmake/SetupOptimizationLevel.cmake index 5d22958..57fcd59 100644 --- a/cmake/SetupOptimizationLevel.cmake +++ b/cmake/SetupOptimizationLevel.cmake @@ -18,7 +18,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release") # but inlining functions and SIMD/Vectorization is # only enabled by -O3, thus it's way faster in some # array calculations. - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2 -march=native") set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) else () set(CMAKE_CXX_FLAGS_DEBUG "-g -O0") diff --git a/lib/context_switcher/include/context_switcher/context_switcher.h b/lib/context_switcher/include/context_switcher/context_switcher.h index 6bb4886..3bcc795 100644 --- a/lib/context_switcher/include/context_switcher/context_switcher.h +++ b/lib/context_switcher/include/context_switcher/context_switcher.h @@ -111,6 +111,9 @@ void lambda_capture_callback(fcontext::transfer_t transfer) { lambda_capture->~T(); continuation_t cont_pointer = cont.consume(); + if (cont_pointer == nullptr) { + printf("Error!!!\n"); + } fcontext::jump_fcontext(cont_pointer, (void *) 0); } diff --git a/lib/context_switcher/include/context_switcher/continuation.h b/lib/context_switcher/include/context_switcher/continuation.h index ff6c31c..3c196c0 100644 --- a/lib/context_switcher/include/context_switcher/continuation.h +++ b/lib/context_switcher/include/context_switcher/continuation.h @@ -50,10 +50,6 @@ struct continuation { } continuation_t consume() { - if (cont_pointer_ == nullptr) { - printf("Error!\n"); - } - auto tmp = cont_pointer_; cont_pointer_ = nullptr; return tmp; diff --git a/lib/pls/CMakeLists.txt b/lib/pls/CMakeLists.txt index e120f2e..f7f8232 100644 --- a/lib/pls/CMakeLists.txt +++ b/lib/pls/CMakeLists.txt @@ -40,7 +40,7 @@ add_library(pls STATIC include/pls/internal/scheduling/task_manager_impl.h include/pls/internal/scheduling/static_scheduler_memory.h include/pls/internal/scheduling/heap_scheduler_memory.h - src/internal/scheduling/task_manager.cpp) + src/internal/scheduling/task_manager.cpp src/internal/scheduling/thread_state.cpp) # Dependencies for pls target_link_libraries(pls Threads::Threads) diff --git a/lib/pls/include/pls/internal/base/error_handling.h b/lib/pls/include/pls/internal/base/error_handling.h index bdb6466..8704cc8 100644 --- a/lib/pls/include/pls/internal/base/error_handling.h +++ b/lib/pls/include/pls/internal/base/error_handling.h @@ -16,6 +16,6 @@ void pls_error(const char *msg); // TODO: Distinguish between debug/internal asserts and production asserts. -#define PLS_ASSERT(cond, msg) // if (!(cond)) { pls_error(msg); } +#define PLS_ASSERT(cond, msg) if (!(cond)) { pls_error(msg); } #endif //PLS_ERROR_HANDLING_H diff --git a/lib/pls/include/pls/internal/base/system_details.h b/lib/pls/include/pls/internal/base/system_details.h index 165ed3e..222d127 100644 --- a/lib/pls/include/pls/internal/base/system_details.h +++ b/lib/pls/include/pls/internal/base/system_details.h @@ -67,6 +67,24 @@ inline void relax_cpu() { #endif } +/** + * Prevent inlining of functions. This is a compiler specific setting and + * it is seen as an error to not properly declare this. + * (Some functions in the codebase MUST be re-evaluated after fiber switches, + * by preventing inlining them we prevent the compiler caching their results) + */ +#if defined(_MSC_VER) +#define PLS_NOINLINE __declspec(noinline) +#elif defined(__GNUC__) && __GNUC__ > 3 +#if defined(__CUDACC__) +#define PLS_NOINLINE __attribute__ ((noinline)) +#else +#define PLS_NOINLINE __attribute__ ((__noinline__)) +#endif +#else +#error "PLS requires inline prevention for certain functions." +#endif + } } } diff --git a/lib/pls/include/pls/internal/scheduling/external_trading_deque.h b/lib/pls/include/pls/internal/scheduling/external_trading_deque.h index 2f589c6..4fc6b1e 100644 --- a/lib/pls/include/pls/internal/scheduling/external_trading_deque.h +++ b/lib/pls/include/pls/internal/scheduling/external_trading_deque.h @@ -78,20 +78,20 @@ class external_trading_deque { auto expected_stamp = bot_internal_.stamp; auto ¤t_entry = entries_[bot_internal_.value]; + // Publish the prepared task in the deque. + current_entry.forwarding_stamp_.store(expected_stamp, std::memory_order_relaxed); + current_entry.traded_task_.store(published_task, std::memory_order_relaxed); + // Field that all threads synchronize on. // This happens not in the deque itself, but in the published task. traded_cas_field sync_cas_field; sync_cas_field.fill_with_stamp(expected_stamp, thread_id_); - published_task->external_trading_deque_cas_.store(sync_cas_field); - - // Publish the prepared task in the deque. - current_entry.forwarding_stamp_.store(expected_stamp); - current_entry.traded_task_.store(published_task); + published_task->external_trading_deque_cas_.store(sync_cas_field, std::memory_order_release); // Advance the bot pointer. Linearization point for making the task public. bot_internal_.stamp++; bot_internal_.value++; - bot_.store(bot_internal_.value); + bot_.store(bot_internal_.value, std::memory_order_release); } void reset_bot_and_top() { @@ -104,7 +104,7 @@ class external_trading_deque { void decrease_bot() { bot_internal_.value--; - bot_.store(bot_internal_.value); + bot_.store(bot_internal_.value, std::memory_order_relaxed); } /** @@ -120,15 +120,17 @@ class external_trading_deque { decrease_bot(); auto ¤t_entry = entries_[bot_internal_.value]; - auto *popped_task = current_entry.traded_task_.load(); - auto expected_stamp = current_entry.forwarding_stamp_.load(); + auto *popped_task = current_entry.traded_task_.load(std::memory_order_relaxed); + auto expected_stamp = current_entry.forwarding_stamp_.load(std::memory_order_relaxed); // We know what value must be in the cas field if no other thread stole it. traded_cas_field expected_sync_cas_field; expected_sync_cas_field.fill_with_stamp(expected_stamp, thread_id_); traded_cas_field empty_cas_field; - if (popped_task->external_trading_deque_cas_.compare_exchange_strong(expected_sync_cas_field, empty_cas_field)) { + if (popped_task->external_trading_deque_cas_.compare_exchange_strong(expected_sync_cas_field, + empty_cas_field, + std::memory_order_acq_rel)) { return optional{popped_task}; } else { reset_bot_and_top(); diff --git a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h index 45abc65..9510505 100644 --- a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h +++ b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h @@ -27,10 +27,11 @@ class scheduler::init_function_impl : public init_function { void run() override { auto &root_task = thread_state::get().get_task_manager().get_active_task(); root_task.run_as_task([&](context_switcher::continuation cont) { - thread_state::get().set_main_continuation(std::move(cont)); + thread_state::get().main_continuation() = std::move(cont); function_(); thread_state::get().get_scheduler().work_section_done_.store(true); - return std::move(thread_state::get().get_main_continuation()); + PLS_ASSERT(thread_state::get().main_continuation().valid(), "Must return valid continuation from main task."); + return std::move(thread_state::get().main_continuation()); }); } diff --git a/lib/pls/include/pls/internal/scheduling/task_manager.h b/lib/pls/include/pls/internal/scheduling/task_manager.h index 9670d82..8b473a6 100644 --- a/lib/pls/include/pls/internal/scheduling/task_manager.h +++ b/lib/pls/include/pls/internal/scheduling/task_manager.h @@ -52,7 +52,7 @@ class task_manager { void spawn_child(F &&lambda); void sync(); - task* steal_task(task_manager &stealing_task_manager); + task *steal_task(task_manager &stealing_task_manager); bool try_clean_return(context_switcher::continuation &result_cont); diff --git a/lib/pls/include/pls/internal/scheduling/task_manager_impl.h b/lib/pls/include/pls/internal/scheduling/task_manager_impl.h index 31d06d2..4c4d740 100644 --- a/lib/pls/include/pls/internal/scheduling/task_manager_impl.h +++ b/lib/pls/include/pls/internal/scheduling/task_manager_impl.h @@ -71,7 +71,7 @@ void task_manager::spawn_child(F &&lambda) { if (continuation.valid()) { // We jumped in here from the main loop, keep track! - thread_state::get().set_main_continuation(std::move(continuation)); + thread_state::get().main_continuation() = std::move(continuation); } } diff --git a/lib/pls/include/pls/internal/scheduling/thread_state.h b/lib/pls/include/pls/internal/scheduling/thread_state.h index 74fe5d9..329ff91 100644 --- a/lib/pls/include/pls/internal/scheduling/thread_state.h +++ b/lib/pls/include/pls/internal/scheduling/thread_state.h @@ -6,6 +6,8 @@ #include #include +#include "pls/internal/base/system_details.h" + #include "context_switcher/continuation.h" namespace pls { @@ -37,9 +39,11 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) thread_state { * Must only be called on threads that are associated with a thread_state, * this will most likely be threads created by the scheduler. * + * Each call is guaranteed to be a new lockup, i.e. it is not cached after fiber context switches. + * * @return The thread_state of this thread. */ - static thread_state &get() { return *base::this_thread::state(); } + static thread_state &PLS_NOINLINE get(); unsigned get_id() { return id_; } void set_id(unsigned id) { @@ -54,11 +58,8 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) thread_state { return random_(); } - void set_main_continuation(context_switcher::continuation &&continuation) { - main_loop_continuation_ = std::move(continuation); - } - context_switcher::continuation get_main_continuation() { - return std::move(main_loop_continuation_); + context_switcher::continuation &main_continuation() { + return main_loop_continuation_; } // Do not allow move/copy operations. diff --git a/lib/pls/src/internal/scheduling/task_manager.cpp b/lib/pls/src/internal/scheduling/task_manager.cpp index 2c73fe1..eeff837 100644 --- a/lib/pls/src/internal/scheduling/task_manager.cpp +++ b/lib/pls/src/internal/scheduling/task_manager.cpp @@ -1,5 +1,3 @@ -#include - #include "pls/internal/scheduling/task_manager.h" #include "pls/internal/scheduling/task.h" @@ -76,7 +74,6 @@ task *task_manager::steal_task(task_manager &stealing_task_manager) { } void task_manager::push_resource_on_task(task *target_task, task *spare_task_chain) { - PLS_ASSERT(check_task_chain_backward(spare_task_chain), "Must only push proper task chains."); PLS_ASSERT(target_task->thread_id_ != spare_task_chain->thread_id_, "Makes no sense to push task onto itself, as it is not clean by definition."); PLS_ASSERT(target_task->depth_ == spare_task_chain->depth_, "Must only push tasks with correct depth."); @@ -90,11 +87,11 @@ void task_manager::push_resource_on_task(task *target_task, task *spare_task_cha if (current_root.value == 0) { // Empty, simply push in with no successor - spare_task_chain->resource_stack_next_.store(nullptr, std::memory_order_relaxed); + spare_task_chain->resource_stack_next_.store(nullptr); } else { // Already an entry. Find it's corresponding task and set it as our successor. auto *current_root_task = find_task(current_root.value - 1, target_task->depth_); - spare_task_chain->resource_stack_next_.store(current_root_task, std::memory_order_relaxed); + spare_task_chain->resource_stack_next_.store(current_root_task); } } while (!target_task->resource_stack_root_.compare_exchange_strong(current_root, target_root)); @@ -112,7 +109,7 @@ task *task_manager::pop_resource_from_task(task *target_task) { } else { // Found something, try to pop it auto *current_root_task = find_task(current_root.value - 1, target_task->depth_); - auto *next_stack_task = current_root_task->resource_stack_next_.load(std::memory_order_relaxed); + auto *next_stack_task = current_root_task->resource_stack_next_.load(); target_root.stamp = current_root.stamp + 1; target_root.value = next_stack_task != nullptr ? next_stack_task->thread_id_ + 1 : 0; @@ -122,7 +119,7 @@ task *task_manager::pop_resource_from_task(task *target_task) { } while (!target_task->resource_stack_root_.compare_exchange_strong(current_root, target_root)); PLS_ASSERT(check_task_chain_backward(output_task), "Must only pop proper task chains."); - output_task->resource_stack_next_.store(nullptr, std::memory_order_relaxed); + output_task->resource_stack_next_.store(nullptr); return output_task; } @@ -187,7 +184,8 @@ bool task_manager::try_clean_return(context_switcher::continuation &result_cont) } // jump back to the continuation in main scheduling loop, time to steal some work - result_cont = thread_state::get().get_main_continuation(); + result_cont = std::move(thread_state::get().main_continuation()); + PLS_ASSERT(result_cont.valid(), "Must return a valid continuation."); return true; } else { // Make sure that we are owner fo this full continuation/task chain. @@ -198,13 +196,16 @@ bool task_manager::try_clean_return(context_switcher::continuation &result_cont) active_task_ = last_task; result_cont = std::move(last_task->continuation_); + PLS_ASSERT(result_cont.valid(), "Must return a valid continuation."); return false; } } bool task_manager::check_task_chain_forward(task *start_task) { while (start_task->next_ != nullptr) { - PLS_ASSERT(start_task->next_->prev_ == start_task, "Chain must have correct prev/next fields for linked list!"); + if (start_task->next_->prev_ != start_task) { + return false; + } start_task = start_task->next_; } return true; @@ -212,17 +213,16 @@ bool task_manager::check_task_chain_forward(task *start_task) { bool task_manager::check_task_chain_backward(task *start_task) { while (start_task->prev_ != nullptr) { - PLS_ASSERT(start_task->prev_->next_ == start_task, "Chain must have correct prev/next fields for linked list!"); + if (start_task->prev_->next_ != start_task) { + return false; + } start_task = start_task->prev_; } return true; } bool task_manager::check_task_chain() { - check_task_chain_backward(active_task_); - check_task_chain_forward(active_task_); - - return true; + return check_task_chain_backward(active_task_) && check_task_chain_forward(active_task_); } } diff --git a/lib/pls/src/internal/scheduling/thread_state.cpp b/lib/pls/src/internal/scheduling/thread_state.cpp new file mode 100644 index 0000000..fe70562 --- /dev/null +++ b/lib/pls/src/internal/scheduling/thread_state.cpp @@ -0,0 +1,12 @@ +#include "pls/internal/scheduling/thread_state.h" +#include "pls/internal/base/thread.h" + +namespace pls { +namespace internal { +namespace scheduling { + +thread_state &thread_state::get() { return *base::this_thread::state(); } + +} +} +}