From 740ae66161ea56787df9902b95a51c606833455a Mon Sep 17 00:00:00 2001 From: FritzFlorian Date: Wed, 6 Nov 2019 13:03:06 +0100 Subject: [PATCH] WIP: Sketch continuation and taks class. This first sketch of the classes captures what we think is needed in terms of general interface and very mich WIP. --- NOTES.md | 38 ++++++++++++++++++++++++++++++++++---- README.md | 14 +++++++++++--- cmake/SetupAssemblyOutput.cmake | 6 ++++++ cmake/SetupOptimizationLevel.cmake | 4 ++-- lib/pls/CMakeLists.txt | 7 ++++--- lib/pls/include/pls/internal/data_structures/aligned_stack.h | 14 ++++++++++---- lib/pls/include/pls/internal/data_structures/aligned_stack_impl.h | 5 +++-- lib/pls/include/pls/internal/data_structures/delayed_initialization_wrapper.h | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ lib/pls/include/pls/internal/scheduling/cont_manager.h | 17 ++++++++++++++--- lib/pls/include/pls/internal/scheduling/continuation.h | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ lib/pls/include/pls/internal/scheduling/lambda_task.h | 47 ----------------------------------------------- lib/pls/include/pls/internal/scheduling/scheduler_memory.h | 8 ++++++-- lib/pls/include/pls/internal/scheduling/task.h | 79 ++++--------------------------------------------------------------------------- lib/pls/include/pls/internal/scheduling/task_manager.h | 40 +++++++++++++++++++++++++++++++++++++--- lib/pls/include/pls/internal/scheduling/thread_state.h | 12 ++++++++---- lib/pls/src/internal/data_structures/aligned_stack.cpp | 4 +++- lib/pls/src/internal/scheduling/task.cpp | 40 ---------------------------------------- 17 files changed, 292 insertions(+), 193 deletions(-) create mode 100644 cmake/SetupAssemblyOutput.cmake create mode 100644 lib/pls/include/pls/internal/data_structures/delayed_initialization_wrapper.h create mode 100644 lib/pls/include/pls/internal/scheduling/continuation.h delete mode 100644 lib/pls/include/pls/internal/scheduling/lambda_task.h diff --git a/NOTES.md b/NOTES.md index 4ac9407..3a5e5e3 100644 --- a/NOTES.md +++ b/NOTES.md @@ -1,8 +1,38 @@ -# Notes +# Notes on Continuation/Parent Stealing Implementation -A collection of stuff that we noticed during development. -Useful later on two write a project report and to go back -in time to find out why certain decisions where made. +The new version of pls uses a more complicated/less user friendly +API in favor of performance and memory guarantees. +For the old version refer to the second half of this document. + +# 05.11.2019 - Memory Allocation, 'Fat Objects' + +We change our memory allocation for all memory the scheduler requires +from allocating buffers (char* arrays) separate from the actual data +structures to 'fat datastructures' that use templating to create +an object that actually holds all the data. This allows us to more +simple add fields to manage tasks and continuations, as we do not +need to change the scheduler_memory (adding additional buffers), but +as we only have to add the fields directly to the container objects. + +# 04.11.2019 - Memory Allocation and Initialization + +Our framework tries to be explicit on how and where memory is allocated. +In any production build of the framework we will only use fixed size +memory pools/blocks to manage all data structures required by the +scheduler, as this property is our main research goal. + +Never the less, we want to offer different ways on where to allocate +these fixed pools. Some people might prefer to store them in the stack, +some to store them in heap memory, and others might want to place them +into memory managed by custom allocators. Currently we support a stack +based 'fat' object and a heap based memory object that stores each +threads state in a vector (could be changed to lists in the future +to avoid the one big memory block allocated by the vector). + +# Notes on Blocking/Child Stealing Implementation + +Notes on the child stealing implementation of pls. +This corresponds to tag v0.1. ## 02.08.2019 - Ideas for sleeping threads when no work is available diff --git a/README.md b/README.md index 122e457..4577263 100644 --- a/README.md +++ b/README.md @@ -76,12 +76,11 @@ long fib(long n) { ``` - ## Project Structure The project uses [CMAKE](https://cmake.org/) as it's build system, the recommended IDE is either a simple text editor or [CLion](https://www.jetbrains.com/clion/). -We divide the project into subtargets to separate for the library +We divide the project into sub-targets to separate for the library itself, testing and example code. The library itself can be found in `lib/pls`, testing related code is in `test`, example and playground apps are in `app`. @@ -114,11 +113,16 @@ Available Settings: - Enables thread/datarace sanitizer to be linked to the executable - Only one sanitizer can be active at once - Enabling has a performance hit (do not use in releases) -- `-DDEBUG_SYMBOLS=ON` +- `-DDEBUG_SYMBOLS=ON/OFF` - default OFF - Enables the build with debug symbols - Use for e.g. profiling the release build +Note that these settings are persistent for one CMake build folder. +If you e.g. set a flag in the debug build it will not influence +the release build, but it will persist in the debug build folder +until you explicitly change it back. + ### Testing Testing is done using [Catch2](https://github.com/catchorg/Catch2/) @@ -167,4 +171,8 @@ For detailed profiling of small performance hotspots we prefer to use [Intel's VTune Amplifier](https://software.intel.com/en-us/vtune). It gives insights in detailed microachitecture usage and performance hotspots. Follow the instructions by Intel for using it. +Make sure to enable debug symbols (`-DDEBUG_SYMBOLS=ON`) in the +analyzed build and that all optimizations are turned on +(by choosing the release build). + diff --git a/cmake/SetupAssemblyOutput.cmake b/cmake/SetupAssemblyOutput.cmake new file mode 100644 index 0000000..916806d --- /dev/null +++ b/cmake/SetupAssemblyOutput.cmake @@ -0,0 +1,6 @@ +option(ASSEMBLY_OUTPUT "Enable output of assembly files when building" OFF) +if (ASSEMBLY_OUTPUT) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -save-temps -Wa,-ahldn=assembly.asm -fverbose-asm -g") +endif () + +message("-- Assembly Output: ${ASSEMBLY_OUTPUT}") diff --git a/cmake/SetupOptimizationLevel.cmake b/cmake/SetupOptimizationLevel.cmake index 1f4031a..05c18cc 100644 --- a/cmake/SetupOptimizationLevel.cmake +++ b/cmake/SetupOptimizationLevel.cmake @@ -13,12 +13,12 @@ message("-- Using Build Type: " ${CMAKE_BUILD_TYPE}) # Enable optimizations in release builds if (CMAKE_BUILD_TYPE STREQUAL "Release") # Link time optimization - set(CMAKE_CXX_FLAGS "-Wall -Wextra") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra") # -O2 is often seen as 'the most speed', # but inlining functions and SIMD/Vectorization is # only enabled by -O3, thus it's way faster in some # array calculations. - set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native") + set(CMAKE_CXX_FLAGS_RELEASE "-O2 -march=native") set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) else () set(CMAKE_CXX_FLAGS_DEBUG "-g -O0") diff --git a/lib/pls/CMakeLists.txt b/lib/pls/CMakeLists.txt index 7ae1366..4b70932 100644 --- a/lib/pls/CMakeLists.txt +++ b/lib/pls/CMakeLists.txt @@ -39,6 +39,7 @@ add_library(pls STATIC include/pls/internal/data_structures/aligned_stack.h src/internal/data_structures/aligned_stack.cpp include/pls/internal/data_structures/aligned_stack_impl.h include/pls/internal/data_structures/stamped_integer.h + include/pls/internal/data_structures/delayed_initialization_wrapper.h include/pls/internal/helpers/prohibit_new.h include/pls/internal/helpers/profiler.h @@ -51,11 +52,11 @@ add_library(pls STATIC include/pls/internal/scheduling/thread_state.h include/pls/internal/scheduling/scheduler.h src/internal/scheduling/scheduler.cpp include/pls/internal/scheduling/scheduler_impl.h - include/pls/internal/scheduling/task.h src/internal/scheduling/task.cpp include/pls/internal/scheduling/scheduler_memory.h - include/pls/internal/scheduling/lambda_task.h include/pls/internal/scheduling/task_manager.h - include/pls/internal/scheduling/cont_manager.h) + include/pls/internal/scheduling/task.h src/internal/scheduling/task.cpp + include/pls/internal/scheduling/cont_manager.h + include/pls/internal/scheduling/continuation.h) # Add everything in `./include` to be in the include path of this project target_include_directories(pls diff --git a/lib/pls/include/pls/internal/data_structures/aligned_stack.h b/lib/pls/include/pls/internal/data_structures/aligned_stack.h index 328b230..b4de92c 100644 --- a/lib/pls/include/pls/internal/data_structures/aligned_stack.h +++ b/lib/pls/include/pls/internal/data_structures/aligned_stack.h @@ -35,11 +35,9 @@ class aligned_stack { public: typedef size_t stack_offset; - protected: aligned_stack(char *memory_pointer, size_t size); aligned_stack(char *memory_pointer, size_t size, size_t original_size); - public: template T *push(ARGS &&... args); template @@ -64,18 +62,26 @@ class aligned_stack { }; template -class static_aligned_stack : public aligned_stack { +class static_aligned_stack { public: static_aligned_stack(); + aligned_stack &get_stack() { return aligned_stack_; } private: + aligned_stack aligned_stack_; alignas(base::system_details::CACHE_LINE_SIZE) std::array memory_; }; -class heap_aligned_stack : public aligned_stack { +class heap_aligned_stack { public: explicit heap_aligned_stack(size_t size); ~heap_aligned_stack(); + + aligned_stack &get_stack() { return aligned_stack_; } + + private: + aligned_stack aligned_stack_; + char *unaligned_memory_pointer_; }; } diff --git a/lib/pls/include/pls/internal/data_structures/aligned_stack_impl.h b/lib/pls/include/pls/internal/data_structures/aligned_stack_impl.h index 78c198c..e5de0d6 100644 --- a/lib/pls/include/pls/internal/data_structures/aligned_stack_impl.h +++ b/lib/pls/include/pls/internal/data_structures/aligned_stack_impl.h @@ -28,10 +28,11 @@ void aligned_stack::pop() { } template -static_aligned_stack::static_aligned_stack(): aligned_stack{memory_.data()} {}; +static_aligned_stack::static_aligned_stack(): memory_{}, aligned_stack_{memory_.data()} {}; heap_aligned_stack::heap_aligned_stack(size_t size) : - aligned_stack{new char[base::alignment::next_alignment(size)], size, base::alignment::next_alignment(size)} {} + unaligned_memory_pointer_{new char[base::alignment::next_alignment(size)]}, + aligned_stack_{unaligned_memory_pointer_, size, base::alignment::next_alignment(size)} {} heap_aligned_stack::~heap_aligned_stack() { delete[] unaligned_memory_pointer_; diff --git a/lib/pls/include/pls/internal/data_structures/delayed_initialization_wrapper.h b/lib/pls/include/pls/internal/data_structures/delayed_initialization_wrapper.h new file mode 100644 index 0000000..1d93398 --- /dev/null +++ b/lib/pls/include/pls/internal/data_structures/delayed_initialization_wrapper.h @@ -0,0 +1,57 @@ + +#ifndef PLS_INTERNAL_DATA_STRUCTURES_DELAYED_INITIALIZATION_WRAPPER_H_ +#define PLS_INTERNAL_DATA_STRUCTURES_DELAYED_INITIALIZATION_WRAPPER_H_ + +#include +#include + +#include "pls/internal/base/error_handling.h" + +namespace pls { +namespace internal { +namespace data_structures { + +/** + * Allows to reserve space for an uninitialized member variable. + * The member must be initialized before usage using the provided + * perfect forwarding constructor method. + * + * Makes sure to call the wrapped objects de-constructor when an object is wrapped. + */ +template +class delayed_initialization_wrapper { + public: + delayed_initialization_wrapper() : memory_{}, initialized_{false} {} + template + explicit delayed_initialization_wrapper(ARGS &&...args): memory_{}, initialized_{true} { + new(memory_) T(std::forward(args)...); + } + + ~delayed_initialization_wrapper() { + if (initialized_) { + memory_->~T(); + } + } + + template + void initialize(ARGS &&...args) { + PLS_ASSERT(initialized_, "Can only initialize delayed wrapper object once!") + + new(memory_) T(std::forward(args)...); + initialized_ = true; + } + T &object() { + PLS_ASSERT(initialized_, "Can not use an uninitialized delayed wrapper object!") + return *reinterpret_cast(memory_); + } + + private: + std::array memory_; + bool initialized_; +}; + +} +} +} + +#endif // PLS_INTERNAL_DATA_STRUCTURES_DELAYED_INITIALIZATION_WRAPPER_H_ diff --git a/lib/pls/include/pls/internal/scheduling/cont_manager.h b/lib/pls/include/pls/internal/scheduling/cont_manager.h index 5d567aa..104337f 100644 --- a/lib/pls/include/pls/internal/scheduling/cont_manager.h +++ b/lib/pls/include/pls/internal/scheduling/cont_manager.h @@ -5,6 +5,7 @@ #include #include "pls/internal/data_structures/aligned_stack.h" +#include "pls/internal/scheduling/continuation.h" namespace pls { namespace internal { @@ -12,15 +13,25 @@ namespace scheduling { class cont_manager { public: - explicit cont_manager() = default; + explicit cont_manager(size_t num_conts, size_t max_cont_size, data_structures::aligned_stack &cont_storage) + : num_conts_{num_conts}, max_cont_size_{max_cont_size}, cont_storage_{cont_storage} { + //TODO: Init linked list like structure + }; private: - // TODO: Add attributes + const size_t num_conts_, max_cont_size_; + data_structures::aligned_stack &cont_storage_; }; template -class static_cont_manager : public cont_manager { +class static_cont_manager { + public: + static_cont_manager() : static_cont_storage_{}, cont_manager_{NUM_CONTS, MAX_CONT_SIZE, static_cont_storage_} {} + cont_manager &get_cont_manager() { return cont_manager_; } + private: + data_structures::static_aligned_stack static_cont_storage_; + cont_manager cont_manager_; }; } diff --git a/lib/pls/include/pls/internal/scheduling/continuation.h b/lib/pls/include/pls/internal/scheduling/continuation.h new file mode 100644 index 0000000..f9a2420 --- /dev/null +++ b/lib/pls/include/pls/internal/scheduling/continuation.h @@ -0,0 +1,93 @@ + +#ifndef PLS_INTERNAL_SCHEDULING_CONTINUATION_H_ +#define PLS_INTERNAL_SCHEDULING_CONTINUATION_H_ + +#include +#include +#include + +#include "pls/internal/data_structures/delayed_initialization_wrapper.h" + +namespace pls { +namespace internal { +namespace scheduling { + +class base_continuation { + public: + virtual void run() = 0; + virtual ~base_continuation() = 0; +}; + +class continuation_node { + public: + + private: + // Pointer to memory region reserved for the companion continuation. + // Must be a buffer big enough to hold any continuation encountered in the program. + base_continuation *continuation_; + + // Linked list property of continuations (continuation chains as memory management). + // Each continuation knows its chain start to allow stealing a whole chain in O(1) + // without the need to traverse back to the chain start. + continuation_node *cont_chain_start_; + continuation_node *prev_, *next_; + + // When blocked on this continuation, we need to know what other chain we + // got offered by the stealing thread. + // For this we need only the head of the other chain (as each continuation is a + // self describing entity for its chain up to the given node). + continuation_node *offered_chain_; +}; + +template +class continuation : public base_continuation { + public: + void run() override { + // TODO: integrate this better into the runtime system. + // E.g. handle passing the result to the parent continuation + function_.object()(result_1_.object(), result_2_.object()); + } + ~continuation() override = default; + + template + void store_result_1_(R1ARG &&result_1) { + static_assert(std::is_same::type>::value, + "must only copy/move objects in, not construct them"); + result_1_.initialize(std::forward(result_1)); + } + + template + void store_result_2(R2ARG &&result_1) { + static_assert(std::is_same::type>::value, + "must only copy/move objects in, not construct them"); + result_2_.initialize(std::forward(result_1)); + } + + template + void store_function(FARG &&function) { + static_assert(std::is_same::type>::value, + "must only copy/move objects in, not construct them"); + function_.initialize(function); + } + + private: + // We plan to only init the members for a continuation on the slow path. + // If we can execute everything inline we simply skip it saving runtime overhead. + template + using delayed_init = data_structures::delayed_initialization_wrapper; + + delayed_init result_1_; + delayed_init result_2_; + delayed_init function_; + + // Also uninitialized at first, only take the atomic write on the slow path. + // The stealer will init it to 2 while stealing, the 'stolen' sync will then make sure + // everyone sees the value in correct order. + std::atomic results_missing_{}; +}; + +} +} +} + +#endif //PLS_INTERNAL_SCHEDULING_CONTINUATION_H_ diff --git a/lib/pls/include/pls/internal/scheduling/lambda_task.h b/lib/pls/include/pls/internal/scheduling/lambda_task.h deleted file mode 100644 index 1d3d4b1..0000000 --- a/lib/pls/include/pls/internal/scheduling/lambda_task.h +++ /dev/null @@ -1,47 +0,0 @@ - -#ifndef PLS_LAMBDA_TASK_H_ -#define PLS_LAMBDA_TASK_H_ - -#include "pls/internal/scheduling/task.h" - -namespace pls { -namespace internal { -namespace scheduling { - -template -class lambda_task_by_reference : public task { - const Function &function_; - - public: - explicit lambda_task_by_reference(const Function &function) : task{}, function_{function} {}; - - protected: - void execute_internal() override { - function_(); - - wait_for_all(); - this->~lambda_task_by_reference(); - } -}; - -template -class lambda_task_by_value : public task { - const Function function_; - - public: - explicit lambda_task_by_value(const Function &function) : task{}, function_{function} {}; - - protected: - void execute_internal() override { - function_(); - - wait_for_all(); - this->~lambda_task_by_value(); - } -}; - -} -} -} - -#endif //PLS_LAMBDA_TASK_H_ diff --git a/lib/pls/include/pls/internal/scheduling/scheduler_memory.h b/lib/pls/include/pls/internal/scheduling/scheduler_memory.h index 4cccb68..f7e88d4 100644 --- a/lib/pls/include/pls/internal/scheduling/scheduler_memory.h +++ b/lib/pls/include/pls/internal/scheduling/scheduler_memory.h @@ -13,6 +13,10 @@ namespace scheduling { void worker_routine(); class scheduler_memory { + // Note: scheduler_memory is a pure interface and has no data. + // By not having an initialization routine we can do our 'static and heap specialization' + // without running into any ordering problems in the initialization sequence. + // We first worried about performance of this being virtual. // However, we decided that only thread_state_for is used during the // runtime and that only when stealing. As stealing is expensive anyways, @@ -35,7 +39,7 @@ class static_scheduler_memory : public scheduler_memory { } thread_state &thread_state_for(size_t id) const override { - return thread_states_[id]; + return thread_states_[id].get_thread_state(); } private: @@ -69,7 +73,7 @@ class heap_scheduler_memory : public scheduler_memory { } thread_state &thread_state_for(size_t id) const override { - return thread_state_vector_[id].object(); + return thread_state_vector_[id].object().get_thread_state(); } private: diff --git a/lib/pls/include/pls/internal/scheduling/task.h b/lib/pls/include/pls/internal/scheduling/task.h index 343bf86..46abf95 100644 --- a/lib/pls/include/pls/internal/scheduling/task.h +++ b/lib/pls/include/pls/internal/scheduling/task.h @@ -2,9 +2,7 @@ #ifndef PLS_TASK_H #define PLS_TASK_H -#include "pls/internal/helpers/profiler.h" - -#include "pls/internal/data_structures/aligned_stack.h" +#include "pls/internal/scheduling/task_manager.h" #include "pls/internal/scheduling/thread_state.h" namespace pls { @@ -16,42 +14,21 @@ namespace scheduling { * Tasks are guaranteed to be executed exactly once. * * Override the execute_internal() method for your custom code. - * - * IMPORTANT: - * Tasks memory is re-used without calling the destructor. - * You must call it yourself at the end of execute_internal(). - * This is done to not introduce any overhead of virtual function calls - * if no clean up is required. */ class task { friend class scheduler; - // Memory-Management (allow to allocate memory blocks in constructor) - bool finished_construction_; - - // Coordinate finishing of sub_tasks - std::atomic ref_count_; - task *parent_; - - // Stack Management (reset stack pointer after wait_for_all() calls) - data_structures::deque_offset deque_offset_; + // TODO: Add ref to continuation + task_manager::task_manager_state task_manager_state_; protected: - /* - * Must call the parent constructor. - * - * IMPORTANT: - * Tasks memory is re-used without calling the destructor. - * You must call it yourself at the end of execute_internal(). - */ explicit task(); /** * Allow to allocate extra memory during run-time for this task. * Memory will be pushed onto the stack (in aligned memory, thus avoid many small chunks). - * MUST be called in constructor, never afterwards. * - * Memory is fully self managed. Calling e.g. deconstructors when not needing objects + * Memory is fully self managed. Calling e.g. de-constructors when not needing objects * anymore is the users responsibility (memory is simply re-used after the life time of the task ends). * * @param size Number of bytes to be allocated @@ -64,58 +41,10 @@ class task { */ virtual void execute_internal() = 0; - template - void spawn_child(ARGS &&... args); - template - void spawn_child_and_wait(ARGS &&... args); - void wait_for_all(); - private: void execute(); }; -template -void task::spawn_child(ARGS &&... args) { - PROFILE_FORK_JOIN_STEALING("spawn_child") - static_assert(std::is_base_of::type>::value, "Only pass task subclasses!"); - - // Keep our refcount up to date - ref_count_++; - - // Push on our deque - auto item = thread_state::get()->deque_.push_task(std::forward(args)...); - - // Assign forced values (for stack and parent management) - item->parent_ = this; - item->finished_construction_ = true; - item->deque_offset_ = thread_state::get()->deque_.save_offset(); - - // Make new task visible to others - thread_state::get()->deque_.publish_last_task(); -} - -template -void task::spawn_child_and_wait(ARGS &&... args) { - static_assert(std::is_base_of::type>::value, "Only pass task subclasses!"); - - spawn_child(std::forward(args)...); - // TODO: Check why 'direct spawn' (even when pushing it onto the tas queue) seems to be slower - // (Also check if it even is slower or if it only appears so on our laptop) -// // Push on our deque -// auto task = thread_state::get()->deque_.push_task(std::forward(args)...); -// -// // Assign forced values (for stack and parent management) -// task->parent_ = nullptr; // ...do not assign this to a parent => it will not notify our reference counter -// task->finished_construction_ = true; -// task->deque_offset_ = thread_state::get()->deque_.save_offset(); -// -// // Execute it -// task->execute(); - - // Wait for the rest of the tasks - wait_for_all(); -} - } } } diff --git a/lib/pls/include/pls/internal/scheduling/task_manager.h b/lib/pls/include/pls/internal/scheduling/task_manager.h index 8ec2294..7a0aeb6 100644 --- a/lib/pls/include/pls/internal/scheduling/task_manager.h +++ b/lib/pls/include/pls/internal/scheduling/task_manager.h @@ -10,8 +10,40 @@ namespace pls { namespace internal { namespace scheduling { +// TODO: Remove forward references +class task; + +/** + * Handles management of tasks in the system. Each thread has a local task manager, + * responsible for allocating, freeing and publishing tasks for stealing. + * + * The manager therefore acts as the deque found in work stealing, as well as the memory + * management for the tasks (as both are deeply intertwined in our implementation to + * integrate the memory management into the stealing procedure. + */ class task_manager { - protected: + using task_manager_offset = data_structures::aligned_stack::stack_offset; + + public: + // Data each task needs to store to enable the 'return_task' functionality. + using task_manager_state = task_manager_offset; + + // Construct a task onto the stack. Stores the previous offset in the newly constructed task. + template + T *push_task(ARGS ...args); + // Publishes a task on the stack, i.e. makes it visible for other threads to steal. + void publish_task(task *task); + // Return a no longer needed task to the stack. Must be the current most top task (will reset the stack pointer). + void return_task(task *task); + + // Try to pop a local task from this task managers stack. + task *pop_local_task(); + // Try to steal a task from a remote task_manager instance. + // The returned task pointer is valid during the lifetyme of the task. + // The returned task pointer must be returned to this task_manager instance. + // (This is because we can either decide to just steal a remote task pointer or to copy the whole task) + task *pop_remote_task(task_manager &other); + explicit task_manager(data_structures::aligned_stack &task_stack) : task_stack_{task_stack} {} private: @@ -19,12 +51,14 @@ class task_manager { }; template -class static_task_manager : public task_manager { +class static_task_manager { public: - static_task_manager() : task_manager{static_task_stack_} {}; + static_task_manager() : static_task_stack_{}, task_manager_{static_task_stack_} {}; + task_manager &get_task_manager() { return task_manager_; } private: data_structures::static_aligned_stack static_task_stack_; + task_manager task_manager_; }; } diff --git a/lib/pls/include/pls/internal/scheduling/thread_state.h b/lib/pls/include/pls/internal/scheduling/thread_state.h index 30dcf97..eaba7f3 100644 --- a/lib/pls/include/pls/internal/scheduling/thread_state.h +++ b/lib/pls/include/pls/internal/scheduling/thread_state.h @@ -29,7 +29,7 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) thread_state { alignas(base::system_details::CACHE_LINE_SIZE) task *current_task_; alignas(base::system_details::CACHE_LINE_SIZE) std::minstd_rand random_; - protected: + public: thread_state(task_manager &task_manager, cont_manager &cont_manager) : scheduler_{nullptr}, @@ -39,7 +39,6 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) thread_state { current_task_{nullptr}, random_{static_cast(std::chrono::steady_clock::now().time_since_epoch().count())} {} - public: /** * Convenience helper to get the thread_state instance associated with this thread. * Must only be called on threads that are associated with a thread_state, @@ -60,13 +59,18 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) thread_state { }; template -struct static_thread_state : public thread_state { +struct static_thread_state { public: - static_thread_state() : thread_state{static_task_manager_, static_cont_manager_} {} + static_thread_state() + : static_task_manager_{}, + static_cont_manager_{}, + thread_state_{static_task_manager_.get_task_manager(), static_cont_manager_.get_cont_manager()} {} + thread_state &get_thread_state() { return thread_state_; } private: static_task_manager static_task_manager_; static_cont_manager static_cont_manager_; + thread_state thread_state_; }; } diff --git a/lib/pls/src/internal/data_structures/aligned_stack.cpp b/lib/pls/src/internal/data_structures/aligned_stack.cpp index 519e795..12fab84 100644 --- a/lib/pls/src/internal/data_structures/aligned_stack.cpp +++ b/lib/pls/src/internal/data_structures/aligned_stack.cpp @@ -6,6 +6,7 @@ namespace internal { namespace data_structures { aligned_stack::aligned_stack(char *memory_pointer, size_t size) : + unaligned_memory_pointer_{memory_pointer}, memory_pointer_{memory_pointer}, // MUST be aligned max_offset_{size / base::system_details::CACHE_LINE_SIZE}, current_offset_{0} { @@ -16,7 +17,8 @@ aligned_stack::aligned_stack(char *memory_pointer, size_t size) : aligned_stack::aligned_stack(char *unaligned_memory_pointer, size_t size, size_t unaligned_size) : unaligned_memory_pointer_{unaligned_memory_pointer}, memory_pointer_{base::alignment::next_alignment(unaligned_memory_pointer)}, - max_offset_{unaligned_size / base::system_details::CACHE_LINE_SIZE} { + max_offset_{unaligned_size / base::system_details::CACHE_LINE_SIZE}, + current_offset_{0} { PLS_ASSERT(size == base::alignment::previous_alignment(unaligned_size), "Initialized aligned stack with invalid memory configuration!") } diff --git a/lib/pls/src/internal/scheduling/task.cpp b/lib/pls/src/internal/scheduling/task.cpp index 1b25604..5e78b37 100644 --- a/lib/pls/src/internal/scheduling/task.cpp +++ b/lib/pls/src/internal/scheduling/task.cpp @@ -8,46 +8,6 @@ namespace pls { namespace internal { namespace scheduling { -task::task() : - finished_construction_{false}, - ref_count_{0}, - parent_{nullptr}, - deque_offset_{0} {} - -void *task::allocate_memory(long size) { - if (finished_construction_) { - PLS_ERROR("Must not allocate dynamic task memory after it's construction.") - } - return thread_state::get()->deque_.push_bytes(size); -} - -void task::execute() { - PROFILE_WORK_BLOCK("execute task") - auto last_executing = thread_state::get()->current_task_; - thread_state::get()->current_task_ = this; - - execute_internal(); - PROFILE_END_BLOCK - - wait_for_all(); - thread_state::get()->current_task_ = last_executing; - - if (parent_ != nullptr) { - parent_->ref_count_--; - } -} - -void task::wait_for_all() { - auto scheduler = thread_state::get()->scheduler_; - - while (ref_count_ > 0) { - if (!scheduler->try_execute_local()) { - scheduler->try_execute_stolen(); - } - } - thread_state::get()->deque_.reset_offset(deque_offset_); -} - } } } -- libgit2 0.26.0