From 3535cbd8e2f90c28feac6119d02327774be92309 Mon Sep 17 00:00:00 2001 From: FritzFlorian Date: Tue, 9 Apr 2019 22:34:17 +0200 Subject: [PATCH] Cache Align scheduler_memory. --- app/playground/main.cpp | 6 ++---- app/test_for_new/main.cpp | 2 +- lib/pls/CMakeLists.txt | 1 + lib/pls/include/pls/internal/base/alignment.h | 29 +++++++++++++++++++++++++++++ lib/pls/include/pls/internal/base/system_details.h | 4 +++- lib/pls/include/pls/internal/data_structures/aligned_stack.h | 10 ++++------ lib/pls/include/pls/internal/scheduling/scheduler_memory.h | 51 +++++++++++++++++++++++++++++++++------------------ lib/pls/src/internal/base/alignment.cpp | 27 +++++++++++++++++++++++++++ lib/pls/src/internal/data_structures/aligned_stack.cpp | 15 +-------------- lib/pls/src/internal/scheduling/scheduler_memory.cpp | 12 ++++++------ test/data_structures_test.cpp | 22 +++++++++++----------- 11 files changed, 118 insertions(+), 61 deletions(-) create mode 100644 lib/pls/include/pls/internal/base/alignment.h create mode 100644 lib/pls/src/internal/base/alignment.cpp diff --git a/app/playground/main.cpp b/app/playground/main.cpp index 5851538..4a33c29 100644 --- a/app/playground/main.cpp +++ b/app/playground/main.cpp @@ -12,10 +12,8 @@ using namespace pls; int main() { - using aligned_state = std::aligned_storage::type; - aligned_state data; - - std::cout << sizeof(aligned_state) << std::endl; malloc_scheduler_memory sched_memory{8}; std::cout << (std::uintptr_t)sched_memory.thread_for(0) % 64 << ", " << (std::uintptr_t)sched_memory.thread_for(1) % 64 << ", " << (std::uintptr_t)sched_memory.thread_for(2) % 64 << ", " << std::endl; + std::cout << (std::uintptr_t)sched_memory.thread_state_for(0) % 64 << ", " << (std::uintptr_t)sched_memory.thread_state_for(1) % 64 << ", " << (std::uintptr_t)sched_memory.thread_state_for(2) % 64 << ", " << std::endl; + std::cout << (std::uintptr_t)sched_memory.task_stack_for(0) % 64 << ", " << (std::uintptr_t)sched_memory.task_stack_for(1) % 64 << ", " << (std::uintptr_t)sched_memory.task_stack_for(2) % 64 << ", " << std::endl; } diff --git a/app/test_for_new/main.cpp b/app/test_for_new/main.cpp index fc48d64..2e74529 100644 --- a/app/test_for_new/main.cpp +++ b/app/test_for_new/main.cpp @@ -1,7 +1,7 @@ #include #include -using namespace pls::internal::data_structures; +using namespace pls::internal::base; int global = 0; diff --git a/lib/pls/CMakeLists.txt b/lib/pls/CMakeLists.txt index 28218d5..7eb497f 100644 --- a/lib/pls/CMakeLists.txt +++ b/lib/pls/CMakeLists.txt @@ -9,6 +9,7 @@ add_library(pls STATIC include/pls/internal/base/barrier.h src/internal/base/barrier.cpp include/pls/internal/base/system_details.h include/pls/internal/base/error_handling.h + include/pls/internal/base/alignment.h src/internal/base/alignment.cpp include/pls/internal/data_structures/aligned_stack.h src/internal/data_structures/aligned_stack.cpp include/pls/internal/data_structures/deque.h src/internal/data_structures/deque.cpp diff --git a/lib/pls/include/pls/internal/base/alignment.h b/lib/pls/include/pls/internal/base/alignment.h new file mode 100644 index 0000000..4dc4752 --- /dev/null +++ b/lib/pls/include/pls/internal/base/alignment.h @@ -0,0 +1,29 @@ + +#ifndef PLS_ALIGNMENT_H +#define PLS_ALIGNMENT_H + +#include +#include + +#include "system_details.h" + +namespace pls { + namespace internal { + namespace base { + namespace alignment { + template + struct aligned_wrapper { + alignas(system_details::CACHE_LINE_SIZE) unsigned char data[sizeof(T)]; + + T* pointer() { return reinterpret_cast(data); } + }; + void* allocate_aligned(size_t size); + + std::uintptr_t next_alignment(std::uintptr_t size); + char* next_alignment(char* pointer); + } + } + } +} + +#endif //PLS_ALIGNMENT_H diff --git a/lib/pls/include/pls/internal/base/system_details.h b/lib/pls/include/pls/internal/base/system_details.h index 9df0707..4aa8965 100644 --- a/lib/pls/include/pls/internal/base/system_details.h +++ b/lib/pls/include/pls/internal/base/system_details.h @@ -13,7 +13,9 @@ namespace pls { * PORTABILITY: * Currently sane default values for x86. */ - constexpr std::uintptr_t CACHE_LINE_SIZE = 64; + namespace system_details { + constexpr std::uintptr_t CACHE_LINE_SIZE = 64; + } } } } diff --git a/lib/pls/include/pls/internal/data_structures/aligned_stack.h b/lib/pls/include/pls/internal/data_structures/aligned_stack.h index ae5c2b6..743ab56 100644 --- a/lib/pls/include/pls/internal/data_structures/aligned_stack.h +++ b/lib/pls/include/pls/internal/data_structures/aligned_stack.h @@ -6,6 +6,7 @@ #include #include "pls/internal/base/error_handling.h" +#include "pls/internal/base/alignment.h" namespace pls { namespace internal { @@ -29,14 +30,11 @@ namespace pls { // Current head will always be aligned to cache lines char* head_; - - static std::uintptr_t next_alignment(std::uintptr_t size); - static char* next_alignment(char* pointer); public: typedef char* state; aligned_stack(): memory_start_{nullptr}, memory_end_{nullptr}, head_{nullptr} {}; - aligned_stack(char* memory_region, const std::size_t size); + aligned_stack(char* memory_region, std::size_t size); template T* push(const T& object) { @@ -49,7 +47,7 @@ namespace pls { void* result = reinterpret_cast(head_); // Move head to next aligned position after new object - head_ = next_alignment(head_ + sizeof(T)); + head_ = base::alignment::next_alignment(head_ + sizeof(T)); if (head_ >= memory_end_) { PLS_ERROR("Tried to allocate object on alligned_stack without sufficient memory!"); } @@ -59,7 +57,7 @@ namespace pls { template T pop() { - head_ = head_ - next_alignment(sizeof(T)); + head_ = head_ - base::alignment::next_alignment(sizeof(T)); return *reinterpret_cast(head_); } diff --git a/lib/pls/include/pls/internal/scheduling/scheduler_memory.h b/lib/pls/include/pls/internal/scheduling/scheduler_memory.h index dd233e9..25d898f 100644 --- a/lib/pls/include/pls/internal/scheduling/scheduler_memory.h +++ b/lib/pls/include/pls/internal/scheduling/scheduler_memory.h @@ -14,7 +14,7 @@ namespace pls { class scheduler_memory { public: - virtual size_t max_threads() = 0; + virtual size_t max_threads() const = 0; virtual thread_state* thread_state_for(size_t id) = 0; virtual scheduler_thread* thread_for(size_t id) = 0; virtual data_structures::aligned_stack* task_stack_for(size_t id) = 0; @@ -22,39 +22,54 @@ namespace pls { template class static_scheduler_memory: public scheduler_memory { - std::array threads_; - std::array thread_states_; - std::array, MAX_THREADS> task_stacks_memory_; - std::array task_stacks_; + // Everyone of these types has to live on its own cache line, + // as each thread uses one of them independently. + // Therefore it would be a major performance hit if we shared cache lines on these. + using aligned_thread = base::alignment::aligned_wrapper; + using aligned_thread_state = base::alignment::aligned_wrapper; + using aligned_thread_stack = base::alignment::aligned_wrapper>; + using aligned_aligned_stack = base::alignment::aligned_wrapper; + + std::array threads_; + std::array thread_states_; + std::array task_stacks_memory_; + std::array task_stacks_; public: static_scheduler_memory() { for (size_t i = 0; i < MAX_THREADS; i++) { - task_stacks_[i] = data_structures::aligned_stack(task_stacks_memory_[i].data(), TASK_STACK_SIZE); + new ((void*)task_stacks_[i].pointer()) data_structures::aligned_stack(task_stacks_memory_[i].pointer()->data(), TASK_STACK_SIZE); } } - size_t max_threads() override { return MAX_THREADS; } - thread_state* thread_state_for(size_t id) override { return &thread_states_[id]; } - scheduler_thread* thread_for(size_t id) override { return &threads_[id]; } - data_structures::aligned_stack* task_stack_for(size_t id) override { return &task_stacks_[id]; } + size_t max_threads() const override { return MAX_THREADS; } + thread_state* thread_state_for(size_t id) override { return thread_states_[id].pointer(); } + scheduler_thread* thread_for(size_t id) override { return threads_[id].pointer(); } + data_structures::aligned_stack* task_stack_for(size_t id) override { return task_stacks_[id].pointer(); } }; class malloc_scheduler_memory: public scheduler_memory { - size_t num_threads_; + // Everyone of these types has to live on its own cache line, + // as each thread uses one of them independently. + // Therefore it would be a major performance hit if we shared cache lines on these. + using aligned_thread = base::alignment::aligned_wrapper; + using aligned_thread_state = base::alignment::aligned_wrapper; + using aligned_aligned_stack = base::alignment::aligned_wrapper; + + const size_t num_threads_; - alignas(64) scheduler_thread* threads_; - thread_state* thread_states_; + aligned_thread* threads_; + aligned_thread_state * thread_states_; char** task_stacks_memory_; - data_structures::aligned_stack* task_stacks_; + aligned_aligned_stack * task_stacks_; public: explicit malloc_scheduler_memory(size_t num_threads, size_t memory_per_stack = 2 << 16); ~malloc_scheduler_memory(); - size_t max_threads() override { return num_threads_; } - thread_state* thread_state_for(size_t id) override { return &thread_states_[id]; } - scheduler_thread* thread_for(size_t id) override { return &threads_[id]; } - data_structures::aligned_stack* task_stack_for(size_t id) override { return &task_stacks_[id]; } + size_t max_threads() const override { return num_threads_; } + thread_state* thread_state_for(size_t id) override { return thread_states_[id].pointer(); } + scheduler_thread* thread_for(size_t id) override { return threads_[id].pointer(); } + data_structures::aligned_stack* task_stack_for(size_t id) override { return task_stacks_[id].pointer(); } }; } } diff --git a/lib/pls/src/internal/base/alignment.cpp b/lib/pls/src/internal/base/alignment.cpp new file mode 100644 index 0000000..af95adb --- /dev/null +++ b/lib/pls/src/internal/base/alignment.cpp @@ -0,0 +1,27 @@ +#include "pls/internal/base/alignment.h" +#include "pls/internal/base/system_details.h" + +namespace pls { + namespace internal { + namespace base { + namespace alignment { + void* allocate_aligned(size_t size) { + return aligned_alloc(system_details::CACHE_LINE_SIZE, size); + } + + std::uintptr_t next_alignment(std::uintptr_t size) { + std::uintptr_t miss_alignment = size % base::system_details::CACHE_LINE_SIZE; + if (miss_alignment == 0) { + return size; + } else { + return size + (base::system_details::CACHE_LINE_SIZE - miss_alignment); + } + } + + char* next_alignment(char* pointer) { + return reinterpret_cast(next_alignment(reinterpret_cast(pointer))); + } + } + } + } +} diff --git a/lib/pls/src/internal/data_structures/aligned_stack.cpp b/lib/pls/src/internal/data_structures/aligned_stack.cpp index ff54dd3..2a4d6d9 100644 --- a/lib/pls/src/internal/data_structures/aligned_stack.cpp +++ b/lib/pls/src/internal/data_structures/aligned_stack.cpp @@ -7,20 +7,7 @@ namespace pls { aligned_stack::aligned_stack(char* memory_region, const std::size_t size): memory_start_{memory_region}, memory_end_{memory_region + size}, - head_{next_alignment(memory_start_)} {} - - std::uintptr_t aligned_stack::next_alignment(std::uintptr_t size) { - std::uintptr_t miss_alignment = size % base::CACHE_LINE_SIZE; - if (miss_alignment == 0) { - return size; - } else { - return size + (base::CACHE_LINE_SIZE - miss_alignment); - } - } - - char* aligned_stack::next_alignment(char* pointer) { - return reinterpret_cast(next_alignment(reinterpret_cast(pointer))); - } + head_{base::alignment::next_alignment(memory_start_)} {} } } } diff --git a/lib/pls/src/internal/scheduling/scheduler_memory.cpp b/lib/pls/src/internal/scheduling/scheduler_memory.cpp index 7201242..8a65002 100644 --- a/lib/pls/src/internal/scheduling/scheduler_memory.cpp +++ b/lib/pls/src/internal/scheduling/scheduler_memory.cpp @@ -5,14 +5,14 @@ namespace pls { namespace scheduling { malloc_scheduler_memory::malloc_scheduler_memory(const size_t num_threads, const size_t memory_per_stack): num_threads_{num_threads} { - threads_ = reinterpret_cast(malloc(num_threads * sizeof(scheduler_thread))); - thread_states_ = reinterpret_cast(malloc(num_threads * sizeof(thread_state))); + threads_ = reinterpret_cast(base::alignment::allocate_aligned(num_threads * sizeof(aligned_thread))); + thread_states_ = reinterpret_cast(base::alignment::allocate_aligned(num_threads * sizeof(aligned_thread_state))); - task_stacks_ = reinterpret_cast(malloc(num_threads * sizeof(data_structures::aligned_stack))); - task_stacks_memory_ = reinterpret_cast(malloc(num_threads * sizeof(char*))); + task_stacks_ = reinterpret_cast(base::alignment::allocate_aligned(num_threads * sizeof(aligned_aligned_stack))); + task_stacks_memory_ = reinterpret_cast(base::alignment::allocate_aligned(num_threads * sizeof(char*))); for (size_t i = 0; i < num_threads_; i++) { - task_stacks_memory_[i] = reinterpret_cast(malloc(memory_per_stack)); - task_stacks_[i] = data_structures::aligned_stack(task_stacks_memory_[i], memory_per_stack); + task_stacks_memory_[i] = reinterpret_cast(base::alignment::allocate_aligned(memory_per_stack)); + new ((void*)task_stacks_[i].pointer()) data_structures::aligned_stack(task_stacks_memory_[i], memory_per_stack); } } diff --git a/test/data_structures_test.cpp b/test/data_structures_test.cpp index 616b85f..a878d72 100644 --- a/test/data_structures_test.cpp +++ b/test/data_structures_test.cpp @@ -27,20 +27,20 @@ TEST_CASE( "aligned stack stores objects correctly", "[internal/data_structures/ auto pointer_two = stack.push(small_data_two); auto pointer_three = stack.push(small_data_three); - REQUIRE(reinterpret_cast(pointer_one) % CACHE_LINE_SIZE == 0); - REQUIRE(reinterpret_cast(pointer_two) % CACHE_LINE_SIZE == 0); - REQUIRE(reinterpret_cast(pointer_three) % CACHE_LINE_SIZE == 0); + REQUIRE(reinterpret_cast(pointer_one) % system_details::CACHE_LINE_SIZE == 0); + REQUIRE(reinterpret_cast(pointer_two) % system_details::CACHE_LINE_SIZE == 0); + REQUIRE(reinterpret_cast(pointer_three) % system_details::CACHE_LINE_SIZE == 0); } SECTION( "stack correctly pushes above linesize objects" ) { std::array small_data_one{'a', 'b', 'c', 'd', 'e'}; - std::array big_data_one{}; + std::array big_data_one{}; auto big_pointer_one = stack.push(big_data_one); auto small_pointer_one = stack.push(small_data_one); - REQUIRE(reinterpret_cast(big_pointer_one) % CACHE_LINE_SIZE == 0); - REQUIRE(reinterpret_cast(small_pointer_one) % CACHE_LINE_SIZE == 0); + REQUIRE(reinterpret_cast(big_pointer_one) % system_details::CACHE_LINE_SIZE == 0); + REQUIRE(reinterpret_cast(small_pointer_one) % system_details::CACHE_LINE_SIZE == 0); } SECTION( "stack correctly stores and retrieves objects" ) { @@ -65,11 +65,11 @@ TEST_CASE( "aligned stack stores objects correctly", "[internal/data_structures/ auto pointer_four = stack.push(small_data_two); auto pointer_five = stack.push(small_data_three); - REQUIRE(reinterpret_cast(pointer_one) % CACHE_LINE_SIZE == 0); - REQUIRE(reinterpret_cast(pointer_two) % CACHE_LINE_SIZE == 0); - REQUIRE(reinterpret_cast(pointer_three) % CACHE_LINE_SIZE == 0); - REQUIRE(reinterpret_cast(pointer_four) % CACHE_LINE_SIZE == 0); - REQUIRE(reinterpret_cast(pointer_five) % CACHE_LINE_SIZE == 0); + REQUIRE(reinterpret_cast(pointer_one) % system_details::CACHE_LINE_SIZE == 0); + REQUIRE(reinterpret_cast(pointer_two) % system_details::CACHE_LINE_SIZE == 0); + REQUIRE(reinterpret_cast(pointer_three) % system_details::CACHE_LINE_SIZE == 0); + REQUIRE(reinterpret_cast(pointer_four) % system_details::CACHE_LINE_SIZE == 0); + REQUIRE(reinterpret_cast(pointer_five) % system_details::CACHE_LINE_SIZE == 0); REQUIRE(pointer_four == pointer_two); REQUIRE(pointer_five == pointer_three); -- libgit2 0.26.0