Commit 3535cbd8 by FritzFlorian

Cache Align scheduler_memory.

parent e2e34b02
Pipeline #1144 passed with stages
in 3 minutes 32 seconds
...@@ -12,10 +12,8 @@ ...@@ -12,10 +12,8 @@
using namespace pls; using namespace pls;
int main() { int main() {
using aligned_state = std::aligned_storage<sizeof(internal::scheduling::thread_state), 64>::type;
aligned_state data;
std::cout << sizeof(aligned_state) << std::endl;
malloc_scheduler_memory sched_memory{8}; malloc_scheduler_memory sched_memory{8};
std::cout << (std::uintptr_t)sched_memory.thread_for(0) % 64 << ", " << (std::uintptr_t)sched_memory.thread_for(1) % 64 << ", " << (std::uintptr_t)sched_memory.thread_for(2) % 64 << ", " << std::endl; std::cout << (std::uintptr_t)sched_memory.thread_for(0) % 64 << ", " << (std::uintptr_t)sched_memory.thread_for(1) % 64 << ", " << (std::uintptr_t)sched_memory.thread_for(2) % 64 << ", " << std::endl;
std::cout << (std::uintptr_t)sched_memory.thread_state_for(0) % 64 << ", " << (std::uintptr_t)sched_memory.thread_state_for(1) % 64 << ", " << (std::uintptr_t)sched_memory.thread_state_for(2) % 64 << ", " << std::endl;
std::cout << (std::uintptr_t)sched_memory.task_stack_for(0) % 64 << ", " << (std::uintptr_t)sched_memory.task_stack_for(1) % 64 << ", " << (std::uintptr_t)sched_memory.task_stack_for(2) % 64 << ", " << std::endl;
} }
#include <pls/internal/base/thread.h> #include <pls/internal/base/thread.h>
#include <pls/internal/helpers/prohibit_new.h> #include <pls/internal/helpers/prohibit_new.h>
using namespace pls::internal::data_structures; using namespace pls::internal::base;
int global = 0; int global = 0;
......
...@@ -9,6 +9,7 @@ add_library(pls STATIC ...@@ -9,6 +9,7 @@ add_library(pls STATIC
include/pls/internal/base/barrier.h src/internal/base/barrier.cpp include/pls/internal/base/barrier.h src/internal/base/barrier.cpp
include/pls/internal/base/system_details.h include/pls/internal/base/system_details.h
include/pls/internal/base/error_handling.h include/pls/internal/base/error_handling.h
include/pls/internal/base/alignment.h src/internal/base/alignment.cpp
include/pls/internal/data_structures/aligned_stack.h src/internal/data_structures/aligned_stack.cpp include/pls/internal/data_structures/aligned_stack.h src/internal/data_structures/aligned_stack.cpp
include/pls/internal/data_structures/deque.h src/internal/data_structures/deque.cpp include/pls/internal/data_structures/deque.h src/internal/data_structures/deque.cpp
......
#ifndef PLS_ALIGNMENT_H
#define PLS_ALIGNMENT_H
#include <cstdint>
#include <cstdlib>
#include "system_details.h"
namespace pls {
namespace internal {
namespace base {
namespace alignment {
template<typename T>
struct aligned_wrapper {
alignas(system_details::CACHE_LINE_SIZE) unsigned char data[sizeof(T)];
T* pointer() { return reinterpret_cast<T*>(data); }
};
void* allocate_aligned(size_t size);
std::uintptr_t next_alignment(std::uintptr_t size);
char* next_alignment(char* pointer);
}
}
}
}
#endif //PLS_ALIGNMENT_H
...@@ -13,7 +13,9 @@ namespace pls { ...@@ -13,7 +13,9 @@ namespace pls {
* PORTABILITY: * PORTABILITY:
* Currently sane default values for x86. * Currently sane default values for x86.
*/ */
constexpr std::uintptr_t CACHE_LINE_SIZE = 64; namespace system_details {
constexpr std::uintptr_t CACHE_LINE_SIZE = 64;
}
} }
} }
} }
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include <cstdlib> #include <cstdlib>
#include "pls/internal/base/error_handling.h" #include "pls/internal/base/error_handling.h"
#include "pls/internal/base/alignment.h"
namespace pls { namespace pls {
namespace internal { namespace internal {
...@@ -29,14 +30,11 @@ namespace pls { ...@@ -29,14 +30,11 @@ namespace pls {
// Current head will always be aligned to cache lines // Current head will always be aligned to cache lines
char* head_; char* head_;
static std::uintptr_t next_alignment(std::uintptr_t size);
static char* next_alignment(char* pointer);
public: public:
typedef char* state; typedef char* state;
aligned_stack(): memory_start_{nullptr}, memory_end_{nullptr}, head_{nullptr} {}; aligned_stack(): memory_start_{nullptr}, memory_end_{nullptr}, head_{nullptr} {};
aligned_stack(char* memory_region, const std::size_t size); aligned_stack(char* memory_region, std::size_t size);
template<typename T> template<typename T>
T* push(const T& object) { T* push(const T& object) {
...@@ -49,7 +47,7 @@ namespace pls { ...@@ -49,7 +47,7 @@ namespace pls {
void* result = reinterpret_cast<T*>(head_); void* result = reinterpret_cast<T*>(head_);
// Move head to next aligned position after new object // Move head to next aligned position after new object
head_ = next_alignment(head_ + sizeof(T)); head_ = base::alignment::next_alignment(head_ + sizeof(T));
if (head_ >= memory_end_) { if (head_ >= memory_end_) {
PLS_ERROR("Tried to allocate object on alligned_stack without sufficient memory!"); PLS_ERROR("Tried to allocate object on alligned_stack without sufficient memory!");
} }
...@@ -59,7 +57,7 @@ namespace pls { ...@@ -59,7 +57,7 @@ namespace pls {
template<typename T> template<typename T>
T pop() { T pop() {
head_ = head_ - next_alignment(sizeof(T)); head_ = head_ - base::alignment::next_alignment(sizeof(T));
return *reinterpret_cast<T*>(head_); return *reinterpret_cast<T*>(head_);
} }
......
...@@ -14,7 +14,7 @@ namespace pls { ...@@ -14,7 +14,7 @@ namespace pls {
class scheduler_memory { class scheduler_memory {
public: public:
virtual size_t max_threads() = 0; virtual size_t max_threads() const = 0;
virtual thread_state* thread_state_for(size_t id) = 0; virtual thread_state* thread_state_for(size_t id) = 0;
virtual scheduler_thread* thread_for(size_t id) = 0; virtual scheduler_thread* thread_for(size_t id) = 0;
virtual data_structures::aligned_stack* task_stack_for(size_t id) = 0; virtual data_structures::aligned_stack* task_stack_for(size_t id) = 0;
...@@ -22,39 +22,54 @@ namespace pls { ...@@ -22,39 +22,54 @@ namespace pls {
template<size_t MAX_THREADS, size_t TASK_STACK_SIZE> template<size_t MAX_THREADS, size_t TASK_STACK_SIZE>
class static_scheduler_memory: public scheduler_memory { class static_scheduler_memory: public scheduler_memory {
std::array<scheduler_thread, MAX_THREADS> threads_; // Everyone of these types has to live on its own cache line,
std::array<thread_state, MAX_THREADS> thread_states_; // as each thread uses one of them independently.
std::array<std::array<char, TASK_STACK_SIZE>, MAX_THREADS> task_stacks_memory_; // Therefore it would be a major performance hit if we shared cache lines on these.
std::array<data_structures::aligned_stack, MAX_THREADS> task_stacks_; using aligned_thread = base::alignment::aligned_wrapper<scheduler_thread>;
using aligned_thread_state = base::alignment::aligned_wrapper<thread_state>;
using aligned_thread_stack = base::alignment::aligned_wrapper<std::array<char, TASK_STACK_SIZE>>;
using aligned_aligned_stack = base::alignment::aligned_wrapper<data_structures::aligned_stack>;
std::array<aligned_thread, MAX_THREADS> threads_;
std::array<aligned_thread_state, MAX_THREADS> thread_states_;
std::array<aligned_thread_stack, MAX_THREADS> task_stacks_memory_;
std::array<aligned_aligned_stack, MAX_THREADS> task_stacks_;
public: public:
static_scheduler_memory() { static_scheduler_memory() {
for (size_t i = 0; i < MAX_THREADS; i++) { for (size_t i = 0; i < MAX_THREADS; i++) {
task_stacks_[i] = data_structures::aligned_stack(task_stacks_memory_[i].data(), TASK_STACK_SIZE); new ((void*)task_stacks_[i].pointer()) data_structures::aligned_stack(task_stacks_memory_[i].pointer()->data(), TASK_STACK_SIZE);
} }
} }
size_t max_threads() override { return MAX_THREADS; } size_t max_threads() const override { return MAX_THREADS; }
thread_state* thread_state_for(size_t id) override { return &thread_states_[id]; } thread_state* thread_state_for(size_t id) override { return thread_states_[id].pointer(); }
scheduler_thread* thread_for(size_t id) override { return &threads_[id]; } scheduler_thread* thread_for(size_t id) override { return threads_[id].pointer(); }
data_structures::aligned_stack* task_stack_for(size_t id) override { return &task_stacks_[id]; } data_structures::aligned_stack* task_stack_for(size_t id) override { return task_stacks_[id].pointer(); }
}; };
class malloc_scheduler_memory: public scheduler_memory { class malloc_scheduler_memory: public scheduler_memory {
size_t num_threads_; // Everyone of these types has to live on its own cache line,
// as each thread uses one of them independently.
// Therefore it would be a major performance hit if we shared cache lines on these.
using aligned_thread = base::alignment::aligned_wrapper<scheduler_thread>;
using aligned_thread_state = base::alignment::aligned_wrapper<thread_state>;
using aligned_aligned_stack = base::alignment::aligned_wrapper<data_structures::aligned_stack>;
const size_t num_threads_;
alignas(64) scheduler_thread* threads_; aligned_thread* threads_;
thread_state* thread_states_; aligned_thread_state * thread_states_;
char** task_stacks_memory_; char** task_stacks_memory_;
data_structures::aligned_stack* task_stacks_; aligned_aligned_stack * task_stacks_;
public: public:
explicit malloc_scheduler_memory(size_t num_threads, size_t memory_per_stack = 2 << 16); explicit malloc_scheduler_memory(size_t num_threads, size_t memory_per_stack = 2 << 16);
~malloc_scheduler_memory(); ~malloc_scheduler_memory();
size_t max_threads() override { return num_threads_; } size_t max_threads() const override { return num_threads_; }
thread_state* thread_state_for(size_t id) override { return &thread_states_[id]; } thread_state* thread_state_for(size_t id) override { return thread_states_[id].pointer(); }
scheduler_thread* thread_for(size_t id) override { return &threads_[id]; } scheduler_thread* thread_for(size_t id) override { return threads_[id].pointer(); }
data_structures::aligned_stack* task_stack_for(size_t id) override { return &task_stacks_[id]; } data_structures::aligned_stack* task_stack_for(size_t id) override { return task_stacks_[id].pointer(); }
}; };
} }
} }
......
#include "pls/internal/base/alignment.h"
#include "pls/internal/base/system_details.h"
namespace pls {
namespace internal {
namespace base {
namespace alignment {
void* allocate_aligned(size_t size) {
return aligned_alloc(system_details::CACHE_LINE_SIZE, size);
}
std::uintptr_t next_alignment(std::uintptr_t size) {
std::uintptr_t miss_alignment = size % base::system_details::CACHE_LINE_SIZE;
if (miss_alignment == 0) {
return size;
} else {
return size + (base::system_details::CACHE_LINE_SIZE - miss_alignment);
}
}
char* next_alignment(char* pointer) {
return reinterpret_cast<char*>(next_alignment(reinterpret_cast<std::uintptr_t >(pointer)));
}
}
}
}
}
...@@ -7,20 +7,7 @@ namespace pls { ...@@ -7,20 +7,7 @@ namespace pls {
aligned_stack::aligned_stack(char* memory_region, const std::size_t size): aligned_stack::aligned_stack(char* memory_region, const std::size_t size):
memory_start_{memory_region}, memory_start_{memory_region},
memory_end_{memory_region + size}, memory_end_{memory_region + size},
head_{next_alignment(memory_start_)} {} head_{base::alignment::next_alignment(memory_start_)} {}
std::uintptr_t aligned_stack::next_alignment(std::uintptr_t size) {
std::uintptr_t miss_alignment = size % base::CACHE_LINE_SIZE;
if (miss_alignment == 0) {
return size;
} else {
return size + (base::CACHE_LINE_SIZE - miss_alignment);
}
}
char* aligned_stack::next_alignment(char* pointer) {
return reinterpret_cast<char*>(next_alignment(reinterpret_cast<std::uintptr_t >(pointer)));
}
} }
} }
} }
...@@ -5,14 +5,14 @@ namespace pls { ...@@ -5,14 +5,14 @@ namespace pls {
namespace scheduling { namespace scheduling {
malloc_scheduler_memory::malloc_scheduler_memory(const size_t num_threads, const size_t memory_per_stack): malloc_scheduler_memory::malloc_scheduler_memory(const size_t num_threads, const size_t memory_per_stack):
num_threads_{num_threads} { num_threads_{num_threads} {
threads_ = reinterpret_cast<scheduler_thread*>(malloc(num_threads * sizeof(scheduler_thread))); threads_ = reinterpret_cast<aligned_thread *>(base::alignment::allocate_aligned(num_threads * sizeof(aligned_thread)));
thread_states_ = reinterpret_cast<thread_state*>(malloc(num_threads * sizeof(thread_state))); thread_states_ = reinterpret_cast<aligned_thread_state *>(base::alignment::allocate_aligned(num_threads * sizeof(aligned_thread_state)));
task_stacks_ = reinterpret_cast<data_structures::aligned_stack*>(malloc(num_threads * sizeof(data_structures::aligned_stack))); task_stacks_ = reinterpret_cast<aligned_aligned_stack *>(base::alignment::allocate_aligned(num_threads * sizeof(aligned_aligned_stack)));
task_stacks_memory_ = reinterpret_cast<char**>(malloc(num_threads * sizeof(char*))); task_stacks_memory_ = reinterpret_cast<char**>(base::alignment::allocate_aligned(num_threads * sizeof(char*)));
for (size_t i = 0; i < num_threads_; i++) { for (size_t i = 0; i < num_threads_; i++) {
task_stacks_memory_[i] = reinterpret_cast<char*>(malloc(memory_per_stack)); task_stacks_memory_[i] = reinterpret_cast<char*>(base::alignment::allocate_aligned(memory_per_stack));
task_stacks_[i] = data_structures::aligned_stack(task_stacks_memory_[i], memory_per_stack); new ((void*)task_stacks_[i].pointer()) data_structures::aligned_stack(task_stacks_memory_[i], memory_per_stack);
} }
} }
......
...@@ -27,20 +27,20 @@ TEST_CASE( "aligned stack stores objects correctly", "[internal/data_structures/ ...@@ -27,20 +27,20 @@ TEST_CASE( "aligned stack stores objects correctly", "[internal/data_structures/
auto pointer_two = stack.push(small_data_two); auto pointer_two = stack.push(small_data_two);
auto pointer_three = stack.push(small_data_three); auto pointer_three = stack.push(small_data_three);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_one) % CACHE_LINE_SIZE == 0); REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_one) % system_details::CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_two) % CACHE_LINE_SIZE == 0); REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_two) % system_details::CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_three) % CACHE_LINE_SIZE == 0); REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_three) % system_details::CACHE_LINE_SIZE == 0);
} }
SECTION( "stack correctly pushes above linesize objects" ) { SECTION( "stack correctly pushes above linesize objects" ) {
std::array<char, 5> small_data_one{'a', 'b', 'c', 'd', 'e'}; std::array<char, 5> small_data_one{'a', 'b', 'c', 'd', 'e'};
std::array<char, CACHE_LINE_SIZE + 10> big_data_one{}; std::array<char, system_details::CACHE_LINE_SIZE + 10> big_data_one{};
auto big_pointer_one = stack.push(big_data_one); auto big_pointer_one = stack.push(big_data_one);
auto small_pointer_one = stack.push(small_data_one); auto small_pointer_one = stack.push(small_data_one);
REQUIRE(reinterpret_cast<std::uintptr_t>(big_pointer_one) % CACHE_LINE_SIZE == 0); REQUIRE(reinterpret_cast<std::uintptr_t>(big_pointer_one) % system_details::CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(small_pointer_one) % CACHE_LINE_SIZE == 0); REQUIRE(reinterpret_cast<std::uintptr_t>(small_pointer_one) % system_details::CACHE_LINE_SIZE == 0);
} }
SECTION( "stack correctly stores and retrieves objects" ) { SECTION( "stack correctly stores and retrieves objects" ) {
...@@ -65,11 +65,11 @@ TEST_CASE( "aligned stack stores objects correctly", "[internal/data_structures/ ...@@ -65,11 +65,11 @@ TEST_CASE( "aligned stack stores objects correctly", "[internal/data_structures/
auto pointer_four = stack.push(small_data_two); auto pointer_four = stack.push(small_data_two);
auto pointer_five = stack.push(small_data_three); auto pointer_five = stack.push(small_data_three);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_one) % CACHE_LINE_SIZE == 0); REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_one) % system_details::CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_two) % CACHE_LINE_SIZE == 0); REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_two) % system_details::CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_three) % CACHE_LINE_SIZE == 0); REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_three) % system_details::CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_four) % CACHE_LINE_SIZE == 0); REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_four) % system_details::CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_five) % CACHE_LINE_SIZE == 0); REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_five) % system_details::CACHE_LINE_SIZE == 0);
REQUIRE(pointer_four == pointer_two); REQUIRE(pointer_four == pointer_two);
REQUIRE(pointer_five == pointer_three); REQUIRE(pointer_five == pointer_three);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment