Commit 3535cbd8 by FritzFlorian

Cache Align scheduler_memory.

parent e2e34b02
Pipeline #1144 passed with stages
in 3 minutes 32 seconds
......@@ -12,10 +12,8 @@
using namespace pls;
int main() {
using aligned_state = std::aligned_storage<sizeof(internal::scheduling::thread_state), 64>::type;
aligned_state data;
std::cout << sizeof(aligned_state) << std::endl;
malloc_scheduler_memory sched_memory{8};
std::cout << (std::uintptr_t)sched_memory.thread_for(0) % 64 << ", " << (std::uintptr_t)sched_memory.thread_for(1) % 64 << ", " << (std::uintptr_t)sched_memory.thread_for(2) % 64 << ", " << std::endl;
std::cout << (std::uintptr_t)sched_memory.thread_state_for(0) % 64 << ", " << (std::uintptr_t)sched_memory.thread_state_for(1) % 64 << ", " << (std::uintptr_t)sched_memory.thread_state_for(2) % 64 << ", " << std::endl;
std::cout << (std::uintptr_t)sched_memory.task_stack_for(0) % 64 << ", " << (std::uintptr_t)sched_memory.task_stack_for(1) % 64 << ", " << (std::uintptr_t)sched_memory.task_stack_for(2) % 64 << ", " << std::endl;
}
#include <pls/internal/base/thread.h>
#include <pls/internal/helpers/prohibit_new.h>
using namespace pls::internal::data_structures;
using namespace pls::internal::base;
int global = 0;
......
......@@ -9,6 +9,7 @@ add_library(pls STATIC
include/pls/internal/base/barrier.h src/internal/base/barrier.cpp
include/pls/internal/base/system_details.h
include/pls/internal/base/error_handling.h
include/pls/internal/base/alignment.h src/internal/base/alignment.cpp
include/pls/internal/data_structures/aligned_stack.h src/internal/data_structures/aligned_stack.cpp
include/pls/internal/data_structures/deque.h src/internal/data_structures/deque.cpp
......
#ifndef PLS_ALIGNMENT_H
#define PLS_ALIGNMENT_H
#include <cstdint>
#include <cstdlib>
#include "system_details.h"
namespace pls {
namespace internal {
namespace base {
namespace alignment {
template<typename T>
struct aligned_wrapper {
alignas(system_details::CACHE_LINE_SIZE) unsigned char data[sizeof(T)];
T* pointer() { return reinterpret_cast<T*>(data); }
};
void* allocate_aligned(size_t size);
std::uintptr_t next_alignment(std::uintptr_t size);
char* next_alignment(char* pointer);
}
}
}
}
#endif //PLS_ALIGNMENT_H
......@@ -13,7 +13,9 @@ namespace pls {
* PORTABILITY:
* Currently sane default values for x86.
*/
constexpr std::uintptr_t CACHE_LINE_SIZE = 64;
namespace system_details {
constexpr std::uintptr_t CACHE_LINE_SIZE = 64;
}
}
}
}
......
......@@ -6,6 +6,7 @@
#include <cstdlib>
#include "pls/internal/base/error_handling.h"
#include "pls/internal/base/alignment.h"
namespace pls {
namespace internal {
......@@ -29,14 +30,11 @@ namespace pls {
// Current head will always be aligned to cache lines
char* head_;
static std::uintptr_t next_alignment(std::uintptr_t size);
static char* next_alignment(char* pointer);
public:
typedef char* state;
aligned_stack(): memory_start_{nullptr}, memory_end_{nullptr}, head_{nullptr} {};
aligned_stack(char* memory_region, const std::size_t size);
aligned_stack(char* memory_region, std::size_t size);
template<typename T>
T* push(const T& object) {
......@@ -49,7 +47,7 @@ namespace pls {
void* result = reinterpret_cast<T*>(head_);
// Move head to next aligned position after new object
head_ = next_alignment(head_ + sizeof(T));
head_ = base::alignment::next_alignment(head_ + sizeof(T));
if (head_ >= memory_end_) {
PLS_ERROR("Tried to allocate object on alligned_stack without sufficient memory!");
}
......@@ -59,7 +57,7 @@ namespace pls {
template<typename T>
T pop() {
head_ = head_ - next_alignment(sizeof(T));
head_ = head_ - base::alignment::next_alignment(sizeof(T));
return *reinterpret_cast<T*>(head_);
}
......
......@@ -14,7 +14,7 @@ namespace pls {
class scheduler_memory {
public:
virtual size_t max_threads() = 0;
virtual size_t max_threads() const = 0;
virtual thread_state* thread_state_for(size_t id) = 0;
virtual scheduler_thread* thread_for(size_t id) = 0;
virtual data_structures::aligned_stack* task_stack_for(size_t id) = 0;
......@@ -22,39 +22,54 @@ namespace pls {
template<size_t MAX_THREADS, size_t TASK_STACK_SIZE>
class static_scheduler_memory: public scheduler_memory {
std::array<scheduler_thread, MAX_THREADS> threads_;
std::array<thread_state, MAX_THREADS> thread_states_;
std::array<std::array<char, TASK_STACK_SIZE>, MAX_THREADS> task_stacks_memory_;
std::array<data_structures::aligned_stack, MAX_THREADS> task_stacks_;
// Everyone of these types has to live on its own cache line,
// as each thread uses one of them independently.
// Therefore it would be a major performance hit if we shared cache lines on these.
using aligned_thread = base::alignment::aligned_wrapper<scheduler_thread>;
using aligned_thread_state = base::alignment::aligned_wrapper<thread_state>;
using aligned_thread_stack = base::alignment::aligned_wrapper<std::array<char, TASK_STACK_SIZE>>;
using aligned_aligned_stack = base::alignment::aligned_wrapper<data_structures::aligned_stack>;
std::array<aligned_thread, MAX_THREADS> threads_;
std::array<aligned_thread_state, MAX_THREADS> thread_states_;
std::array<aligned_thread_stack, MAX_THREADS> task_stacks_memory_;
std::array<aligned_aligned_stack, MAX_THREADS> task_stacks_;
public:
static_scheduler_memory() {
for (size_t i = 0; i < MAX_THREADS; i++) {
task_stacks_[i] = data_structures::aligned_stack(task_stacks_memory_[i].data(), TASK_STACK_SIZE);
new ((void*)task_stacks_[i].pointer()) data_structures::aligned_stack(task_stacks_memory_[i].pointer()->data(), TASK_STACK_SIZE);
}
}
size_t max_threads() override { return MAX_THREADS; }
thread_state* thread_state_for(size_t id) override { return &thread_states_[id]; }
scheduler_thread* thread_for(size_t id) override { return &threads_[id]; }
data_structures::aligned_stack* task_stack_for(size_t id) override { return &task_stacks_[id]; }
size_t max_threads() const override { return MAX_THREADS; }
thread_state* thread_state_for(size_t id) override { return thread_states_[id].pointer(); }
scheduler_thread* thread_for(size_t id) override { return threads_[id].pointer(); }
data_structures::aligned_stack* task_stack_for(size_t id) override { return task_stacks_[id].pointer(); }
};
class malloc_scheduler_memory: public scheduler_memory {
size_t num_threads_;
// Everyone of these types has to live on its own cache line,
// as each thread uses one of them independently.
// Therefore it would be a major performance hit if we shared cache lines on these.
using aligned_thread = base::alignment::aligned_wrapper<scheduler_thread>;
using aligned_thread_state = base::alignment::aligned_wrapper<thread_state>;
using aligned_aligned_stack = base::alignment::aligned_wrapper<data_structures::aligned_stack>;
const size_t num_threads_;
alignas(64) scheduler_thread* threads_;
thread_state* thread_states_;
aligned_thread* threads_;
aligned_thread_state * thread_states_;
char** task_stacks_memory_;
data_structures::aligned_stack* task_stacks_;
aligned_aligned_stack * task_stacks_;
public:
explicit malloc_scheduler_memory(size_t num_threads, size_t memory_per_stack = 2 << 16);
~malloc_scheduler_memory();
size_t max_threads() override { return num_threads_; }
thread_state* thread_state_for(size_t id) override { return &thread_states_[id]; }
scheduler_thread* thread_for(size_t id) override { return &threads_[id]; }
data_structures::aligned_stack* task_stack_for(size_t id) override { return &task_stacks_[id]; }
size_t max_threads() const override { return num_threads_; }
thread_state* thread_state_for(size_t id) override { return thread_states_[id].pointer(); }
scheduler_thread* thread_for(size_t id) override { return threads_[id].pointer(); }
data_structures::aligned_stack* task_stack_for(size_t id) override { return task_stacks_[id].pointer(); }
};
}
}
......
#include "pls/internal/base/alignment.h"
#include "pls/internal/base/system_details.h"
namespace pls {
namespace internal {
namespace base {
namespace alignment {
void* allocate_aligned(size_t size) {
return aligned_alloc(system_details::CACHE_LINE_SIZE, size);
}
std::uintptr_t next_alignment(std::uintptr_t size) {
std::uintptr_t miss_alignment = size % base::system_details::CACHE_LINE_SIZE;
if (miss_alignment == 0) {
return size;
} else {
return size + (base::system_details::CACHE_LINE_SIZE - miss_alignment);
}
}
char* next_alignment(char* pointer) {
return reinterpret_cast<char*>(next_alignment(reinterpret_cast<std::uintptr_t >(pointer)));
}
}
}
}
}
......@@ -7,20 +7,7 @@ namespace pls {
aligned_stack::aligned_stack(char* memory_region, const std::size_t size):
memory_start_{memory_region},
memory_end_{memory_region + size},
head_{next_alignment(memory_start_)} {}
std::uintptr_t aligned_stack::next_alignment(std::uintptr_t size) {
std::uintptr_t miss_alignment = size % base::CACHE_LINE_SIZE;
if (miss_alignment == 0) {
return size;
} else {
return size + (base::CACHE_LINE_SIZE - miss_alignment);
}
}
char* aligned_stack::next_alignment(char* pointer) {
return reinterpret_cast<char*>(next_alignment(reinterpret_cast<std::uintptr_t >(pointer)));
}
head_{base::alignment::next_alignment(memory_start_)} {}
}
}
}
......@@ -5,14 +5,14 @@ namespace pls {
namespace scheduling {
malloc_scheduler_memory::malloc_scheduler_memory(const size_t num_threads, const size_t memory_per_stack):
num_threads_{num_threads} {
threads_ = reinterpret_cast<scheduler_thread*>(malloc(num_threads * sizeof(scheduler_thread)));
thread_states_ = reinterpret_cast<thread_state*>(malloc(num_threads * sizeof(thread_state)));
threads_ = reinterpret_cast<aligned_thread *>(base::alignment::allocate_aligned(num_threads * sizeof(aligned_thread)));
thread_states_ = reinterpret_cast<aligned_thread_state *>(base::alignment::allocate_aligned(num_threads * sizeof(aligned_thread_state)));
task_stacks_ = reinterpret_cast<data_structures::aligned_stack*>(malloc(num_threads * sizeof(data_structures::aligned_stack)));
task_stacks_memory_ = reinterpret_cast<char**>(malloc(num_threads * sizeof(char*)));
task_stacks_ = reinterpret_cast<aligned_aligned_stack *>(base::alignment::allocate_aligned(num_threads * sizeof(aligned_aligned_stack)));
task_stacks_memory_ = reinterpret_cast<char**>(base::alignment::allocate_aligned(num_threads * sizeof(char*)));
for (size_t i = 0; i < num_threads_; i++) {
task_stacks_memory_[i] = reinterpret_cast<char*>(malloc(memory_per_stack));
task_stacks_[i] = data_structures::aligned_stack(task_stacks_memory_[i], memory_per_stack);
task_stacks_memory_[i] = reinterpret_cast<char*>(base::alignment::allocate_aligned(memory_per_stack));
new ((void*)task_stacks_[i].pointer()) data_structures::aligned_stack(task_stacks_memory_[i], memory_per_stack);
}
}
......
......@@ -27,20 +27,20 @@ TEST_CASE( "aligned stack stores objects correctly", "[internal/data_structures/
auto pointer_two = stack.push(small_data_two);
auto pointer_three = stack.push(small_data_three);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_one) % CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_two) % CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_three) % CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_one) % system_details::CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_two) % system_details::CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_three) % system_details::CACHE_LINE_SIZE == 0);
}
SECTION( "stack correctly pushes above linesize objects" ) {
std::array<char, 5> small_data_one{'a', 'b', 'c', 'd', 'e'};
std::array<char, CACHE_LINE_SIZE + 10> big_data_one{};
std::array<char, system_details::CACHE_LINE_SIZE + 10> big_data_one{};
auto big_pointer_one = stack.push(big_data_one);
auto small_pointer_one = stack.push(small_data_one);
REQUIRE(reinterpret_cast<std::uintptr_t>(big_pointer_one) % CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(small_pointer_one) % CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(big_pointer_one) % system_details::CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(small_pointer_one) % system_details::CACHE_LINE_SIZE == 0);
}
SECTION( "stack correctly stores and retrieves objects" ) {
......@@ -65,11 +65,11 @@ TEST_CASE( "aligned stack stores objects correctly", "[internal/data_structures/
auto pointer_four = stack.push(small_data_two);
auto pointer_five = stack.push(small_data_three);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_one) % CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_two) % CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_three) % CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_four) % CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_five) % CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_one) % system_details::CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_two) % system_details::CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_three) % system_details::CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_four) % system_details::CACHE_LINE_SIZE == 0);
REQUIRE(reinterpret_cast<std::uintptr_t>(pointer_five) % system_details::CACHE_LINE_SIZE == 0);
REQUIRE(pointer_four == pointer_two);
REQUIRE(pointer_five == pointer_three);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment