diff --git a/CMakeLists.txt b/CMakeLists.txt index be1f6ac..941c725 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,7 @@ include(cmake/SetupThreadingSupport.cmake) include(cmake/SetupThreadSanitizer.cmake) include(cmake/SetupAddressSanitizer.cmake) include(cmake/SetupEasyProfiler.cmake) +include(cmake/SetupDebugSymbols.cmake) # make our internal cmake script collection avaliable in the build process. list(APPEND CMAKE_PREFIX_PATH "${PROJECT_SOURCE_DIR}/cmake") diff --git a/NOTES.md b/NOTES.md index a495bab..04b1705 100644 --- a/NOTES.md +++ b/NOTES.md @@ -4,6 +4,19 @@ A collection of stuff that we noticed during development. Useful later on two write a project report and to go back in time to find out why certain decisions where made. +## 09.02.2019 - Cache Alignment + +Aligning the cache needs all parts (both data types with correct alignment +and base memory with correct alignment). + +Our first tests show that the initial alignment (Commit 3535cbd8), +boostet the performance in the fft_benchmark from our library to +Intel TBB's speedup when running on up to 4 threads. +When crossing the boundary to hyper-threading this falls of. +We therefore think that contemption/cache misses are the reason for +bad performance above 4 threads, but have to investigate further to +pin down the issue. + ## 08.04.2019 - Random Numbers We decided to go for a simple linear random number generator diff --git a/PERFORMANCE.md b/PERFORMANCE.md index 2ec5a0d..2372c66 100644 --- a/PERFORMANCE.md +++ b/PERFORMANCE.md @@ -1,6 +1,8 @@ # Notes on performance measures during development -#### Commit 9c12addf +#### Commit 52fcb51f - Add basic random stealing + +Slight improvement, needs further measurement after removing more important bottlenecks. | | | | | | | | | | | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | @@ -19,3 +21,9 @@ change | 100.39 %| 99.14 %| 98.46 %| 107.74 %| 100.17 %| old | 1654.26 us| 969.12 us| 832.13 us| 680.69 us| 718.70 us| 750.80 us| 744.12 us| 775.24 us| 7125.07 us new | 1637.04 us| 978.09 us| 799.93 us| 709.33 us| 746.42 us| 684.87 us| 822.30 us| 787.61 us| 7165.59 us change | 98.96 %| 100.93 %| 96.13 %| 104.21 %| 103.86 %| 91.22 %| 110.51 %| 101.60 %| 100.57 % + +#### Commit 3535cbd8 - Cache Align scheduler_memory + +Big improvements of about 6% in our test. This seems like a little, +but 6% from the scheduler is a lot, as the 'main work' is the tasks +itself, not the scheduler. diff --git a/README.md b/README.md index d62f410..8ec38b4 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,70 @@ [![pipeline status](http://lab.las3.de/gitlab/las3/development/scheduling/predictable_parallel_patterns/badges/master/pipeline.svg)](http://lab.las3.de/gitlab/las3/development/scheduling/predictable_parallel_patterns/commits/master) +## Getting Started + +This section will give a brief introduction on how to get a minimal +project setup that uses the PLS library. + +### Installation + +Clone the repository and open a terminal session in its folder. +Create a build folder using `mkdir cmake-build-release` +and switch into it `cd cmake-build-release`. +Setup the cmake project using `cmake ../ -DCMAKE_BUILD_TYPE=RELEASE`, +then install it as a system wide dependency using `sudo make install.pls`. + +At this point the library is installed on your system. +To use it simply add it to your existing cmake project using +`find_package(pls REQUIRED)` and then link it to your project +using `target_link_libraries(your_target pls::pls)`. + +### Basic Usage + +```c++ +#include +#include + +long fib(long n); + +int main() { + // All memory needed by the scheduler can be allocated in advance either on stack or using malloc. + const unsigned int num_threads = 8; + const unsigned int memory_per_thread = 2 << 14; + static pls::static_scheduler_memory memory; + + // Create the scheduler instance (starts a thread pool). + pls::scheduler scheduler{&memory, num_threads}; + + // Wake up the thread pool and perform work. + scheduler.perform_work([&] { + long result = fib(20); + std::cout << "fib(20)=" << result << std::endl; + }); + // At this point the thread pool sleeps. + // This can for example be used for periodic work. +} + +long fib(long n) { + if (n == 0) { + return 0; + } + if (n == 1) { + return 1; + } + + // Example for the high level API. + // Will run both functions in parallel as seperate tasks. + int left, right; + pls::invoke_parallel( + [&] { left = fib(n - 1); }, + [&] { right = fib(n - 2); } + ); + return left + right; +} + +``` + ## Project Structure diff --git a/app/benchmark_fft/main.cpp b/app/benchmark_fft/main.cpp index e5f3453..0b0f505 100644 --- a/app/benchmark_fft/main.cpp +++ b/app/benchmark_fft/main.cpp @@ -30,7 +30,7 @@ void combine(complex_vector::iterator data, int n) { std::complex odd = data[i + n / 2]; // w is the "twiddle-factor". - // this could be cached, but we run the same 'base' algorithm parallel/serial, + // this could be cached, but we run the same 'data_structures' algorithm parallel/serial, // so it won't impact the performance comparison. std::complex w = exp(std::complex(0, -2. * M_PI * i / n)); diff --git a/app/invoke_parallel/main.cpp b/app/invoke_parallel/main.cpp index 4ae48ef..a856b51 100644 --- a/app/invoke_parallel/main.cpp +++ b/app/invoke_parallel/main.cpp @@ -34,7 +34,7 @@ long fib(long n) { int main() { PROFILE_ENABLE - pls::scheduler scheduler{&my_scheduler_memory, 8}; + pls::scheduler scheduler{&my_scheduler_memory, 2}; long result; scheduler.perform_work([&] { diff --git a/app/playground/main.cpp b/app/playground/main.cpp index 56c0c8e..4a33c29 100644 --- a/app/playground/main.cpp +++ b/app/playground/main.cpp @@ -3,69 +3,17 @@ #include #include #include +#include #include #include +#include using namespace pls; -// Example for static memory allocation (no malloc or free required) -static static_scheduler_memory<8, 2 << 12> my_scheduler_memory; - -class fib: public fork_join_sub_task { - static constexpr int CUTOFF = 20; - - int num_; - int* result_; - -public: - fib(int num, int* result): num_{num}, result_{result} {} - -private: - static int fib_serial(int num) { - if (num == 0) { - return 0; - } - if (num == 1) { - return 1; - } - - return fib_serial(num - 1) + fib_serial(num - 2); - } - -protected: - void execute_internal() override { - if (num_ <= CUTOFF) { - *result_ = fib_serial(num_); - return; - } - - int left_result; - int right_result; - - spawn_child(fib{num_ - 1, &left_result}); - spawn_child(fib{num_ - 2, &right_result}); - - wait_for_all(); - *result_ = left_result + right_result; - } -}; - - int main() { - scheduler my_scheduler{&my_scheduler_memory, 4}; - - auto start = std::chrono::high_resolution_clock::now(); - my_scheduler.perform_work([] (){ - int result; - - fib fib_sub_task{45, &result}; - fork_join_task tbb_task{&fib_sub_task, task_id{1}}; - scheduler::execute_task(tbb_task); - - std::cout << "Result: " << result << std::endl; - }); - auto end = std::chrono::high_resolution_clock::now(); - long time = std::chrono::duration_cast(end - start).count(); - std::cout << "Startup time in us: " << time << std::endl; + malloc_scheduler_memory sched_memory{8}; + std::cout << (std::uintptr_t)sched_memory.thread_for(0) % 64 << ", " << (std::uintptr_t)sched_memory.thread_for(1) % 64 << ", " << (std::uintptr_t)sched_memory.thread_for(2) % 64 << ", " << std::endl; + std::cout << (std::uintptr_t)sched_memory.thread_state_for(0) % 64 << ", " << (std::uintptr_t)sched_memory.thread_state_for(1) % 64 << ", " << (std::uintptr_t)sched_memory.thread_state_for(2) % 64 << ", " << std::endl; + std::cout << (std::uintptr_t)sched_memory.task_stack_for(0) % 64 << ", " << (std::uintptr_t)sched_memory.task_stack_for(1) % 64 << ", " << (std::uintptr_t)sched_memory.task_stack_for(2) % 64 << ", " << std::endl; } diff --git a/cmake/SetupDebugSymbols.cmake b/cmake/SetupDebugSymbols.cmake new file mode 100644 index 0000000..6e17d41 --- /dev/null +++ b/cmake/SetupDebugSymbols.cmake @@ -0,0 +1,5 @@ +option(DEBUG_SYMBOLS "Enable debug symbols" OFF) +if(DEBUG_SYMBOLS) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") +endif() +message("-- Debug Symbols: ${DEBUG_SYMBOLS}") diff --git a/lib/pls/CMakeLists.txt b/lib/pls/CMakeLists.txt index 6cee207..346ad9f 100644 --- a/lib/pls/CMakeLists.txt +++ b/lib/pls/CMakeLists.txt @@ -1,24 +1,31 @@ # List all required files here (cmake best practice to NOT automate this step!) add_library(pls STATIC - src/pls.cpp include/pls/pls.h - src/internal/base/spin_lock.cpp include/pls/internal/base/spin_lock.h - src/internal/base/thread.cpp include/pls/internal/base/thread.h - include/pls/internal/helpers/prohibit_new.h - src/internal/scheduling/abstract_task.cpp include/pls/internal/scheduling/abstract_task.h - src/internal/scheduling/scheduler.cpp include/pls/internal/scheduling/scheduler.h - src/internal/scheduling/thread_state.cpp include/pls/internal/scheduling/thread_state.h - src/internal/base/barrier.cpp include/pls/internal/base/barrier.h - src/internal/scheduling/root_task.cpp include/pls/internal/scheduling/root_task.h - src/internal/base/aligned_stack.cpp include/pls/internal/base/aligned_stack.h - include/pls/internal/base/system_details.h - src/internal/scheduling/run_on_n_threads_task.cpp include/pls/internal/scheduling/run_on_n_threads_task.h - src/internal/scheduling/fork_join_task.cpp include/pls/internal/scheduling/fork_join_task.h - src/internal/base/deque.cpp include/pls/internal/base/deque.h - src/algorithms/invoke_parallel.cpp include/pls/algorithms/invoke_parallel.h - include/pls/internal/base/error_handling.h - include/pls/internal/scheduling/scheduler_memory.h src/internal/scheduling/scheduler_memory.cpp - include/pls/internal/helpers/profiler.h - include/pls/internal/helpers/mini_benchmark.h) + include/pls/pls.h src/pls.cpp + + include/pls/algorithms/invoke_parallel.h src/algorithms/invoke_parallel.cpp + + include/pls/internal/base/spin_lock.h src/internal/base/spin_lock.cpp + include/pls/internal/base/thread.h src/internal/base/thread.cpp + include/pls/internal/base/barrier.h src/internal/base/barrier.cpp + include/pls/internal/base/system_details.h + include/pls/internal/base/error_handling.h + include/pls/internal/base/alignment.h src/internal/base/alignment.cpp + + include/pls/internal/data_structures/aligned_stack.h src/internal/data_structures/aligned_stack.cpp + include/pls/internal/data_structures/deque.h src/internal/data_structures/deque.cpp + + include/pls/internal/helpers/prohibit_new.h + include/pls/internal/helpers/profiler.h + include/pls/internal/helpers/mini_benchmark.h + + include/pls/internal/scheduling/root_task.h src/internal/scheduling/root_task.cpp + include/pls/internal/scheduling/thread_state.h src/internal/scheduling/thread_state.cpp + include/pls/internal/scheduling/abstract_task.h src/internal/scheduling/abstract_task.cpp + include/pls/internal/scheduling/scheduler.h src/internal/scheduling/scheduler.cpp + include/pls/internal/scheduling/run_on_n_threads_task.h src/internal/scheduling/run_on_n_threads_task.cpp + include/pls/internal/scheduling/fork_join_task.h src/internal/scheduling/fork_join_task.cpp + include/pls/internal/scheduling/scheduler_memory.h src/internal/scheduling/scheduler_memory.cpp +) # Add everything in `./include` to be in the include path of this project target_include_directories(pls @@ -63,6 +70,13 @@ INSTALl( FILES pls-config.cmake DESTINATION lib/pls ) +# ...add a custom target that will only build the library when istalling. +# This can allow us to speed up the installation on embedded devices. +ADD_CUSTOM_TARGET(install.pls + ${CMAKE_COMMAND} + -DBUILD_TYPE=${CMAKE_BUILD_TYPE} + -P ${CMAKE_BINARY_DIR}/cmake_install.cmake) +ADD_DEPENDENCIES(install.pls pls) # Enable warnings/tidy code checking from our compiler target_compile_options(pls PRIVATE diff --git a/lib/pls/include/pls/internal/base/alignment.h b/lib/pls/include/pls/internal/base/alignment.h new file mode 100644 index 0000000..4dc4752 --- /dev/null +++ b/lib/pls/include/pls/internal/base/alignment.h @@ -0,0 +1,29 @@ + +#ifndef PLS_ALIGNMENT_H +#define PLS_ALIGNMENT_H + +#include +#include + +#include "system_details.h" + +namespace pls { + namespace internal { + namespace base { + namespace alignment { + template + struct aligned_wrapper { + alignas(system_details::CACHE_LINE_SIZE) unsigned char data[sizeof(T)]; + + T* pointer() { return reinterpret_cast(data); } + }; + void* allocate_aligned(size_t size); + + std::uintptr_t next_alignment(std::uintptr_t size); + char* next_alignment(char* pointer); + } + } + } +} + +#endif //PLS_ALIGNMENT_H diff --git a/lib/pls/include/pls/internal/base/barrier.h b/lib/pls/include/pls/internal/base/barrier.h index f5ea58b..996f0e0 100644 --- a/lib/pls/include/pls/internal/base/barrier.h +++ b/lib/pls/include/pls/internal/base/barrier.h @@ -7,21 +7,22 @@ namespace pls { namespace internal { namespace base { + /** + * Provides standard barrier behaviour. + * `count` threads have to call `wait()` before any of the `wait()` calls returns, + * thus blocking all threads until everyone reached the barrier. + * + * PORTABILITY: + * Current implementation is based on pthreads. + */ class barrier { pthread_barrier_t barrier_; public: - explicit barrier(const unsigned int count): barrier_{} { - pthread_barrier_init(&barrier_, nullptr, count); - } + explicit barrier(unsigned int count); + ~barrier(); - ~barrier() { - pthread_barrier_destroy(&barrier_); - } - - void wait() { - pthread_barrier_wait(&barrier_); - } + void wait(); }; } } diff --git a/lib/pls/include/pls/internal/base/error_handling.h b/lib/pls/include/pls/internal/base/error_handling.h index d405dde..235964e 100644 --- a/lib/pls/include/pls/internal/base/error_handling.h +++ b/lib/pls/include/pls/internal/base/error_handling.h @@ -4,7 +4,12 @@ #include -// TODO: Figure out proper exception handling +/** + * Called when there is an non-recoverable error/invariant in the scheduler. + * This SHOULD NOT HAPPEN AT ANY POINT in production, any instance of this is a bug! + * The implementation can be changed if for example no iostream is available on a system + * (or its inclusion adds too much overhead). + */ #define PLS_ERROR(msg) std::cout << msg << std::endl; exit(1); #endif //PLS_ERROR_HANDLING_H diff --git a/lib/pls/include/pls/internal/base/spin_lock.h b/lib/pls/include/pls/internal/base/spin_lock.h index c2b98c8..5acaf0a 100644 --- a/lib/pls/include/pls/internal/base/spin_lock.h +++ b/lib/pls/include/pls/internal/base/spin_lock.h @@ -10,6 +10,12 @@ namespace pls { namespace internal { namespace base { + /** + * A simple set and test_and_set based spin lock implementation. + * + * PORTABILITY: + * Current implementation is based on C++ 11 atomic_flag. + */ class spin_lock { std::atomic_flag flag_; int yield_at_tries_; diff --git a/lib/pls/include/pls/internal/base/system_details.h b/lib/pls/include/pls/internal/base/system_details.h index a8dfb72..28f7dff 100644 --- a/lib/pls/include/pls/internal/base/system_details.h +++ b/lib/pls/include/pls/internal/base/system_details.h @@ -7,7 +7,25 @@ namespace pls { namespace internal { namespace base { - constexpr std::uintptr_t CACHE_LINE_SIZE = 64; + /** + * Collection of system details, e.g. hardware cache line size. + * + * PORTABILITY: + * Currently sane default values for x86. + */ + namespace system_details { + /** + * Most processors have 64 byte cache lines + */ + constexpr std::uintptr_t CACHE_LINE_SIZE = 64; + + /** + * Choose one of the following ways to store thread specific data. + * Try to choose the fastest available on this processor/system. + */ +// #define PLS_THREAD_SPECIFIC_PTHREAD + #define PLS_THREAD_SPECIFIC_COMPILER + } } } } diff --git a/lib/pls/include/pls/internal/base/thread.h b/lib/pls/include/pls/internal/base/thread.h index f03be21..fd0fe33 100644 --- a/lib/pls/include/pls/internal/base/thread.h +++ b/lib/pls/include/pls/internal/base/thread.h @@ -10,17 +10,33 @@ #include #include +#include "system_details.h" + namespace pls { namespace internal { namespace base { using thread_entrypoint = void(); + /** + * Static methods than can be performed on the current thread. + * + * usage: + * this_thread::yield(); + * T* state = this_thread::state(); + * + * PORTABILITY: + * Current implementation is based on pthreads. + */ class this_thread { template friend class thread; +#ifdef PLS_THREAD_SPECIFIC_PTHREAD static pthread_key_t local_storage_key_; static bool local_storage_key_initialized_; - +#endif +#ifdef PLS_THREAD_SPECIFIC_COMPILER + static __thread void* local_state_; +#endif public: static void yield() { pthread_yield(); @@ -34,7 +50,12 @@ namespace pls { */ template static T* state() { +#ifdef PLS_THREAD_SPECIFIC_PTHREAD return reinterpret_cast(pthread_getspecific(local_storage_key_)); +#endif +#ifdef PLS_THREAD_SPECIFIC_COMPILER + return reinterpret_cast(local_state_); +#endif } /** @@ -47,10 +68,31 @@ namespace pls { */ template static void set_state(T* state_pointer) { +#ifdef PLS_THREAD_SPECIFIC_PTHREAD pthread_setspecific(this_thread::local_storage_key_, (void*)state_pointer); +#endif +#ifdef PLS_THREAD_SPECIFIC_COMPILER + local_state_ = state_pointer; +#endif } }; + /** + * Abstraction for starting a function in a sparate thread. + * + * @tparam Function Lambda being started on the new thread. + * @tparam State State type held for this thread. + * + * usage: + * T* state; + * auto thread = start_thread([] { + * // Run on new thread + * }, state); + * thread.join(); // Wait for it to finish + * + * PORTABILITY: + * Current implementation is based on pthreads. + */ template class thread { friend class this_thread; @@ -92,10 +134,12 @@ namespace pls { startup_flag_{nullptr}, pthread_thread_{} { +#ifdef PLS_THREAD_SPECIFIC_PTHREAD if (!this_thread::local_storage_key_initialized_) { pthread_key_create(&this_thread::local_storage_key_, nullptr); this_thread::local_storage_key_initialized_ = true; } +#endif // We only need this during startup, will be destroyed when out of scope std::atomic_flag startup_flag{ATOMIC_FLAG_INIT}; diff --git a/lib/pls/include/pls/internal/base/aligned_stack.h b/lib/pls/include/pls/internal/data_structures/aligned_stack.h similarity index 74% rename from lib/pls/include/pls/internal/base/aligned_stack.h rename to lib/pls/include/pls/internal/data_structures/aligned_stack.h index 7c16fec..743ab56 100644 --- a/lib/pls/include/pls/internal/base/aligned_stack.h +++ b/lib/pls/include/pls/internal/data_structures/aligned_stack.h @@ -6,10 +6,23 @@ #include #include "pls/internal/base/error_handling.h" +#include "pls/internal/base/alignment.h" namespace pls { namespace internal { - namespace base { + namespace data_structures { + /** + * Generic stack-like data structure that allows to allocate arbitrary objects in a given memory region. + * The objects will be stored aligned in the stack, making the storage cache friendly and very fast + * (as long as one can live with the stack restrictions). + * + * IMPORTANT: Does not call destructors on stored objects! Do not allocate resources in the objects! + * + * Usage: + * aligned_stack stack{pointer_to_memory, size_of_memory}; + * T* pointer = stack.push(some_object); // Copy-Constrict the object on top of stack + * stack.pop(); // Deconstruct the top object of type T + */ class aligned_stack { // Keep bounds of our memory block char* memory_start_; @@ -17,22 +30,15 @@ namespace pls { // Current head will always be aligned to cache lines char* head_; - - static std::uintptr_t next_alignment(std::uintptr_t size); - static char* next_alignment(char* pointer); public: typedef char* state; aligned_stack(): memory_start_{nullptr}, memory_end_{nullptr}, head_{nullptr} {}; - - aligned_stack(char* memory_region, const std::size_t size): - memory_start_{memory_region}, - memory_end_{memory_region + size}, - head_{next_alignment(memory_start_)} {} + aligned_stack(char* memory_region, std::size_t size); template T* push(const T& object) { - // Placement new into desired memory location + // Copy-Construct return new ((void*)push())T(object); } @@ -41,7 +47,7 @@ namespace pls { void* result = reinterpret_cast(head_); // Move head to next aligned position after new object - head_ = next_alignment(head_ + sizeof(T)); + head_ = base::alignment::next_alignment(head_ + sizeof(T)); if (head_ >= memory_end_) { PLS_ERROR("Tried to allocate object on alligned_stack without sufficient memory!"); } @@ -51,8 +57,7 @@ namespace pls { template T pop() { - head_ = head_ - next_alignment(sizeof(T)); - + head_ = head_ - base::alignment::next_alignment(sizeof(T)); return *reinterpret_cast(head_); } diff --git a/lib/pls/include/pls/internal/base/deque.h b/lib/pls/include/pls/internal/data_structures/deque.h similarity index 82% rename from lib/pls/include/pls/internal/base/deque.h rename to lib/pls/include/pls/internal/data_structures/deque.h index 5fbe0bb..8652cc3 100644 --- a/lib/pls/include/pls/internal/base/deque.h +++ b/lib/pls/include/pls/internal/data_structures/deque.h @@ -2,11 +2,14 @@ #ifndef PLS_DEQUE_H #define PLS_DEQUE_H -#include "spin_lock.h" +#include "pls/internal/base/spin_lock.h" namespace pls { namespace internal { - namespace base { + namespace data_structures { + /** + * Turns any object into deque item when inheriting from this. + */ class deque_item { friend class deque_internal; @@ -20,13 +23,19 @@ namespace pls { deque_item* head_; deque_item* tail_; - spin_lock lock_; + base::spin_lock lock_; deque_item* pop_head_internal(); deque_item* pop_tail_internal(); void push_tail_internal(deque_item *new_item); }; + /** + * A double linked list based deque. + * Storage is therefore only needed for the individual items. + * + * @tparam Item The type of items stored in this deque + */ template class deque: deque_internal { public: diff --git a/lib/pls/include/pls/internal/scheduling/fork_join_task.h b/lib/pls/include/pls/internal/scheduling/fork_join_task.h index 830772f..efcd395 100644 --- a/lib/pls/include/pls/internal/scheduling/fork_join_task.h +++ b/lib/pls/include/pls/internal/scheduling/fork_join_task.h @@ -4,8 +4,8 @@ #include "pls/internal/helpers/profiler.h" -#include "pls/internal/base/aligned_stack.h" -#include "pls/internal/base/deque.h" +#include "pls/internal/data_structures/aligned_stack.h" +#include "pls/internal/data_structures/deque.h" #include "abstract_task.h" #include "thread_state.h" @@ -14,7 +14,7 @@ namespace pls { namespace internal { namespace scheduling { class fork_join_task; - class fork_join_sub_task: public base::deque_item { + class fork_join_sub_task: public data_structures::deque_item { friend class fork_join_task; // Coordinate finishing of sub_tasks @@ -25,7 +25,7 @@ namespace pls { fork_join_task* tbb_task_; // Stack Management (reset stack pointer after wait_for_all() calls) - base::aligned_stack::state stack_state_; + data_structures::aligned_stack::state stack_state_; protected: explicit fork_join_sub_task(); fork_join_sub_task(const fork_join_sub_task& other); @@ -62,10 +62,10 @@ namespace pls { fork_join_sub_task* root_task_; fork_join_sub_task* currently_executing_; - base::aligned_stack* my_stack_; + data_structures::aligned_stack* my_stack_; // Double-Ended Queue management - base::deque deque_; + data_structures::deque deque_; // Steal Management fork_join_sub_task* last_stolen_; diff --git a/lib/pls/include/pls/internal/scheduling/scheduler.h b/lib/pls/include/pls/internal/scheduling/scheduler.h index 55e72b5..a9e2da5 100644 --- a/lib/pls/include/pls/internal/scheduling/scheduler.h +++ b/lib/pls/include/pls/internal/scheduling/scheduler.h @@ -7,7 +7,8 @@ #include "pls/internal/helpers/profiler.h" -#include "pls/internal/base/aligned_stack.h" +#include "pls/internal/data_structures/aligned_stack.h" + #include "pls/internal/base/thread.h" #include "pls/internal/base/barrier.h" diff --git a/lib/pls/include/pls/internal/scheduling/scheduler_memory.h b/lib/pls/include/pls/internal/scheduling/scheduler_memory.h index c9d233d..25d898f 100644 --- a/lib/pls/include/pls/internal/scheduling/scheduler_memory.h +++ b/lib/pls/include/pls/internal/scheduling/scheduler_memory.h @@ -1,4 +1,4 @@ -#include "pls/internal/base/aligned_stack.h" +#include "pls/internal/data_structures/aligned_stack.h" #include "pls/internal/base/thread.h" #include "thread_state.h" @@ -14,47 +14,62 @@ namespace pls { class scheduler_memory { public: - virtual size_t max_threads() = 0; + virtual size_t max_threads() const = 0; virtual thread_state* thread_state_for(size_t id) = 0; virtual scheduler_thread* thread_for(size_t id) = 0; - virtual base::aligned_stack* task_stack_for(size_t id) = 0; + virtual data_structures::aligned_stack* task_stack_for(size_t id) = 0; }; template class static_scheduler_memory: public scheduler_memory { - std::array threads_; - std::array thread_states_; - std::array, MAX_THREADS> task_stacks_memory_; - std::array task_stacks_; + // Everyone of these types has to live on its own cache line, + // as each thread uses one of them independently. + // Therefore it would be a major performance hit if we shared cache lines on these. + using aligned_thread = base::alignment::aligned_wrapper; + using aligned_thread_state = base::alignment::aligned_wrapper; + using aligned_thread_stack = base::alignment::aligned_wrapper>; + using aligned_aligned_stack = base::alignment::aligned_wrapper; + + std::array threads_; + std::array thread_states_; + std::array task_stacks_memory_; + std::array task_stacks_; public: static_scheduler_memory() { for (size_t i = 0; i < MAX_THREADS; i++) { - task_stacks_[i] = base::aligned_stack(task_stacks_memory_[i].data(), TASK_STACK_SIZE); + new ((void*)task_stacks_[i].pointer()) data_structures::aligned_stack(task_stacks_memory_[i].pointer()->data(), TASK_STACK_SIZE); } } - size_t max_threads() override { return MAX_THREADS; } - thread_state* thread_state_for(size_t id) override { return &thread_states_[id]; } - scheduler_thread* thread_for(size_t id) override { return &threads_[id]; } - base::aligned_stack* task_stack_for(size_t id) override { return &task_stacks_[id]; } + size_t max_threads() const override { return MAX_THREADS; } + thread_state* thread_state_for(size_t id) override { return thread_states_[id].pointer(); } + scheduler_thread* thread_for(size_t id) override { return threads_[id].pointer(); } + data_structures::aligned_stack* task_stack_for(size_t id) override { return task_stacks_[id].pointer(); } }; class malloc_scheduler_memory: public scheduler_memory { - size_t num_threads_; + // Everyone of these types has to live on its own cache line, + // as each thread uses one of them independently. + // Therefore it would be a major performance hit if we shared cache lines on these. + using aligned_thread = base::alignment::aligned_wrapper; + using aligned_thread_state = base::alignment::aligned_wrapper; + using aligned_aligned_stack = base::alignment::aligned_wrapper; + + const size_t num_threads_; - scheduler_thread* threads_; - thread_state* thread_states_; + aligned_thread* threads_; + aligned_thread_state * thread_states_; char** task_stacks_memory_; - base::aligned_stack* task_stacks_; + aligned_aligned_stack * task_stacks_; public: explicit malloc_scheduler_memory(size_t num_threads, size_t memory_per_stack = 2 << 16); ~malloc_scheduler_memory(); - size_t max_threads() override { return num_threads_; } - thread_state* thread_state_for(size_t id) override { return &thread_states_[id]; } - scheduler_thread* thread_for(size_t id) override { return &threads_[id]; } - base::aligned_stack* task_stack_for(size_t id) override { return &task_stacks_[id]; } + size_t max_threads() const override { return num_threads_; } + thread_state* thread_state_for(size_t id) override { return thread_states_[id].pointer(); } + scheduler_thread* thread_for(size_t id) override { return threads_[id].pointer(); } + data_structures::aligned_stack* task_stack_for(size_t id) override { return task_stacks_[id].pointer(); } }; } } diff --git a/lib/pls/include/pls/internal/scheduling/thread_state.h b/lib/pls/include/pls/internal/scheduling/thread_state.h index 2f9cda9..ee864db 100644 --- a/lib/pls/include/pls/internal/scheduling/thread_state.h +++ b/lib/pls/include/pls/internal/scheduling/thread_state.h @@ -4,10 +4,9 @@ #include +#include "pls/internal/data_structures/aligned_stack.h" #include "abstract_task.h" -#include "pls/internal/base/aligned_stack.h" - namespace pls { namespace internal { namespace scheduling { @@ -18,7 +17,7 @@ namespace pls { scheduler* scheduler_; abstract_task* root_task_; abstract_task* current_task_; - base::aligned_stack* task_stack_; + data_structures::aligned_stack* task_stack_; size_t id_; base::spin_lock lock_; std::minstd_rand random_; @@ -31,7 +30,7 @@ namespace pls { id_{0}, random_{id_} {}; - thread_state(scheduler* scheduler, base::aligned_stack* task_stack, unsigned int id): + thread_state(scheduler* scheduler, data_structures::aligned_stack* task_stack, unsigned int id): scheduler_{scheduler}, root_task_{nullptr}, current_task_{nullptr}, diff --git a/lib/pls/src/internal/base/aligned_stack.cpp b/lib/pls/src/internal/base/alignment.cpp similarity index 66% rename from lib/pls/src/internal/base/aligned_stack.cpp rename to lib/pls/src/internal/base/alignment.cpp index 4efe681..af95adb 100644 --- a/lib/pls/src/internal/base/aligned_stack.cpp +++ b/lib/pls/src/internal/base/alignment.cpp @@ -1,20 +1,26 @@ -#include "pls/internal/base/aligned_stack.h" +#include "pls/internal/base/alignment.h" #include "pls/internal/base/system_details.h" namespace pls { namespace internal { namespace base { - std::uintptr_t aligned_stack::next_alignment(std::uintptr_t size) { - std::uintptr_t miss_alignment = size % CACHE_LINE_SIZE; - if (miss_alignment == 0) { - return size; - } else { - return size + (CACHE_LINE_SIZE - miss_alignment); + namespace alignment { + void* allocate_aligned(size_t size) { + return aligned_alloc(system_details::CACHE_LINE_SIZE, size); + } + + std::uintptr_t next_alignment(std::uintptr_t size) { + std::uintptr_t miss_alignment = size % base::system_details::CACHE_LINE_SIZE; + if (miss_alignment == 0) { + return size; + } else { + return size + (base::system_details::CACHE_LINE_SIZE - miss_alignment); + } } - } - char* aligned_stack::next_alignment(char* pointer) { - return reinterpret_cast(next_alignment(reinterpret_cast(pointer))); + char* next_alignment(char* pointer) { + return reinterpret_cast(next_alignment(reinterpret_cast(pointer))); + } } } } diff --git a/lib/pls/src/internal/base/barrier.cpp b/lib/pls/src/internal/base/barrier.cpp index 038e030..a2893be 100644 --- a/lib/pls/src/internal/base/barrier.cpp +++ b/lib/pls/src/internal/base/barrier.cpp @@ -3,7 +3,17 @@ namespace pls { namespace internal { namespace base { + barrier::barrier(const unsigned int count): barrier_{} { + pthread_barrier_init(&barrier_, nullptr, count); + } + barrier::~barrier() { + pthread_barrier_destroy(&barrier_); + } + + void barrier::wait() { + pthread_barrier_wait(&barrier_); + } } } } diff --git a/lib/pls/src/internal/base/thread.cpp b/lib/pls/src/internal/base/thread.cpp index b2cd8d8..57ebd21 100644 --- a/lib/pls/src/internal/base/thread.cpp +++ b/lib/pls/src/internal/base/thread.cpp @@ -3,8 +3,13 @@ namespace pls { namespace internal { namespace base { - bool this_thread::local_storage_key_initialized_ = false; - pthread_key_t this_thread::local_storage_key_; +#ifdef PLS_THREAD_SPECIFIC_PTHREAD + pthread_key_t this_thread::local_storage_key_ = false; + bool this_thread::local_storage_key_initialized_; +#endif +#ifdef PLS_THREAD_SPECIFIC_COMPILER + __thread void* this_thread::local_state_; +#endif // implementation in header (C++ templating) } } diff --git a/lib/pls/src/internal/data_structures/aligned_stack.cpp b/lib/pls/src/internal/data_structures/aligned_stack.cpp new file mode 100644 index 0000000..2a4d6d9 --- /dev/null +++ b/lib/pls/src/internal/data_structures/aligned_stack.cpp @@ -0,0 +1,13 @@ +#include "pls/internal/data_structures/aligned_stack.h" +#include "pls/internal/base/system_details.h" + +namespace pls { + namespace internal { + namespace data_structures { + aligned_stack::aligned_stack(char* memory_region, const std::size_t size): + memory_start_{memory_region}, + memory_end_{memory_region + size}, + head_{base::alignment::next_alignment(memory_start_)} {} + } + } +} diff --git a/lib/pls/src/internal/base/deque.cpp b/lib/pls/src/internal/data_structures/deque.cpp similarity index 89% rename from lib/pls/src/internal/base/deque.cpp rename to lib/pls/src/internal/data_structures/deque.cpp index 5370a91..786e04b 100644 --- a/lib/pls/src/internal/base/deque.cpp +++ b/lib/pls/src/internal/data_structures/deque.cpp @@ -1,12 +1,12 @@ #include -#include "pls/internal/base/deque.h" +#include "pls/internal/data_structures/deque.h" namespace pls { namespace internal { - namespace base { + namespace data_structures { deque_item* deque_internal::pop_head_internal() { - std::lock_guard lock{lock_}; + std::lock_guard lock{lock_}; if (head_ == nullptr) { return nullptr; @@ -24,7 +24,7 @@ namespace pls { } deque_item* deque_internal::pop_tail_internal() { - std::lock_guard lock{lock_}; + std::lock_guard lock{lock_}; if (tail_ == nullptr) { return nullptr; @@ -42,7 +42,7 @@ namespace pls { } void deque_internal::push_tail_internal(deque_item *new_item) { - std::lock_guard lock{lock_}; + std::lock_guard lock{lock_}; if (tail_ != nullptr) { tail_->prev_ = new_item; diff --git a/lib/pls/src/internal/scheduling/fork_join_task.cpp b/lib/pls/src/internal/scheduling/fork_join_task.cpp index 1f1360c..164f804 100644 --- a/lib/pls/src/internal/scheduling/fork_join_task.cpp +++ b/lib/pls/src/internal/scheduling/fork_join_task.cpp @@ -7,14 +7,14 @@ namespace pls { namespace internal { namespace scheduling { fork_join_sub_task::fork_join_sub_task(): - base::deque_item{}, + data_structures::deque_item{}, ref_count_{0}, parent_{nullptr}, tbb_task_{nullptr}, stack_state_{nullptr} {} fork_join_sub_task::fork_join_sub_task(const fork_join_sub_task& other): - base::deque_item(other), + data_structures::deque_item(other), ref_count_{0}, parent_{nullptr}, tbb_task_{nullptr}, diff --git a/lib/pls/src/internal/scheduling/scheduler_memory.cpp b/lib/pls/src/internal/scheduling/scheduler_memory.cpp index 9018be9..8a65002 100644 --- a/lib/pls/src/internal/scheduling/scheduler_memory.cpp +++ b/lib/pls/src/internal/scheduling/scheduler_memory.cpp @@ -5,14 +5,14 @@ namespace pls { namespace scheduling { malloc_scheduler_memory::malloc_scheduler_memory(const size_t num_threads, const size_t memory_per_stack): num_threads_{num_threads} { - threads_ = reinterpret_cast(malloc(num_threads * sizeof(scheduler_thread))); - thread_states_ = reinterpret_cast(malloc(num_threads * sizeof(thread_state))); + threads_ = reinterpret_cast(base::alignment::allocate_aligned(num_threads * sizeof(aligned_thread))); + thread_states_ = reinterpret_cast(base::alignment::allocate_aligned(num_threads * sizeof(aligned_thread_state))); - task_stacks_ = reinterpret_cast(malloc(num_threads * sizeof(base::aligned_stack))); - task_stacks_memory_ = reinterpret_cast(malloc(num_threads * sizeof(char*))); + task_stacks_ = reinterpret_cast(base::alignment::allocate_aligned(num_threads * sizeof(aligned_aligned_stack))); + task_stacks_memory_ = reinterpret_cast(base::alignment::allocate_aligned(num_threads * sizeof(char*))); for (size_t i = 0; i < num_threads_; i++) { - task_stacks_memory_[i] = reinterpret_cast(malloc(memory_per_stack)); - task_stacks_[i] = base::aligned_stack(task_stacks_memory_[i], memory_per_stack); + task_stacks_memory_[i] = reinterpret_cast(base::alignment::allocate_aligned(memory_per_stack)); + new ((void*)task_stacks_[i].pointer()) data_structures::aligned_stack(task_stacks_memory_[i], memory_per_stack); } } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index dbe5d58..8e7850d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,4 +1,4 @@ add_executable(tests main.cpp - base_tests.cpp scheduling_tests.cpp) + base_tests.cpp scheduling_tests.cpp data_structures_test.cpp) target_link_libraries(tests catch2 pls) diff --git a/test/base_tests.cpp b/test/base_tests.cpp index f1764d6..b22cfd4 100644 --- a/test/base_tests.cpp +++ b/test/base_tests.cpp @@ -1,12 +1,10 @@ #include #include #include -#include #include #include #include -#include using namespace pls::internal::base; using namespace std; @@ -15,7 +13,7 @@ static bool base_tests_visited; static int base_tests_local_value_one; static vector base_tests_local_value_two; -TEST_CASE( "thread creation and joining", "[internal/base/thread.h]") { +TEST_CASE( "thread creation and joining", "[internal/data_structures/thread.h]") { base_tests_visited = false; auto t1 = start_thread([]() { base_tests_visited = true; }); t1.join(); @@ -23,7 +21,7 @@ TEST_CASE( "thread creation and joining", "[internal/base/thread.h]") { REQUIRE(base_tests_visited); } -TEST_CASE( "thread state", "[internal/base/thread.h]") { +TEST_CASE( "thread state", "[internal/data_structures/thread.h]") { int state_one = 1; vector state_two{1, 2}; @@ -38,7 +36,7 @@ TEST_CASE( "thread state", "[internal/base/thread.h]") { int base_tests_shared_counter; -TEST_CASE( "spinlock protects concurrent counter", "[internal/base/spinlock.h]") { +TEST_CASE( "spinlock protects concurrent counter", "[internal/data_structures/spinlock.h]") { constexpr int num_iterations = 1000000; base_tests_shared_counter = 0; spin_lock lock{}; @@ -85,122 +83,3 @@ TEST_CASE( "spinlock protects concurrent counter", "[internal/base/spinlock.h]") REQUIRE(base_tests_shared_counter == 0); } } - -TEST_CASE( "aligned stack stores objects correctly", "[internal/base/aligned_stack.h]") { - constexpr long data_size = 1024; - char data[data_size]; - aligned_stack stack{data, data_size}; - - SECTION( "stack correctly pushes sub linesize objects" ) { - std::array small_data_one{'a', 'b', 'c', 'd', 'e'}; - std::array small_data_two{}; - std::array small_data_three{'A'}; - - auto pointer_one = stack.push(small_data_one); - auto pointer_two = stack.push(small_data_two); - auto pointer_three = stack.push(small_data_three); - - REQUIRE(reinterpret_cast(pointer_one) % CACHE_LINE_SIZE == 0); - REQUIRE(reinterpret_cast(pointer_two) % CACHE_LINE_SIZE == 0); - REQUIRE(reinterpret_cast(pointer_three) % CACHE_LINE_SIZE == 0); - } - - SECTION( "stack correctly pushes above linesize objects" ) { - std::array small_data_one{'a', 'b', 'c', 'd', 'e'}; - std::array big_data_one{}; - - auto big_pointer_one = stack.push(big_data_one); - auto small_pointer_one = stack.push(small_data_one); - - REQUIRE(reinterpret_cast(big_pointer_one) % CACHE_LINE_SIZE == 0); - REQUIRE(reinterpret_cast(small_pointer_one) % CACHE_LINE_SIZE == 0); - } - - SECTION( "stack correctly stores and retrieves objects" ) { - std::array data_one{'a', 'b', 'c', 'd', 'e'}; - - stack.push(data_one); - auto retrieved_data = stack.pop>(); - - REQUIRE(retrieved_data == std::array{'a', 'b', 'c', 'd', 'e'}); - } - - SECTION( "stack can push and pop multiple times with correct alignment" ) { - std::array small_data_one{'a', 'b', 'c', 'd', 'e'}; - std::array small_data_two{}; - std::array small_data_three{'A'}; - - auto pointer_one = stack.push(small_data_one); - auto pointer_two = stack.push(small_data_two); - auto pointer_three = stack.push(small_data_three); - stack.pop(); - stack.pop(); - auto pointer_four = stack.push(small_data_two); - auto pointer_five = stack.push(small_data_three); - - REQUIRE(reinterpret_cast(pointer_one) % CACHE_LINE_SIZE == 0); - REQUIRE(reinterpret_cast(pointer_two) % CACHE_LINE_SIZE == 0); - REQUIRE(reinterpret_cast(pointer_three) % CACHE_LINE_SIZE == 0); - REQUIRE(reinterpret_cast(pointer_four) % CACHE_LINE_SIZE == 0); - REQUIRE(reinterpret_cast(pointer_five) % CACHE_LINE_SIZE == 0); - - REQUIRE(pointer_four == pointer_two); - REQUIRE(pointer_five == pointer_three); - } -} - -TEST_CASE( "deque stores objects correctly", "[internal/base/deque.h]") { - class my_item: public deque_item { - - }; - - deque deque; - my_item one, two, three; - - SECTION( "add and remove items form the tail" ) { - deque.push_tail(&one); - deque.push_tail(&two); - deque.push_tail(&three); - - REQUIRE(deque.pop_tail() == &three); - REQUIRE(deque.pop_tail() == &two); - REQUIRE(deque.pop_tail() == &one); - } - - SECTION( "handles getting empty by popping the tail correctly" ) { - deque.push_tail(&one); - REQUIRE(deque.pop_tail() == &one); - - deque.push_tail(&two); - REQUIRE(deque.pop_tail() == &two); - } - - SECTION( "remove items form the head" ) { - deque.push_tail(&one); - deque.push_tail(&two); - deque.push_tail(&three); - - REQUIRE(deque.pop_head() == &one); - REQUIRE(deque.pop_head() == &two); - REQUIRE(deque.pop_head() == &three); - } - - SECTION( "handles getting empty by popping the head correctly" ) { - deque.push_tail(&one); - REQUIRE(deque.pop_head() == &one); - - deque.push_tail(&two); - REQUIRE(deque.pop_head() == &two); - } - - SECTION( "handles getting empty by popping the head and tail correctly" ) { - deque.push_tail(&one); - REQUIRE(deque.pop_tail() == &one); - - deque.push_tail(&two); - REQUIRE(deque.pop_head() == &two); - - deque.push_tail(&three); - REQUIRE(deque.pop_tail() == &three); - } -} diff --git a/test/data_structures_test.cpp b/test/data_structures_test.cpp new file mode 100644 index 0000000..a878d72 --- /dev/null +++ b/test/data_structures_test.cpp @@ -0,0 +1,133 @@ +#include + +#include + +#include +#include + +#include +#include + +using namespace pls::internal::data_structures; +using namespace pls::internal::base; +using namespace std; + + +TEST_CASE( "aligned stack stores objects correctly", "[internal/data_structures/aligned_stack.h]") { + constexpr long data_size = 1024; + char data[data_size]; + aligned_stack stack{data, data_size}; + + SECTION( "stack correctly pushes sub linesize objects" ) { + std::array small_data_one{'a', 'b', 'c', 'd', 'e'}; + std::array small_data_two{}; + std::array small_data_three{'A'}; + + auto pointer_one = stack.push(small_data_one); + auto pointer_two = stack.push(small_data_two); + auto pointer_three = stack.push(small_data_three); + + REQUIRE(reinterpret_cast(pointer_one) % system_details::CACHE_LINE_SIZE == 0); + REQUIRE(reinterpret_cast(pointer_two) % system_details::CACHE_LINE_SIZE == 0); + REQUIRE(reinterpret_cast(pointer_three) % system_details::CACHE_LINE_SIZE == 0); + } + + SECTION( "stack correctly pushes above linesize objects" ) { + std::array small_data_one{'a', 'b', 'c', 'd', 'e'}; + std::array big_data_one{}; + + auto big_pointer_one = stack.push(big_data_one); + auto small_pointer_one = stack.push(small_data_one); + + REQUIRE(reinterpret_cast(big_pointer_one) % system_details::CACHE_LINE_SIZE == 0); + REQUIRE(reinterpret_cast(small_pointer_one) % system_details::CACHE_LINE_SIZE == 0); + } + + SECTION( "stack correctly stores and retrieves objects" ) { + std::array data_one{'a', 'b', 'c', 'd', 'e'}; + + stack.push(data_one); + auto retrieved_data = stack.pop>(); + + REQUIRE(retrieved_data == std::array{'a', 'b', 'c', 'd', 'e'}); + } + + SECTION( "stack can push and pop multiple times with correct alignment" ) { + std::array small_data_one{'a', 'b', 'c', 'd', 'e'}; + std::array small_data_two{}; + std::array small_data_three{'A'}; + + auto pointer_one = stack.push(small_data_one); + auto pointer_two = stack.push(small_data_two); + auto pointer_three = stack.push(small_data_three); + stack.pop(); + stack.pop(); + auto pointer_four = stack.push(small_data_two); + auto pointer_five = stack.push(small_data_three); + + REQUIRE(reinterpret_cast(pointer_one) % system_details::CACHE_LINE_SIZE == 0); + REQUIRE(reinterpret_cast(pointer_two) % system_details::CACHE_LINE_SIZE == 0); + REQUIRE(reinterpret_cast(pointer_three) % system_details::CACHE_LINE_SIZE == 0); + REQUIRE(reinterpret_cast(pointer_four) % system_details::CACHE_LINE_SIZE == 0); + REQUIRE(reinterpret_cast(pointer_five) % system_details::CACHE_LINE_SIZE == 0); + + REQUIRE(pointer_four == pointer_two); + REQUIRE(pointer_five == pointer_three); + } +} + +TEST_CASE( "deque stores objects correctly", "[internal/data_structures/deque.h]") { + class my_item: public deque_item { + + }; + + deque deque; + my_item one, two, three; + + SECTION( "add and remove items form the tail" ) { + deque.push_tail(&one); + deque.push_tail(&two); + deque.push_tail(&three); + + REQUIRE(deque.pop_tail() == &three); + REQUIRE(deque.pop_tail() == &two); + REQUIRE(deque.pop_tail() == &one); + } + + SECTION( "handles getting empty by popping the tail correctly" ) { + deque.push_tail(&one); + REQUIRE(deque.pop_tail() == &one); + + deque.push_tail(&two); + REQUIRE(deque.pop_tail() == &two); + } + + SECTION( "remove items form the head" ) { + deque.push_tail(&one); + deque.push_tail(&two); + deque.push_tail(&three); + + REQUIRE(deque.pop_head() == &one); + REQUIRE(deque.pop_head() == &two); + REQUIRE(deque.pop_head() == &three); + } + + SECTION( "handles getting empty by popping the head correctly" ) { + deque.push_tail(&one); + REQUIRE(deque.pop_head() == &one); + + deque.push_tail(&two); + REQUIRE(deque.pop_head() == &two); + } + + SECTION( "handles getting empty by popping the head and tail correctly" ) { + deque.push_tail(&one); + REQUIRE(deque.pop_tail() == &one); + + deque.push_tail(&two); + REQUIRE(deque.pop_head() == &two); + + deque.push_tail(&three); + REQUIRE(deque.pop_tail() == &three); + } +}