From 5c65459769ad3740ebfb0d7fe374cf3e68249b71 Mon Sep 17 00:00:00 2001 From: FritzFlorian Date: Wed, 10 Apr 2019 18:33:40 +0200 Subject: [PATCH] Allow to use __thread locals and pthread_locals. The __thread implementation of thread local, static variables seems more efficient, so we activate this by default. If it is (for some reason) not available/slower than the pthread version, one can toggle it for the specific system/processor later on using macros. --- CMakeLists.txt | 1 + NOTES.md | 13 +++++++++++++ PERFORMANCE.md | 10 +++++++++- app/invoke_parallel/main.cpp | 2 +- cmake/SetupDebugSymbols.cmake | 5 +++++ lib/pls/include/pls/internal/base/system_details.h | 10 ++++++++++ lib/pls/include/pls/internal/base/thread.h | 20 +++++++++++++++++++- lib/pls/src/internal/base/thread.cpp | 9 +++++++-- 8 files changed, 65 insertions(+), 5 deletions(-) create mode 100644 cmake/SetupDebugSymbols.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index be1f6ac..941c725 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,7 @@ include(cmake/SetupThreadingSupport.cmake) include(cmake/SetupThreadSanitizer.cmake) include(cmake/SetupAddressSanitizer.cmake) include(cmake/SetupEasyProfiler.cmake) +include(cmake/SetupDebugSymbols.cmake) # make our internal cmake script collection avaliable in the build process. list(APPEND CMAKE_PREFIX_PATH "${PROJECT_SOURCE_DIR}/cmake") diff --git a/NOTES.md b/NOTES.md index a495bab..04b1705 100644 --- a/NOTES.md +++ b/NOTES.md @@ -4,6 +4,19 @@ A collection of stuff that we noticed during development. Useful later on two write a project report and to go back in time to find out why certain decisions where made. +## 09.02.2019 - Cache Alignment + +Aligning the cache needs all parts (both data types with correct alignment +and base memory with correct alignment). + +Our first tests show that the initial alignment (Commit 3535cbd8), +boostet the performance in the fft_benchmark from our library to +Intel TBB's speedup when running on up to 4 threads. +When crossing the boundary to hyper-threading this falls of. +We therefore think that contemption/cache misses are the reason for +bad performance above 4 threads, but have to investigate further to +pin down the issue. + ## 08.04.2019 - Random Numbers We decided to go for a simple linear random number generator diff --git a/PERFORMANCE.md b/PERFORMANCE.md index 2ec5a0d..2372c66 100644 --- a/PERFORMANCE.md +++ b/PERFORMANCE.md @@ -1,6 +1,8 @@ # Notes on performance measures during development -#### Commit 9c12addf +#### Commit 52fcb51f - Add basic random stealing + +Slight improvement, needs further measurement after removing more important bottlenecks. | | | | | | | | | | | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | @@ -19,3 +21,9 @@ change | 100.39 %| 99.14 %| 98.46 %| 107.74 %| 100.17 %| old | 1654.26 us| 969.12 us| 832.13 us| 680.69 us| 718.70 us| 750.80 us| 744.12 us| 775.24 us| 7125.07 us new | 1637.04 us| 978.09 us| 799.93 us| 709.33 us| 746.42 us| 684.87 us| 822.30 us| 787.61 us| 7165.59 us change | 98.96 %| 100.93 %| 96.13 %| 104.21 %| 103.86 %| 91.22 %| 110.51 %| 101.60 %| 100.57 % + +#### Commit 3535cbd8 - Cache Align scheduler_memory + +Big improvements of about 6% in our test. This seems like a little, +but 6% from the scheduler is a lot, as the 'main work' is the tasks +itself, not the scheduler. diff --git a/app/invoke_parallel/main.cpp b/app/invoke_parallel/main.cpp index 4ae48ef..a856b51 100644 --- a/app/invoke_parallel/main.cpp +++ b/app/invoke_parallel/main.cpp @@ -34,7 +34,7 @@ long fib(long n) { int main() { PROFILE_ENABLE - pls::scheduler scheduler{&my_scheduler_memory, 8}; + pls::scheduler scheduler{&my_scheduler_memory, 2}; long result; scheduler.perform_work([&] { diff --git a/cmake/SetupDebugSymbols.cmake b/cmake/SetupDebugSymbols.cmake new file mode 100644 index 0000000..6e17d41 --- /dev/null +++ b/cmake/SetupDebugSymbols.cmake @@ -0,0 +1,5 @@ +option(DEBUG_SYMBOLS "Enable debug symbols" OFF) +if(DEBUG_SYMBOLS) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") +endif() +message("-- Debug Symbols: ${DEBUG_SYMBOLS}") diff --git a/lib/pls/include/pls/internal/base/system_details.h b/lib/pls/include/pls/internal/base/system_details.h index 4aa8965..28f7dff 100644 --- a/lib/pls/include/pls/internal/base/system_details.h +++ b/lib/pls/include/pls/internal/base/system_details.h @@ -14,7 +14,17 @@ namespace pls { * Currently sane default values for x86. */ namespace system_details { + /** + * Most processors have 64 byte cache lines + */ constexpr std::uintptr_t CACHE_LINE_SIZE = 64; + + /** + * Choose one of the following ways to store thread specific data. + * Try to choose the fastest available on this processor/system. + */ +// #define PLS_THREAD_SPECIFIC_PTHREAD + #define PLS_THREAD_SPECIFIC_COMPILER } } } diff --git a/lib/pls/include/pls/internal/base/thread.h b/lib/pls/include/pls/internal/base/thread.h index 6b94043..fd0fe33 100644 --- a/lib/pls/include/pls/internal/base/thread.h +++ b/lib/pls/include/pls/internal/base/thread.h @@ -10,6 +10,8 @@ #include #include +#include "system_details.h" + namespace pls { namespace internal { namespace base { @@ -28,9 +30,13 @@ namespace pls { class this_thread { template friend class thread; +#ifdef PLS_THREAD_SPECIFIC_PTHREAD static pthread_key_t local_storage_key_; static bool local_storage_key_initialized_; - +#endif +#ifdef PLS_THREAD_SPECIFIC_COMPILER + static __thread void* local_state_; +#endif public: static void yield() { pthread_yield(); @@ -44,7 +50,12 @@ namespace pls { */ template static T* state() { +#ifdef PLS_THREAD_SPECIFIC_PTHREAD return reinterpret_cast(pthread_getspecific(local_storage_key_)); +#endif +#ifdef PLS_THREAD_SPECIFIC_COMPILER + return reinterpret_cast(local_state_); +#endif } /** @@ -57,7 +68,12 @@ namespace pls { */ template static void set_state(T* state_pointer) { +#ifdef PLS_THREAD_SPECIFIC_PTHREAD pthread_setspecific(this_thread::local_storage_key_, (void*)state_pointer); +#endif +#ifdef PLS_THREAD_SPECIFIC_COMPILER + local_state_ = state_pointer; +#endif } }; @@ -118,10 +134,12 @@ namespace pls { startup_flag_{nullptr}, pthread_thread_{} { +#ifdef PLS_THREAD_SPECIFIC_PTHREAD if (!this_thread::local_storage_key_initialized_) { pthread_key_create(&this_thread::local_storage_key_, nullptr); this_thread::local_storage_key_initialized_ = true; } +#endif // We only need this during startup, will be destroyed when out of scope std::atomic_flag startup_flag{ATOMIC_FLAG_INIT}; diff --git a/lib/pls/src/internal/base/thread.cpp b/lib/pls/src/internal/base/thread.cpp index b2cd8d8..57ebd21 100644 --- a/lib/pls/src/internal/base/thread.cpp +++ b/lib/pls/src/internal/base/thread.cpp @@ -3,8 +3,13 @@ namespace pls { namespace internal { namespace base { - bool this_thread::local_storage_key_initialized_ = false; - pthread_key_t this_thread::local_storage_key_; +#ifdef PLS_THREAD_SPECIFIC_PTHREAD + pthread_key_t this_thread::local_storage_key_ = false; + bool this_thread::local_storage_key_initialized_; +#endif +#ifdef PLS_THREAD_SPECIFIC_COMPILER + __thread void* this_thread::local_state_; +#endif // implementation in header (C++ templating) } } -- libgit2 0.26.0