Allow to use __thread locals and pthread_locals.

The __thread implementation of thread local, static variables seems more efficient, so we activate this by default. If it is (for some reason) not available/slower than the pthread version, one can toggle it for the specific system/processor later on using macros.

Allow to use __thread locals and pthread_locals.
The __thread implementation of thread local, static variables seems more efficient, so we activate this by default. If it is (for some reason) not available/slower than the pthread version, one can toggle it for the specific system/processor later on using macros.
5c654597 · FritzFlorian · 3535cbd8 · 5c654597 · 5c654597 · 5c654597
Commit 5c654597 authored Apr 10, 2019 by FritzFlorian
8 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,6 +16,7 @@ include(cmake/SetupThreadingSupport.cmake)
 include(cmake/SetupThreadSanitizer.cmake)
 include(cmake/SetupAddressSanitizer.cmake)
 include(cmake/SetupEasyProfiler.cmake)
+include(cmake/SetupDebugSymbols.cmake)

 # make our internal cmake script collection avaliable in the build process.
 list(APPEND CMAKE_PREFIX_PATH "${PROJECT_SOURCE_DIR}/cmake")

--- a/NOTES.md
+++ b/NOTES.md
@@ -4,6 +4,19 @@ A collection of stuff that we noticed during development.
 Useful later on two write a project report and to go back
 in time to find out why certain decisions where made.

+## 09.02.2019 - Cache Alignment
+
+Aligning the cache needs all parts (both data types with correct alignment
+and base memory with correct alignment).
+
+Our first tests show that the initial alignment (Commit 3535cbd8),
+boostet the performance in the fft_benchmark from our library to
+Intel TBB's speedup when running on up to 4 threads.
+When crossing the boundary to hyper-threading this falls of.
+We therefore think that contemption/cache misses are the reason for
+bad performance above 4 threads, but have to investigate further to
+pin down the issue.
+
 ## 08.04.2019 - Random Numbers

 We decided to go for a simple linear random number generator

--- a/PERFORMANCE.md
+++ b/PERFORMANCE.md
 # Notes on performance measures during development

-#### Commit 9c12addf
+#### Commit 52fcb51f - Add basic random stealing
+
+Slight improvement, needs further measurement after removing more important bottlenecks.

 | | | | | | | | | | |
 | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
@@ -19,3 +21,9 @@ change  |    100.39  %|     99.14  %|     98.46  %|    107.74  %|    100.17  %| 
 old     |    1654.26 us|    969.12 us|    832.13 us|    680.69 us|    718.70 us|    750.80 us|    744.12 us|    775.24 us|   7125.07 us
 new     |    1637.04 us|    978.09 us|    799.93 us|    709.33 us|    746.42 us|    684.87 us|    822.30 us|    787.61 us|   7165.59 us
 change  |     98.96  %|    100.93  %|     96.13  %|    104.21  %|    103.86  %|     91.22  %|    110.51  %|    101.60  %|    100.57  %
+
+#### Commit 3535cbd8  - Cache Align scheduler_memory
+
+Big improvements of about 6% in our test. This seems like a little,
+but 6% from the scheduler is a lot, as the 'main work' is the tasks
+itself, not the scheduler.
--- a/app/invoke_parallel/main.cpp
+++ b/app/invoke_parallel/main.cpp
@@ -34,7 +34,7 @@ long fib(long n) {

 int main() {
    PROFILE_ENABLE
-    pls::scheduler scheduler{&my_scheduler_memory, 8};
+    pls::scheduler scheduler{&my_scheduler_memory, 2};

    long result;
    scheduler.perform_work([&] {

--- a/cmake/SetupDebugSymbols.cmake
+++ b/cmake/SetupDebugSymbols.cmake
+option(DEBUG_SYMBOLS "Enable debug symbols" OFF)
+if(DEBUG_SYMBOLS)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
+endif()
+message("-- Debug Symbols: ${DEBUG_SYMBOLS}")
--- a/lib/pls/include/pls/internal/base/system_details.h
+++ b/lib/pls/include/pls/internal/base/system_details.h
@@ -14,7 +14,17 @@ namespace pls {
             * Currently sane default values for x86.
             */
            namespace system_details {
+                /**
+                 * Most processors have 64 byte cache lines
+                 */
                constexpr std::uintptr_t CACHE_LINE_SIZE = 64;
+
+                /**
+                 * Choose one of the following ways to store thread specific data.
+                 * Try to choose the fastest available on this processor/system.
+                 */
+//                #define PLS_THREAD_SPECIFIC_PTHREAD
+                #define PLS_THREAD_SPECIFIC_COMPILER
            }
        }
    }

--- a/lib/pls/include/pls/internal/base/thread.h
+++ b/lib/pls/include/pls/internal/base/thread.h
@@ -10,6 +10,8 @@
 #include <pthread.h>
 #include <atomic>

+#include "system_details.h"
+
 namespace pls {
    namespace internal {
        namespace base {
@@ -28,9 +30,13 @@ namespace pls {
            class this_thread {
                template<typename Function, typename State>
                friend class thread;
+#ifdef PLS_THREAD_SPECIFIC_PTHREAD
                static pthread_key_t local_storage_key_;
                static bool local_storage_key_initialized_;
-
+#endif
+#ifdef PLS_THREAD_SPECIFIC_COMPILER
+                static __thread void* local_state_;
+#endif
            public:
                static void yield() {
                    pthread_yield();
@@ -44,7 +50,12 @@ namespace pls {
                 */
                template<typename T>
                static T* state() {
+#ifdef PLS_THREAD_SPECIFIC_PTHREAD
                    return reinterpret_cast<T*>(pthread_getspecific(local_storage_key_));
+#endif
+#ifdef PLS_THREAD_SPECIFIC_COMPILER
+                    return reinterpret_cast<T*>(local_state_);
+#endif
                }

                /**
@@ -57,7 +68,12 @@ namespace pls {
                 */
                template<typename T>
                static void set_state(T* state_pointer) {
+#ifdef PLS_THREAD_SPECIFIC_PTHREAD
                    pthread_setspecific(this_thread::local_storage_key_, (void*)state_pointer);
+#endif
+#ifdef PLS_THREAD_SPECIFIC_COMPILER
+                    local_state_ = state_pointer;
+#endif
                }
            };

@@ -118,10 +134,12 @@ namespace pls {
                    startup_flag_{nullptr},
                    pthread_thread_{} {

+#ifdef PLS_THREAD_SPECIFIC_PTHREAD
                    if (!this_thread::local_storage_key_initialized_) {
                        pthread_key_create(&this_thread::local_storage_key_, nullptr);
                        this_thread::local_storage_key_initialized_ = true;
                    }
+#endif

                    // We only need this during startup, will be destroyed when out of scope
                    std::atomic_flag startup_flag{ATOMIC_FLAG_INIT};

--- a/lib/pls/src/internal/base/thread.cpp
+++ b/lib/pls/src/internal/base/thread.cpp
@@ -3,8 +3,13 @@
 namespace pls {
    namespace internal {
        namespace base {
-            bool this_thread::local_storage_key_initialized_ = false;
-            pthread_key_t this_thread::local_storage_key_;
+#ifdef PLS_THREAD_SPECIFIC_PTHREAD
+            pthread_key_t this_thread::local_storage_key_ = false;
+            bool this_thread::local_storage_key_initialized_;
+#endif
+#ifdef PLS_THREAD_SPECIFIC_COMPILER
+            __thread void* this_thread::local_state_;
+#endif
            // implementation in header (C++ templating)
        }
    }