Rework threads to not be template based.

This allows us to more easily handle them and makes their interface closer to std::thread.

Rework threads to not be template based.
This allows us to more easily handle them and makes their interface closer to std::thread.
4a9ca21d · FritzFlorian · 65409e0a · 4a9ca21d · 4a9ca21d · 4a9ca21d
Commit 4a9ca21d authored Sep 16, 2019 by FritzFlorian
11 changed files
--- a/cmake/SetupAddressSanitizer.cmake
+++ b/cmake/SetupAddressSanitizer.cmake
-# Optionally compile with thread sanitizer enabled to find concurrency bugs
+# Optionally compile with thread sanitizer enabled to find memory bugs
 # https://github.com/google/sanitizers/wiki/ThreadSanitizerCppManual

 # Add optional sanitizer, off by default

--- a/lib/pls/include/pls/internal/base/thread.h
+++ b/lib/pls/include/pls/internal/base/thread.h
@@ -9,7 +9,7 @@
 #include <functional>
 #include <pthread.h>
 #include <atomic>
-#include <time.h>
+#include <ctime>

 #include "system_details.h"

@@ -30,7 +30,6 @@ using thread_entrypoint = void();
 * Current implementation is based on pthreads.
 */
 class this_thread {
-  template<typename Function, typename State>
  friend
  class thread;
 #ifdef PLS_THREAD_SPECIFIC_PTHREAD
@@ -73,40 +72,28 @@ class this_thread {

 /**
 * Abstraction for starting a function in a separate thread.
- *
- * @tparam Function Lambda being started on the new thread.
- * @tparam State State type held for this thread.
- *
- * usage:
- * T* state;
- * auto thread = start_thread([] {
- *      // Run on new thread
- * }, state);
- * thread.join(); // Wait for it to finish
+ * Offers only threading functionality needed in this project,
+ * underlying implementation can be changed.
+ * Uses NO heap memory allocation.
 *
 * PORTABILITY:
 * Current implementation is based on pthreads.
 */
-template<typename Function, typename State>
 class thread {
  friend class this_thread;
-  // Keep a copy of the function (lambda) in this object to make sure it is valid when called!
-  Function function_;
-  State *state_pointer_;
-
-  // Wee need to wait for the started function to read
-  // the function_ and state_pointer_ property before returning
-  // from the constructor, as the object might be moved after this.
-  std::atomic_flag *startup_flag_;
-
  // Keep handle to native implementation
  pthread_t pthread_thread_;

+  template<typename Function, typename State>
  static void *start_pthread_internal(void *thread_pointer);

 public:
+  template<typename Function, typename State>
  explicit thread(const Function &function, State *state_pointer);

+  template<typename Function>
+  explicit thread(const Function &function);
+
 public:
  void join();

@@ -118,11 +105,6 @@ class thread {
  thread &operator=(const thread &) = delete;
 };

-template<typename Function, typename State>
-thread<Function, State> start_thread(const Function &function, State *state_pointer);
-template<typename Function>
-thread<Function, void> start_thread(const Function &function);
-
 }
 }
 }

--- a/lib/pls/include/pls/internal/base/thread_impl.h
+++ b/lib/pls/include/pls/internal/base/thread_impl.h
@@ -27,28 +27,32 @@ void this_thread::set_state(T *state_pointer) {
 }

 template<typename Function, typename State>
-void *thread<Function, State>::start_pthread_internal(void *thread_pointer) {
-  auto my_thread = reinterpret_cast<thread *>(thread_pointer);
-  Function my_function_copy = my_thread->function_;
-  State *my_state_pointer_copy = my_thread->state_pointer_;
+struct thread_arguments {
+  Function function_;
+  State *state_;
+  std::atomic_flag *startup_flag_;
+};
+
+template<typename Function, typename State>
+void *thread::start_pthread_internal(void *thread_pointer) {
+  // Actively copy all arguments into stack memory.
+  thread_arguments<Function, State>
+      arguments_copy = *reinterpret_cast<thread_arguments<Function, State> *>(thread_pointer);

  // Now we have copies of everything we need on the stack.
  // The original thread object can be moved freely (no more
  // references to its memory location).
-  my_thread->startup_flag_->clear();
+  arguments_copy.startup_flag_->clear();

-  this_thread::set_state(my_state_pointer_copy);
-  my_function_copy();
+  this_thread::set_state(arguments_copy.state_);
+  arguments_copy.function_();

  // Finished executing the user function
  pthread_exit(nullptr);
 }

 template<typename Function, typename State>
-thread<Function, State>::thread(const Function &function, State *state_pointer):
-    function_{function},
-    state_pointer_{state_pointer},
-    startup_flag_{nullptr},
+thread::thread(const Function &function, State *state_pointer):
    pthread_thread_{} {

 #ifdef PLS_THREAD_SPECIFIC_PTHREAD
@@ -58,29 +62,20 @@ thread<Function, State>::thread(const Function &function, State *state_pointer):
      }
 #endif

-  // We only need this during startup, will be destroyed when out of scope
+  // Wee need to wait for the started function to read
+  // the function_ and state_pointer_ property before returning
+  // from the constructor, as the object might be moved after this.
  std::atomic_flag startup_flag{ATOMIC_FLAG_INIT};
-  startup_flag_ = &startup_flag;
+
+  thread_arguments<Function, State> arguments{function, state_pointer, &startup_flag};

  startup_flag.test_and_set(); // Set the flag, pthread will clear it when it is safe to return
-  pthread_create(&pthread_thread_, nullptr, start_pthread_internal, (void *) (this));
+  pthread_create(&pthread_thread_, nullptr, start_pthread_internal < Function, State > , (void *) (&arguments));
  while (startup_flag.test_and_set()); // Busy waiting for the starting flag to clear
 }

-template<typename Function, typename State>
-void thread<Function, State>::join() {
-  pthread_join(pthread_thread_, nullptr);
-}
-
-template<typename Function, typename State>
-thread<Function, State> start_thread(const Function &function, State *state_pointer) {
-  return thread<Function, State>(function, state_pointer);
-}
-
 template<typename Function>
-thread<Function, void> start_thread(const Function &function) {
-  return thread<Function, void>(function, nullptr);
-}
+thread::thread(const Function &function): thread{function, (void *) nullptr} {}

 }
 }

--- a/lib/pls/include/pls/internal/scheduling/scheduler.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler.h
@@ -20,8 +20,6 @@ namespace pls {
 namespace internal {
 namespace scheduling {

-using scheduler_thread = base::thread<decltype(&worker_routine), thread_state>;
-
 /**
 * The scheduler is the central part of the dispatching-framework.
 * It manages a pool of worker threads (creates, sleeps/wakes up, destroys)

--- a/lib/pls/include/pls/internal/scheduling/scheduler_memory.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler_memory.h
@@ -11,19 +11,18 @@ namespace internal {
 namespace scheduling {

 void worker_routine();
-using scheduler_thread = base::thread<decltype(&worker_routine), thread_state>;

 class scheduler_memory {
 private:
  size_t max_threads_;
  thread_state **thread_states_;
-  scheduler_thread **threads_;
+  base::thread **threads_;
  data_structures::aligned_stack **task_stacks_;

 protected:
  void init(size_t max_therads,
            thread_state **thread_states,
-            scheduler_thread **threads,
+            base::thread **threads,
            data_structures::aligned_stack **task_stacks) {
    max_threads_ = max_therads;
    thread_states_ = thread_states;
@@ -38,7 +37,7 @@ class scheduler_memory {
  thread_state *thread_state_for(size_t id) const {
    return thread_states_[id];
  }
-  scheduler_thread *thread_for(size_t id) const {
+  base::thread *thread_for(size_t id) const {
    return threads_[id];
  }
  data_structures::aligned_stack *task_stack_for(size_t id) const {
@@ -51,7 +50,7 @@ class static_scheduler_memory : public scheduler_memory {
  // Everyone of these types has to live on its own cache line,
  // as each thread uses one of them independently.
  // Therefore it would be a major performance hit if we shared cache lines on these.
-  using aligned_thread = base::alignment::aligned_wrapper<scheduler_thread>;
+  using aligned_thread = base::alignment::aligned_wrapper<base::thread>;
  using aligned_thread_state = base::alignment::aligned_wrapper<thread_state>;
  using aligned_thread_stack = base::alignment::aligned_wrapper<std::array<char, TASK_STACK_SIZE>>;
  using aligned_aligned_stack = base::alignment::aligned_wrapper<data_structures::aligned_stack>;
@@ -63,7 +62,7 @@ class static_scheduler_memory : public scheduler_memory {
  std::array<aligned_aligned_stack, MAX_THREADS> task_stacks_;

  // References for parent
-  std::array<scheduler_thread *, MAX_THREADS> thread_refs_;
+  std::array<base::thread *, MAX_THREADS> thread_refs_;
  std::array<thread_state *, MAX_THREADS> thread_state_refs_;
  std::array<data_structures::aligned_stack *, MAX_THREADS> task_stack_refs_;

@@ -86,7 +85,7 @@ class malloc_scheduler_memory : public scheduler_memory {
  // Everyone of these types has to live on its own cache line,
  // as each thread uses one of them independently.
  // Therefore it would be a major performance hit if we shared cache lines on these.
-  using aligned_thread = base::alignment::aligned_wrapper<scheduler_thread>;
+  using aligned_thread = base::alignment::aligned_wrapper<base::thread>;
  using aligned_thread_state = base::alignment::aligned_wrapper<thread_state>;
  using aligned_aligned_stack = base::alignment::aligned_wrapper<data_structures::aligned_stack>;

@@ -99,7 +98,7 @@ class malloc_scheduler_memory : public scheduler_memory {
  aligned_aligned_stack *task_stacks_;

  // References for parent
-  scheduler_thread **thread_refs_;
+  base::thread **thread_refs_;
  thread_state **thread_state_refs_;
  data_structures::aligned_stack **task_stack_refs_;


--- a/lib/pls/include/pls/internal/scheduling/thread_state.h
+++ b/lib/pls/include/pls/internal/scheduling/thread_state.h
@@ -25,14 +25,6 @@ struct thread_state {
  alignas(base::system_details::CACHE_LINE_SIZE) size_t id_;
  alignas(base::system_details::CACHE_LINE_SIZE) std::minstd_rand random_;

-  thread_state() :
-      scheduler_{nullptr},
-      current_task_{nullptr},
-      task_stack_{nullptr},
-      deque_{task_stack_},
-      id_{0},
-      random_{id_} {};
-
  thread_state(scheduler *scheduler, data_structures::aligned_stack *task_stack, unsigned int id) :
      scheduler_{scheduler},
      current_task_{nullptr},

--- a/lib/pls/src/internal/base/thread.cpp
+++ b/lib/pls/src/internal/base/thread.cpp
@@ -11,7 +11,10 @@ bool this_thread::local_storage_key_initialized_;
 #ifdef PLS_THREAD_SPECIFIC_COMPILER
 __thread void *this_thread::local_state_;
 #endif
-// implementation in header (C++ templating)
+
+void thread::join() {
+  pthread_join(pthread_thread_, nullptr);
+}

 }
 }

--- a/lib/pls/src/internal/scheduling/scheduler.cpp
+++ b/lib/pls/src/internal/scheduling/scheduler.cpp
@@ -26,8 +26,7 @@ scheduler::scheduler(scheduler_memory *memory, const unsigned int num_threads, b
    if (reuse_thread && i == 0) {
      continue; // Skip over first/main thread when re-using the users thread, as this one will replace the first one.
    }
-    new((void *) memory_->thread_for(i))base::thread<void (*)(), thread_state>(&scheduler::worker_routine,
-                                                                               memory_->thread_state_for(i));
+    new((void *) memory_->thread_for(i))base::thread(&scheduler::worker_routine, memory_->thread_state_for(i));

  }
 }

--- a/lib/pls/src/internal/scheduling/scheduler_memory.cpp
+++ b/lib/pls/src/internal/scheduling/scheduler_memory.cpp
@@ -15,7 +15,7 @@ malloc_scheduler_memory::malloc_scheduler_memory(const size_t num_threads, const
      num_threads * sizeof(aligned_aligned_stack)));
  task_stacks_memory_ = reinterpret_cast<char **>(base::alignment::allocate_aligned(num_threads * sizeof(char *)));

-  thread_refs_ = static_cast<scheduler_thread **>(malloc(num_threads * sizeof(scheduler_thread *)));
+  thread_refs_ = static_cast<base::thread **>(malloc(num_threads * sizeof(base::thread *)));
  thread_state_refs_ = static_cast<thread_state **>(malloc(num_threads * sizeof(thread_state *)));
  task_stack_refs_ =
      static_cast<data_structures::aligned_stack **>(malloc(num_threads * sizeof(data_structures::aligned_stack *)));

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
 add_executable(tests
        main.cpp
        data_structures_test.cpp
+        base_tests.cpp
        scheduling_tests.cpp
        algorithm_test.cpp
        dataflow_test.cpp)

--- a/test/base_tests.cpp
+++ b/test/base_tests.cpp
@@ -15,7 +15,7 @@ static vector<int> base_tests_local_value_two;

 TEST_CASE("thread creation and joining", "[internal/data_structures/thread.h]") {
  base_tests_visited = false;
-  auto t1 = start_thread([]() { base_tests_visited = true; });
+  thread t1{[]() { base_tests_visited = true; }};
  t1.join();

  REQUIRE(base_tests_visited);
@@ -25,8 +25,8 @@ TEST_CASE("thread state", "[internal/data_structures/thread.h]") {
  int state_one = 1;
  vector<int> state_two{1, 2};

-  auto t1 = start_thread([]() { base_tests_local_value_one = *this_thread::state<int>(); }, &state_one);
-  auto t2 = start_thread([]() { base_tests_local_value_two = *this_thread::state<vector<int>>(); }, &state_two);
+  thread t1{[]() { base_tests_local_value_one = *this_thread::state<int>(); }, &state_one};
+  thread t2{[]() { base_tests_local_value_two = *this_thread::state<vector<int>>(); }, &state_two};
  t1.join();
  t2.join();

@@ -42,20 +42,20 @@ TEST_CASE("spinlock protects concurrent counter", "[internal/data_structures/spi
  spin_lock lock{};

  SECTION("lock can be used by itself") {
-    auto t1 = start_thread([&]() {
+    thread t1{[&]() {
      for (int i = 0; i < num_iterations; i++) {
        lock.lock();
        base_tests_shared_counter++;
        lock.unlock();
      }
-    });
-    auto t2 = start_thread([&]() {
+    }};
+    thread t2{[&]() {
      for (int i = 0; i < num_iterations; i++) {
        lock.lock();
        base_tests_shared_counter--;
        lock.unlock();
      }
-    });
+    }};

    t1.join();
    t2.join();
@@ -64,18 +64,18 @@ TEST_CASE("spinlock protects concurrent counter", "[internal/data_structures/spi
  }

  SECTION("lock can be used with std::lock_guard") {
-    auto t1 = start_thread([&]() {
+    thread t1{[&]() {
      for (int i = 0; i < num_iterations; i++) {
        std::lock_guard<spin_lock> my_lock{lock};
        base_tests_shared_counter++;
      }
-    });
-    auto t2 = start_thread([&]() {
+    }};
+    thread t2{[&]() {
      for (int i = 0; i < num_iterations; i++) {
        std::lock_guard<spin_lock> my_lock{lock};
        base_tests_shared_counter--;
      }
-    });
+    }};

    t1.join();
    t2.join();