First working version of pure fork-join based scheduler.

10ca31dc · FritzFlorian · 374153ce · 10ca31dc · 10ca31dc · 10ca31dc
Commit 10ca31dc authored Jun 05, 2019 by FritzFlorian
9 changed files
--- a/app/benchmark_unbalanced/main.cpp
+++ b/app/benchmark_unbalanced/main.cpp
@@ -19,15 +19,14 @@ int count_child_nodes(uts::node &node) {
    return child_count;
  }

-  auto current_task = pls::task::current();
  std::vector<int> results(children.size());
  for (size_t i = 0; i < children.size(); i++) {
    size_t index = i;
    auto lambda = [&, index] { results[index] = count_child_nodes(children[index]); };
    pls::lambda_task_by_value<typeof(lambda)> sub_task(lambda);
-    current_task->spawn_child(sub_task);
+    pls::scheduler::spawn_child(sub_task);
  }
-  current_task->wait_for_all();
+  pls::scheduler::wait_for_all();
  for (auto result : results) {
    child_count += result;
  }
@@ -36,43 +35,41 @@ int count_child_nodes(uts::node &node) {
 }

 int unbalanced_tree_search(int seed, int root_children, double q, int normal_children) {
-  static auto id = pls::unique_id::create(42);
  int result;

  auto lambda = [&] {
    uts::node root(seed, root_children, q, normal_children);
    result = count_child_nodes(root);
  };
-  pls::lambda_task_by_reference<typeof(lambda)> task(lambda);
  pls::lambda_task_by_reference<typeof(lambda)> sub_task(lambda);
-  pls::task root_task{&sub_task, id};
-  pls::scheduler::execute_task(root_task);
+  pls::scheduler::spawn_child(sub_task);
+  pls::scheduler::wait_for_all();

  return result;
 }
-//
-//int main() {
-//  PROFILE_ENABLE
-//  pls::internal::helpers::run_mini_benchmark([&] {
-//    unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN);
-//  }, 8, 4000);
-//
-//  PROFILE_SAVE("test_profile.prof")
-//}

 int main() {
  PROFILE_ENABLE
-  pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18};
-  pls::scheduler scheduler{&my_scheduler_memory, 8};
-
-  scheduler.perform_work([&] {
-    PROFILE_MAIN_THREAD
-    for (int i = 0; i < 50; i++) {
-      PROFILE_WORK_BLOCK("Top Level")
-      int result = unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN);
-      std::cout << result << std::endl;
-    }
-  });
+  pls::internal::helpers::run_mini_benchmark([&] {
+    unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN);
+  }, 8, 2000);

  PROFILE_SAVE("test_profile.prof")
 }
+
+//int main() {
+//  PROFILE_ENABLE
+//  pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18};
+//  pls::scheduler scheduler{&my_scheduler_memory, 8};
+//
+//  scheduler.perform_work([&] {
+//    PROFILE_MAIN_THREAD
+//    for (int i = 0; i < 50; i++) {
+//      PROFILE_WORK_BLOCK("Top Level")
+//      int result = unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN);
+//      std::cout << result << std::endl;
+//    }
+//  });
+//
+//  PROFILE_SAVE("test_profile.prof")
+//}
--- a/app/invoke_parallel/main.cpp
+++ b/app/invoke_parallel/main.cpp
@@ -91,7 +91,7 @@ int main() {
    PROFILE_MAIN_THREAD
    // Call looks just the same, only requirement is
    // the enclosure in the perform_work lambda.
-    for (int i = 0; i < 1000; i++) {
+    for (int i = 0; i < 10; i++) {
      PROFILE_WORK_BLOCK("Top Level FFT")
      complex_vector input = initial_input;
      fft(input.begin(), input.size());

--- a/lib/pls/include/pls/internal/scheduling/lambda_task.h
+++ b/lib/pls/include/pls/internal/scheduling/lambda_task.h
+
+#ifndef PLS_LAMBDA_TASK_H_
+#define PLS_LAMBDA_TASK_H_
+
+#include "pls/internal/scheduling/task.h"
+
+namespace pls {
+namespace internal {
+namespace scheduling {
+
+template<typename Function>
+class lambda_task_by_reference : public task {
+  const Function &function_;
+
+ public:
+  explicit lambda_task_by_reference(const Function &function) : task{}, function_{function} {};
+
+ protected:
+  void execute_internal() override {
+    function_();
+  }
+};
+
+template<typename Function>
+class lambda_task_by_value : public task {
+  const Function function_;
+
+ public:
+  explicit lambda_task_by_value(const Function &function) : task{}, function_{function} {};
+
+ protected:
+  void execute_internal() override {
+    function_();
+  }
+};
+
+}
+}
+}
+
+#endif //PLS_LAMBDA_TASK_H_
--- a/lib/pls/include/pls/internal/scheduling/scheduler.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler.h
@@ -22,12 +22,23 @@ namespace scheduling {

 using scheduler_thread = base::thread<decltype(&worker_routine), thread_state>;

+/**
+ * The scheduler is the central part of the dispatching-framework.
+ * It manages a pool of worker threads (creates, sleeps/wakes up, destroys)
+ * and allows to execute parallel sections.
+ *
+ * It works in close rellation with the 'task' class for scheduling.
+ */
 class scheduler {
  friend class task;
  const unsigned int num_threads_;
  scheduler_memory *memory_;

  base::barrier sync_barrier_;
+
+  task *main_thread_root_task_;
+  bool work_section_done_;
+
  bool terminated_;
 public:
  /**
@@ -85,6 +96,9 @@ class scheduler {

  task *get_local_task();
  task *steal_task();
+
+  bool try_execute_local();
+  bool try_execute_stolen();
 };

 }

--- a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
@@ -2,35 +2,30 @@
 #ifndef PLS_SCHEDULER_IMPL_H
 #define PLS_SCHEDULER_IMPL_H

+#include "pls/internal/scheduling/lambda_task.h"
+
 namespace pls {
 namespace internal {
 namespace scheduling {

+// TODO: generally look into the performance implications of using many thread_state::get() calls
+
 template<typename Function>
 void scheduler::perform_work(Function work_section) {
  PROFILE_WORK_BLOCK("scheduler::perform_work")
-//  root_task<Function> master{work_section};
-//
-//  // Push root task on stacks
-//  auto new_master = memory_->task_stack_for(0)->push(master);
-//  memory_->thread_state_for(0)->root_task_ = new_master;
-//  memory_->thread_state_for(0)->current_task_ = new_master;
-//  for (unsigned int i = 1; i < num_threads_; i++) {
-//    root_worker_task<Function> worker{new_master};
-//    auto new_worker = memory_->task_stack_for(0)->push(worker);
-//    memory_->thread_state_for(i)->root_task_ = new_worker;
-//    memory_->thread_state_for(i)->current_task_ = new_worker;
-//  }
-//
-//  // Perform and wait for work
-//  sync_barrier_.wait(); // Trigger threads to wake up
-//  sync_barrier_.wait(); // Wait for threads to finish
+
+//  if (execute_main_thread) {
+//    work_section();
 //
-//  // Clean up stack
-//  memory_->task_stack_for(0)->pop<typeof(master)>();
-//  for (unsigned int i = 1; i < num_threads_; i++) {
-//    root_worker_task<Function> worker{new_master};
-//    memory_->task_stack_for(0)->pop<typeof(worker)>();
+//    sync_barrier_.wait(); // Trigger threads to wake up
+//    sync_barrier_.wait(); // Wait for threads to finish
+//  } else {
+  lambda_task_by_reference<Function> root_task{work_section};
+  main_thread_root_task_ = &root_task;
+  work_section_done_ = false;
+
+  sync_barrier_.wait(); // Trigger threads to wake up
+  sync_barrier_.wait(); // Wait for threads to finish
 //  }
 }

@@ -39,12 +34,6 @@ void scheduler::spawn_child(T &sub_task) {
  thread_state::get()->current_task_->spawn_child(sub_task);
 }

-void scheduler::wait_for_all() {
-  thread_state::get()->current_task_->wait_for_all();
-}
-
-thread_state *scheduler::thread_state_for(size_t id) { return memory_->thread_state_for(id); }
-
 }
 }
 }

--- a/lib/pls/include/pls/internal/scheduling/task.h
+++ b/lib/pls/include/pls/internal/scheduling/task.h
@@ -39,14 +39,12 @@ class task {

 private:
  void execute();
-  bool try_execute_local();
-  bool try_execute_stolen();
 };

 template<typename T>
 void task::spawn_child(T &sub_task) {
  PROFILE_FORK_JOIN_STEALING("spawn_child")
-  static_assert(std::is_base_of<T, task>::value, "Only pass task subclasses!");
+  static_assert(std::is_base_of<task, T>::value, "Only pass task subclasses!");

  // Keep our refcount up to date
  ref_count_++;

--- a/lib/pls/include/pls/internal/scheduling/thread_state.h
+++ b/lib/pls/include/pls/internal/scheduling/thread_state.h
@@ -19,7 +19,6 @@ class task;

 struct thread_state {
  alignas(base::system_details::CACHE_LINE_SIZE) scheduler *scheduler_;
-  alignas(base::system_details::CACHE_LINE_SIZE) task *root_task_;
  alignas(base::system_details::CACHE_LINE_SIZE) task *current_task_;
  alignas(base::system_details::CACHE_LINE_SIZE) data_structures::aligned_stack *task_stack_;
  alignas(base::system_details::CACHE_LINE_SIZE) data_structures::work_stealing_deque<task> deque_;
@@ -28,7 +27,6 @@ struct thread_state {

  thread_state() :
      scheduler_{nullptr},
-      root_task_{nullptr},
      current_task_{nullptr},
      task_stack_{nullptr},
      deque_{task_stack_},
@@ -37,7 +35,6 @@ struct thread_state {

  thread_state(scheduler *scheduler, data_structures::aligned_stack *task_stack, unsigned int id) :
      scheduler_{scheduler},
-      root_task_{nullptr},
      current_task_{nullptr},
      task_stack_{task_stack},
      deque_{task_stack_},

--- a/lib/pls/src/internal/scheduling/scheduler.cpp
+++ b/lib/pls/src/internal/scheduling/scheduler.cpp
@@ -30,19 +30,37 @@ scheduler::~scheduler() {
 }

 void scheduler::worker_routine() {
-  auto my_state = base::this_thread::state<thread_state>();
+  auto my_state = thread_state::get();
+  auto scheduler = my_state->scheduler_;

  while (true) {
-    my_state->scheduler_->sync_barrier_.wait();
-    if (my_state->scheduler_->terminated_) {
+    // Wait to be triggered
+    scheduler->sync_barrier_.wait();
+
+    // Check for shutdown
+    if (scheduler->terminated_) {
      return;
    }

-    // The root task must only return when all work is done,
-    // because of this a simple call is enough to ensure the
-    // fork-join-section is done (logically joined back into our main thread).
-    my_state->root_task_->execute();
+    // Execute work
+    if (my_state->id_ == 0) {
+      // Main Thread
+      auto root_task = scheduler->main_thread_root_task_;
+      root_task->parent_ = nullptr;
+      root_task->deque_state_ = my_state->deque_.save_state();
+
+      root_task->execute();
+      scheduler->work_section_done_ = true;
+    } else {
+      // Worker Threads
+      while (!scheduler->work_section_done_) {
+        if (!scheduler->try_execute_local()) {
+          scheduler->try_execute_stolen();
+        }
+      }
+    }

+    // Sync back with main thread
    my_state->scheduler_->sync_barrier_.wait();
  }
 }
@@ -100,6 +118,33 @@ task *scheduler::steal_task() {
  return nullptr;
 }

+bool scheduler::try_execute_local() {
+  task *local_task = get_local_task();
+  if (local_task != nullptr) {
+    local_task->execute();
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool scheduler::try_execute_stolen() {
+  task *stolen_task = steal_task();
+  if (stolen_task != nullptr) {
+    stolen_task->deque_state_ = thread_state::get()->deque_.save_state();
+    stolen_task->execute();
+    return true;
+  }
+
+  return false;
+}
+
+void scheduler::wait_for_all() {
+  thread_state::get()->current_task_->wait_for_all();
+}
+
+thread_state *scheduler::thread_state_for(size_t id) { return memory_->thread_state_for(id); }
+
 }
 }
 }
--- a/lib/pls/src/internal/scheduling/task.cpp
+++ b/lib/pls/src/internal/scheduling/task.cpp
@@ -36,31 +36,12 @@ void task::execute() {
  }
 }

-bool task::try_execute_local() {
-  task *local_task = thread_state::get()->scheduler_->get_local_task();
-  if (local_task != nullptr) {
-    local_task->execute();
-    return true;
-  } else {
-    return false;
-  }
-}
-
-bool task::try_execute_stolen() {
-  task *stolen_task = thread_state::get()->scheduler_->steal_task();
-  if (stolen_task != nullptr) {
-    stolen_task->deque_state_ = thread_state::get()->deque_.save_state();
-    stolen_task->execute();
-    return true;
-  }
-
-  return false;
-}
-
 void task::wait_for_all() {
+  auto scheduler = thread_state::get()->scheduler_;
+
  while (ref_count_ > 0) {
-    if (!try_execute_local()) {
-      try_execute_stolen();
+    if (!scheduler->try_execute_local()) {
+      scheduler->try_execute_stolen();
    }
  }
  thread_state::get()->deque_.release_memory_until(deque_state_);