Add support for spawn-and-sync statement for not wasting steals.

7d090b3c · FritzFlorian · 1d1b5185 · 7d090b3c · 7d090b3c · 7d090b3c
Commit 7d090b3c authored Jun 24, 2020 by FritzFlorian
6 changed files
--- a/lib/pls/include/pls/algorithms/for_each_impl.h
+++ b/lib/pls/include/pls/algorithms/for_each_impl.h
@@ -32,13 +32,12 @@ static void for_each(const RandomIt first,
                         function,
                         min_elements);
    });
-    scheduler::spawn([first, middle_index, last, &function, min_elements] {
+    scheduler::spawn_and_sync([first, middle_index, last, &function, min_elements] {
      internal::for_each(first + middle_index,
                         last,
                         function,
                         min_elements);
    });
-    scheduler::sync();
  }
 }

--- a/lib/pls/include/pls/algorithms/reduce_impl.h
+++ b/lib/pls/include/pls/algorithms/reduce_impl.h
@@ -37,14 +37,13 @@ static Element reduce(const RandomIt first,
                                                           reducer,
                                                           min_elements);
    });
-    scheduler::spawn([first, middle_index, last, neutral, &reducer, min_elements, &right] {
+    scheduler::spawn_and_sync([first, middle_index, last, neutral, &reducer, min_elements, &right] {
      right = internal::reduce<RandomIt, Function, Element>(first + middle_index,
                                                            last,
                                                            neutral,
                                                            reducer,
                                                            min_elements);
    });
-    scheduler::sync();
    return reducer(left, right);
  }
 }

--- a/lib/pls/include/pls/internal/scheduling/scheduler.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler.h
@@ -86,7 +86,11 @@ class scheduler {
 #ifdef PLS_SERIAL_ELUSION
    lambda();
 #else
+    if (thread_state::is_scheduler_active()) {
      spawn_internal(std::forward<Function>(lambda));
+    } else {
+      lambda();
+    }
 #endif
  }
@@ -97,7 +101,29 @@ class scheduler {
 #ifdef PLS_SERIAL_ELUSION
    return;
 #else
+    if (thread_state::is_scheduler_active()) {
      sync_internal();
+    } else {
+      return;
+    }
+#endif
+  }
+  /**
+   * Equivialent to calling spawn(lambda); sync();
+   *
+   * Faster than the direct notation, as stealing the continuation before the sync is not useful and only overhead.
+ */
+  template<typename Function>
+  static void spawn_and_sync(Function &&lambda) {
+#ifdef PLS_SERIAL_ELUSION
+    lambda();
+#else
+    if (thread_state::is_scheduler_active()) {
+      spawn_and_sync_internal(std::forward<Function>(lambda));
+    } else {
+      lambda();
+    }
 #endif
  }
@@ -110,7 +136,11 @@ class scheduler {
 #ifdef PLS_SERIAL_ELUSION
    lambda();
 #else
+    if (thread_state::is_scheduler_active()) {
      serial_internal(std::forward<Function>(lambda));
+    } else {
+      lambda();
+    }
 #endif
  }
@@ -138,6 +168,8 @@ class scheduler {
 private:
  template<typename Function>
  static void spawn_internal(Function &&lambda);
+  template<typename Function>
+  static void spawn_and_sync_internal(Function &&lambda);
  static void sync_internal();
  template<typename Function>
  static void serial_internal(Function &&lambda);

--- a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
@@ -33,7 +33,7 @@ scheduler::scheduler(unsigned int num_threads,
    stack_allocator_{std::make_shared<ALLOC>(std::forward<ALLOC>(stack_allocator))},
    serial_stack_size_{serial_stack_size}
 #if PLS_PROFILING_ENABLED
-    , profiler_{num_threads}
+, profiler_{num_threads}
 #endif
 {
@@ -290,8 +290,105 @@ void scheduler::spawn_internal(Function &&lambda) {
 }
 template<typename Function>
+void scheduler::spawn_and_sync_internal(Function &&lambda) {
+  thread_state &spawning_state = thread_state::get();
+  base_task *last_task = spawning_state.get_active_task();
+  base_task *spawned_task = last_task->next_;
+  // We are at the end of our allocated tasks, go over to serial execution.
+  if (spawned_task == nullptr) {
+    scheduler::serial_internal(std::forward<Function>(lambda));
+    return;
+  }
+  // We are in a serial section.
+  // For now we simply stay serial. Later versions could add nested parallel sections.
+  if (last_task->is_serial_section_) {
+    lambda();
+    return;
+  }
+  // Carry over the resources allocated by parents
+  auto *attached_resources = last_task->attached_resources_.load(std::memory_order_relaxed);
+  spawned_task->attached_resources_.store(attached_resources, std::memory_order_relaxed);
+#if PLS_PROFILING_ENABLED
+  spawning_state.get_scheduler().profiler_.task_prepare_stack_measure(spawning_state.get_thread_id(),
+                                                                      spawned_task->stack_memory_,
+                                                                      spawned_task->stack_size_);
+  auto *child_dag_node = spawning_state.get_scheduler().profiler_.task_spawn_child(spawning_state.get_thread_id(),
+                                                                                   last_task->profiling_node_);
+  spawned_task->profiling_node_ = child_dag_node;
+#endif
+  auto continuation = spawned_task->run_as_task([last_task, spawned_task, lambda, &spawning_state](auto cont) {
+    // allow stealing threads to continue the last task.
+    last_task->continuation_ = std::move(cont);
+    // we are now executing the new task, allow others to steal the last task continuation.
+    spawned_task->is_synchronized_ = true;
+    spawning_state.set_active_task(spawned_task);
+    // execute the lambda itself, which could lead to a different thread returning.
+#if PLS_PROFILING_ENABLED
+    spawning_state.get_scheduler().profiler_.task_finish_stack_measure(syncing_state.get_thread_id(),
+                                                                      last_task->stack_memory_,
+                                                                      last_task->stack_size_,
+                                                                      last_task->profiling_node_);
+    syncing_state.get_scheduler().profiler_.task_stop_running(spawning_state.get_thread_id(),
+                                                              last_task->profiling_node_);
+    auto *next_dag_node =
+        spawning_state.get_scheduler().profiler_.task_sync(spawning_state.get_thread_id(), last_task->profiling_node_);
+    last_task->profiling_node_ = next_dag_node;
+    spawning_state.get_scheduler().profiler_.task_start_running(spawning_state.get_thread_id(),
+                                                                spawned_task->profiling_node_);
+#endif
+    lambda();
+    thread_state &syncing_state = thread_state::get();
+    PLS_ASSERT(syncing_state.get_active_task() == spawned_task,
+               "Task manager must always point its active task onto whats executing.");
+    if (&syncing_state == &spawning_state && last_task->is_synchronized_) {
+      // Fast path, simply continue execution where we left of before spawn.
+      PLS_ASSERT(last_task->continuation_.valid(),
+                 "Fast path, no one can have continued working on the last task.");
+      syncing_state.set_active_task(last_task);
+#if PLS_PROFILING_ENABLED
+      syncing_state.get_scheduler().profiler_.task_finish_stack_measure(syncing_state.get_thread_id(),
+                                                                        spawned_task->stack_memory_,
+                                                                        spawned_task->stack_size_,
+                                                                        spawned_task->profiling_node_);
+      syncing_state.get_scheduler().profiler_.task_stop_running(syncing_state.get_thread_id(),
+                                                                spawned_task->profiling_node_);
+#endif
+      return std::move(last_task->continuation_);
+    } else {
+      // Slow path, the last task was stolen. This path is common to sync() events.
+#if PLS_PROFILING_ENABLED
+      syncing_state.get_scheduler().profiler_.task_finish_stack_measure(syncing_state.get_thread_id(),
+                                                                        spawned_task->stack_memory_,
+                                                                        spawned_task->stack_size_,
+                                                                        spawned_task->profiling_node_);
+      syncing_state.get_scheduler().profiler_.task_stop_running(syncing_state.get_thread_id(),
+                                                                spawned_task->profiling_node_);
+#endif
+      auto continuation = slow_return(syncing_state, false);
+      return continuation;
+    }
+  });
+#if PLS_PROFILING_ENABLED
+  thread_state::get().get_scheduler().profiler_.task_start_running(thread_state::get().get_thread_id(),
+                                                                   last_task->profiling_node_);
+#endif
+  PLS_ASSERT(!continuation.valid(), "We must not jump to a not-published (synced) spawn.");
+}
+template<typename Function>
 void scheduler::serial_internal(Function &&lambda) {
-  if (thread_state::is_scheduler_active()) {
  thread_state &spawning_state = thread_state::get();
  base_task *active_task = spawning_state.get_active_task();
@@ -307,9 +404,6 @@ void scheduler::serial_internal(Function &&lambda) {
                                    });
    active_task->is_serial_section_ = false;
  }
-  } else {
-    lambda();
-  }
 }
 }

--- a/lib/pls/include/pls/pls.h
+++ b/lib/pls/include/pls/pls.h
@@ -6,6 +6,7 @@
 #include "pls/algorithms/invoke.h"
 #include "pls/algorithms/for_each.h"
 #include "pls/algorithms/reduce.h"
+#include "pls/algorithms/loop_partition_strategy.h"
 #include "pls/internal/scheduling/scheduler.h"
 #include "pls/internal/scheduling/strain_local_resource.h"
@@ -21,6 +22,10 @@ template<typename Function>
 static void spawn(Function &&function) {
  scheduler::spawn(std::forward<Function>(function));
 }
+template<typename Function>
+static void spawn_and_sync(Function &&function) {
+  scheduler::spawn_and_sync(std::forward<Function>(function));
+}
 static void sync() {
  scheduler::sync();
 }
@@ -42,6 +47,9 @@ using algorithm::invoke;
 using algorithm::for_each;
 using algorithm::for_each_range;
 using algorithm::reduce;
+using algorithm::dynamic_strategy;
+using algorithm::fixed_strategy;
 }
 #endif
--- a/lib/pls/src/internal/scheduling/scheduler.cpp
+++ b/lib/pls/src/internal/scheduling/scheduler.cpp
@@ -155,7 +155,6 @@ void scheduler::work_thread_work_section() {
 }
 void scheduler::sync_internal() {
-  if (thread_state::is_scheduler_active()) {
  thread_state &syncing_state = thread_state::get();
  base_task *active_task = syncing_state.get_active_task();
@@ -196,10 +195,6 @@ void scheduler::sync_internal() {
 #endif
    return; // We cleanly synced to the last one finishing work on last_task
  }
-  } else {
-    // Scheduler not active
-    return;
-  }
 }
 context_switcher::continuation scheduler::slow_return(thread_state &calling_state, bool in_sync) {