Add cscontext ARMv7 assembly and fast path optimization.

89b6e3cb · FritzFlorian · 3c60e8d7 · 89b6e3cb · 89b6e3cb · 89b6e3cb
Commit 89b6e3cb authored Feb 09, 2020 by FritzFlorian
12 changed files
--- a/app/benchmark_fft/main.cpp
+++ b/app/benchmark_fft/main.cpp
@@ -37,7 +37,7 @@ void conquer(fft::complex_vector::iterator data, int n) {

 constexpr int MAX_NUM_THREADS = 8;
 constexpr int MAX_NUM_TASKS = 32;
-constexpr int MAX_STACK_SIZE = 1024 * 64;
+constexpr int MAX_STACK_SIZE = 1024 * 32;

 static_scheduler_memory<MAX_NUM_THREADS,
                        MAX_NUM_TASKS,
@@ -56,19 +56,11 @@ int main(int argc, char **argv) {

  scheduler scheduler{global_scheduler_memory, (unsigned) num_threads};

+  runner.run_iterations(fft::NUM_ITERATIONS, [&]() {
    scheduler.perform_work([&]() {
-    for (int i = 0; i < fft::NUM_WARMUP_ITERATIONS; i++) {
-      conquer(data.begin(), fft::SIZE);
-    }
-  });
-
-  scheduler.perform_work([&]() {
-    for (int i = 0; i < fft::NUM_ITERATIONS; i++) {
-      runner.start_iteration();
-      conquer(data.begin(), fft::SIZE);
-      runner.end_iteration();
-    }
+      conquer(data.begin(), fft::SIZE);;
    });
+  }, fft::NUM_WARMUP_ITERATIONS);
  runner.commit_results(true);

  return 0;

--- a/app/benchmark_fib/main.cpp
+++ b/app/benchmark_fib/main.cpp
@@ -51,19 +51,12 @@ int main(int argc, char **argv) {
  scheduler scheduler{global_scheduler_memory, (unsigned) num_threads};

  volatile int res;
-  scheduler.perform_work([&]() {
-    for (int i = 0; i < fib::NUM_WARMUP_ITERATIONS; i++) {
-      res = pls_fib(fib::INPUT_N);
-    }
-  });

+  runner.run_iterations(fib::NUM_ITERATIONS, [&]() {
    scheduler.perform_work([&]() {
-    for (int i = 0; i < fib::NUM_ITERATIONS; i++) {
-      runner.start_iteration();
      res = pls_fib(fib::INPUT_N);
-      runner.end_iteration();
-    }
    });
+  }, fib::NUM_WARMUP_ITERATIONS);
  runner.commit_results(true);

  return 0;

--- a/app/benchmark_matrix/main.cpp
+++ b/app/benchmark_matrix/main.cpp
@@ -14,8 +14,8 @@ class pls_matrix : public matrix::matrix<T, SIZE> {
 public:
  pls_matrix() : matrix::matrix<T, SIZE>() {}

-  void pls_multiply(const matrix::matrix<T, SIZE> &a, const matrix::matrix<T, SIZE> &b) {
-    pls::algorithm::for_each_range(0, SIZE, [this, &a, &b](int i) {
+  void multiply(const matrix::matrix<T, SIZE> &a, const matrix::matrix<T, SIZE> &b) override {
+    pls::algorithm::for_each_range(0, SIZE, [&](int i) {
      this->multiply_column(i, a, b);
    });
  }
@@ -23,7 +23,7 @@ class pls_matrix : public matrix::matrix<T, SIZE> {

 constexpr int MAX_NUM_THREADS = 8;
 constexpr int MAX_NUM_TASKS = 32;
-constexpr int MAX_STACK_SIZE = 1024 * 4;
+constexpr int MAX_STACK_SIZE = 1024 * 1;

 static_scheduler_memory<MAX_NUM_THREADS,
                        MAX_NUM_TASKS,
@@ -44,18 +44,10 @@ int main(int argc, char **argv) {

  scheduler scheduler{global_scheduler_memory, (unsigned) num_threads};

+  runner.run_iterations(matrix::NUM_ITERATIONS, [&]() {
    scheduler.perform_work([&]() {
-    for (int i = 0; i < matrix::WARMUP_ITERATIONS; i++) {
-      result.pls_multiply(a, b);
-    }
-  });
-
-  scheduler.perform_work([&]() {
-    for (int i = 0; i < matrix::NUM_ITERATIONS; i++) {
-      runner.start_iteration();
-      result.pls_multiply(a, b);
-      runner.end_iteration();
-    }
+      result.multiply(a, b);
    });
+  }, matrix::WARMUP_ITERATIONS);
  runner.commit_results(true);
 }
--- a/cmake/SetupOptimizationLevel.cmake
+++ b/cmake/SetupOptimizationLevel.cmake
@@ -18,7 +18,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release")
    # but inlining functions and SIMD/Vectorization is
    # only enabled by -O3, thus it's way faster in some
    # array calculations.
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2 -march=native")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native")
    set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 else ()
    set(CMAKE_CXX_FLAGS_DEBUG "-g -O0")

--- a/lib/context_switcher/asm/cscontext/SelectAssemblyFiles.cmake
+++ b/lib/context_switcher/asm/cscontext/SelectAssemblyFiles.cmake
@@ -9,6 +9,11 @@ if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND CMAKE_SYSTEM_NAME STREQUAL "Lin
    SET(CS_CSCONTEXT_ASSEMBLY
            asm/cscontext/enter_context_x86_64_sysv_elf.s
            asm/cscontext/switch_context_x86_64_sysv_elf.s)
+elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l" AND CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    # Typical Linux running on ARMv7
+    SET(CS_CSCONTEXT_ASSEMBLY
+            asm/cscontext/enter_context_arm32_sysv_elf.s
+            asm/cscontext/switch_context_arm32_sysv_elf.s)
 else ()
    SET(CS_CSCONTEXT_FOUND FALSE)
 endif ()

--- a/lib/context_switcher/asm/cscontext/enter_context_arm32_sysv_elf.s
+++ b/lib/context_switcher/asm/cscontext/enter_context_arm32_sysv_elf.s
+	.arm
+	.text
+	.global	__cs_enter_context
+	.type	__cs_enter_context, %function
+
+__cs_enter_context:
+        /* Parameter List (in order)
+        * r0 = new stack pointer
+        * r1 = first parameter to callback
+        * r2 = callback function pointer
+        * r3 = new stack limit (not used on most platforms)
+        *
+        * Return
+        * r0 = continuation that returned control back to the caller (null if fallthrough)
+        *
+        * Variables
+        * r4 = temporary for the old stack pointer */
+
+        /* ========== Save State ========== */
+        /* store programm counter for later return */
+        push {lr}
+        /* store callee saved registers */
+        push {r4-r12,lr}
+        /* store floating point extension registers */
+        #if (defined(__VFP_FP__) && !defined(__SOFTFP__))
+        sub sp, sp, #64
+        vstmia sp, {d8-d15}
+        #endif
+        /* ========== Save State ========== */
+
+        /* Perform change to new stack */
+        /* Keep old stack as second parameter to our callback function. */
+        mov r4, sp
+        /* Make sure that stack start is properly aligned. */
+        and r0, r0, #-16
+        /* Switch to new stack pointer. */
+        mov sp, r0
+
+        /* Perform actual function call, this will now be on the new stack */
+        /* r0 = first parametor to callback (continuation) */
+        /* r1 = second parameter to callback (arbetary pointer) */
+        mov r0, r4
+        blx r2
+
+        /* Restore state of returned continuation. */
+        /* To do so we first reset the stack pointer (which we get returned in r0). */
+        /* After that we execute our standard restore procedere to pop the state from the stack. */
+        mov sp, r0
+
+        /* ========== Restore State ========== */
+        /* restore floating point extension registers */
+        #if (defined(__VFP_FP__) && !defined(__SOFTFP__))
+        vldmia sp, {d8-d15}
+        add sp, sp, #64
+        #endif
+        /* restore callee saved registers */
+        pop {r4-r12,lr}
+        /* ========== Restore State ========== */
+
+        /* Just return back from the call. */
+        /* This is the end of a fiber, so we have no continuation. */
+        eor r0, r0, r0
+        pop {pc}
--- a/lib/context_switcher/asm/cscontext/switch_context_arm32_sysv_elf.s
+++ b/lib/context_switcher/asm/cscontext/switch_context_arm32_sysv_elf.s
+	.arm
+	.text
+	.global	__cs_switch_context
+	.type	__cs_switch_context, %function
+
+__cs_switch_context:
+        /* Parameter List (in order)
+        * r0 = pointer to continuation (should hold value of target stack will be filled with this continuation)
+        *
+        * Return
+        * r0 = continuation that returned control back to the caller (null if fallthrough)
+        *
+        * Variables
+        * r1 = temporary for the old stack pointer */
+
+        /* ========== Save State ========== */
+        /* store programm counter for later return */
+        push {lr}
+        /* store callee saved registers */
+        push {r4-r12,lr}
+        /* store floating point extension registers */
+        #if (defined(__VFP_FP__) && !defined(__SOFTFP__))
+        sub sp, sp, #64
+        vstmia sp, {d8-d15}
+        #endif
+        /* ========== Save State ========== */
+
+        /* Perform change to new stack */
+        /* Keep old stack as result from this function. */
+        mov r1, sp
+        /* Switch to new stack pointer. */
+        mov sp, r0
+
+
+        /* ========== Restore State ========== */
+        /* restore floating point extension registers */
+        #if (defined(__VFP_FP__) && !defined(__SOFTFP__))
+        vldmia sp, {d8-d15}
+        add sp, sp, #64
+        #endif
+        /* restore callee saved registers */
+        pop {r4-r12,lr}
+        /* ========== Restore State ========== */
+
+        /* Just return back from the call. */
+        /* This is the end of a fiber, so we have no continuation. */
+        mov r0, r1
+        pop {pc}
--- a/lib/pls/include/pls/algorithms/for_each_impl.h
+++ b/lib/pls/include/pls/algorithms/for_each_impl.h
@@ -77,10 +77,7 @@ void for_each(RandomIt
              execution_strategy) {
  long num_elements = std::distance(first, last);
  return
-      internal::for_each(first, last, function, execution_strategy
-          .
-              calculate_min_elements(num_elements)
-      );
+      internal::for_each(first, last, function, execution_strategy.calculate_min_elements(num_elements));
 }

 template<typename RandomIt, typename Function>

--- a/lib/pls/include/pls/internal/scheduling/task.h
+++ b/lib/pls/include/pls/internal/scheduling/task.h
@@ -34,6 +34,8 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) task {

    depth_ = depth;
    thread_id_ = thread_id;
+
+    is_synchronized_ = false;
  }

  template<typename F>
@@ -44,9 +46,11 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) task {
  // TODO: Proper access control and split it up into responsibilities
  // Stack/Continuation Management
  char *stack_memory_;
-  size_t stack_size_;
+  size_t stack_size_; // TODO: maybe remove it, not needed in here
  context_switcher::continuation continuation_;
+  bool is_synchronized_;

+  // TODO: Clean up responsibilities
  // Work-Stealing
  std::atomic<traded_cas_field> external_trading_deque_cas_{};
  std::atomic<task *> resource_stack_next_{};

--- a/lib/pls/include/pls/internal/scheduling/task_manager_impl.h
+++ b/lib/pls/include/pls/internal/scheduling/task_manager_impl.h
@@ -28,6 +28,7 @@ void task_manager::spawn_child(F &&lambda) {
        last_task->continuation_ = std::move(cont);

        // we are now executing the new task, allow others to steal the last task continuation.
+        spawned_task->is_synchronized_ = true;
        spawning_task_manager->active_task_ = spawned_task;
        spawning_task_manager->deque_.push_bot(last_task);


--- a/lib/pls/src/internal/scheduling/scheduler.cpp
+++ b/lib/pls/src/internal/scheduling/scheduler.cpp
@@ -108,6 +108,7 @@ void scheduler::work_thread_work_section() {
      // Execute the stolen task by jumping to it's continuation.
      PLS_ASSERT(stolen_task->continuation_.valid(),
                 "A task that we can steal must have a valid continuation for us to start working.");
+      stolen_task->is_synchronized_ = false;
      context_switcher::switch_context(std::move(stolen_task->continuation_));
      // We will continue execution in this line when we finished the stolen work.
    }

--- a/lib/pls/src/internal/scheduling/task_manager.cpp
+++ b/lib/pls/src/internal/scheduling/task_manager.cpp
@@ -128,6 +128,9 @@ void task_manager::sync() {
  auto *last_task = spawning_task_manager->active_task_;
  auto *spawned_task = spawning_task_manager->active_task_->next_;

+  if (last_task->is_synchronized_) {
+    return; // We are already the sole owner of last_task
+  } else {
    auto continuation = spawned_task->run_as_task([=](context_switcher::continuation cont) {
      last_task->continuation_ = std::move(cont);
      spawning_task_manager->active_task_ = spawned_task;
@@ -145,6 +148,9 @@ void task_manager::sync() {
    PLS_ASSERT(!continuation.valid(),
               "We only return to a sync point, never jump to it directly."
               "This must therefore never return an unfinished fiber/continuation.");
+
+    return; // We cleanly synced to the last one finishing work on last_task
+  }
 }

 bool task_manager::try_clean_return(context_switcher::continuation &result_cont) {
@@ -195,6 +201,7 @@ bool task_manager::try_clean_return(context_switcher::continuation &result_cont)
    // We are the last one working on this task. Thus the sync must be finished, continue working.
    active_task_ = last_task;

+    last_task->is_synchronized_ = true;
    result_cont = std::move(last_task->continuation_);
    PLS_ASSERT(result_cont.valid(), "Must return a valid continuation.");
    return false;