Add minimal example for x86_64 user level threads.

We implement a minimal concepts of user level threads. This shows the minimum requirements for our 'staggered' stack implementation: we need to be able to switch to a new stack and allow someone else to continue the calling function right before the switch.

Add minimal example for x86_64 user level threads.
We implement a minimal concepts of user level threads. This shows the minimum requirements for our 'staggered' stack implementation: we need to be able to switch to a new stack and allow someone else to continue the calling function right before the switch.
5490e966 · FritzFlorian · d054e1ab · 5490e966 · 5490e966 · 5490e966
Commit 5490e966 authored Jan 10, 2020 by FritzFlorian
Hide whitespace changes
Inline Side-by-side

Showing with 74 additions and 73 deletions

CMakeLists.txt
+2 -1

app/playground/CMakeLists.txt
+2 -2

app/playground/custom_stack_callback.s
+16 -0

app/playground/main.cpp
+53 -69

cmake/SetupOptimizationLevel.cmake
+1 -1

No files found.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.10)
 project(predictable_parallel_patterns
        VERSION 0.0.1
-        DESCRIPTION "predictable parallel patterns for scalable smart systems using work stealing")
+        DESCRIPTION "predictable parallel patterns for scalable smart systems using work stealing"
+        LANGUAGES CXX ASM)
 set(CMAKE_CXX_STANDARD 11)

--- a/app/playground/CMakeLists.txt
+++ b/app/playground/CMakeLists.txt
-add_executable(playground main.cpp)
+add_executable(playground main.cpp custom_stack_callback.s)
 # Example for adding the library to your app (as a cmake project dependency)
-target_link_libraries(playground pls)
+target_link_libraries(playground)
--- a/app/playground/custom_stack_callback.s
+++ b/app/playground/custom_stack_callback.s
+	.file	"custom_stack_callback.s"
+	.text
+	.global	custom_stack_callback
+	.type	custom_stack_callback, @function
+.align 16
+custom_stack_callback:
+        # rdi = new stack adress (passed as parameter)
+        # r12 temporary for restoring old stack (callee saved, so we get the correct value in case of a return)
+        push %r12           # store the callee saved register as required
+        movq %rsp, %r12     # store current stack pointer
+        movq %rdi, %rsp     # update stack pointer to new user level stack
+        call callback       # enter next tasks (will not return if continuation is stolen)
+        movq %r12, %rsp     # restore to the old stack pointer
+        pop %r12            # restore the callee saved register as required
+    	ret
--- a/app/playground/main.cpp
+++ b/app/playground/main.cpp
-#include <iostream>
+#include <cstdio>
+#include <csetjmp>
+#include <cstring>
 #include <chrono>
-#include "pls/internal/scheduling/scheduler.h"
+using namespace std;
-#include "pls/internal/scheduling/parallel_result.h"
-#include "pls/internal/scheduling/scheduler_memory.h"
-#include "pls/internal/data_structures/bounded_trading_deque.h"
-using namespace pls::internal;
+// Settings for stack and benchmark
+const unsigned int NUM_RUNS = 100000;
+const unsigned int STACK_SIZE = 512 * 1;
+const unsigned char MAGIC_NUMBER = (unsigned char) 0xAB;
-constexpr size_t MAX_NUM_THREADS = 1;
+// Memory for custom stack and continuation semantics
+unsigned char custom_stack[STACK_SIZE] = {0};
+jmp_buf buffer;
-constexpr size_t MAX_NUM_TASKS = 128;
+// Example callback function and declaration of our assembly stack switching routine
-static constexpr int NUM_ITERATIONS = 10;
+extern "C" {
+void custom_stack_callback(void *);
-constexpr size_t MAX_NUM_CONTS = 128;
+void __attribute__ ((noinline)) callback() {
-constexpr size_t MAX_CONT_SIZE = 256;
+  static volatile int tmp;
+  tmp = 0; // Force at least a single memory write
+}
+}
-int fib_normal(int n) {
+long __attribute__ ((noinline)) measure_function_call() {
-  if (n == 0) {
+  auto start_time = chrono::steady_clock::now();
-    return 0;
+  for (unsigned int i = 0; i < NUM_RUNS; i++) {
-  }
+    callback();
-  if (n == 1) {
-    return 1;
  }
+  auto end_time = chrono::steady_clock::now();
-  int result = fib_normal(n - 1) + fib_normal(n - 2);
+  return chrono::duration_cast<chrono::nanoseconds>(end_time - start_time).count();
-  return result;
 }
-scheduling::parallel_result<int> fib(int n) {
+long __attribute__ ((noinline)) measure_stack_switch() {
-  pls::variable<int> i;
+  auto start_time = chrono::steady_clock::now();
-  pls::array<int> a{10};
+  for (unsigned int i = 0; i < NUM_RUNS; i++) {
-  if (n == 0) {
+    custom_stack_callback(&custom_stack[STACK_SIZE - 16]);
-    return 0;
-  }
-  if (n == 1) {
-    return 1;
  }
+  auto end_time = chrono::steady_clock::now();
+  return chrono::duration_cast<chrono::nanoseconds>(end_time - start_time).count();
+}
-  return scheduling::scheduler::par([=]() {
+long __attribute__ ((noinline)) measure_continuation() {
-    return fib(n - 1);
+  auto start_time = chrono::steady_clock::now();
-  }, [=]() {
+  for (unsigned int i = 0; i < NUM_RUNS; i++) {
-    return fib(n - 2);
+    if (setjmp(buffer) == 0) {
-  }).then([=](int a, int b) {
+      custom_stack_callback(&custom_stack[STACK_SIZE - 16]);
-    return scheduling::parallel_result<int>{a + b};
+    }
-  });
+  }
+  auto end_time = chrono::steady_clock::now();
+  return chrono::duration_cast<chrono::nanoseconds>(end_time - start_time).count();
 }
-static volatile int result;
 int main() {
-  PROFILE_ENABLE;
+  memset(custom_stack, MAGIC_NUMBER, STACK_SIZE);
-  scheduling::static_scheduler_memory<MAX_NUM_THREADS,
-                                      MAX_NUM_TASKS,
-                                      MAX_NUM_CONTS,
-                                      MAX_CONT_SIZE> static_scheduler_memory;
-  scheduling::scheduler scheduler{static_scheduler_memory, MAX_NUM_THREADS};
-  auto start = std::chrono::steady_clock::now();
-  for (int i = 0; i < NUM_ITERATIONS; i++) {
-    result = fib_normal(35);
-  }
-  auto end = std::chrono::steady_clock::now();
-  std::cout << "Normal:     " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
-            << std::endl;
-  start = std::chrono::steady_clock::now();
+  auto time_cont = measure_continuation();
+  auto time_stack = measure_stack_switch();
+  auto time_func = measure_function_call();
-  for (int i = 0; i < NUM_ITERATIONS; i++) {
+  for (unsigned int i = 0; i < STACK_SIZE; i++) {
-    scheduler.perform_work([]() {
+    if (custom_stack[i] != MAGIC_NUMBER) {
-      PROFILE_MAIN_THREAD;
+      printf("Used stack size about %u bytes.\n\n\n", (STACK_SIZE - i));
-      return scheduling::scheduler::par([]() {
+      break;
-        return scheduling::parallel_result<int>(0);
+    }
-      }, []() {
-        return fib(35);
-      }).then([](int, int b) {
-        result = b;
-        PROFILE_LOCK("DONE");
-        return scheduling::parallel_result<int>{0};
-      });
-    });
-    PROFILE_LOCK("DONE");
  }
-  PROFILE_SAVE("test_profile.prof");
-  end = std::chrono::steady_clock::now();
+  printf("Function Call    : %10ld, %5.5f\n", time_func, ((float) time_func / NUM_RUNS));
-  std::cout << "Framework: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << std::endl;
+  printf("Stack Switching  : %10ld, %5.5f\n", time_stack, ((float) time_stack / NUM_RUNS));
+  printf("Full Continuation: %10ld, %5.5f\n", time_cont, ((float) time_cont / NUM_RUNS));
  return 0;
 }
--- a/cmake/SetupOptimizationLevel.cmake
+++ b/cmake/SetupOptimizationLevel.cmake
@@ -18,7 +18,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release")
    # but inlining functions and SIMD/Vectorization is
    # only enabled by -O3, thus it's way faster in some
    # array calculations.
-    set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native")
    set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 else ()
    set(CMAKE_CXX_FLAGS_DEBUG "-g -O0")