diff --git a/CMakeLists.txt b/CMakeLists.txt index bc1107e..d5bcb81 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,8 @@ cmake_minimum_required(VERSION 3.10) project(predictable_parallel_patterns VERSION 0.0.1 - DESCRIPTION "predictable parallel patterns for scalable smart systems using work stealing") + DESCRIPTION "predictable parallel patterns for scalable smart systems using work stealing" + LANGUAGES CXX ASM) set(CMAKE_CXX_STANDARD 11) diff --git a/app/playground/CMakeLists.txt b/app/playground/CMakeLists.txt index 3ea2ce1..9482fb6 100644 --- a/app/playground/CMakeLists.txt +++ b/app/playground/CMakeLists.txt @@ -1,4 +1,4 @@ -add_executable(playground main.cpp) +add_executable(playground main.cpp custom_stack_callback.s) # Example for adding the library to your app (as a cmake project dependency) -target_link_libraries(playground pls) +target_link_libraries(playground) diff --git a/app/playground/custom_stack_callback.s b/app/playground/custom_stack_callback.s new file mode 100644 index 0000000..085350c --- /dev/null +++ b/app/playground/custom_stack_callback.s @@ -0,0 +1,16 @@ + .file "custom_stack_callback.s" + .text + .global custom_stack_callback + .type custom_stack_callback, @function + +.align 16 +custom_stack_callback: + # rdi = new stack adress (passed as parameter) + # r12 temporary for restoring old stack (callee saved, so we get the correct value in case of a return) + push %r12 # store the callee saved register as required + movq %rsp, %r12 # store current stack pointer + movq %rdi, %rsp # update stack pointer to new user level stack + call callback # enter next tasks (will not return if continuation is stolen) + movq %r12, %rsp # restore to the old stack pointer + pop %r12 # restore the callee saved register as required + ret diff --git a/app/playground/main.cpp b/app/playground/main.cpp index d7d0c07..8022bfd 100644 --- a/app/playground/main.cpp +++ b/app/playground/main.cpp @@ -1,91 +1,75 @@ -#include +#include +#include +#include #include -#include "pls/internal/scheduling/scheduler.h" -#include "pls/internal/scheduling/parallel_result.h" -#include "pls/internal/scheduling/scheduler_memory.h" -#include "pls/internal/data_structures/bounded_trading_deque.h" +using namespace std; -using namespace pls::internal; +// Settings for stack and benchmark +const unsigned int NUM_RUNS = 100000; +const unsigned int STACK_SIZE = 512 * 1; +const unsigned char MAGIC_NUMBER = (unsigned char) 0xAB; -constexpr size_t MAX_NUM_THREADS = 1; +// Memory for custom stack and continuation semantics +unsigned char custom_stack[STACK_SIZE] = {0}; +jmp_buf buffer; -constexpr size_t MAX_NUM_TASKS = 128; -static constexpr int NUM_ITERATIONS = 10; +// Example callback function and declaration of our assembly stack switching routine +extern "C" { +void custom_stack_callback(void *); -constexpr size_t MAX_NUM_CONTS = 128; -constexpr size_t MAX_CONT_SIZE = 256; +void __attribute__ ((noinline)) callback() { + static volatile int tmp; + tmp = 0; // Force at least a single memory write +} +} -int fib_normal(int n) { - if (n == 0) { - return 0; - } - if (n == 1) { - return 1; +long __attribute__ ((noinline)) measure_function_call() { + auto start_time = chrono::steady_clock::now(); + for (unsigned int i = 0; i < NUM_RUNS; i++) { + callback(); } - - int result = fib_normal(n - 1) + fib_normal(n - 2); - return result; + auto end_time = chrono::steady_clock::now(); + return chrono::duration_cast(end_time - start_time).count(); } -scheduling::parallel_result fib(int n) { - pls::variable i; - pls::array a{10}; - if (n == 0) { - return 0; - } - if (n == 1) { - return 1; +long __attribute__ ((noinline)) measure_stack_switch() { + auto start_time = chrono::steady_clock::now(); + for (unsigned int i = 0; i < NUM_RUNS; i++) { + custom_stack_callback(&custom_stack[STACK_SIZE - 16]); } + auto end_time = chrono::steady_clock::now(); + return chrono::duration_cast(end_time - start_time).count(); +} - return scheduling::scheduler::par([=]() { - return fib(n - 1); - }, [=]() { - return fib(n - 2); - }).then([=](int a, int b) { - return scheduling::parallel_result{a + b}; - }); +long __attribute__ ((noinline)) measure_continuation() { + auto start_time = chrono::steady_clock::now(); + for (unsigned int i = 0; i < NUM_RUNS; i++) { + if (setjmp(buffer) == 0) { + custom_stack_callback(&custom_stack[STACK_SIZE - 16]); + } + } + auto end_time = chrono::steady_clock::now(); + return chrono::duration_cast(end_time - start_time).count(); } -static volatile int result; int main() { - PROFILE_ENABLE; - scheduling::static_scheduler_memory static_scheduler_memory; - - scheduling::scheduler scheduler{static_scheduler_memory, MAX_NUM_THREADS}; - - auto start = std::chrono::steady_clock::now(); - for (int i = 0; i < NUM_ITERATIONS; i++) { - result = fib_normal(35); - } - auto end = std::chrono::steady_clock::now(); - std::cout << "Normal: " << std::chrono::duration_cast(end - start).count() - << std::endl; + memset(custom_stack, MAGIC_NUMBER, STACK_SIZE); - start = std::chrono::steady_clock::now(); + auto time_cont = measure_continuation(); + auto time_stack = measure_stack_switch(); + auto time_func = measure_function_call(); - for (int i = 0; i < NUM_ITERATIONS; i++) { - scheduler.perform_work([]() { - PROFILE_MAIN_THREAD; - return scheduling::scheduler::par([]() { - return scheduling::parallel_result(0); - }, []() { - return fib(35); - }).then([](int, int b) { - result = b; - PROFILE_LOCK("DONE"); - return scheduling::parallel_result{0}; - }); - }); - PROFILE_LOCK("DONE"); + for (unsigned int i = 0; i < STACK_SIZE; i++) { + if (custom_stack[i] != MAGIC_NUMBER) { + printf("Used stack size about %u bytes.\n\n\n", (STACK_SIZE - i)); + break; + } } - PROFILE_SAVE("test_profile.prof"); - end = std::chrono::steady_clock::now(); - std::cout << "Framework: " << std::chrono::duration_cast(end - start).count() << std::endl; + printf("Function Call : %10ld, %5.5f\n", time_func, ((float) time_func / NUM_RUNS)); + printf("Stack Switching : %10ld, %5.5f\n", time_stack, ((float) time_stack / NUM_RUNS)); + printf("Full Continuation: %10ld, %5.5f\n", time_cont, ((float) time_cont / NUM_RUNS)); return 0; } diff --git a/cmake/SetupOptimizationLevel.cmake b/cmake/SetupOptimizationLevel.cmake index f2f66ec..5d22958 100644 --- a/cmake/SetupOptimizationLevel.cmake +++ b/cmake/SetupOptimizationLevel.cmake @@ -18,7 +18,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release") # but inlining functions and SIMD/Vectorization is # only enabled by -O3, thus it's way faster in some # array calculations. - set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native") set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) else () set(CMAKE_CXX_FLAGS_DEBUG "-g -O0")