Commit 5490e966 by FritzFlorian

Add minimal example for x86_64 user level threads.

We implement a minimal concepts of user level threads. This shows the minimum requirements for our 'staggered' stack implementation: we need to be able to switch to a new stack and allow someone else to continue the calling function right before the switch.
parent d054e1ab
Pipeline #1375 failed with stages
in 37 seconds
cmake_minimum_required(VERSION 3.10) cmake_minimum_required(VERSION 3.10)
project(predictable_parallel_patterns project(predictable_parallel_patterns
VERSION 0.0.1 VERSION 0.0.1
DESCRIPTION "predictable parallel patterns for scalable smart systems using work stealing") DESCRIPTION "predictable parallel patterns for scalable smart systems using work stealing"
LANGUAGES CXX ASM)
set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD 11)
......
add_executable(playground main.cpp) add_executable(playground main.cpp custom_stack_callback.s)
# Example for adding the library to your app (as a cmake project dependency) # Example for adding the library to your app (as a cmake project dependency)
target_link_libraries(playground pls) target_link_libraries(playground)
.file "custom_stack_callback.s"
.text
.global custom_stack_callback
.type custom_stack_callback, @function
.align 16
custom_stack_callback:
# rdi = new stack adress (passed as parameter)
# r12 temporary for restoring old stack (callee saved, so we get the correct value in case of a return)
push %r12 # store the callee saved register as required
movq %rsp, %r12 # store current stack pointer
movq %rdi, %rsp # update stack pointer to new user level stack
call callback # enter next tasks (will not return if continuation is stolen)
movq %r12, %rsp # restore to the old stack pointer
pop %r12 # restore the callee saved register as required
ret
#include <iostream> #include <cstdio>
#include <csetjmp>
#include <cstring>
#include <chrono> #include <chrono>
#include "pls/internal/scheduling/scheduler.h" using namespace std;
#include "pls/internal/scheduling/parallel_result.h"
#include "pls/internal/scheduling/scheduler_memory.h"
#include "pls/internal/data_structures/bounded_trading_deque.h"
using namespace pls::internal; // Settings for stack and benchmark
const unsigned int NUM_RUNS = 100000;
const unsigned int STACK_SIZE = 512 * 1;
const unsigned char MAGIC_NUMBER = (unsigned char) 0xAB;
constexpr size_t MAX_NUM_THREADS = 1; // Memory for custom stack and continuation semantics
unsigned char custom_stack[STACK_SIZE] = {0};
jmp_buf buffer;
constexpr size_t MAX_NUM_TASKS = 128; // Example callback function and declaration of our assembly stack switching routine
static constexpr int NUM_ITERATIONS = 10; extern "C" {
void custom_stack_callback(void *);
constexpr size_t MAX_NUM_CONTS = 128; void __attribute__ ((noinline)) callback() {
constexpr size_t MAX_CONT_SIZE = 256; static volatile int tmp;
tmp = 0; // Force at least a single memory write
}
}
int fib_normal(int n) { long __attribute__ ((noinline)) measure_function_call() {
if (n == 0) { auto start_time = chrono::steady_clock::now();
return 0; for (unsigned int i = 0; i < NUM_RUNS; i++) {
} callback();
if (n == 1) {
return 1;
} }
auto end_time = chrono::steady_clock::now();
int result = fib_normal(n - 1) + fib_normal(n - 2); return chrono::duration_cast<chrono::nanoseconds>(end_time - start_time).count();
return result;
} }
scheduling::parallel_result<int> fib(int n) { long __attribute__ ((noinline)) measure_stack_switch() {
pls::variable<int> i; auto start_time = chrono::steady_clock::now();
pls::array<int> a{10}; for (unsigned int i = 0; i < NUM_RUNS; i++) {
if (n == 0) { custom_stack_callback(&custom_stack[STACK_SIZE - 16]);
return 0;
}
if (n == 1) {
return 1;
} }
auto end_time = chrono::steady_clock::now();
return chrono::duration_cast<chrono::nanoseconds>(end_time - start_time).count();
}
return scheduling::scheduler::par([=]() { long __attribute__ ((noinline)) measure_continuation() {
return fib(n - 1); auto start_time = chrono::steady_clock::now();
}, [=]() { for (unsigned int i = 0; i < NUM_RUNS; i++) {
return fib(n - 2); if (setjmp(buffer) == 0) {
}).then([=](int a, int b) { custom_stack_callback(&custom_stack[STACK_SIZE - 16]);
return scheduling::parallel_result<int>{a + b}; }
}); }
auto end_time = chrono::steady_clock::now();
return chrono::duration_cast<chrono::nanoseconds>(end_time - start_time).count();
} }
static volatile int result;
int main() { int main() {
PROFILE_ENABLE; memset(custom_stack, MAGIC_NUMBER, STACK_SIZE);
scheduling::static_scheduler_memory<MAX_NUM_THREADS,
MAX_NUM_TASKS,
MAX_NUM_CONTS,
MAX_CONT_SIZE> static_scheduler_memory;
scheduling::scheduler scheduler{static_scheduler_memory, MAX_NUM_THREADS};
auto start = std::chrono::steady_clock::now();
for (int i = 0; i < NUM_ITERATIONS; i++) {
result = fib_normal(35);
}
auto end = std::chrono::steady_clock::now();
std::cout << "Normal: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
<< std::endl;
start = std::chrono::steady_clock::now(); auto time_cont = measure_continuation();
auto time_stack = measure_stack_switch();
auto time_func = measure_function_call();
for (int i = 0; i < NUM_ITERATIONS; i++) { for (unsigned int i = 0; i < STACK_SIZE; i++) {
scheduler.perform_work([]() { if (custom_stack[i] != MAGIC_NUMBER) {
PROFILE_MAIN_THREAD; printf("Used stack size about %u bytes.\n\n\n", (STACK_SIZE - i));
return scheduling::scheduler::par([]() { break;
return scheduling::parallel_result<int>(0); }
}, []() {
return fib(35);
}).then([](int, int b) {
result = b;
PROFILE_LOCK("DONE");
return scheduling::parallel_result<int>{0};
});
});
PROFILE_LOCK("DONE");
} }
PROFILE_SAVE("test_profile.prof");
end = std::chrono::steady_clock::now(); printf("Function Call : %10ld, %5.5f\n", time_func, ((float) time_func / NUM_RUNS));
std::cout << "Framework: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << std::endl; printf("Stack Switching : %10ld, %5.5f\n", time_stack, ((float) time_stack / NUM_RUNS));
printf("Full Continuation: %10ld, %5.5f\n", time_cont, ((float) time_cont / NUM_RUNS));
return 0; return 0;
} }
...@@ -18,7 +18,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release") ...@@ -18,7 +18,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release")
# but inlining functions and SIMD/Vectorization is # but inlining functions and SIMD/Vectorization is
# only enabled by -O3, thus it's way faster in some # only enabled by -O3, thus it's way faster in some
# array calculations. # array calculations.
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native")
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
else () else ()
set(CMAKE_CXX_FLAGS_DEBUG "-g -O0") set(CMAKE_CXX_FLAGS_DEBUG "-g -O0")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment