Commit 5490e966 by FritzFlorian

Add minimal example for x86_64 user level threads.

We implement a minimal concepts of user level threads. This shows the minimum requirements for our 'staggered' stack implementation: we need to be able to switch to a new stack and allow someone else to continue the calling function right before the switch.
parent d054e1ab
Pipeline #1375 failed with stages
in 37 seconds
cmake_minimum_required(VERSION 3.10)
project(predictable_parallel_patterns
VERSION 0.0.1
DESCRIPTION "predictable parallel patterns for scalable smart systems using work stealing")
DESCRIPTION "predictable parallel patterns for scalable smart systems using work stealing"
LANGUAGES CXX ASM)
set(CMAKE_CXX_STANDARD 11)
......
add_executable(playground main.cpp)
add_executable(playground main.cpp custom_stack_callback.s)
# Example for adding the library to your app (as a cmake project dependency)
target_link_libraries(playground pls)
target_link_libraries(playground)
.file "custom_stack_callback.s"
.text
.global custom_stack_callback
.type custom_stack_callback, @function
.align 16
custom_stack_callback:
# rdi = new stack adress (passed as parameter)
# r12 temporary for restoring old stack (callee saved, so we get the correct value in case of a return)
push %r12 # store the callee saved register as required
movq %rsp, %r12 # store current stack pointer
movq %rdi, %rsp # update stack pointer to new user level stack
call callback # enter next tasks (will not return if continuation is stolen)
movq %r12, %rsp # restore to the old stack pointer
pop %r12 # restore the callee saved register as required
ret
#include <iostream>
#include <cstdio>
#include <csetjmp>
#include <cstring>
#include <chrono>
#include "pls/internal/scheduling/scheduler.h"
#include "pls/internal/scheduling/parallel_result.h"
#include "pls/internal/scheduling/scheduler_memory.h"
#include "pls/internal/data_structures/bounded_trading_deque.h"
using namespace std;
using namespace pls::internal;
// Settings for stack and benchmark
const unsigned int NUM_RUNS = 100000;
const unsigned int STACK_SIZE = 512 * 1;
const unsigned char MAGIC_NUMBER = (unsigned char) 0xAB;
constexpr size_t MAX_NUM_THREADS = 1;
// Memory for custom stack and continuation semantics
unsigned char custom_stack[STACK_SIZE] = {0};
jmp_buf buffer;
constexpr size_t MAX_NUM_TASKS = 128;
static constexpr int NUM_ITERATIONS = 10;
// Example callback function and declaration of our assembly stack switching routine
extern "C" {
void custom_stack_callback(void *);
constexpr size_t MAX_NUM_CONTS = 128;
constexpr size_t MAX_CONT_SIZE = 256;
void __attribute__ ((noinline)) callback() {
static volatile int tmp;
tmp = 0; // Force at least a single memory write
}
}
int fib_normal(int n) {
if (n == 0) {
return 0;
}
if (n == 1) {
return 1;
long __attribute__ ((noinline)) measure_function_call() {
auto start_time = chrono::steady_clock::now();
for (unsigned int i = 0; i < NUM_RUNS; i++) {
callback();
}
int result = fib_normal(n - 1) + fib_normal(n - 2);
return result;
auto end_time = chrono::steady_clock::now();
return chrono::duration_cast<chrono::nanoseconds>(end_time - start_time).count();
}
scheduling::parallel_result<int> fib(int n) {
pls::variable<int> i;
pls::array<int> a{10};
if (n == 0) {
return 0;
}
if (n == 1) {
return 1;
long __attribute__ ((noinline)) measure_stack_switch() {
auto start_time = chrono::steady_clock::now();
for (unsigned int i = 0; i < NUM_RUNS; i++) {
custom_stack_callback(&custom_stack[STACK_SIZE - 16]);
}
auto end_time = chrono::steady_clock::now();
return chrono::duration_cast<chrono::nanoseconds>(end_time - start_time).count();
}
return scheduling::scheduler::par([=]() {
return fib(n - 1);
}, [=]() {
return fib(n - 2);
}).then([=](int a, int b) {
return scheduling::parallel_result<int>{a + b};
});
long __attribute__ ((noinline)) measure_continuation() {
auto start_time = chrono::steady_clock::now();
for (unsigned int i = 0; i < NUM_RUNS; i++) {
if (setjmp(buffer) == 0) {
custom_stack_callback(&custom_stack[STACK_SIZE - 16]);
}
}
auto end_time = chrono::steady_clock::now();
return chrono::duration_cast<chrono::nanoseconds>(end_time - start_time).count();
}
static volatile int result;
int main() {
PROFILE_ENABLE;
scheduling::static_scheduler_memory<MAX_NUM_THREADS,
MAX_NUM_TASKS,
MAX_NUM_CONTS,
MAX_CONT_SIZE> static_scheduler_memory;
scheduling::scheduler scheduler{static_scheduler_memory, MAX_NUM_THREADS};
auto start = std::chrono::steady_clock::now();
for (int i = 0; i < NUM_ITERATIONS; i++) {
result = fib_normal(35);
}
auto end = std::chrono::steady_clock::now();
std::cout << "Normal: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
<< std::endl;
memset(custom_stack, MAGIC_NUMBER, STACK_SIZE);
start = std::chrono::steady_clock::now();
auto time_cont = measure_continuation();
auto time_stack = measure_stack_switch();
auto time_func = measure_function_call();
for (int i = 0; i < NUM_ITERATIONS; i++) {
scheduler.perform_work([]() {
PROFILE_MAIN_THREAD;
return scheduling::scheduler::par([]() {
return scheduling::parallel_result<int>(0);
}, []() {
return fib(35);
}).then([](int, int b) {
result = b;
PROFILE_LOCK("DONE");
return scheduling::parallel_result<int>{0};
});
});
PROFILE_LOCK("DONE");
for (unsigned int i = 0; i < STACK_SIZE; i++) {
if (custom_stack[i] != MAGIC_NUMBER) {
printf("Used stack size about %u bytes.\n\n\n", (STACK_SIZE - i));
break;
}
}
PROFILE_SAVE("test_profile.prof");
end = std::chrono::steady_clock::now();
std::cout << "Framework: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << std::endl;
printf("Function Call : %10ld, %5.5f\n", time_func, ((float) time_func / NUM_RUNS));
printf("Stack Switching : %10ld, %5.5f\n", time_stack, ((float) time_stack / NUM_RUNS));
printf("Full Continuation: %10ld, %5.5f\n", time_cont, ((float) time_cont / NUM_RUNS));
return 0;
}
......@@ -18,7 +18,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release")
# but inlining functions and SIMD/Vectorization is
# only enabled by -O3, thus it's way faster in some
# array calculations.
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native")
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
else ()
set(CMAKE_CXX_FLAGS_DEBUG "-g -O0")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment