Commit 89b6e3cb by FritzFlorian

Add cscontext ARMv7 assembly and fast path optimization.

parent 3c60e8d7
Pipeline #1404 failed with stages
in 37 seconds
......@@ -37,7 +37,7 @@ void conquer(fft::complex_vector::iterator data, int n) {
constexpr int MAX_NUM_THREADS = 8;
constexpr int MAX_NUM_TASKS = 32;
constexpr int MAX_STACK_SIZE = 1024 * 64;
constexpr int MAX_STACK_SIZE = 1024 * 32;
static_scheduler_memory<MAX_NUM_THREADS,
MAX_NUM_TASKS,
......@@ -56,19 +56,11 @@ int main(int argc, char **argv) {
scheduler scheduler{global_scheduler_memory, (unsigned) num_threads};
runner.run_iterations(fft::NUM_ITERATIONS, [&]() {
scheduler.perform_work([&]() {
for (int i = 0; i < fft::NUM_WARMUP_ITERATIONS; i++) {
conquer(data.begin(), fft::SIZE);
}
});
scheduler.perform_work([&]() {
for (int i = 0; i < fft::NUM_ITERATIONS; i++) {
runner.start_iteration();
conquer(data.begin(), fft::SIZE);
runner.end_iteration();
}
conquer(data.begin(), fft::SIZE);;
});
}, fft::NUM_WARMUP_ITERATIONS);
runner.commit_results(true);
return 0;
......
......@@ -51,19 +51,12 @@ int main(int argc, char **argv) {
scheduler scheduler{global_scheduler_memory, (unsigned) num_threads};
volatile int res;
scheduler.perform_work([&]() {
for (int i = 0; i < fib::NUM_WARMUP_ITERATIONS; i++) {
res = pls_fib(fib::INPUT_N);
}
});
runner.run_iterations(fib::NUM_ITERATIONS, [&]() {
scheduler.perform_work([&]() {
for (int i = 0; i < fib::NUM_ITERATIONS; i++) {
runner.start_iteration();
res = pls_fib(fib::INPUT_N);
runner.end_iteration();
}
});
}, fib::NUM_WARMUP_ITERATIONS);
runner.commit_results(true);
return 0;
......
......@@ -14,8 +14,8 @@ class pls_matrix : public matrix::matrix<T, SIZE> {
public:
pls_matrix() : matrix::matrix<T, SIZE>() {}
void pls_multiply(const matrix::matrix<T, SIZE> &a, const matrix::matrix<T, SIZE> &b) {
pls::algorithm::for_each_range(0, SIZE, [this, &a, &b](int i) {
void multiply(const matrix::matrix<T, SIZE> &a, const matrix::matrix<T, SIZE> &b) override {
pls::algorithm::for_each_range(0, SIZE, [&](int i) {
this->multiply_column(i, a, b);
});
}
......@@ -23,7 +23,7 @@ class pls_matrix : public matrix::matrix<T, SIZE> {
constexpr int MAX_NUM_THREADS = 8;
constexpr int MAX_NUM_TASKS = 32;
constexpr int MAX_STACK_SIZE = 1024 * 4;
constexpr int MAX_STACK_SIZE = 1024 * 1;
static_scheduler_memory<MAX_NUM_THREADS,
MAX_NUM_TASKS,
......@@ -44,18 +44,10 @@ int main(int argc, char **argv) {
scheduler scheduler{global_scheduler_memory, (unsigned) num_threads};
runner.run_iterations(matrix::NUM_ITERATIONS, [&]() {
scheduler.perform_work([&]() {
for (int i = 0; i < matrix::WARMUP_ITERATIONS; i++) {
result.pls_multiply(a, b);
}
});
scheduler.perform_work([&]() {
for (int i = 0; i < matrix::NUM_ITERATIONS; i++) {
runner.start_iteration();
result.pls_multiply(a, b);
runner.end_iteration();
}
result.multiply(a, b);
});
}, matrix::WARMUP_ITERATIONS);
runner.commit_results(true);
}
......@@ -18,7 +18,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release")
# but inlining functions and SIMD/Vectorization is
# only enabled by -O3, thus it's way faster in some
# array calculations.
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2 -march=native")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native")
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
else ()
set(CMAKE_CXX_FLAGS_DEBUG "-g -O0")
......
......@@ -9,6 +9,11 @@ if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND CMAKE_SYSTEM_NAME STREQUAL "Lin
SET(CS_CSCONTEXT_ASSEMBLY
asm/cscontext/enter_context_x86_64_sysv_elf.s
asm/cscontext/switch_context_x86_64_sysv_elf.s)
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l" AND CMAKE_SYSTEM_NAME STREQUAL "Linux")
# Typical Linux running on ARMv7
SET(CS_CSCONTEXT_ASSEMBLY
asm/cscontext/enter_context_arm32_sysv_elf.s
asm/cscontext/switch_context_arm32_sysv_elf.s)
else ()
SET(CS_CSCONTEXT_FOUND FALSE)
endif ()
......
.arm
.text
.global __cs_enter_context
.type __cs_enter_context, %function
__cs_enter_context:
/* Parameter List (in order)
* r0 = new stack pointer
* r1 = first parameter to callback
* r2 = callback function pointer
* r3 = new stack limit (not used on most platforms)
*
* Return
* r0 = continuation that returned control back to the caller (null if fallthrough)
*
* Variables
* r4 = temporary for the old stack pointer */
/* ========== Save State ========== */
/* store programm counter for later return */
push {lr}
/* store callee saved registers */
push {r4-r12,lr}
/* store floating point extension registers */
#if (defined(__VFP_FP__) && !defined(__SOFTFP__))
sub sp, sp, #64
vstmia sp, {d8-d15}
#endif
/* ========== Save State ========== */
/* Perform change to new stack */
/* Keep old stack as second parameter to our callback function. */
mov r4, sp
/* Make sure that stack start is properly aligned. */
and r0, r0, #-16
/* Switch to new stack pointer. */
mov sp, r0
/* Perform actual function call, this will now be on the new stack */
/* r0 = first parametor to callback (continuation) */
/* r1 = second parameter to callback (arbetary pointer) */
mov r0, r4
blx r2
/* Restore state of returned continuation. */
/* To do so we first reset the stack pointer (which we get returned in r0). */
/* After that we execute our standard restore procedere to pop the state from the stack. */
mov sp, r0
/* ========== Restore State ========== */
/* restore floating point extension registers */
#if (defined(__VFP_FP__) && !defined(__SOFTFP__))
vldmia sp, {d8-d15}
add sp, sp, #64
#endif
/* restore callee saved registers */
pop {r4-r12,lr}
/* ========== Restore State ========== */
/* Just return back from the call. */
/* This is the end of a fiber, so we have no continuation. */
eor r0, r0, r0
pop {pc}
.arm
.text
.global __cs_switch_context
.type __cs_switch_context, %function
__cs_switch_context:
/* Parameter List (in order)
* r0 = pointer to continuation (should hold value of target stack will be filled with this continuation)
*
* Return
* r0 = continuation that returned control back to the caller (null if fallthrough)
*
* Variables
* r1 = temporary for the old stack pointer */
/* ========== Save State ========== */
/* store programm counter for later return */
push {lr}
/* store callee saved registers */
push {r4-r12,lr}
/* store floating point extension registers */
#if (defined(__VFP_FP__) && !defined(__SOFTFP__))
sub sp, sp, #64
vstmia sp, {d8-d15}
#endif
/* ========== Save State ========== */
/* Perform change to new stack */
/* Keep old stack as result from this function. */
mov r1, sp
/* Switch to new stack pointer. */
mov sp, r0
/* ========== Restore State ========== */
/* restore floating point extension registers */
#if (defined(__VFP_FP__) && !defined(__SOFTFP__))
vldmia sp, {d8-d15}
add sp, sp, #64
#endif
/* restore callee saved registers */
pop {r4-r12,lr}
/* ========== Restore State ========== */
/* Just return back from the call. */
/* This is the end of a fiber, so we have no continuation. */
mov r0, r1
pop {pc}
......@@ -77,10 +77,7 @@ void for_each(RandomIt
execution_strategy) {
long num_elements = std::distance(first, last);
return
internal::for_each(first, last, function, execution_strategy
.
calculate_min_elements(num_elements)
);
internal::for_each(first, last, function, execution_strategy.calculate_min_elements(num_elements));
}
template<typename RandomIt, typename Function>
......
......@@ -34,6 +34,8 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) task {
depth_ = depth;
thread_id_ = thread_id;
is_synchronized_ = false;
}
template<typename F>
......@@ -44,9 +46,11 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) task {
// TODO: Proper access control and split it up into responsibilities
// Stack/Continuation Management
char *stack_memory_;
size_t stack_size_;
size_t stack_size_; // TODO: maybe remove it, not needed in here
context_switcher::continuation continuation_;
bool is_synchronized_;
// TODO: Clean up responsibilities
// Work-Stealing
std::atomic<traded_cas_field> external_trading_deque_cas_{};
std::atomic<task *> resource_stack_next_{};
......
......@@ -28,6 +28,7 @@ void task_manager::spawn_child(F &&lambda) {
last_task->continuation_ = std::move(cont);
// we are now executing the new task, allow others to steal the last task continuation.
spawned_task->is_synchronized_ = true;
spawning_task_manager->active_task_ = spawned_task;
spawning_task_manager->deque_.push_bot(last_task);
......
......@@ -108,6 +108,7 @@ void scheduler::work_thread_work_section() {
// Execute the stolen task by jumping to it's continuation.
PLS_ASSERT(stolen_task->continuation_.valid(),
"A task that we can steal must have a valid continuation for us to start working.");
stolen_task->is_synchronized_ = false;
context_switcher::switch_context(std::move(stolen_task->continuation_));
// We will continue execution in this line when we finished the stolen work.
}
......
......@@ -128,6 +128,9 @@ void task_manager::sync() {
auto *last_task = spawning_task_manager->active_task_;
auto *spawned_task = spawning_task_manager->active_task_->next_;
if (last_task->is_synchronized_) {
return; // We are already the sole owner of last_task
} else {
auto continuation = spawned_task->run_as_task([=](context_switcher::continuation cont) {
last_task->continuation_ = std::move(cont);
spawning_task_manager->active_task_ = spawned_task;
......@@ -145,6 +148,9 @@ void task_manager::sync() {
PLS_ASSERT(!continuation.valid(),
"We only return to a sync point, never jump to it directly."
"This must therefore never return an unfinished fiber/continuation.");
return; // We cleanly synced to the last one finishing work on last_task
}
}
bool task_manager::try_clean_return(context_switcher::continuation &result_cont) {
......@@ -195,6 +201,7 @@ bool task_manager::try_clean_return(context_switcher::continuation &result_cont)
// We are the last one working on this task. Thus the sync must be finished, continue working.
active_task_ = last_task;
last_task->is_synchronized_ = true;
result_cont = std::move(last_task->continuation_);
PLS_ASSERT(result_cont.valid(), "Must return a valid continuation.");
return false;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment