Commit 3c60e8d7 by FritzFlorian

First working version on both ARM and x86.

parent 731b47c5
Pipeline #1403 failed with stages
in 39 seconds
add_executable(benchmark_fft_pls_v2 main.cpp)
target_link_libraries(benchmark_fft_pls_v2 pls benchmark_runner benchmark_base)
add_executable(benchmark_fft_pls_v3 main.cpp)
target_link_libraries(benchmark_fft_pls_v3 pls benchmark_runner benchmark_base)
if (EASY_PROFILER)
target_link_libraries(benchmark_fft_pls_v2 easy_profiler)
target_link_libraries(benchmark_fft_pls_v3 easy_profiler)
endif ()
......@@ -37,7 +37,7 @@ void conquer(fft::complex_vector::iterator data, int n) {
constexpr int MAX_NUM_THREADS = 8;
constexpr int MAX_NUM_TASKS = 32;
constexpr int MAX_STACK_SIZE = 1024 * 4;
constexpr int MAX_STACK_SIZE = 1024 * 64;
static_scheduler_memory<MAX_NUM_THREADS,
MAX_NUM_TASKS,
......@@ -49,7 +49,7 @@ int main(int argc, char **argv) {
benchmark_runner::read_args(argc, argv, num_threads, directory);
string test_name = to_string(num_threads) + ".csv";
string full_directory = directory + "/PLS_v2/";
string full_directory = directory + "/PLS_v3/";
benchmark_runner runner{full_directory, test_name};
fft::complex_vector data = fft::generate_input();
......
add_executable(benchmark_fib_pls_v2 main.cpp)
target_link_libraries(benchmark_fib_pls_v2 pls benchmark_runner benchmark_base)
add_executable(benchmark_fib_pls_v3 main.cpp)
target_link_libraries(benchmark_fib_pls_v3 pls benchmark_runner benchmark_base)
if (EASY_PROFILER)
target_link_libraries(benchmark_fib_pls_v2 easy_profiler)
target_link_libraries(benchmark_fib_pls_v3 easy_profiler)
endif ()
......@@ -33,7 +33,7 @@ int pls_fib(int n) {
constexpr int MAX_NUM_THREADS = 8;
constexpr int MAX_NUM_TASKS = 32;
constexpr int MAX_STACK_SIZE = 1024 * 1;
constexpr int MAX_STACK_SIZE = 1024 * 4;
static_scheduler_memory<MAX_NUM_THREADS,
MAX_NUM_TASKS,
......
add_executable(benchmark_matrix_pls_v2 main.cpp)
target_link_libraries(benchmark_matrix_pls_v2 pls benchmark_runner benchmark_base)
add_executable(benchmark_matrix_pls_v3 main.cpp)
target_link_libraries(benchmark_matrix_pls_v3 pls benchmark_runner benchmark_base)
if (EASY_PROFILER)
target_link_libraries(benchmark_matrix_pls_v2 easy_profiler)
target_link_libraries(benchmark_matrix_pls_v3 easy_profiler)
endif ()
......@@ -23,7 +23,7 @@ class pls_matrix : public matrix::matrix<T, SIZE> {
constexpr int MAX_NUM_THREADS = 8;
constexpr int MAX_NUM_TASKS = 32;
constexpr int MAX_STACK_SIZE = 1024 * 1;
constexpr int MAX_STACK_SIZE = 1024 * 4;
static_scheduler_memory<MAX_NUM_THREADS,
MAX_NUM_TASKS,
......@@ -35,7 +35,7 @@ int main(int argc, char **argv) {
benchmark_runner::read_args(argc, argv, num_threads, directory);
string test_name = to_string(num_threads) + ".csv";
string full_directory = directory + "/PLS_v2/";
string full_directory = directory + "/PLS_v3/";
benchmark_runner runner{full_directory, test_name};
pls_matrix<double, matrix::MATRIX_SIZE> a;
......
......@@ -18,7 +18,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release")
# but inlining functions and SIMD/Vectorization is
# only enabled by -O3, thus it's way faster in some
# array calculations.
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2 -march=native")
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
else ()
set(CMAKE_CXX_FLAGS_DEBUG "-g -O0")
......
......@@ -111,6 +111,9 @@ void lambda_capture_callback(fcontext::transfer_t transfer) {
lambda_capture->~T();
continuation_t cont_pointer = cont.consume();
if (cont_pointer == nullptr) {
printf("Error!!!\n");
}
fcontext::jump_fcontext(cont_pointer, (void *) 0);
}
......
......@@ -50,10 +50,6 @@ struct continuation {
}
continuation_t consume() {
if (cont_pointer_ == nullptr) {
printf("Error!\n");
}
auto tmp = cont_pointer_;
cont_pointer_ = nullptr;
return tmp;
......
......@@ -40,7 +40,7 @@ add_library(pls STATIC
include/pls/internal/scheduling/task_manager_impl.h
include/pls/internal/scheduling/static_scheduler_memory.h
include/pls/internal/scheduling/heap_scheduler_memory.h
src/internal/scheduling/task_manager.cpp)
src/internal/scheduling/task_manager.cpp src/internal/scheduling/thread_state.cpp)
# Dependencies for pls
target_link_libraries(pls Threads::Threads)
......
......@@ -16,6 +16,6 @@
void pls_error(const char *msg);
// TODO: Distinguish between debug/internal asserts and production asserts.
#define PLS_ASSERT(cond, msg) // if (!(cond)) { pls_error(msg); }
#define PLS_ASSERT(cond, msg) if (!(cond)) { pls_error(msg); }
#endif //PLS_ERROR_HANDLING_H
......@@ -67,6 +67,24 @@ inline void relax_cpu() {
#endif
}
/**
* Prevent inlining of functions. This is a compiler specific setting and
* it is seen as an error to not properly declare this.
* (Some functions in the codebase MUST be re-evaluated after fiber switches,
* by preventing inlining them we prevent the compiler caching their results)
*/
#if defined(_MSC_VER)
#define PLS_NOINLINE __declspec(noinline)
#elif defined(__GNUC__) && __GNUC__ > 3
#if defined(__CUDACC__)
#define PLS_NOINLINE __attribute__ ((noinline))
#else
#define PLS_NOINLINE __attribute__ ((__noinline__))
#endif
#else
#error "PLS requires inline prevention for certain functions."
#endif
}
}
}
......
......@@ -78,20 +78,20 @@ class external_trading_deque {
auto expected_stamp = bot_internal_.stamp;
auto &current_entry = entries_[bot_internal_.value];
// Publish the prepared task in the deque.
current_entry.forwarding_stamp_.store(expected_stamp, std::memory_order_relaxed);
current_entry.traded_task_.store(published_task, std::memory_order_relaxed);
// Field that all threads synchronize on.
// This happens not in the deque itself, but in the published task.
traded_cas_field sync_cas_field;
sync_cas_field.fill_with_stamp(expected_stamp, thread_id_);
published_task->external_trading_deque_cas_.store(sync_cas_field);
// Publish the prepared task in the deque.
current_entry.forwarding_stamp_.store(expected_stamp);
current_entry.traded_task_.store(published_task);
published_task->external_trading_deque_cas_.store(sync_cas_field, std::memory_order_release);
// Advance the bot pointer. Linearization point for making the task public.
bot_internal_.stamp++;
bot_internal_.value++;
bot_.store(bot_internal_.value);
bot_.store(bot_internal_.value, std::memory_order_release);
}
void reset_bot_and_top() {
......@@ -104,7 +104,7 @@ class external_trading_deque {
void decrease_bot() {
bot_internal_.value--;
bot_.store(bot_internal_.value);
bot_.store(bot_internal_.value, std::memory_order_relaxed);
}
/**
......@@ -120,15 +120,17 @@ class external_trading_deque {
decrease_bot();
auto &current_entry = entries_[bot_internal_.value];
auto *popped_task = current_entry.traded_task_.load();
auto expected_stamp = current_entry.forwarding_stamp_.load();
auto *popped_task = current_entry.traded_task_.load(std::memory_order_relaxed);
auto expected_stamp = current_entry.forwarding_stamp_.load(std::memory_order_relaxed);
// We know what value must be in the cas field if no other thread stole it.
traded_cas_field expected_sync_cas_field;
expected_sync_cas_field.fill_with_stamp(expected_stamp, thread_id_);
traded_cas_field empty_cas_field;
if (popped_task->external_trading_deque_cas_.compare_exchange_strong(expected_sync_cas_field, empty_cas_field)) {
if (popped_task->external_trading_deque_cas_.compare_exchange_strong(expected_sync_cas_field,
empty_cas_field,
std::memory_order_acq_rel)) {
return optional<task *>{popped_task};
} else {
reset_bot_and_top();
......
......@@ -27,10 +27,11 @@ class scheduler::init_function_impl : public init_function {
void run() override {
auto &root_task = thread_state::get().get_task_manager().get_active_task();
root_task.run_as_task([&](context_switcher::continuation cont) {
thread_state::get().set_main_continuation(std::move(cont));
thread_state::get().main_continuation() = std::move(cont);
function_();
thread_state::get().get_scheduler().work_section_done_.store(true);
return std::move(thread_state::get().get_main_continuation());
PLS_ASSERT(thread_state::get().main_continuation().valid(), "Must return valid continuation from main task.");
return std::move(thread_state::get().main_continuation());
});
}
......
......@@ -52,7 +52,7 @@ class task_manager {
void spawn_child(F &&lambda);
void sync();
task* steal_task(task_manager &stealing_task_manager);
task *steal_task(task_manager &stealing_task_manager);
bool try_clean_return(context_switcher::continuation &result_cont);
......
......@@ -71,7 +71,7 @@ void task_manager::spawn_child(F &&lambda) {
if (continuation.valid()) {
// We jumped in here from the main loop, keep track!
thread_state::get().set_main_continuation(std::move(continuation));
thread_state::get().main_continuation() = std::move(continuation);
}
}
......
......@@ -6,6 +6,8 @@
#include <chrono>
#include <utility>
#include "pls/internal/base/system_details.h"
#include "context_switcher/continuation.h"
namespace pls {
......@@ -37,9 +39,11 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) thread_state {
* Must only be called on threads that are associated with a thread_state,
* this will most likely be threads created by the scheduler.
*
* Each call is guaranteed to be a new lockup, i.e. it is not cached after fiber context switches.
*
* @return The thread_state of this thread.
*/
static thread_state &get() { return *base::this_thread::state<thread_state>(); }
static thread_state &PLS_NOINLINE get();
unsigned get_id() { return id_; }
void set_id(unsigned id) {
......@@ -54,11 +58,8 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) thread_state {
return random_();
}
void set_main_continuation(context_switcher::continuation &&continuation) {
main_loop_continuation_ = std::move(continuation);
}
context_switcher::continuation get_main_continuation() {
return std::move(main_loop_continuation_);
context_switcher::continuation &main_continuation() {
return main_loop_continuation_;
}
// Do not allow move/copy operations.
......
#include <tuple>
#include "pls/internal/scheduling/task_manager.h"
#include "pls/internal/scheduling/task.h"
......@@ -76,7 +74,6 @@ task *task_manager::steal_task(task_manager &stealing_task_manager) {
}
void task_manager::push_resource_on_task(task *target_task, task *spare_task_chain) {
PLS_ASSERT(check_task_chain_backward(spare_task_chain), "Must only push proper task chains.");
PLS_ASSERT(target_task->thread_id_ != spare_task_chain->thread_id_,
"Makes no sense to push task onto itself, as it is not clean by definition.");
PLS_ASSERT(target_task->depth_ == spare_task_chain->depth_, "Must only push tasks with correct depth.");
......@@ -90,11 +87,11 @@ void task_manager::push_resource_on_task(task *target_task, task *spare_task_cha
if (current_root.value == 0) {
// Empty, simply push in with no successor
spare_task_chain->resource_stack_next_.store(nullptr, std::memory_order_relaxed);
spare_task_chain->resource_stack_next_.store(nullptr);
} else {
// Already an entry. Find it's corresponding task and set it as our successor.
auto *current_root_task = find_task(current_root.value - 1, target_task->depth_);
spare_task_chain->resource_stack_next_.store(current_root_task, std::memory_order_relaxed);
spare_task_chain->resource_stack_next_.store(current_root_task);
}
} while (!target_task->resource_stack_root_.compare_exchange_strong(current_root, target_root));
......@@ -112,7 +109,7 @@ task *task_manager::pop_resource_from_task(task *target_task) {
} else {
// Found something, try to pop it
auto *current_root_task = find_task(current_root.value - 1, target_task->depth_);
auto *next_stack_task = current_root_task->resource_stack_next_.load(std::memory_order_relaxed);
auto *next_stack_task = current_root_task->resource_stack_next_.load();
target_root.stamp = current_root.stamp + 1;
target_root.value = next_stack_task != nullptr ? next_stack_task->thread_id_ + 1 : 0;
......@@ -122,7 +119,7 @@ task *task_manager::pop_resource_from_task(task *target_task) {
} while (!target_task->resource_stack_root_.compare_exchange_strong(current_root, target_root));
PLS_ASSERT(check_task_chain_backward(output_task), "Must only pop proper task chains.");
output_task->resource_stack_next_.store(nullptr, std::memory_order_relaxed);
output_task->resource_stack_next_.store(nullptr);
return output_task;
}
......@@ -187,7 +184,8 @@ bool task_manager::try_clean_return(context_switcher::continuation &result_cont)
}
// jump back to the continuation in main scheduling loop, time to steal some work
result_cont = thread_state::get().get_main_continuation();
result_cont = std::move(thread_state::get().main_continuation());
PLS_ASSERT(result_cont.valid(), "Must return a valid continuation.");
return true;
} else {
// Make sure that we are owner fo this full continuation/task chain.
......@@ -198,13 +196,16 @@ bool task_manager::try_clean_return(context_switcher::continuation &result_cont)
active_task_ = last_task;
result_cont = std::move(last_task->continuation_);
PLS_ASSERT(result_cont.valid(), "Must return a valid continuation.");
return false;
}
}
bool task_manager::check_task_chain_forward(task *start_task) {
while (start_task->next_ != nullptr) {
PLS_ASSERT(start_task->next_->prev_ == start_task, "Chain must have correct prev/next fields for linked list!");
if (start_task->next_->prev_ != start_task) {
return false;
}
start_task = start_task->next_;
}
return true;
......@@ -212,17 +213,16 @@ bool task_manager::check_task_chain_forward(task *start_task) {
bool task_manager::check_task_chain_backward(task *start_task) {
while (start_task->prev_ != nullptr) {
PLS_ASSERT(start_task->prev_->next_ == start_task, "Chain must have correct prev/next fields for linked list!");
if (start_task->prev_->next_ != start_task) {
return false;
}
start_task = start_task->prev_;
}
return true;
}
bool task_manager::check_task_chain() {
check_task_chain_backward(active_task_);
check_task_chain_forward(active_task_);
return true;
return check_task_chain_backward(active_task_) && check_task_chain_forward(active_task_);
}
}
......
#include "pls/internal/scheduling/thread_state.h"
#include "pls/internal/base/thread.h"
namespace pls {
namespace internal {
namespace scheduling {
thread_state &thread_state::get() { return *base::this_thread::state<thread_state>(); }
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment