From 8668cad280bc6908ff7b0d998ef444bbd41ab88c Mon Sep 17 00:00:00 2001 From: FritzFlorian Date: Mon, 25 Nov 2019 10:03:54 +0100 Subject: [PATCH] WIP: Add first performance tests of single threaded execution. We changed up some of the memory constraints in the lock free deque and will need to see if this is ok. If so, the single threaded performance looks very good. --- app/benchmark_fft/main.cpp | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------ cmake/SetupOptimizationLevel.cmake | 2 +- lib/pls/CMakeLists.txt | 4 ++-- lib/pls/include/pls/internal/data_structures/bounded_ws_deque.h | 8 ++++---- 4 files changed, 85 insertions(+), 25 deletions(-) diff --git a/app/benchmark_fft/main.cpp b/app/benchmark_fft/main.cpp index 80de92c..e890d9e 100644 --- a/app/benchmark_fft/main.cpp +++ b/app/benchmark_fft/main.cpp @@ -1,13 +1,16 @@ -#include -#include -#include +#include "pls/internal/scheduling/scheduler.h" +#include "pls/internal/scheduling/parallel_result.h" +#include "pls/internal/scheduling/scheduler_memory.h" +using namespace pls::internal::scheduling; #include #include #include +#include + +std::atomic count; static constexpr int CUTOFF = 16; -static constexpr int NUM_ITERATIONS = 1000; static constexpr int INPUT_SIZE = 8192; typedef std::vector> complex_vector; @@ -39,22 +42,38 @@ void combine(complex_vector::iterator data, int n) { } } -void fft(complex_vector::iterator data, int n) { +void fft_normal(complex_vector::iterator data, int n) { if (n < 2) { +// count++; return; } divide(data, n); + fft_normal(data, n / 2); + fft_normal(data + n / 2, n / 2); + combine(data, n); +} + +parallel_result fft(complex_vector::iterator data, int n) { + if (n < 2) { + return 0; + } + + divide(data, n); if (n <= CUTOFF) { - fft(data, n / 2); - fft(data + n / 2, n / 2); + fft_normal(data, n / 2); + fft_normal(data + n / 2, n / 2); + return 0; } else { - pls::invoke( - [&] { fft(data, n / 2); }, - [&] { fft(data + n / 2, n / 2); } - ); + return scheduler::par([=]() { + return fft(data, n / 2); + }, [=]() { + return fft(data + n / 2, n / 2); + }).then([=](int, int) { + combine(data, n); + return 0; + }); } - combine(data, n); } complex_vector prepare_input(int input_size) { @@ -73,14 +92,55 @@ complex_vector prepare_input(int input_size) { return data; } +static constexpr int NUM_ITERATIONS = 1000; +constexpr size_t NUM_THREADS = 1; + +constexpr size_t NUM_TASKS = 64; +constexpr size_t MAX_TASK_STACK_SIZE = 0; + +constexpr size_t NUM_CONTS = 64; +constexpr size_t MAX_CONT_SIZE = 192; + int main() { - PROFILE_ENABLE complex_vector initial_input = prepare_input(INPUT_SIZE); - pls::internal::helpers::run_mini_benchmark([&] { - complex_vector input = initial_input; - fft(input.begin(), input.size()); - }, 7, 1000); + static_scheduler_memory static_scheduler_memory; + + scheduler scheduler{static_scheduler_memory, NUM_THREADS}; + + count.store(0); + auto start = std::chrono::steady_clock::now(); + for (int i = 0; i < NUM_ITERATIONS; i++) { + complex_vector input_2(initial_input); + scheduler.perform_work([&]() { + return scheduler::par([&]() { + return fft(input_2.begin(), INPUT_SIZE); + }, []() { + return parallel_result{0}; + }).then([](int, int) { + return 0; + }); + }); + } + auto end = std::chrono::steady_clock::now(); + std::cout << "Count: " << count.load() << std::endl; + std::cout << "Framework: " << std::chrono::duration_cast(end - start).count() + << std::endl; + + count.store(0); + start = std::chrono::steady_clock::now(); + for (int i = 0; i < NUM_ITERATIONS; i++) { + complex_vector input_1(initial_input); + fft_normal(input_1.begin(), INPUT_SIZE); + } + end = std::chrono::steady_clock::now(); + std::cout << "Count: " << count.load() << std::endl; + std::cout << "Normal: " << std::chrono::duration_cast(end - start).count() + << std::endl; - PROFILE_SAVE("test_profile.prof") + return 0; } diff --git a/cmake/SetupOptimizationLevel.cmake b/cmake/SetupOptimizationLevel.cmake index 05c18cc..f2f66ec 100644 --- a/cmake/SetupOptimizationLevel.cmake +++ b/cmake/SetupOptimizationLevel.cmake @@ -18,7 +18,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release") # but inlining functions and SIMD/Vectorization is # only enabled by -O3, thus it's way faster in some # array calculations. - set(CMAKE_CXX_FLAGS_RELEASE "-O2 -march=native") + set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native") set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) else () set(CMAKE_CXX_FLAGS_DEBUG "-g -O0") diff --git a/lib/pls/CMakeLists.txt b/lib/pls/CMakeLists.txt index 12e7e97..9e9dd85 100644 --- a/lib/pls/CMakeLists.txt +++ b/lib/pls/CMakeLists.txt @@ -34,7 +34,7 @@ add_library(pls STATIC include/pls/internal/base/barrier.h src/internal/base/barrier.cpp include/pls/internal/base/system_details.h include/pls/internal/base/error_handling.h - include/pls/internal/base/alignment.h + include/pls/internal/base/alignment.h src/internal/base/alignment.cpp include/pls/internal/data_structures/aligned_stack.h src/internal/data_structures/aligned_stack.cpp include/pls/internal/data_structures/aligned_stack_impl.h @@ -58,7 +58,7 @@ add_library(pls STATIC include/pls/internal/scheduling/task.h src/internal/scheduling/task.cpp include/pls/internal/scheduling/cont_manager.h include/pls/internal/scheduling/continuation.h - include/pls/internal/data_structures/bounded_ws_deque.h include/pls/internal/data_structures/optional.h src/internal/base/alignment.cpp src/internal/base/alignment.h) + include/pls/internal/data_structures/bounded_ws_deque.h include/pls/internal/data_structures/optional.h) # Add everything in `./include` to be in the include path of this project target_include_directories(pls diff --git a/lib/pls/include/pls/internal/data_structures/bounded_ws_deque.h b/lib/pls/include/pls/internal/data_structures/bounded_ws_deque.h index ccc6bfa..7a37f7b 100644 --- a/lib/pls/include/pls/internal/data_structures/bounded_ws_deque.h +++ b/lib/pls/include/pls/internal/data_structures/bounded_ws_deque.h @@ -36,11 +36,11 @@ class bounded_ws_deque { void push_bottom(T item) { item_array_[bottom_] = item; local_bottom_++; - bottom_.store(local_bottom_); + bottom_.store(local_bottom_, std::memory_order_release); } bool is_empty() { - return top_.load().value < bottom_; + return top_.load().value < bottom_.load(); } optional pop_top() { @@ -66,11 +66,11 @@ class bounded_ws_deque { } local_bottom_--; - bottom_.store(local_bottom_); + bottom_.store(local_bottom_, std::memory_order_seq_cst); optional result(item_array_[local_bottom_]); - stamped_integer old_top = top_.load(); + stamped_integer old_top = top_.load(std::memory_order_acquire); if (local_bottom_ > old_top.value) { // Enough distance to just return the value return result; -- libgit2 0.26.0