diff --git a/app/benchmark_fft/main.cpp b/app/benchmark_fft/main.cpp index 80de92c..e890d9e 100644 --- a/app/benchmark_fft/main.cpp +++ b/app/benchmark_fft/main.cpp @@ -1,13 +1,16 @@ -#include -#include -#include +#include "pls/internal/scheduling/scheduler.h" +#include "pls/internal/scheduling/parallel_result.h" +#include "pls/internal/scheduling/scheduler_memory.h" +using namespace pls::internal::scheduling; #include #include #include +#include + +std::atomic count; static constexpr int CUTOFF = 16; -static constexpr int NUM_ITERATIONS = 1000; static constexpr int INPUT_SIZE = 8192; typedef std::vector> complex_vector; @@ -39,22 +42,38 @@ void combine(complex_vector::iterator data, int n) { } } -void fft(complex_vector::iterator data, int n) { +void fft_normal(complex_vector::iterator data, int n) { if (n < 2) { +// count++; return; } divide(data, n); + fft_normal(data, n / 2); + fft_normal(data + n / 2, n / 2); + combine(data, n); +} + +parallel_result fft(complex_vector::iterator data, int n) { + if (n < 2) { + return 0; + } + + divide(data, n); if (n <= CUTOFF) { - fft(data, n / 2); - fft(data + n / 2, n / 2); + fft_normal(data, n / 2); + fft_normal(data + n / 2, n / 2); + return 0; } else { - pls::invoke( - [&] { fft(data, n / 2); }, - [&] { fft(data + n / 2, n / 2); } - ); + return scheduler::par([=]() { + return fft(data, n / 2); + }, [=]() { + return fft(data + n / 2, n / 2); + }).then([=](int, int) { + combine(data, n); + return 0; + }); } - combine(data, n); } complex_vector prepare_input(int input_size) { @@ -73,14 +92,55 @@ complex_vector prepare_input(int input_size) { return data; } +static constexpr int NUM_ITERATIONS = 1000; +constexpr size_t NUM_THREADS = 1; + +constexpr size_t NUM_TASKS = 64; +constexpr size_t MAX_TASK_STACK_SIZE = 0; + +constexpr size_t NUM_CONTS = 64; +constexpr size_t MAX_CONT_SIZE = 192; + int main() { - PROFILE_ENABLE complex_vector initial_input = prepare_input(INPUT_SIZE); - pls::internal::helpers::run_mini_benchmark([&] { - complex_vector input = initial_input; - fft(input.begin(), input.size()); - }, 7, 1000); + static_scheduler_memory static_scheduler_memory; + + scheduler scheduler{static_scheduler_memory, NUM_THREADS}; + + count.store(0); + auto start = std::chrono::steady_clock::now(); + for (int i = 0; i < NUM_ITERATIONS; i++) { + complex_vector input_2(initial_input); + scheduler.perform_work([&]() { + return scheduler::par([&]() { + return fft(input_2.begin(), INPUT_SIZE); + }, []() { + return parallel_result{0}; + }).then([](int, int) { + return 0; + }); + }); + } + auto end = std::chrono::steady_clock::now(); + std::cout << "Count: " << count.load() << std::endl; + std::cout << "Framework: " << std::chrono::duration_cast(end - start).count() + << std::endl; + + count.store(0); + start = std::chrono::steady_clock::now(); + for (int i = 0; i < NUM_ITERATIONS; i++) { + complex_vector input_1(initial_input); + fft_normal(input_1.begin(), INPUT_SIZE); + } + end = std::chrono::steady_clock::now(); + std::cout << "Count: " << count.load() << std::endl; + std::cout << "Normal: " << std::chrono::duration_cast(end - start).count() + << std::endl; - PROFILE_SAVE("test_profile.prof") + return 0; } diff --git a/cmake/SetupOptimizationLevel.cmake b/cmake/SetupOptimizationLevel.cmake index 05c18cc..f2f66ec 100644 --- a/cmake/SetupOptimizationLevel.cmake +++ b/cmake/SetupOptimizationLevel.cmake @@ -18,7 +18,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release") # but inlining functions and SIMD/Vectorization is # only enabled by -O3, thus it's way faster in some # array calculations. - set(CMAKE_CXX_FLAGS_RELEASE "-O2 -march=native") + set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native") set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) else () set(CMAKE_CXX_FLAGS_DEBUG "-g -O0") diff --git a/lib/pls/CMakeLists.txt b/lib/pls/CMakeLists.txt index 12e7e97..9e9dd85 100644 --- a/lib/pls/CMakeLists.txt +++ b/lib/pls/CMakeLists.txt @@ -34,7 +34,7 @@ add_library(pls STATIC include/pls/internal/base/barrier.h src/internal/base/barrier.cpp include/pls/internal/base/system_details.h include/pls/internal/base/error_handling.h - include/pls/internal/base/alignment.h + include/pls/internal/base/alignment.h src/internal/base/alignment.cpp include/pls/internal/data_structures/aligned_stack.h src/internal/data_structures/aligned_stack.cpp include/pls/internal/data_structures/aligned_stack_impl.h @@ -58,7 +58,7 @@ add_library(pls STATIC include/pls/internal/scheduling/task.h src/internal/scheduling/task.cpp include/pls/internal/scheduling/cont_manager.h include/pls/internal/scheduling/continuation.h - include/pls/internal/data_structures/bounded_ws_deque.h include/pls/internal/data_structures/optional.h src/internal/base/alignment.cpp src/internal/base/alignment.h) + include/pls/internal/data_structures/bounded_ws_deque.h include/pls/internal/data_structures/optional.h) # Add everything in `./include` to be in the include path of this project target_include_directories(pls diff --git a/lib/pls/include/pls/internal/data_structures/bounded_ws_deque.h b/lib/pls/include/pls/internal/data_structures/bounded_ws_deque.h index ccc6bfa..7a37f7b 100644 --- a/lib/pls/include/pls/internal/data_structures/bounded_ws_deque.h +++ b/lib/pls/include/pls/internal/data_structures/bounded_ws_deque.h @@ -36,11 +36,11 @@ class bounded_ws_deque { void push_bottom(T item) { item_array_[bottom_] = item; local_bottom_++; - bottom_.store(local_bottom_); + bottom_.store(local_bottom_, std::memory_order_release); } bool is_empty() { - return top_.load().value < bottom_; + return top_.load().value < bottom_.load(); } optional pop_top() { @@ -66,11 +66,11 @@ class bounded_ws_deque { } local_bottom_--; - bottom_.store(local_bottom_); + bottom_.store(local_bottom_, std::memory_order_seq_cst); optional result(item_array_[local_bottom_]); - stamped_integer old_top = top_.load(); + stamped_integer old_top = top_.load(std::memory_order_acquire); if (local_bottom_ > old_top.value) { // Enough distance to just return the value return result;