Commit 8668cad2 by FritzFlorian

WIP: Add first performance tests of single threaded execution.

We changed up some of the memory constraints in the lock free deque and will need to see if this is ok. If so, the single threaded performance looks very good.
parent c2d4bc25
Pipeline #1337 failed with stages
in 38 seconds
#include <pls/pls.h>
#include <pls/internal/helpers/profiler.h>
#include <pls/internal/helpers/mini_benchmark.h>
#include "pls/internal/scheduling/scheduler.h"
#include "pls/internal/scheduling/parallel_result.h"
#include "pls/internal/scheduling/scheduler_memory.h"
using namespace pls::internal::scheduling;
#include <iostream>
#include <complex>
#include <vector>
#include <atomic>
std::atomic<unsigned long> count;
static constexpr int CUTOFF = 16;
static constexpr int NUM_ITERATIONS = 1000;
static constexpr int INPUT_SIZE = 8192;
typedef std::vector<std::complex<double>> complex_vector;
......@@ -39,22 +42,38 @@ void combine(complex_vector::iterator data, int n) {
}
}
void fft(complex_vector::iterator data, int n) {
void fft_normal(complex_vector::iterator data, int n) {
if (n < 2) {
// count++;
return;
}
divide(data, n);
fft_normal(data, n / 2);
fft_normal(data + n / 2, n / 2);
combine(data, n);
}
parallel_result<short> fft(complex_vector::iterator data, int n) {
if (n < 2) {
return 0;
}
divide(data, n);
if (n <= CUTOFF) {
fft(data, n / 2);
fft(data + n / 2, n / 2);
fft_normal(data, n / 2);
fft_normal(data + n / 2, n / 2);
return 0;
} else {
pls::invoke(
[&] { fft(data, n / 2); },
[&] { fft(data + n / 2, n / 2); }
);
return scheduler::par([=]() {
return fft(data, n / 2);
}, [=]() {
return fft(data + n / 2, n / 2);
}).then([=](int, int) {
combine(data, n);
return 0;
});
}
combine(data, n);
}
complex_vector prepare_input(int input_size) {
......@@ -73,14 +92,55 @@ complex_vector prepare_input(int input_size) {
return data;
}
static constexpr int NUM_ITERATIONS = 1000;
constexpr size_t NUM_THREADS = 1;
constexpr size_t NUM_TASKS = 64;
constexpr size_t MAX_TASK_STACK_SIZE = 0;
constexpr size_t NUM_CONTS = 64;
constexpr size_t MAX_CONT_SIZE = 192;
int main() {
PROFILE_ENABLE
complex_vector initial_input = prepare_input(INPUT_SIZE);
pls::internal::helpers::run_mini_benchmark([&] {
complex_vector input = initial_input;
fft(input.begin(), input.size());
}, 7, 1000);
static_scheduler_memory<NUM_THREADS,
NUM_TASKS,
MAX_TASK_STACK_SIZE,
NUM_CONTS,
MAX_CONT_SIZE> static_scheduler_memory;
scheduler scheduler{static_scheduler_memory, NUM_THREADS};
count.store(0);
auto start = std::chrono::steady_clock::now();
for (int i = 0; i < NUM_ITERATIONS; i++) {
complex_vector input_2(initial_input);
scheduler.perform_work([&]() {
return scheduler::par([&]() {
return fft(input_2.begin(), INPUT_SIZE);
}, []() {
return parallel_result<int>{0};
}).then([](int, int) {
return 0;
});
});
}
auto end = std::chrono::steady_clock::now();
std::cout << "Count: " << count.load() << std::endl;
std::cout << "Framework: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
<< std::endl;
count.store(0);
start = std::chrono::steady_clock::now();
for (int i = 0; i < NUM_ITERATIONS; i++) {
complex_vector input_1(initial_input);
fft_normal(input_1.begin(), INPUT_SIZE);
}
end = std::chrono::steady_clock::now();
std::cout << "Count: " << count.load() << std::endl;
std::cout << "Normal: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
<< std::endl;
PROFILE_SAVE("test_profile.prof")
return 0;
}
......@@ -18,7 +18,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release")
# but inlining functions and SIMD/Vectorization is
# only enabled by -O3, thus it's way faster in some
# array calculations.
set(CMAKE_CXX_FLAGS_RELEASE "-O2 -march=native")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native")
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
else ()
set(CMAKE_CXX_FLAGS_DEBUG "-g -O0")
......
......@@ -34,7 +34,7 @@ add_library(pls STATIC
include/pls/internal/base/barrier.h src/internal/base/barrier.cpp
include/pls/internal/base/system_details.h
include/pls/internal/base/error_handling.h
include/pls/internal/base/alignment.h
include/pls/internal/base/alignment.h src/internal/base/alignment.cpp
include/pls/internal/data_structures/aligned_stack.h src/internal/data_structures/aligned_stack.cpp
include/pls/internal/data_structures/aligned_stack_impl.h
......@@ -58,7 +58,7 @@ add_library(pls STATIC
include/pls/internal/scheduling/task.h src/internal/scheduling/task.cpp
include/pls/internal/scheduling/cont_manager.h
include/pls/internal/scheduling/continuation.h
include/pls/internal/data_structures/bounded_ws_deque.h include/pls/internal/data_structures/optional.h src/internal/base/alignment.cpp src/internal/base/alignment.h)
include/pls/internal/data_structures/bounded_ws_deque.h include/pls/internal/data_structures/optional.h)
# Add everything in `./include` to be in the include path of this project
target_include_directories(pls
......
......@@ -36,11 +36,11 @@ class bounded_ws_deque {
void push_bottom(T item) {
item_array_[bottom_] = item;
local_bottom_++;
bottom_.store(local_bottom_);
bottom_.store(local_bottom_, std::memory_order_release);
}
bool is_empty() {
return top_.load().value < bottom_;
return top_.load().value < bottom_.load();
}
optional<T> pop_top() {
......@@ -66,11 +66,11 @@ class bounded_ws_deque {
}
local_bottom_--;
bottom_.store(local_bottom_);
bottom_.store(local_bottom_, std::memory_order_seq_cst);
optional<T> result(item_array_[local_bottom_]);
stamped_integer old_top = top_.load();
stamped_integer old_top = top_.load(std::memory_order_acquire);
if (local_bottom_ > old_top.value) {
// Enough distance to just return the value
return result;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment