diff --git a/CMakeLists.txt b/CMakeLists.txt index 1afba5a..be1f6ac 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,7 @@ add_subdirectory(lib/pls) add_subdirectory(app/playground) add_subdirectory(app/test_for_new) add_subdirectory(app/invoke_parallel) +add_subdirectory(app/benchmark_fft) # Add optional tests option(PACKAGE_TESTS "Build the tests" ON) diff --git a/README.md b/README.md index 955f49d..d62f410 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,22 @@ After this is done you can use normal `make` commands like `make` to build everything `make ` to build a target or `make install` to install the library globally. +Available Settings: +- `-DEASY_PROFILER=ON/OFF` + - default OFF + - Enabling will link the easy profiler library and enable its macros + - Enabling has a performance hit (do not use in releases) +- `-DADDRESS_SANITIZER=ON/OFF` + - default OFF + - Enables address sanitizer to be linked to the executable + - Only one sanitizer can be active at once + - Enabling has a performance hit (do not use in releases) +- `-DTHREAD_SANITIZER=ON/OFF` + - default OFF + - Enables thread/datarace sanitizer to be linked to the executable + - Only one sanitizer can be active at once + - Enabling has a performance hit (do not use in releases) + ### Testing Testing is done using [Catch2](https://github.com/catchorg/Catch2/) diff --git a/app/benchmark_fft/CMakeLists.txt b/app/benchmark_fft/CMakeLists.txt new file mode 100644 index 0000000..41591e5 --- /dev/null +++ b/app/benchmark_fft/CMakeLists.txt @@ -0,0 +1,5 @@ +add_executable(benchmark_fft main.cpp) +target_link_libraries(benchmark_fft pls) +if(EASY_PROFILER) + target_link_libraries(benchmark_fft easy_profiler) +endif() diff --git a/app/benchmark_fft/main.cpp b/app/benchmark_fft/main.cpp new file mode 100644 index 0000000..e5f3453 --- /dev/null +++ b/app/benchmark_fft/main.cpp @@ -0,0 +1,87 @@ +#include +#include +#include + +#include +#include +#include + +static constexpr int CUTOFF = 10; +static constexpr int NUM_ITERATIONS = 1000; +static constexpr int INPUT_SIZE = 2064; +typedef std::vector> complex_vector; + +void divide(complex_vector::iterator data, int n) { + complex_vector tmp_odd_elements(n / 2); + for (int i = 0; i < n / 2; i++) { + tmp_odd_elements[i] = data[i * 2 + 1]; + } + for (int i = 0; i < n / 2; i++) { + data[i] = data[i * 2]; + } + for (int i = 0; i < n / 2; i++) { + data[i + n / 2] = tmp_odd_elements[i]; + } +} + +void combine(complex_vector::iterator data, int n) { + for (int i = 0; i < n / 2; i++) { + std::complex even = data[i]; + std::complex odd = data[i + n / 2]; + + // w is the "twiddle-factor". + // this could be cached, but we run the same 'base' algorithm parallel/serial, + // so it won't impact the performance comparison. + std::complex w = exp(std::complex(0, -2. * M_PI * i / n)); + + data[i] = even + w * odd; + data[i + n / 2] = even - w * odd; + } +} + +void fft(complex_vector::iterator data, int n) { + if (n < 2) { + return; + } + + divide(data, n); + if (n <= CUTOFF) { + fft(data, n / 2); + fft(data + n / 2, n / 2); + } else { + pls::invoke_parallel( + [&] { fft(data, n / 2); }, + [&] { fft(data + n / 2, n / 2); } + ); + } + combine(data, n); +} + +complex_vector prepare_input(int input_size) { + std::vector known_frequencies{2, 11, 52, 88, 256}; + complex_vector data(input_size); + + // Set our input data to match a time series of the known_frequencies. + // When applying fft to this time-series we should find these frequencies. + for (int i = 0; i < input_size; i++) { + data[i] = std::complex(0.0, 0.0); + for (auto frequencie : known_frequencies) { + data[i] += sin(2 * M_PI * frequencie * i / input_size); + } + } + + return data; +} + + +int main() { + PROFILE_ENABLE + complex_vector initial_input = prepare_input(INPUT_SIZE); + + pls::internal::helpers::run_mini_benchmark([&] { + complex_vector input = initial_input; + fft(input.begin(), input.size()); + }, 8, 4000); + + PROFILE_SAVE("test_profile.prof") +} diff --git a/app/invoke_parallel/CMakeLists.txt b/app/invoke_parallel/CMakeLists.txt index adcb6be..944f5ef 100644 --- a/app/invoke_parallel/CMakeLists.txt +++ b/app/invoke_parallel/CMakeLists.txt @@ -1,2 +1,5 @@ add_executable(invoke_parallel main.cpp) -target_link_libraries(invoke_parallel pls easy_profiler) +target_link_libraries(invoke_parallel pls) +if(EASY_PROFILER) + target_link_libraries(invoke_parallel easy_profiler) +endif() diff --git a/app/invoke_parallel/main.cpp b/app/invoke_parallel/main.cpp index e16b8f7..4ae48ef 100644 --- a/app/invoke_parallel/main.cpp +++ b/app/invoke_parallel/main.cpp @@ -1,7 +1,7 @@ #include -#include +#include -#include +#include static pls::static_scheduler_memory<8, 2 << 14> my_scheduler_memory; @@ -33,17 +33,19 @@ long fib(long n) { } int main() { - EASY_PROFILER_ENABLE; + PROFILE_ENABLE pls::scheduler scheduler{&my_scheduler_memory, 8}; long result; scheduler.perform_work([&] { - EASY_MAIN_THREAD; + PROFILE_MAIN_THREAD // Call looks just the same, only requirement is // the enclosure in the perform_work lambda. - result = fib(30); + for (int i = 0; i < 10; i++) { + result = fib(30); + std::cout << "Fib(30)=" << result << std::endl; + } }); - std::cout << "Fib(30)=" << result << std::endl; - profiler::dumpBlocksToFile("test_profile.prof"); + PROFILE_SAVE("test_profile.prof") } diff --git a/app/playground/main.cpp b/app/playground/main.cpp index e9aee51..56c0c8e 100644 --- a/app/playground/main.cpp +++ b/app/playground/main.cpp @@ -5,7 +5,7 @@ #include #include -#include +#include using namespace pls; diff --git a/app/test_for_new/main.cpp b/app/test_for_new/main.cpp index d04050f..2e74529 100644 --- a/app/test_for_new/main.cpp +++ b/app/test_for_new/main.cpp @@ -1,5 +1,5 @@ #include -#include +#include using namespace pls::internal::base; diff --git a/cmake/SetupEasyProfiler.cmake b/cmake/SetupEasyProfiler.cmake index ddadb5f..745fedc 100644 --- a/cmake/SetupEasyProfiler.cmake +++ b/cmake/SetupEasyProfiler.cmake @@ -1,17 +1,19 @@ -# Optional external dependencies -find_package(easy_profiler) - option(EASY_PROFILER "Enable the profiler" OFF) if(EASY_PROFILER) - if(easy_profiler_FOUND) + # Optional external dependencies + find_package(easy_profiler) + if(easy_profiler_FOUND) + # Do nothing, add definitions below else() message(WARNING "EasyProfiler dependency not found on system, DISABLING it!") set(EASY_PROFILER OFF) endif() endif() -if(NOT EASY_PROFILER) +if(EASY_PROFILER) + add_definitions(-DENABLE_EASY_PROFILER) +else() add_definitions(-DDISABLE_EASY_PROFILER) endif() diff --git a/compare_benchmarks.py b/compare_benchmarks.py new file mode 100755 index 0000000..56ebddb --- /dev/null +++ b/compare_benchmarks.py @@ -0,0 +1,44 @@ +#!bin/python3 +import sys +import os + +if len(sys.argv) < 2: + print("Please pass the name of the benchmark target as an argument!") + exit(1) + +target = sys.argv[1] +print('Comparing current modifications for benchmark target ' + target) + +print('Executing current version...') +print(os.popen('cd cmake-build-release; make ' + target).read()) +current = os.popen('chrt -rr 99 ./cmake-build-release/bin/' + target).read() + +print('Executing old version...') +print(os.popen('git stash push').read()) +print(os.popen('cd cmake-build-release; make ' + target).read()) +before = os.popen('chrt -rr 99 ./cmake-build-release/bin/' + target).read() +print(os.popen('git stash pop').read()) + +print('=======================================================') +current = [float(value) for value in current.split(',')] +before = [float(value) for value in before.split(',')] + + +def formate_change(change): + if change > 1.05: + color = '31' + elif change < 0.95: + color = '32' + else: + color = '30' + + return '\033[1;' + color + ';40m %8.2f' % (change * 100) + ' %' + + +format_string = ' '.join(['%10.2f us'] * len(current)) +print('old: ' + format_string % tuple(before)) +print('new: ' + format_string % tuple(current)) +print('=' * 55) +change = [c / b for b, c in zip(before, current)] +formated_change = ''.join(list(map(formate_change, change))) +print(formated_change) diff --git a/lib/pls/CMakeLists.txt b/lib/pls/CMakeLists.txt index c90ff97..6cee207 100644 --- a/lib/pls/CMakeLists.txt +++ b/lib/pls/CMakeLists.txt @@ -3,7 +3,7 @@ add_library(pls STATIC src/pls.cpp include/pls/pls.h src/internal/base/spin_lock.cpp include/pls/internal/base/spin_lock.h src/internal/base/thread.cpp include/pls/internal/base/thread.h - include/pls/internal/base/prohibit_new.h + include/pls/internal/helpers/prohibit_new.h src/internal/scheduling/abstract_task.cpp include/pls/internal/scheduling/abstract_task.h src/internal/scheduling/scheduler.cpp include/pls/internal/scheduling/scheduler.h src/internal/scheduling/thread_state.cpp include/pls/internal/scheduling/thread_state.h @@ -14,7 +14,11 @@ add_library(pls STATIC src/internal/scheduling/run_on_n_threads_task.cpp include/pls/internal/scheduling/run_on_n_threads_task.h src/internal/scheduling/fork_join_task.cpp include/pls/internal/scheduling/fork_join_task.h src/internal/base/deque.cpp include/pls/internal/base/deque.h - src/algorithms/invoke_parallel.cpp include/pls/algorithms/invoke_parallel.h include/pls/internal/base/error_handling.h) + src/algorithms/invoke_parallel.cpp include/pls/algorithms/invoke_parallel.h + include/pls/internal/base/error_handling.h + include/pls/internal/scheduling/scheduler_memory.h src/internal/scheduling/scheduler_memory.cpp + include/pls/internal/helpers/profiler.h + include/pls/internal/helpers/mini_benchmark.h) # Add everything in `./include` to be in the include path of this project target_include_directories(pls diff --git a/lib/pls/include/pls/internal/base/aligned_stack.h b/lib/pls/include/pls/internal/base/aligned_stack.h index c463232..7c16fec 100644 --- a/lib/pls/include/pls/internal/base/aligned_stack.h +++ b/lib/pls/include/pls/internal/base/aligned_stack.h @@ -32,13 +32,13 @@ namespace pls { template T* push(const T& object) { - // Copy-Construct into desired memory location - return new (push())T(object); + // Placement new into desired memory location + return new ((void*)push())T(object); } template - T* push() { - T* result = reinterpret_cast(head_); + void* push() { + void* result = reinterpret_cast(head_); // Move head to next aligned position after new object head_ = next_alignment(head_ + sizeof(T)); diff --git a/lib/pls/include/pls/internal/helpers/mini_benchmark.h b/lib/pls/include/pls/internal/helpers/mini_benchmark.h new file mode 100644 index 0000000..d153a32 --- /dev/null +++ b/lib/pls/include/pls/internal/helpers/mini_benchmark.h @@ -0,0 +1,53 @@ + +#ifndef PLS_MINI_BENCHMARK_H +#define PLS_MINI_BENCHMARK_H + +#include "pls/internal/scheduling/scheduler_memory.h" +#include "pls/internal/scheduling/scheduler.h" + +#include +#include + +namespace pls { + namespace internal { + namespace helpers { + // TODO: Clean up (separate into small functions and .cpp file) + template + void run_mini_benchmark(const Function& lambda, size_t max_threads, long max_runtime_ms=1000) { + using namespace std; + using namespace pls::internal::scheduling; + + malloc_scheduler_memory scheduler_memory{max_threads}; + for (unsigned int num_threads = 1; num_threads <= max_threads; num_threads++) { + scheduler local_scheduler{&scheduler_memory, num_threads}; + + chrono::high_resolution_clock::time_point start_time; + chrono::high_resolution_clock::time_point end_time; + unsigned long iterations = 0; + local_scheduler.perform_work([&] { + start_time = chrono::high_resolution_clock::now(); + end_time = start_time; + chrono::high_resolution_clock::time_point planned_end_time = start_time + chrono::milliseconds(max_runtime_ms); + + while (end_time < planned_end_time) { + lambda(); + end_time = chrono::high_resolution_clock::now(); + iterations++; + } + }); + + long time = chrono::duration_cast(end_time - start_time).count(); + double time_per_iteration = (double)time / iterations; + + std::cout << time_per_iteration; + if (num_threads < max_threads) { + std::cout << ","; + } + } + std::cout << std::endl; + } + } + } +} + +#endif //PLS_MINI_BENCHMARK_H diff --git a/lib/pls/include/pls/internal/helpers/profiler.h b/lib/pls/include/pls/internal/helpers/profiler.h new file mode 100644 index 0000000..221994d --- /dev/null +++ b/lib/pls/include/pls/internal/helpers/profiler.h @@ -0,0 +1,33 @@ + +#ifndef PLS_PROFILER_H +#define PLS_PROFILER_H +#ifdef ENABLE_EASY_PROFILER + +#include + +#define PROFILE_WORK_BLOCK(msg) EASY_BLOCK(msg, profiler::colors::LightGreen) +#define PROFILE_FORK_JOIN_STEALING(msg) EASY_BLOCK(msg, profiler::colors::LightBlue) +#define PROFILE_STEALING(msg) EASY_BLOCK(msg, profiler::colors::Blue) +#define PROFILE_LOCK(msg) EASY_BLOCK(msg, profiler::colors::Red) + +#define PROFILE_END_BLOCK EASY_END_BLOCK + +#define PROFILE_SAVE(filename) profiler::dumpBlocksToFile(filename); +#define PROFILE_ENABLE EASY_PROFILER_ENABLE +#define PROFILE_MAIN_THREAD EASY_MAIN_THREAD + +#else //ENABLE_EASY_PROFILER + +#define PROFILE_WORK_BLOCK(msg) +#define PROFILE_FORK_JOIN_STEALING(msg) +#define PROFILE_STEALING(msg) +#define PROFILE_LOCK(msg) + +#define PROFILE_END_BLOCK + +#define PROFILE_SAVE(filename) +#define PROFILE_ENABLE +#define PROFILE_MAIN_THREAD + +#endif //ENABLE_EASY_PROFILER +#endif //PLS_PROFILER_H diff --git a/lib/pls/include/pls/internal/base/prohibit_new.h b/lib/pls/include/pls/internal/helpers/prohibit_new.h similarity index 100% rename from lib/pls/include/pls/internal/base/prohibit_new.h rename to lib/pls/include/pls/internal/helpers/prohibit_new.h diff --git a/lib/pls/include/pls/internal/scheduling/fork_join_task.h b/lib/pls/include/pls/internal/scheduling/fork_join_task.h index 36f5ccd..830772f 100644 --- a/lib/pls/include/pls/internal/scheduling/fork_join_task.h +++ b/lib/pls/include/pls/internal/scheduling/fork_join_task.h @@ -2,7 +2,7 @@ #ifndef PLS_TBB_LIKE_TASK_H #define PLS_TBB_LIKE_TASK_H -#include +#include "pls/internal/helpers/profiler.h" #include "pls/internal/base/aligned_stack.h" #include "pls/internal/base/deque.h" @@ -86,7 +86,7 @@ namespace pls { last_stolen_{nullptr} {}; void execute() override { - EASY_BLOCK("execute fork_join_task", profiler::colors::LightGreen); + PROFILE_WORK_BLOCK("execute fork_join_task"); // Bind this instance to our OS thread my_stack_ = base::this_thread::state()->task_stack_; @@ -102,7 +102,7 @@ namespace pls { template void fork_join_sub_task::spawn_child(const T& task) { - EASY_FUNCTION(profiler::colors::Blue) + PROFILE_FORK_JOIN_STEALING("spawn_child") static_assert(std::is_base_of::value, "Only pass fork_join_sub_task subclasses!"); T* new_task = tbb_task_->my_stack_->push(task); diff --git a/lib/pls/include/pls/internal/scheduling/root_task.h b/lib/pls/include/pls/internal/scheduling/root_task.h index cdb7f0b..6834b6b 100644 --- a/lib/pls/include/pls/internal/scheduling/root_task.h +++ b/lib/pls/include/pls/internal/scheduling/root_task.h @@ -2,12 +2,13 @@ #ifndef PLS_ROOT_MASTER_TASK_H #define PLS_ROOT_MASTER_TASK_H -#include #include -#include "abstract_task.h" +#include "pls/internal/helpers/profiler.h" #include "pls/internal/base/spin_lock.h" +#include "abstract_task.h" + namespace pls { namespace internal { namespace scheduling { @@ -30,7 +31,7 @@ namespace pls { } void execute() override { - EASY_BLOCK("execute root_task", profiler::colors::LightGreen); + PROFILE_WORK_BLOCK("execute root_task"); function_(); finished_ = 1; } @@ -54,7 +55,7 @@ namespace pls { master_task_{master_task} {} void execute() override { - EASY_BLOCK("execute root_task", profiler::colors::LightGreen); + PROFILE_WORK_BLOCK("execute root_task"); do { steal_work(); } while (!master_task_->finished()); diff --git a/lib/pls/include/pls/internal/scheduling/scheduler.h b/lib/pls/include/pls/internal/scheduling/scheduler.h index 51b5b0c..55e72b5 100644 --- a/lib/pls/include/pls/internal/scheduling/scheduler.h +++ b/lib/pls/include/pls/internal/scheduling/scheduler.h @@ -2,55 +2,25 @@ #ifndef PLS_SCHEDULER_H #define PLS_SCHEDULER_H -#include #include #include +#include "pls/internal/helpers/profiler.h" + #include "pls/internal/base/aligned_stack.h" #include "pls/internal/base/thread.h" #include "pls/internal/base/barrier.h" #include "thread_state.h" #include "root_task.h" +#include "scheduler_memory.h" namespace pls { namespace internal { namespace scheduling { - // Upper thread limit for static memory allocation. - // Could be moved to templating if needed. - static constexpr int MAX_THREADS = 32; - void worker_routine(); using scheduler_thread = base::thread; - class scheduler_memory { - public: - virtual size_t max_threads() = 0; - virtual thread_state* thread_state_for(size_t id) = 0; - virtual scheduler_thread* thread_for(size_t id) = 0; - virtual base::aligned_stack* task_stack_for(size_t id) = 0; - }; - - template - class static_scheduler_memory: public scheduler_memory { - std::array threads_; - std::array thread_states_; - std::array, MAX_THREADS> task_stacks_memory_; - std::array task_stacks_; - - public: - static_scheduler_memory() { - for (size_t i = 0; i < MAX_THREADS; i++) { - task_stacks_[i] = base::aligned_stack(reinterpret_cast(&task_stacks_memory_[i]), TASK_STACK_SIZE); - } - } - - size_t max_threads() override { return MAX_THREADS; } - thread_state* thread_state_for(size_t id) override { return &thread_states_[id]; } - scheduler_thread* thread_for(size_t id) override { return &threads_[id]; } - base::aligned_stack* task_stack_for(size_t id) override { return &task_stacks_[id]; } - }; - class scheduler { friend void worker_routine(); @@ -65,7 +35,7 @@ namespace pls { template void perform_work(Function work_section) { - EASY_FUNCTION(); + PROFILE_WORK_BLOCK("scheduler::perform_work") root_task master{work_section}; // Push root task on stacks diff --git a/lib/pls/include/pls/internal/scheduling/scheduler_memory.h b/lib/pls/include/pls/internal/scheduling/scheduler_memory.h new file mode 100644 index 0000000..c9d233d --- /dev/null +++ b/lib/pls/include/pls/internal/scheduling/scheduler_memory.h @@ -0,0 +1,63 @@ +#include "pls/internal/base/aligned_stack.h" +#include "pls/internal/base/thread.h" + +#include "thread_state.h" + +#ifndef PLS_SCHEDULER_MEMORY_H +#define PLS_SCHEDULER_MEMORY_H + +namespace pls { + namespace internal { + namespace scheduling { + void worker_routine(); + using scheduler_thread = base::thread; + + class scheduler_memory { + public: + virtual size_t max_threads() = 0; + virtual thread_state* thread_state_for(size_t id) = 0; + virtual scheduler_thread* thread_for(size_t id) = 0; + virtual base::aligned_stack* task_stack_for(size_t id) = 0; + }; + + template + class static_scheduler_memory: public scheduler_memory { + std::array threads_; + std::array thread_states_; + std::array, MAX_THREADS> task_stacks_memory_; + std::array task_stacks_; + + public: + static_scheduler_memory() { + for (size_t i = 0; i < MAX_THREADS; i++) { + task_stacks_[i] = base::aligned_stack(task_stacks_memory_[i].data(), TASK_STACK_SIZE); + } + } + + size_t max_threads() override { return MAX_THREADS; } + thread_state* thread_state_for(size_t id) override { return &thread_states_[id]; } + scheduler_thread* thread_for(size_t id) override { return &threads_[id]; } + base::aligned_stack* task_stack_for(size_t id) override { return &task_stacks_[id]; } + }; + + class malloc_scheduler_memory: public scheduler_memory { + size_t num_threads_; + + scheduler_thread* threads_; + thread_state* thread_states_; + char** task_stacks_memory_; + base::aligned_stack* task_stacks_; + public: + explicit malloc_scheduler_memory(size_t num_threads, size_t memory_per_stack = 2 << 16); + ~malloc_scheduler_memory(); + + size_t max_threads() override { return num_threads_; } + thread_state* thread_state_for(size_t id) override { return &thread_states_[id]; } + scheduler_thread* thread_for(size_t id) override { return &threads_[id]; } + base::aligned_stack* task_stack_for(size_t id) override { return &task_stacks_[id]; } + }; + } + } +} + +#endif //PLS_SCHEDULER_MEMORY_H diff --git a/lib/pls/include/pls/internal/scheduling/thread_state.h b/lib/pls/include/pls/internal/scheduling/thread_state.h index 58dcc9b..042c8f8 100644 --- a/lib/pls/include/pls/internal/scheduling/thread_state.h +++ b/lib/pls/include/pls/internal/scheduling/thread_state.h @@ -32,23 +32,6 @@ namespace pls { current_task_{nullptr}, task_stack_{task_stack}, id_{id} {} - - thread_state(const thread_state& other): - scheduler_{other.scheduler_}, - root_task_{other.root_task_}, - current_task_{other.current_task_}, - task_stack_{other.task_stack_}, - id_{other.id_} {} - - thread_state& operator=(const thread_state& other) { - scheduler_ = other.scheduler_; - root_task_ = other.root_task_; - current_task_ = other.current_task_; - task_stack_ = other.task_stack_; - id_ = other.id_; - - return *this; - } }; } } diff --git a/lib/pls/include/pls/pls.h b/lib/pls/include/pls/pls.h index a6354cd..ac5c4ec 100644 --- a/lib/pls/include/pls/pls.h +++ b/lib/pls/include/pls/pls.h @@ -7,8 +7,10 @@ #include "pls/internal/scheduling/scheduler.h" namespace pls { - using internal::scheduling::scheduler; using internal::scheduling::static_scheduler_memory; + using internal::scheduling::malloc_scheduler_memory; + + using internal::scheduling::scheduler; using task_id = internal::scheduling::abstract_task::id; using internal::scheduling::fork_join_sub_task; diff --git a/lib/pls/src/internal/scheduling/abstract_task.cpp b/lib/pls/src/internal/scheduling/abstract_task.cpp index 7cf7dca..3b75bd0 100644 --- a/lib/pls/src/internal/scheduling/abstract_task.cpp +++ b/lib/pls/src/internal/scheduling/abstract_task.cpp @@ -1,4 +1,4 @@ -#include +#include "pls/internal/helpers/profiler.h" #include "pls/internal/scheduling/thread_state.h" #include "pls/internal/scheduling/abstract_task.h" @@ -8,7 +8,7 @@ namespace pls { namespace internal { namespace scheduling { bool abstract_task::steal_work() { - EASY_FUNCTION(profiler::colors::Orange); + PROFILE_STEALING("abstract_task::steal_work") auto my_state = base::this_thread::state(); auto my_scheduler = my_state->scheduler_; @@ -18,19 +18,19 @@ namespace pls { auto target_state = my_scheduler->thread_state_for(target); // TODO: Cleaner Locking Using std::guarded_lock - EASY_BLOCK("Acquire Thread Lock", profiler::colors::Red) + PROFILE_LOCK("Acquire Thread Lock") target_state->lock_.lock(); - EASY_END_BLOCK; + PROFILE_END_BLOCK // Dig down to our level - EASY_BLOCK("Go to our level") + PROFILE_STEALING("Go to our level") abstract_task* current_task = target_state->root_task_; while (current_task != nullptr && current_task->depth() < depth()) { current_task = current_task->child_task_; } - EASY_END_BLOCK; + PROFILE_END_BLOCK - EASY_BLOCK("Internal Steal") + PROFILE_STEALING("Internal Steal") if (current_task != nullptr) { // See if it equals our type and depth of task if (current_task->unique_id_ == unique_id_ && @@ -45,12 +45,12 @@ namespace pls { current_task = current_task->child_task_; } } - EASY_END_BLOCK; + PROFILE_END_BLOCK; // Execute 'top level task steal' if possible // (only try deeper tasks to keep depth restricted stealing) - EASY_BLOCK("Top Level Steal") + PROFILE_STEALING("Top Level Steal") while (current_task != nullptr) { auto lock = &target_state->lock_; if (current_task->split_task(lock)) { @@ -60,7 +60,7 @@ namespace pls { current_task = current_task->child_task_; } - EASY_END_BLOCK; + PROFILE_END_BLOCK; target_state->lock_.unlock(); } diff --git a/lib/pls/src/internal/scheduling/fork_join_task.cpp b/lib/pls/src/internal/scheduling/fork_join_task.cpp index 413ea53..1f1360c 100644 --- a/lib/pls/src/internal/scheduling/fork_join_task.cpp +++ b/lib/pls/src/internal/scheduling/fork_join_task.cpp @@ -1,4 +1,4 @@ -#include +#include "pls/internal/helpers/profiler.h" #include "pls/internal/scheduling/scheduler.h" #include "pls/internal/scheduling/fork_join_task.h" @@ -13,16 +13,19 @@ namespace pls { tbb_task_{nullptr}, stack_state_{nullptr} {} - fork_join_sub_task::fork_join_sub_task(const fork_join_sub_task& other): base::deque_item(other) { - // Do Nothing, will be inited after this anyways - } + fork_join_sub_task::fork_join_sub_task(const fork_join_sub_task& other): + base::deque_item(other), + ref_count_{0}, + parent_{nullptr}, + tbb_task_{nullptr}, + stack_state_{nullptr} {} void fork_join_sub_task::execute() { - EASY_BLOCK("execute sub_task", profiler::colors::Green); + PROFILE_WORK_BLOCK("execute sub_task") tbb_task_->currently_executing_ = this; execute_internal(); tbb_task_->currently_executing_ = nullptr; - EASY_END_BLOCK; + PROFILE_END_BLOCK wait_for_all(); if (parent_ != nullptr) { @@ -44,17 +47,17 @@ namespace pls { void fork_join_sub_task::wait_for_all() { while (ref_count_ > 0) { - EASY_BLOCK("get local sub task", profiler::colors::Blue) + PROFILE_STEALING("get local sub task") fork_join_sub_task* local_task = tbb_task_->get_local_sub_task(); - EASY_END_BLOCK + PROFILE_END_BLOCK if (local_task != nullptr) { local_task->execute(); } else { // Try to steal work. // External steal will be executed implicitly if success - EASY_BLOCK("steal work", profiler::colors::Blue) + PROFILE_STEALING("steal work") bool internal_steal_success = tbb_task_->steal_work(); - EASY_END_BLOCK + PROFILE_END_BLOCK if (internal_steal_success) { tbb_task_->last_stolen_->execute(); } @@ -72,7 +75,7 @@ namespace pls { } bool fork_join_task::internal_stealing(abstract_task* other_task) { - EASY_FUNCTION(profiler::colors::Blue); + PROFILE_STEALING("fork_join_task::internal_stealin") auto cast_other_task = reinterpret_cast(other_task); auto stolen_sub_task = cast_other_task->get_stolen_sub_task(); @@ -90,7 +93,7 @@ namespace pls { } bool fork_join_task::split_task(base::spin_lock* lock) { - EASY_FUNCTION(profiler::colors::Blue); + PROFILE_STEALING("fork_join_task::split_task") fork_join_sub_task* stolen_sub_task = get_stolen_sub_task(); if (stolen_sub_task == nullptr) { return false; diff --git a/lib/pls/src/internal/scheduling/scheduler.cpp b/lib/pls/src/internal/scheduling/scheduler.cpp index b32907f..dd06768 100644 --- a/lib/pls/src/internal/scheduling/scheduler.cpp +++ b/lib/pls/src/internal/scheduling/scheduler.cpp @@ -9,13 +9,14 @@ namespace pls { memory_{memory}, sync_barrier_{num_threads + 1}, terminated_{false} { - if (num_threads > MAX_THREADS) { + if (num_threads_ > memory_->max_threads()) { PLS_ERROR("Tried to create scheduler with more OS threads than pre-allocated memory."); } - for (unsigned int i = 0; i < num_threads; i++) { - *memory_->thread_state_for(i) = thread_state{this, memory_->task_stack_for(i), i}; - *memory_->thread_for(i) = base::start_thread(&worker_routine, memory_->thread_state_for(i)); + for (unsigned int i = 0; i < num_threads_; i++) { + // Placement new is required, as the memory of `memory_` is not required to be initialized. + new((void*)memory_->thread_state_for(i)) thread_state{this, memory_->task_stack_for(i), i}; + new ((void*)memory_->thread_for(i))base::thread(&worker_routine, memory_->thread_state_for(i)); } } diff --git a/lib/pls/src/internal/scheduling/scheduler_memory.cpp b/lib/pls/src/internal/scheduling/scheduler_memory.cpp new file mode 100644 index 0000000..9018be9 --- /dev/null +++ b/lib/pls/src/internal/scheduling/scheduler_memory.cpp @@ -0,0 +1,31 @@ +#include "pls/internal/scheduling/scheduler_memory.h" + +namespace pls { + namespace internal { + namespace scheduling { + malloc_scheduler_memory::malloc_scheduler_memory(const size_t num_threads, const size_t memory_per_stack): + num_threads_{num_threads} { + threads_ = reinterpret_cast(malloc(num_threads * sizeof(scheduler_thread))); + thread_states_ = reinterpret_cast(malloc(num_threads * sizeof(thread_state))); + + task_stacks_ = reinterpret_cast(malloc(num_threads * sizeof(base::aligned_stack))); + task_stacks_memory_ = reinterpret_cast(malloc(num_threads * sizeof(char*))); + for (size_t i = 0; i < num_threads_; i++) { + task_stacks_memory_[i] = reinterpret_cast(malloc(memory_per_stack)); + task_stacks_[i] = base::aligned_stack(task_stacks_memory_[i], memory_per_stack); + } + } + + malloc_scheduler_memory::~malloc_scheduler_memory() { + free(threads_); + free(thread_states_); + + for (size_t i = 0; i < num_threads_; i++) { + free(task_stacks_memory_[i]); + } + free(task_stacks_); + free(task_stacks_memory_); + } + } + } +} diff --git a/test/scheduling_tests.cpp b/test/scheduling_tests.cpp index f94f860..f116f1b 100644 --- a/test/scheduling_tests.cpp +++ b/test/scheduling_tests.cpp @@ -48,7 +48,7 @@ public: }; TEST_CASE( "tbb task are scheduled correctly", "[internal/scheduling/fork_join_task.h]") { - static static_scheduler_memory<8, 2 << 12> my_scheduler_memory; + malloc_scheduler_memory my_scheduler_memory{8, 2 << 12}; SECTION("tasks are executed exactly once") { scheduler my_scheduler{&my_scheduler_memory, 2};