diff --git a/CMakeLists.txt b/CMakeLists.txt index 8178a05..1afba5a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,6 +15,7 @@ include(cmake/SetupOptimizationLevel.cmake) include(cmake/SetupThreadingSupport.cmake) include(cmake/SetupThreadSanitizer.cmake) include(cmake/SetupAddressSanitizer.cmake) +include(cmake/SetupEasyProfiler.cmake) # make our internal cmake script collection avaliable in the build process. list(APPEND CMAKE_PREFIX_PATH "${PROJECT_SOURCE_DIR}/cmake") diff --git a/NOTES.md b/NOTES.md index 3e66c3d..e771ad0 100644 --- a/NOTES.md +++ b/NOTES.md @@ -4,7 +4,16 @@ A collection of stuff that we noticed during development. Useful later on two write a project report and to go back in time to find out why certain decisions where made. -## 28.03.2018 - custom new operators +## 02.04.2019 - CMake Export + +We built our project using CMake to make it portable and easy to setup. +To allow others to use our library we need to make it installable on +other systems. For this we use CMake's install feature and +a [tutorial](https://pabloariasal.github.io/2018/02/19/its-time-to-do-cmake-right/) +on how to correctly configure a CMake library to be included by other +projects. + +## 28.03.2019 - custom new operators When initializing sub_tasks we want to place them on our custom 'stack like' data structure per thread. We looked at TBB's API diff --git a/README.md b/README.md index daa680b..955f49d 100644 --- a/README.md +++ b/README.md @@ -43,3 +43,26 @@ keep the repository in a state where the sanitizer reports errors. Consider reading [the section on common data races](https://github.com/google/sanitizers/wiki/ThreadSanitizerPopularDataRaces) to get an idea of what we try to avoid in our code. + +### Profiling + +To make profiling portable and allow us to later analyze the logs +programaticly we use [easy_profiler](https://github.com/yse/easy_profiler) +for capturing data. To enable profiling install the library on your system +(best building it and then running `make install`) and set the +cmake option `-DEASY_PROFILER=ON`. + +After that see the `invoke_parallel` example app for activating the +profiler. This will generate a trace file that can be viewed with +the `profiler_gui ` command. + +Please note that the profiler adds overhead when looking at sub millisecond +method invokations as we do and it can not replace a seperate +profiler like `gperf` or `valgrind` for detailed analysis. +We still think it makes sense to add it in as an optional feature, +as the customizable colors and fine grained events (including collection +of variables) can be used to visualize the `big picture` of +program execution. Also, we hope to use it to log 'events' like +successful and failed steals in the future, as the general idea of logging +information per thread efficiently might be helpful for further +analysis. diff --git a/app/invoke_parallel/CMakeLists.txt b/app/invoke_parallel/CMakeLists.txt index 435b2d3..adcb6be 100644 --- a/app/invoke_parallel/CMakeLists.txt +++ b/app/invoke_parallel/CMakeLists.txt @@ -1,2 +1,2 @@ add_executable(invoke_parallel main.cpp) -target_link_libraries(invoke_parallel pls) +target_link_libraries(invoke_parallel pls easy_profiler) diff --git a/app/invoke_parallel/main.cpp b/app/invoke_parallel/main.cpp index 9a17139..e16b8f7 100644 --- a/app/invoke_parallel/main.cpp +++ b/app/invoke_parallel/main.cpp @@ -1,9 +1,11 @@ #include #include -static pls::static_scheduler_memory<8, 2 << 10> my_scheduler_memory; +#include -static constexpr int CUTOFF = 20; +static pls::static_scheduler_memory<8, 2 << 14> my_scheduler_memory; + +static constexpr int CUTOFF = 10; long fib_serial(long n) { if (n == 0) { @@ -31,19 +33,17 @@ long fib(long n) { } int main() { + EASY_PROFILER_ENABLE; pls::scheduler scheduler{&my_scheduler_memory, 8}; - scheduler.perform_work([] { - auto start = std::chrono::high_resolution_clock::now(); - + long result; + scheduler.perform_work([&] { + EASY_MAIN_THREAD; // Call looks just the same, only requirement is // the enclosure in the perform_work lambda. - long result = fib(30); - - auto end = std::chrono::high_resolution_clock::now(); - long time = std::chrono::duration_cast(end - start).count(); - - std::cout << "Fib(30)=" << result << std::endl; - std::cout << "Execution time in us: " << time << std::endl; + result = fib(30); }); + std::cout << "Fib(30)=" << result << std::endl; + + profiler::dumpBlocksToFile("test_profile.prof"); } diff --git a/cmake/SetupEasyProfiler.cmake b/cmake/SetupEasyProfiler.cmake new file mode 100644 index 0000000..ddadb5f --- /dev/null +++ b/cmake/SetupEasyProfiler.cmake @@ -0,0 +1,18 @@ +# Optional external dependencies +find_package(easy_profiler) + +option(EASY_PROFILER "Enable the profiler" OFF) +if(EASY_PROFILER) + if(easy_profiler_FOUND) + + else() + message(WARNING "EasyProfiler dependency not found on system, DISABLING it!") + set(EASY_PROFILER OFF) + endif() +endif() + +if(NOT EASY_PROFILER) + add_definitions(-DDISABLE_EASY_PROFILER) +endif() + +message("-- Easy Profiler: ${EASY_PROFILER}") diff --git a/lib/pls/CMakeLists.txt b/lib/pls/CMakeLists.txt index 078f698..c90ff97 100644 --- a/lib/pls/CMakeLists.txt +++ b/lib/pls/CMakeLists.txt @@ -29,6 +29,9 @@ target_include_directories(pls target_link_libraries(pls Threads::Threads # pthread support ) +if(EASY_PROFILER) + target_link_libraries(pls easy_profiler) +endif() # Rules for istalling the library on a system # ...binaries diff --git a/lib/pls/include/pls/algorithms/invoke_parallel.h b/lib/pls/include/pls/algorithms/invoke_parallel.h index aa82aa8..21dec7e 100644 --- a/lib/pls/include/pls/algorithms/invoke_parallel.h +++ b/lib/pls/include/pls/algorithms/invoke_parallel.h @@ -33,7 +33,7 @@ namespace pls { auto internal_body = [&] (fork_join_sub_task* this_task){ auto sub_task_body_1 = [&] (fork_join_sub_task*){ function1(); }; - fork_join_lambda sub_task_1(&sub_task_body_1); + auto sub_task_1 = fork_join_lambda(&sub_task_body_1); this_task->spawn_child(sub_task_1); function2(); // Execute last function 'inline' without spawning a sub_task object diff --git a/lib/pls/include/pls/internal/scheduling/fork_join_task.h b/lib/pls/include/pls/internal/scheduling/fork_join_task.h index 0248d4e..36f5ccd 100644 --- a/lib/pls/include/pls/internal/scheduling/fork_join_task.h +++ b/lib/pls/include/pls/internal/scheduling/fork_join_task.h @@ -2,6 +2,8 @@ #ifndef PLS_TBB_LIKE_TASK_H #define PLS_TBB_LIKE_TASK_H +#include + #include "pls/internal/base/aligned_stack.h" #include "pls/internal/base/deque.h" @@ -84,6 +86,8 @@ namespace pls { last_stolen_{nullptr} {}; void execute() override { + EASY_BLOCK("execute fork_join_task", profiler::colors::LightGreen); + // Bind this instance to our OS thread my_stack_ = base::this_thread::state()->task_stack_; root_task_->tbb_task_ = this; @@ -97,7 +101,8 @@ namespace pls { }; template - void fork_join_sub_task::spawn_child(const T& task) { + void fork_join_sub_task::spawn_child(const T& task) { + EASY_FUNCTION(profiler::colors::Blue) static_assert(std::is_base_of::value, "Only pass fork_join_sub_task subclasses!"); T* new_task = tbb_task_->my_stack_->push(task); diff --git a/lib/pls/include/pls/internal/scheduling/root_task.h b/lib/pls/include/pls/internal/scheduling/root_task.h index 57a4dca..cdb7f0b 100644 --- a/lib/pls/include/pls/internal/scheduling/root_task.h +++ b/lib/pls/include/pls/internal/scheduling/root_task.h @@ -2,6 +2,7 @@ #ifndef PLS_ROOT_MASTER_TASK_H #define PLS_ROOT_MASTER_TASK_H +#include #include #include "abstract_task.h" @@ -13,27 +14,25 @@ namespace pls { template class root_task : public abstract_task { Function function_; - bool finished_; - - // Improvement: Remove lock and replace by atomic variable (performance) - base::spin_lock finished_lock_; + std::atomic_uint8_t finished_; public: explicit root_task(Function function): abstract_task{0, id{0}}, function_{function}, - finished_{false} {} + finished_{0} {} + root_task(const root_task& other): + abstract_task{0, id{0}}, + function_{other.function_}, + finished_{0} {} bool finished() { - std::lock_guard lock{finished_lock_}; return finished_; } void execute() override { + EASY_BLOCK("execute root_task", profiler::colors::LightGreen); function_(); - { - std::lock_guard lock{finished_lock_}; - finished_ = true; - } + finished_ = 1; } bool internal_stealing(abstract_task* /*other_task*/) override { @@ -55,6 +54,7 @@ namespace pls { master_task_{master_task} {} void execute() override { + EASY_BLOCK("execute root_task", profiler::colors::LightGreen); do { steal_work(); } while (!master_task_->finished()); diff --git a/lib/pls/include/pls/internal/scheduling/scheduler.h b/lib/pls/include/pls/internal/scheduling/scheduler.h index 59f9440..51b5b0c 100644 --- a/lib/pls/include/pls/internal/scheduling/scheduler.h +++ b/lib/pls/include/pls/internal/scheduling/scheduler.h @@ -2,6 +2,7 @@ #ifndef PLS_SCHEDULER_H #define PLS_SCHEDULER_H +#include #include #include @@ -64,6 +65,7 @@ namespace pls { template void perform_work(Function work_section) { + EASY_FUNCTION(); root_task master{work_section}; // Push root task on stacks diff --git a/lib/pls/src/internal/scheduling/abstract_task.cpp b/lib/pls/src/internal/scheduling/abstract_task.cpp index 0694f6f..7cf7dca 100644 --- a/lib/pls/src/internal/scheduling/abstract_task.cpp +++ b/lib/pls/src/internal/scheduling/abstract_task.cpp @@ -1,3 +1,5 @@ +#include + #include "pls/internal/scheduling/thread_state.h" #include "pls/internal/scheduling/abstract_task.h" #include "pls/internal/scheduling/scheduler.h" @@ -6,6 +8,7 @@ namespace pls { namespace internal { namespace scheduling { bool abstract_task::steal_work() { + EASY_FUNCTION(profiler::colors::Orange); auto my_state = base::this_thread::state(); auto my_scheduler = my_state->scheduler_; @@ -15,14 +18,19 @@ namespace pls { auto target_state = my_scheduler->thread_state_for(target); // TODO: Cleaner Locking Using std::guarded_lock + EASY_BLOCK("Acquire Thread Lock", profiler::colors::Red) target_state->lock_.lock(); + EASY_END_BLOCK; // Dig down to our level + EASY_BLOCK("Go to our level") abstract_task* current_task = target_state->root_task_; while (current_task != nullptr && current_task->depth() < depth()) { current_task = current_task->child_task_; } + EASY_END_BLOCK; + EASY_BLOCK("Internal Steal") if (current_task != nullptr) { // See if it equals our type and depth of task if (current_task->unique_id_ == unique_id_ && @@ -37,10 +45,12 @@ namespace pls { current_task = current_task->child_task_; } } + EASY_END_BLOCK; // Execute 'top level task steal' if possible // (only try deeper tasks to keep depth restricted stealing) + EASY_BLOCK("Top Level Steal") while (current_task != nullptr) { auto lock = &target_state->lock_; if (current_task->split_task(lock)) { @@ -50,6 +60,7 @@ namespace pls { current_task = current_task->child_task_; } + EASY_END_BLOCK; target_state->lock_.unlock(); } diff --git a/lib/pls/src/internal/scheduling/fork_join_task.cpp b/lib/pls/src/internal/scheduling/fork_join_task.cpp index b0bba90..413ea53 100644 --- a/lib/pls/src/internal/scheduling/fork_join_task.cpp +++ b/lib/pls/src/internal/scheduling/fork_join_task.cpp @@ -1,3 +1,5 @@ +#include + #include "pls/internal/scheduling/scheduler.h" #include "pls/internal/scheduling/fork_join_task.h" @@ -16,9 +18,11 @@ namespace pls { } void fork_join_sub_task::execute() { + EASY_BLOCK("execute sub_task", profiler::colors::Green); tbb_task_->currently_executing_ = this; execute_internal(); tbb_task_->currently_executing_ = nullptr; + EASY_END_BLOCK; wait_for_all(); if (parent_ != nullptr) { @@ -40,13 +44,18 @@ namespace pls { void fork_join_sub_task::wait_for_all() { while (ref_count_ > 0) { + EASY_BLOCK("get local sub task", profiler::colors::Blue) fork_join_sub_task* local_task = tbb_task_->get_local_sub_task(); + EASY_END_BLOCK if (local_task != nullptr) { local_task->execute(); } else { // Try to steal work. // External steal will be executed implicitly if success - if (tbb_task_->steal_work()) { + EASY_BLOCK("steal work", profiler::colors::Blue) + bool internal_steal_success = tbb_task_->steal_work(); + EASY_END_BLOCK + if (internal_steal_success) { tbb_task_->last_stolen_->execute(); } } @@ -63,6 +72,7 @@ namespace pls { } bool fork_join_task::internal_stealing(abstract_task* other_task) { + EASY_FUNCTION(profiler::colors::Blue); auto cast_other_task = reinterpret_cast(other_task); auto stolen_sub_task = cast_other_task->get_stolen_sub_task(); @@ -80,6 +90,7 @@ namespace pls { } bool fork_join_task::split_task(base::spin_lock* lock) { + EASY_FUNCTION(profiler::colors::Blue); fork_join_sub_task* stolen_sub_task = get_stolen_sub_task(); if (stolen_sub_task == nullptr) { return false;