diff --git a/lib/pls/CMakeLists.txt b/lib/pls/CMakeLists.txt index fcdb2ac..e23fe54 100644 --- a/lib/pls/CMakeLists.txt +++ b/lib/pls/CMakeLists.txt @@ -39,7 +39,7 @@ add_library(pls STATIC include/pls/internal/scheduling/lock_free/task.h include/pls/internal/scheduling/lock_free/task_manager.h src/internal/scheduling/lock_free/task_manager.cpp include/pls/internal/scheduling/lock_free/external_trading_deque.h src/internal/scheduling/lock_free/external_trading_deque.cpp - include/pls/internal/scheduling/lock_free/traded_cas_field.h src/internal/scheduling/lock_free/task.cpp) + include/pls/internal/scheduling/lock_free/traded_cas_field.h src/internal/scheduling/lock_free/task.cpp include/pls/internal/profiling/dag_node.h include/pls/internal/profiling/profiler.h include/pls/internal/profiling/thread_stats.h) # Dependencies for pls target_link_libraries(pls Threads::Threads) diff --git a/lib/pls/include/pls/internal/profiling/dag_node.h b/lib/pls/include/pls/internal/profiling/dag_node.h new file mode 100644 index 0000000..f3c6d0e --- /dev/null +++ b/lib/pls/include/pls/internal/profiling/dag_node.h @@ -0,0 +1,45 @@ +#ifndef PLS_INTERNAL_PROFILING_DAG_NODE_H_ +#define PLS_INTERNAL_PROFILING_DAG_NODE_H_ + +#include +#include +#include + +namespace pls::internal::profiling { + +struct dag_node { + dag_node(unsigned spawning_thread_id) : spawning_thread_id_{spawning_thread_id} {}; + + unsigned spawning_thread_id_; + unsigned long max_memory_{0}; + unsigned long total_runtime_{0}; + + std::unique_ptr next_node_; + std::list child_nodes_; + + unsigned dag_max_memory() { + unsigned max = max_memory_; + if (next_node_) { + max = std::max(max, next_node_->dag_max_memory()); + } + for (auto &child : child_nodes_) { + max = std::max(max, child.dag_max_memory()); + } + return max; + } + + unsigned long dag_total_user_time() { + unsigned total_user_time = total_runtime_; + if (next_node_) { + total_user_time += next_node_->dag_total_user_time(); + } + for (auto &child : child_nodes_) { + total_user_time += child.dag_total_user_time(); + } + return total_user_time; + } +}; + +} + +#endif //PLS_INTERNAL_PROFILING_DAG_NODE_H_ diff --git a/lib/pls/include/pls/internal/profiling/profiler.h b/lib/pls/include/pls/internal/profiling/profiler.h new file mode 100644 index 0000000..883b29e --- /dev/null +++ b/lib/pls/include/pls/internal/profiling/profiler.h @@ -0,0 +1,193 @@ + +#ifndef PLS_INTERNAL_PROFILING_PROFILER_H_ +#define PLS_INTERNAL_PROFILING_PROFILER_H_ + +#define PLS_PROFILING_ENABLED true + +#include +#include +#include + +#include + +#include "dag_node.h" +#include "thread_stats.h" + +namespace pls::internal::profiling { +class profiler { + using clock = std::chrono::steady_clock; + using resolution = std::chrono::nanoseconds; + + struct profiler_run { + profiler_run(unsigned num_threads) : start_time_{}, + end_time_{}, + root_node_{std::make_unique(0)}, + per_thread_stats_(num_threads) {} + + clock::time_point start_time_; + clock::time_point end_time_; + std::unique_ptr root_node_; + std::vector per_thread_stats_; + + void print_stats(unsigned num_threads) const { + auto run_duration = std::chrono::duration_cast(end_time_ - start_time_).count(); + std::cout << "===========================" << std::endl; + std::cout << "WALL TIME: " << run_duration << std::endl; + unsigned long total_user_time = root_node_->dag_total_user_time(); + std::cout << "USER TIME: " << total_user_time << std::endl; + + unsigned long total_failed_steals = 0; + unsigned long total_successful_steals = 0; + unsigned long total_steal_time = 0; + for (auto &thread_stats : per_thread_stats_) { + total_failed_steals += thread_stats.failed_steals_; + total_successful_steals += thread_stats.successful_steals_; + total_steal_time += thread_stats.total_time_stealing_; + } + std::cout << "STEALS: (Time " << total_steal_time + << ", Total: " << (total_successful_steals + total_failed_steals) + << ", Success: " << total_successful_steals + << ", Failed: " << total_failed_steals << ")" << std::endl; + + unsigned long total_measured = total_steal_time + total_user_time; + unsigned long total_wall = run_duration * num_threads; + std::cout << "Wall Time vs. Measured: " << 100.0 * total_measured / total_wall << std::endl; + + std::cout << "MEMORY: " << root_node_->dag_max_memory() << " bytes per stack" << std::endl; + } + }; + + public: + profiler(unsigned num_threads) : num_threads_{num_threads}, + profiler_runs_() {} + + dag_node *start_profiler_run() { + profiler_run ¤t_run = profiler_runs_.emplace_back(num_threads_); + current_run.start_time_ = clock::now(); + return current_run.root_node_.get(); + } + + void stop_profiler_run() { + current_run().end_time_ = clock::now(); + current_run().print_stats(num_threads_); + } + + void stealing_start(unsigned thread_id) { + auto &thread_stats = thread_stats_for(thread_id); + + thread_stats.run_on_stack([&] { + thread_stats.stealing_start_time_ = clock::now(); + thread_stats.total_steals_++; + }); + } + + void stealing_end(unsigned thread_id, bool success) { + auto &thread_stats = thread_stats_for(thread_id); + + thread_stats.run_on_stack([&] { + thread_stats.failed_steals_ += !success; + thread_stats.successful_steals_ += success; + + auto end_time = clock::now(); + auto + steal_duration = std::chrono::duration_cast(end_time - thread_stats.stealing_start_time_).count(); + thread_stats.total_time_stealing_ += steal_duration; + }); + } + + void stealing_cas_op(unsigned thread_id) { + auto &thread_stats = thread_stats_for(thread_id); + + thread_stats.run_on_stack([&] { + thread_stats.steal_cas_ops_++; + }); + } + + dag_node *task_spawn_child(unsigned thread_id, dag_node *parent) { + auto &thread_stats = thread_stats_for(thread_id); + + dag_node *result; + thread_stats.run_on_stack([&] { + result = &parent->child_nodes_.emplace_back(thread_id); + }); + return result; + } + + dag_node *task_sync(unsigned thread_id, dag_node *synced) { + auto &thread_stats = thread_stats_for(thread_id); + + dag_node *result; + thread_stats.run_on_stack([&] { + synced->next_node_ = std::make_unique(thread_id); + result = synced->next_node_.get(); + }); + return result; + } + + void task_start_running(unsigned thread_id, dag_node *in_node) { + auto &thread_stats = thread_stats_for(thread_id); + + thread_stats.run_on_stack([&] { + thread_stats.task_run_start_time = clock::now(); + }); + } + + void task_stop_running(unsigned thread_id, dag_node *in_node) { + auto &thread_stats = thread_stats_for(thread_id); + + thread_stats.run_on_stack([&] { + auto end_time = clock::now(); + auto user_code_duration = + std::chrono::duration_cast(end_time - thread_stats.task_run_start_time).count(); + in_node->total_runtime_ += user_code_duration; + }); + } + + static constexpr char MAGIC_BYTES[] = {'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'}; + void task_prepare_stack_measure(unsigned thread_id, char *stack_memory, size_t stack_size) { + auto &thread_stats = thread_stats_for(thread_id); + + thread_stats.run_on_stack([&] { + for (size_t i = 0; i < stack_size - sizeof(MAGIC_BYTES); i += sizeof(MAGIC_BYTES)) { + for (size_t j = 0; j < sizeof(MAGIC_BYTES); j++) { + stack_memory[i + j] = MAGIC_BYTES[j]; + } + } + }); + } + + void task_finish_stack_measure(unsigned thread_id, char *stack_memory, size_t stack_size, dag_node *in_node) { + auto &thread_stats = thread_stats_for(thread_id); + + thread_stats.run_on_stack([&] { + for (size_t i = 0; i < stack_size - sizeof(MAGIC_BYTES); i += sizeof(MAGIC_BYTES)) { + bool section_clean = true; + for (size_t j = 0; j < sizeof(MAGIC_BYTES); j++) { + if (stack_memory[i + j] != MAGIC_BYTES[j]) { + section_clean = false; + } + } + + in_node->max_memory_ = stack_size - i + sizeof(MAGIC_BYTES); + if (!section_clean) { + return; + } + } + }); + } + + private: + profiler_run ¤t_run() { + return profiler_runs_[profiler_runs_.size() - 1]; + } + + thread_stats &thread_stats_for(unsigned thread_id) { + return current_run().per_thread_stats_[thread_id]; + } + + unsigned num_threads_; + std::vector profiler_runs_; +}; +} + +#endif //PLS_INTERNAL_PROFILING_PROFILER_H_ diff --git a/lib/pls/include/pls/internal/profiling/thread_stats.h b/lib/pls/include/pls/internal/profiling/thread_stats.h new file mode 100644 index 0000000..b41168b --- /dev/null +++ b/lib/pls/include/pls/internal/profiling/thread_stats.h @@ -0,0 +1,51 @@ + +#ifndef PLS_INTERNAL_PROFILING_THREAD_STATS_H_ +#define PLS_INTERNAL_PROFILING_THREAD_STATS_H_ + +#include +#include + +#include "context_switcher/context_switcher.h" + +#include "pls/internal/base/system_details.h" +#include "pls/internal/base/stack_allocator.h" + +namespace pls::internal::profiling { + +struct PLS_CACHE_ALIGN thread_stats { + static constexpr size_t STACK_SIZE = 4096 * 4; + + thread_stats() { + stack_ = stack_allocator_.allocate_stack(STACK_SIZE); + } + ~thread_stats() { + stack_allocator_.free_stack(STACK_SIZE, stack_); + } + + template + void run_on_stack(const Function function) { + context_switcher::enter_context(stack_, STACK_SIZE, [function](auto cont) { + function(); + return std::move(cont); + }); + } + + using clock = std::chrono::steady_clock; + + unsigned long total_steals_{0}; + unsigned long successful_steals_{0}; + unsigned long failed_steals_{0}; + unsigned long steal_cas_ops_{0}; + + unsigned long total_time_stealing_{0}; + + clock::time_point stealing_start_time_; + clock::time_point task_run_start_time; + + base::mmap_stack_allocator stack_allocator_; + char *stack_; +}; + +} + +#endif //PLS_INTERNAL_PROFILING_THREAD_STATS_H_ diff --git a/lib/pls/include/pls/internal/scheduling/base_task.h b/lib/pls/include/pls/internal/scheduling/base_task.h index 5cbe16e..4f78fa9 100644 --- a/lib/pls/include/pls/internal/scheduling/base_task.h +++ b/lib/pls/include/pls/internal/scheduling/base_task.h @@ -7,6 +7,9 @@ #include "context_switcher/continuation.h" #include "context_switcher/context_switcher.h" +#include "pls/internal/profiling/profiler.h" +#include "pls/internal/profiling/dag_node.h" + namespace pls::internal::scheduling { /** * A task is the smallest unit of execution seen by the runtime system. @@ -59,6 +62,10 @@ struct base_task { // Linked list for trading/memory management base_task *prev_; base_task *next_; + +#if PLS_PROFILING_ENABLED + profiling::dag_node *profiling_node_; +#endif }; } diff --git a/lib/pls/include/pls/internal/scheduling/lock_free/task_manager.h b/lib/pls/include/pls/internal/scheduling/lock_free/task_manager.h index f4b8505..6f39341 100644 --- a/lib/pls/include/pls/internal/scheduling/lock_free/task_manager.h +++ b/lib/pls/include/pls/internal/scheduling/lock_free/task_manager.h @@ -42,7 +42,8 @@ class task_manager { base_task *pop_local_task(); // Stealing work, automatically trades in another task - std::tuple steal_task(thread_state &stealing_state); + // Return: stolen_task, traded_task, cas_success + std::tuple steal_task(thread_state &stealing_state); // Sync/memory management base_task *pop_clean_task_chain(base_task *task); diff --git a/lib/pls/include/pls/internal/scheduling/scheduler.h b/lib/pls/include/pls/internal/scheduling/scheduler.h index 3588643..edb95d2 100644 --- a/lib/pls/include/pls/internal/scheduling/scheduler.h +++ b/lib/pls/include/pls/internal/scheduling/scheduler.h @@ -13,6 +13,8 @@ #include "pls/internal/scheduling/thread_state.h" #include "pls/internal/scheduling/task_manager.h" +#include "pls/internal/profiling/profiler.h" + namespace pls::internal::scheduling { /** * The scheduler is the central part of the dispatching-framework. @@ -121,6 +123,10 @@ class scheduler { bool terminated_; std::shared_ptr stack_allocator_; + +#if PLS_PROFILING_ENABLED + profiling::profiler profiler_; +#endif }; } diff --git a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h index aceda9b..101821b 100644 --- a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h +++ b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h @@ -11,6 +11,8 @@ #include "pls/internal/scheduling/base_task.h" #include "base_task.h" +#include "pls/internal/profiling/dag_node.h" + namespace pls::internal::scheduling { template @@ -27,7 +29,11 @@ scheduler::scheduler(unsigned int num_threads, main_thread_starter_function_{nullptr}, work_section_done_{false}, terminated_{false}, - stack_allocator_{std::make_shared(std::forward(stack_allocator))} { + stack_allocator_{std::make_shared(std::forward(stack_allocator))} +#if PLS_PROFILING_ENABLED + , profiler_{num_threads} +#endif +{ worker_threads_.reserve(num_threads); task_managers_.reserve(num_threads); @@ -63,15 +69,36 @@ class scheduler::init_function_impl : public init_function { explicit init_function_impl(F &function) : function_{function} {} void run() override { base_task *root_task = thread_state::get().get_active_task(); + +#if PLS_PROFILING_ENABLED + thread_state::get().get_scheduler().profiler_.task_start_running(thread_state::get().get_thread_id(), + root_task->profiling_node_); + thread_state::get().get_scheduler().profiler_.task_prepare_stack_measure(thread_state::get().get_thread_id(), + root_task->stack_memory_, + root_task->stack_size_); +#endif + root_task->run_as_task([root_task, this](auto cont) { root_task->is_synchronized_ = true; thread_state::get().main_continuation() = std::move(cont); function_(); + thread_state::get().get_scheduler().work_section_done_.store(true); PLS_ASSERT(thread_state::get().main_continuation().valid(), "Must return valid continuation from main task."); + +#if PLS_PROFILING_ENABLED + thread_state::get().get_scheduler().profiler_.task_stop_running(thread_state::get().get_thread_id(), + root_task->profiling_node_); +#endif return std::move(thread_state::get().main_continuation()); }); +#if PLS_PROFILING_ENABLED + thread_state::get().get_scheduler().profiler_.task_finish_stack_measure(thread_state::get().get_thread_id(), + root_task->stack_memory_, + root_task->stack_size_, + root_task->profiling_node_); +#endif } private: F &function_; @@ -83,6 +110,11 @@ void scheduler::perform_work(Function work_section) { init_function_impl starter_function{work_section}; main_thread_starter_function_ = &starter_function; +#if PLS_PROFILING_ENABLED + auto *root_task = thread_state_for(0).get_active_task(); + auto *root_node = profiler_.start_profiler_run(); + root_task->profiling_node_ = root_node; +#endif work_section_done_ = false; if (reuse_thread_) { auto &my_state = thread_state_for(0); @@ -96,6 +128,9 @@ void scheduler::perform_work(Function work_section) { sync_barrier_.wait(); // Trigger threads to wake up sync_barrier_.wait(); // Wait for threads to finish } +#if PLS_PROFILING_ENABLED + profiler_.stop_profiler_run(); +#endif } template @@ -105,6 +140,16 @@ void scheduler::spawn(Function &&lambda) { base_task *last_task = spawning_state.get_active_task(); base_task *spawned_task = last_task->next_; +#if PLS_PROFILING_ENABLED + // Memory and DAG nodes + spawning_state.get_scheduler().profiler_.task_prepare_stack_measure(spawning_state.get_thread_id(), + spawned_task->stack_memory_, + spawned_task->stack_size_); + auto *child_dag_node = spawning_state.get_scheduler().profiler_.task_spawn_child(spawning_state.get_thread_id(), + last_task->profiling_node_); + spawned_task->profiling_node_ = child_dag_node; +#endif + auto continuation = spawned_task->run_as_task([last_task, spawned_task, lambda, &spawning_state](auto cont) { // allow stealing threads to continue the last task. last_task->continuation_ = std::move(cont); @@ -115,7 +160,14 @@ void scheduler::spawn(Function &&lambda) { spawning_state.get_task_manager().push_local_task(last_task); // execute the lambda itself, which could lead to a different thread returning. +#if PLS_PROFILING_ENABLED + spawning_state.get_scheduler().profiler_.task_stop_running(spawning_state.get_thread_id(), + last_task->profiling_node_); + spawning_state.get_scheduler().profiler_.task_start_running(spawning_state.get_thread_id(), + spawned_task->profiling_node_); +#endif lambda(); + thread_state &syncing_state = thread_state::get(); PLS_ASSERT(syncing_state.get_active_task() == spawned_task, "Task manager must always point its active task onto whats executing."); @@ -136,13 +188,27 @@ void scheduler::spawn(Function &&lambda) { "Fast path, no one can have continued working on the last task."); syncing_state.set_active_task(last_task); +#if PLS_PROFILING_ENABLED + syncing_state.get_scheduler().profiler_.task_stop_running(syncing_state.get_thread_id(), + spawned_task->profiling_node_); +#endif return std::move(last_task->continuation_); } else { // Slow path, the last task was stolen. This path is common to sync() events. - return slow_return(syncing_state); + auto continuation = slow_return(syncing_state); +#if PLS_PROFILING_ENABLED + syncing_state.get_scheduler().profiler_.task_stop_running(syncing_state.get_thread_id(), + spawned_task->profiling_node_); +#endif + return std::move(continuation); } }); +#if PLS_PROFILING_ENABLED + thread_state::get().get_scheduler().profiler_.task_start_running(thread_state::get().get_thread_id(), + thread_state::get().get_active_task()->profiling_node_); +#endif + if (continuation.valid()) { // We jumped in here from the main loop, keep track! thread_state::get().main_continuation() = std::move(continuation); diff --git a/lib/pls/src/internal/scheduling/lock_free/task_manager.cpp b/lib/pls/src/internal/scheduling/lock_free/task_manager.cpp index b306bca..67e1b66 100644 --- a/lib/pls/src/internal/scheduling/lock_free/task_manager.cpp +++ b/lib/pls/src/internal/scheduling/lock_free/task_manager.cpp @@ -39,7 +39,7 @@ base_task *task_manager::pop_local_task() { return deque_.pop_bot(); } -std::tuple task_manager::steal_task(thread_state &stealing_state) { +std::tuple task_manager::steal_task(thread_state &stealing_state) { PLS_ASSERT(stealing_state.get_active_task()->depth_ == 0, "Must only steal with clean task chain."); PLS_ASSERT(scheduler::check_task_chain(*stealing_state.get_active_task()), "Must only steal with clean task chain."); @@ -52,7 +52,7 @@ std::tuple task_manager::steal_task(thread_state &stea base_task *chain_after_stolen_task = traded_task->next_; // perform the actual pop operation - task* pop_result_task = deque_.pop_top(traded_task, peek); + task *pop_result_task = deque_.pop_top(traded_task, peek); if (pop_result_task) { PLS_ASSERT(stolen_task->thread_id_ != traded_task->thread_id_, "It is impossible to steal an task we already own!"); @@ -62,7 +62,7 @@ std::tuple task_manager::steal_task(thread_state &stea // update the resource stack associated with the stolen task stolen_task->push_task_chain(traded_task); - task* optional_exchanged_task = external_trading_deque::get_trade_object(stolen_task); + task *optional_exchanged_task = external_trading_deque::get_trade_object(stolen_task); if (optional_exchanged_task) { // All good, we pushed the task over to the stack, nothing more to do PLS_ASSERT(optional_exchanged_task == traded_task, @@ -73,12 +73,12 @@ std::tuple task_manager::steal_task(thread_state &stea stolen_task->reset_task_chain(); } - return std::pair{stolen_task, chain_after_stolen_task}; + return std::tuple{stolen_task, chain_after_stolen_task, true}; } else { - return std::pair{nullptr, nullptr}; + return std::tuple{nullptr, nullptr, false}; } } else { - return std::pair{nullptr, nullptr}; + return std::tuple{nullptr, nullptr, true}; } } @@ -88,7 +88,7 @@ base_task *task_manager::pop_clean_task_chain(base_task *base_task) { task *clean_chain = popped_task->pop_task_chain(); if (clean_chain == nullptr) { // double-check if we are really last one or we only have unlucky timing - task* optional_cas_task = external_trading_deque::get_trade_object(popped_task); + task *optional_cas_task = external_trading_deque::get_trade_object(popped_task); if (optional_cas_task) { clean_chain = optional_cas_task; } else { diff --git a/lib/pls/src/internal/scheduling/scheduler.cpp b/lib/pls/src/internal/scheduling/scheduler.cpp index 2ab7454..6ab57b2 100644 --- a/lib/pls/src/internal/scheduling/scheduler.cpp +++ b/lib/pls/src/internal/scheduling/scheduler.cpp @@ -48,6 +48,9 @@ void scheduler::work_thread_work_section() { unsigned int failed_steals = 0; while (!work_section_done_) { +#if PLS_PROFILING_ENABLED + my_state.get_scheduler().profiler_.stealing_start(my_state.get_thread_id()); +#endif PLS_ASSERT(check_task_chain(*my_state.get_active_task()), "Must start stealing with a clean task chain."); size_t target; @@ -56,7 +59,18 @@ void scheduler::work_thread_work_section() { } while (target == my_state.get_thread_id()); thread_state &target_state = my_state.get_scheduler().thread_state_for(target); - auto[stolen_task, chain_after_stolen_task] = target_state.get_task_manager().steal_task(my_state); + + base_task *stolen_task; + base_task *chain_after_stolen_task; + bool cas_success; + do { +#if PLS_PROFILING_ENABLED + my_state.get_scheduler().profiler_.stealing_cas_op(my_state.get_thread_id()); +#endif + std::tie(stolen_task, chain_after_stolen_task, cas_success) = + target_state.get_task_manager().steal_task(my_state); + } while (!cas_success); + if (stolen_task) { // Keep task chain consistent. We want to appear as if we are working an a branch upwards of the stolen task. stolen_task->next_ = chain_after_stolen_task; @@ -70,6 +84,11 @@ void scheduler::work_thread_work_section() { PLS_ASSERT(stolen_task->continuation_.valid(), "A task that we can steal must have a valid continuation for us to start working."); stolen_task->is_synchronized_ = false; +#if PLS_PROFILING_ENABLED + my_state.get_scheduler().profiler_.stealing_end(my_state.get_thread_id(), true); + my_state.get_scheduler().profiler_.task_start_running(my_state.get_thread_id(), + stolen_task->profiling_node_); +#endif context_switcher::switch_context(std::move(stolen_task->continuation_)); // We will continue execution in this line when we finished the stolen work. failed_steals = 0; @@ -78,6 +97,9 @@ void scheduler::work_thread_work_section() { if (failed_steals >= num_threads) { std::this_thread::yield(); } +#if PLS_PROFILING_ENABLED + my_state.get_scheduler().profiler_.stealing_end(my_state.get_thread_id(), false); +#endif } } } @@ -88,7 +110,23 @@ void scheduler::sync() { base_task *active_task = syncing_state.get_active_task(); base_task *spawned_task = active_task->next_; +#if PLS_PROFILING_ENABLED + syncing_state.get_scheduler().profiler_.task_finish_stack_measure(syncing_state.get_thread_id(), + active_task->stack_memory_, + active_task->stack_size_, + active_task->profiling_node_); + syncing_state.get_scheduler().profiler_.task_stop_running(syncing_state.get_thread_id(), + active_task->profiling_node_); + auto *next_dag_node = + syncing_state.get_scheduler().profiler_.task_sync(syncing_state.get_thread_id(), active_task->profiling_node_); + active_task->profiling_node_ = next_dag_node; +#endif + if (active_task->is_synchronized_) { +#if PLS_PROFILING_ENABLED + syncing_state.get_scheduler().profiler_.task_start_running(syncing_state.get_thread_id(), + active_task->profiling_node_); +#endif return; // We are already the sole owner of last_task } else { auto continuation = @@ -101,6 +139,10 @@ void scheduler::sync() { PLS_ASSERT(!continuation.valid(), "We only return to a sync point, never jump to it directly." "This must therefore never return an unfinished fiber/continuation."); +#if PLS_PROFILING_ENABLED + thread_state::get().get_scheduler().profiler_.task_start_running(thread_state::get().get_thread_id(), + thread_state::get().get_active_task()->profiling_node_); +#endif return; // We cleanly synced to the last one finishing work on last_task } } @@ -140,7 +182,7 @@ context_switcher::continuation scheduler::slow_return(thread_state &calling_stat // Jump back to the continuation in main scheduling loop. context_switcher::continuation result_cont = std::move(thread_state::get().main_continuation()); PLS_ASSERT(result_cont.valid(), "Must return a valid continuation."); - return result_cont; + return std::move(result_cont); } else { // Make sure that we are owner of this full continuation/task chain. last_task->next_ = this_task; @@ -152,7 +194,7 @@ context_switcher::continuation scheduler::slow_return(thread_state &calling_stat // Jump to parent task and continue working on it. context_switcher::continuation result_cont = std::move(last_task->continuation_); PLS_ASSERT(result_cont.valid(), "Must return a valid continuation."); - return result_cont; + return std::move(result_cont); } }