#include "pls/internal/profiling/profiler.h" namespace pls::internal::profiling { void profiler::profiler_run::print_stats(unsigned num_threads) const { root_node_->dag_compact(); auto run_duration = std::chrono::duration_cast(end_time_ - start_time_).count(); std::cout << "===========================" << std::endl; std::cout << "WALL TIME: " << m_to_d(run_duration) << std::endl; unsigned long total_user_time = root_node_->dag_total_user_time(); std::cout << "USER TIME: " << m_to_d(total_user_time) << std::endl; unsigned long critical_path_time = root_node_->dag_critical_path(); std::cout << "CRITICAL TIME: " << m_to_d(critical_path_time) << std::endl; std::cout << "MAX SPEEDUP:" << (double) total_user_time / (double) critical_path_time << std::endl; unsigned long total_failed_steals = 0; unsigned long total_successful_steals = 0; unsigned long total_steal_time = 0; for (auto &thread_stats : per_thread_stats_) { total_failed_steals += thread_stats.failed_steals_; total_successful_steals += thread_stats.successful_steals_; total_steal_time += thread_stats.total_time_stealing_; } std::cout << "STEALS: (Time " << m_to_d(total_steal_time) << ", Total: " << (total_successful_steals + total_failed_steals) << ", Success: " << total_successful_steals << ", Failed: " << total_failed_steals << ")" << std::endl; unsigned long total_measured = total_steal_time + total_user_time; unsigned long total_wall = run_duration * num_threads; std::cout << "Wall Time vs. Measured: " << 100.0 * total_measured / total_wall << std::endl; std::cout << "MEMORY: " << root_node_->dag_max_memory() << " bytes per stack" << std::endl; // TODO: re-add graph printing // std::cout << "digraph {" << std::endl; // root_node_->dag_print(std::cout, 0); // std::cout << "}" << std::endl; } dag_node *profiler::start_profiler_run() { profiler_run ¤t_run = profiler_runs_.emplace_back(num_threads_); current_run.start_time_ = clock::now(); return current_run.root_node_.get(); } void profiler::stop_profiler_run() { current_run().end_time_ = clock::now(); current_run().print_stats(num_threads_); } void profiler::stealing_start(unsigned thread_id) { auto &thread_stats = thread_stats_for(thread_id); thread_stats.run_on_stack([&] { thread_stats.stealing_start_time_ = clock::now(); thread_stats.total_steals_++; }); } void profiler::stealing_end(unsigned thread_id, bool success) { auto &thread_stats = thread_stats_for(thread_id); thread_stats.run_on_stack([&] { thread_stats.failed_steals_ += !success; thread_stats.successful_steals_ += success; auto end_time = clock::now(); auto steal_duration = std::chrono::duration_cast(end_time - thread_stats.stealing_start_time_).count(); thread_stats.total_time_stealing_ += steal_duration; }); } void profiler::stealing_cas_op(unsigned thread_id) { auto &thread_stats = thread_stats_for(thread_id); thread_stats.run_on_stack([&] { thread_stats.steal_cas_ops_++; }); } dag_node *profiler::task_spawn_child(unsigned thread_id, dag_node *parent) { auto &thread_stats = thread_stats_for(thread_id); dag_node *result; thread_stats.run_on_stack([&] { result = &parent->child_nodes_.emplace_back(thread_id); }); return result; } dag_node *profiler::task_sync(unsigned thread_id, dag_node *synced) { auto &thread_stats = thread_stats_for(thread_id); dag_node *result; thread_stats.run_on_stack([&] { synced->next_node_ = std::make_unique(thread_id); result = synced->next_node_.get(); }); return result; } void profiler::task_start_running(unsigned thread_id, dag_node *in_node) { (void) in_node; // unused, keep for 'symmetric' API auto &thread_stats = thread_stats_for(thread_id); thread_stats.run_on_stack([&] { thread_stats.task_run_start_time = clock::now(); }); } void profiler::task_stop_running(unsigned thread_id, dag_node *in_node) { auto &thread_stats = thread_stats_for(thread_id); thread_stats.run_on_stack([&] { auto end_time = clock::now(); auto user_code_duration = std::chrono::duration_cast(end_time - thread_stats.task_run_start_time).count(); in_node->total_runtime_ += user_code_duration; }); } void profiler::task_prepare_stack_measure(unsigned thread_id, char *stack_memory, size_t stack_size) { auto &thread_stats = thread_stats_for(thread_id); thread_stats.run_on_stack([&] { for (size_t i = 0; i < stack_size - sizeof(MAGIC_BYTES); i += sizeof(MAGIC_BYTES)) { for (size_t j = 0; j < sizeof(MAGIC_BYTES); j++) { stack_memory[i + j] = MAGIC_BYTES[j]; } } }); } void profiler::task_finish_stack_measure(unsigned thread_id, char *stack_memory, size_t stack_size, dag_node *in_node) { auto &thread_stats = thread_stats_for(thread_id); thread_stats.run_on_stack([&] { for (size_t i = 0; i < stack_size - sizeof(MAGIC_BYTES); i += sizeof(MAGIC_BYTES)) { bool section_clean = true; for (size_t j = 0; j < sizeof(MAGIC_BYTES); j++) { if (stack_memory[i + j] != MAGIC_BYTES[j]) { section_clean = false; } } in_node->max_memory_ = stack_size - i + sizeof(MAGIC_BYTES); if (!section_clean) { return; } } }); } }