#include "pls/internal/profiling/profiler.h" namespace pls::internal::profiling { void profiler::profiler_run::calculate_stats() { root_node_->dag_compact(); wall_time_ = std::chrono::duration_cast(end_time_ - start_time_).count(); t_1_ = root_node_->dag_total_user_time(); t_inf_ = root_node_->dag_critical_path(); steals_failed_ = 0; steals_successful_ = 0; steals_cas_ops_ = 0; steals_time_ = 0; for (auto &thread_stats : per_thread_stats_) { steals_failed_ += thread_stats.failed_steals_; steals_successful_ += thread_stats.successful_steals_; steals_cas_ops_ += thread_stats.steal_cas_ops_; steals_time_ += thread_stats.total_time_stealing_; } max_memory_per_stack_ = root_node_->dag_max_memory(); spawn_depth_ = root_node_->dag_depth(); } void profiler::profiler_run::print_stats() const { std::cout << "===========================" << std::endl; std::cout << "WALL TIME: " << m_to_d(wall_time_) << std::endl; std::cout << "USER TIME: " << m_to_d(t_1_) << std::endl; std::cout << "CRITICAL TIME: " << m_to_d(t_inf_) << std::endl; std::cout << "MAX SPEEDUP:" << (double) t_1_ / (double) t_inf_ << std::endl; std::cout << "STEALS: (Time " << m_to_d(steals_time_) << ", Total: " << (steals_failed_ + steals_successful_) << ", Success: " << steals_successful_ << ", Failed: " << steals_failed_ << ", CAS: " << steals_cas_ops_ << ")" << std::endl; unsigned long total_measured = steals_time_ + t_1_; unsigned long total_wall = wall_time_ * num_threads_; std::cout << "Wall Time vs. Measured: " << 100.0 * total_measured / total_wall << std::endl; std::cout << "MEMORY: " << max_memory_per_stack_ << " bytes per stack" << std::endl; std::cout << "DEPTH:" << spawn_depth_ << std::endl; } void profiler::profiler_run::print_dag(std::ostream &stream) { stream << "digraph {" << std::endl; root_node_->dag_print(std::cout, 0); stream << "}" << std::endl; } dag_node *profiler::start_profiler_run() { profiler_run ¤t_run = profiler_runs_.emplace_back(num_threads_); current_run.start_time_ = clock::now(); return current_run.root_node_.get(); } void profiler::stop_profiler_run() { current_run().end_time_ = clock::now(); current_run().root_node_->dag_compact(); current_run().calculate_stats(); } void profiler::stealing_start(unsigned thread_id) { run_on_stack(thread_id, [&](thread_stats &thread_stats) { thread_stats.total_steals_++; if (capture_time_) { thread_stats.stealing_start_time_ = clock::now(); } }); } void profiler::stealing_end(unsigned thread_id, bool success) { run_on_stack(thread_id, [&](thread_stats &thread_stats) { thread_stats.failed_steals_ += !success; thread_stats.successful_steals_ += success; if (capture_time_) { auto end_time = clock::now(); auto steal_duration = std::chrono::duration_cast(end_time - thread_stats.stealing_start_time_).count(); thread_stats.total_time_stealing_ += steal_duration; } }); } void profiler::stealing_cas_op(unsigned thread_id) { run_on_stack(thread_id, [&](thread_stats &thread_stats) { thread_stats.steal_cas_ops_++; }); } dag_node *profiler::task_spawn_child(unsigned thread_id, dag_node *parent) { dag_node *result; run_on_stack(thread_id, [&](thread_stats &) { result = &parent->child_nodes_.emplace_back(thread_id); }); return result; } dag_node *profiler::task_sync(unsigned thread_id, dag_node *synced) { dag_node *result; run_on_stack(thread_id, [&](thread_stats &) { synced->next_node_ = std::make_unique(thread_id); result = synced->next_node_.get(); }); return result; } void profiler::task_start_running(unsigned thread_id, dag_node *in_node) { if (capture_time_) { (void) in_node; // unused, keep for 'symmetric' API run_on_stack(thread_id, [&](thread_stats &thread_stats) { thread_stats.task_run_start_time_ = clock::now(); }); } } void profiler::task_stop_running(unsigned thread_id, dag_node *in_node) { if (capture_time_) { run_on_stack(thread_id, [&](thread_stats &thread_stats) { auto end_time = clock::now(); auto user_code_duration = std::chrono::duration_cast(end_time - thread_stats.task_run_start_time_).count(); in_node->total_runtime_ += user_code_duration; }); } } void profiler::task_prepare_stack_measure(unsigned thread_id, char *stack_memory, size_t stack_size) { if (capture_memory_) { run_on_stack(thread_id, [&](thread_stats &) { for (size_t i = 0; i < stack_size - sizeof(MAGIC_BYTES); i += sizeof(MAGIC_BYTES)) { for (size_t j = 0; j < sizeof(MAGIC_BYTES); j++) { stack_memory[i + j] = MAGIC_BYTES[j]; } } }); } } void profiler::task_finish_stack_measure(unsigned thread_id, char *stack_memory, size_t stack_size, dag_node *in_node) { if (capture_memory_) { run_on_stack(thread_id, [&](thread_stats &) { for (size_t i = 0; i < stack_size - sizeof(MAGIC_BYTES); i += sizeof(MAGIC_BYTES)) { bool section_clean = true; for (size_t j = 0; j < sizeof(MAGIC_BYTES); j++) { if (stack_memory[i + j] != MAGIC_BYTES[j]) { section_clean = false; } } in_node->max_memory_ = stack_size - i + sizeof(MAGIC_BYTES); if (!section_clean) { return; } } }); } } }