diff --git a/lib/pls/include/pls/internal/profiling/dag_node.h b/lib/pls/include/pls/internal/profiling/dag_node.h index f3c6d0e..8254969 100644 --- a/lib/pls/include/pls/internal/profiling/dag_node.h +++ b/lib/pls/include/pls/internal/profiling/dag_node.h @@ -1,9 +1,12 @@ #ifndef PLS_INTERNAL_PROFILING_DAG_NODE_H_ #define PLS_INTERNAL_PROFILING_DAG_NODE_H_ +#include "pls/internal/base/system_details.h" + #include #include #include +#include namespace pls::internal::profiling { @@ -17,6 +20,37 @@ struct dag_node { std::unique_ptr next_node_; std::list child_nodes_; + base::system_details::pointer_t node_print_id() { + return reinterpret_cast(this); + } + + void dag_compact() { + while (next_node_ && next_node_->child_nodes_.empty()) { + max_memory_ = std::max(max_memory_, next_node_->max_memory_); + total_runtime_ += next_node_->total_runtime_; + next_node_ = std::move(next_node_->next_node_); + } + if (next_node_) { + next_node_->dag_compact(); + } + + for (auto &child : child_nodes_) { + child.dag_compact(); + } + } + + void dag_print(std::ostream &stream, unsigned rank) { + stream << node_print_id() << " [label=" << spawning_thread_id_ << ", rank=" << rank << "];" << std::endl; + for (auto &child : child_nodes_) { + child.dag_print(stream, rank + 1); + stream << node_print_id() << " -> " << child.node_print_id() << ";" << std::endl; + } + if (next_node_) { + next_node_->dag_print(stream, rank); + stream << node_print_id() << " -> " << next_node_->node_print_id() << ";" << std::endl; + } + } + unsigned dag_max_memory() { unsigned max = max_memory_; if (next_node_) { @@ -29,7 +63,7 @@ struct dag_node { } unsigned long dag_total_user_time() { - unsigned total_user_time = total_runtime_; + unsigned long total_user_time = total_runtime_; if (next_node_) { total_user_time += next_node_->dag_total_user_time(); } @@ -38,6 +72,21 @@ struct dag_node { } return total_user_time; } + + unsigned long dag_critical_path() { + unsigned long critical_path = total_runtime_; + if (next_node_) { + critical_path += next_node_->dag_critical_path(); + } + + unsigned long slowest_child = 0; + for (auto &child : child_nodes_) { + slowest_child = std::max(slowest_child, child.dag_critical_path()); + } + critical_path += slowest_child; + + return critical_path; + } }; } diff --git a/lib/pls/include/pls/internal/profiling/profiler.h b/lib/pls/include/pls/internal/profiling/profiler.h index 883b29e..29972a0 100644 --- a/lib/pls/include/pls/internal/profiling/profiler.h +++ b/lib/pls/include/pls/internal/profiling/profiler.h @@ -2,7 +2,7 @@ #ifndef PLS_INTERNAL_PROFILING_PROFILER_H_ #define PLS_INTERNAL_PROFILING_PROFILER_H_ -#define PLS_PROFILING_ENABLED true +#define PLS_PROFILING_ENABLED false #include #include @@ -16,7 +16,8 @@ namespace pls::internal::profiling { class profiler { using clock = std::chrono::steady_clock; - using resolution = std::chrono::nanoseconds; + using measurement_resolution = std::chrono::nanoseconds; + using display_resolution = std::chrono::microseconds; struct profiler_run { profiler_run(unsigned num_threads) : start_time_{}, @@ -29,12 +30,22 @@ class profiler { std::unique_ptr root_node_; std::vector per_thread_stats_; + static unsigned long m_to_d(unsigned long duration) { + measurement_resolution measurement_duration{duration}; + return std::chrono::duration_cast(measurement_duration).count(); + } + void print_stats(unsigned num_threads) const { - auto run_duration = std::chrono::duration_cast(end_time_ - start_time_).count(); + root_node_->dag_compact(); + + auto run_duration = std::chrono::duration_cast(end_time_ - start_time_).count(); std::cout << "===========================" << std::endl; - std::cout << "WALL TIME: " << run_duration << std::endl; + std::cout << "WALL TIME: " << m_to_d(run_duration) << std::endl; unsigned long total_user_time = root_node_->dag_total_user_time(); - std::cout << "USER TIME: " << total_user_time << std::endl; + std::cout << "USER TIME: " << m_to_d(total_user_time) << std::endl; + unsigned long critical_path_time = root_node_->dag_critical_path(); + std::cout << "CRITICAL TIME: " << m_to_d(critical_path_time) << std::endl; + std::cout << "MAX SPEEDUP:" << (double) total_user_time / (double) critical_path_time << std::endl; unsigned long total_failed_steals = 0; unsigned long total_successful_steals = 0; @@ -44,7 +55,7 @@ class profiler { total_successful_steals += thread_stats.successful_steals_; total_steal_time += thread_stats.total_time_stealing_; } - std::cout << "STEALS: (Time " << total_steal_time + std::cout << "STEALS: (Time " << m_to_d(total_steal_time) << ", Total: " << (total_successful_steals + total_failed_steals) << ", Success: " << total_successful_steals << ", Failed: " << total_failed_steals << ")" << std::endl; @@ -54,6 +65,11 @@ class profiler { std::cout << "Wall Time vs. Measured: " << 100.0 * total_measured / total_wall << std::endl; std::cout << "MEMORY: " << root_node_->dag_max_memory() << " bytes per stack" << std::endl; + + // TODO: re-add graph printing +// std::cout << "digraph {" << std::endl; +// root_node_->dag_print(std::cout, 0); +// std::cout << "}" << std::endl; } }; @@ -90,7 +106,8 @@ class profiler { auto end_time = clock::now(); auto - steal_duration = std::chrono::duration_cast(end_time - thread_stats.stealing_start_time_).count(); + steal_duration = + std::chrono::duration_cast(end_time - thread_stats.stealing_start_time_).count(); thread_stats.total_time_stealing_ += steal_duration; }); } @@ -138,7 +155,7 @@ class profiler { thread_stats.run_on_stack([&] { auto end_time = clock::now(); auto user_code_duration = - std::chrono::duration_cast(end_time - thread_stats.task_run_start_time).count(); + std::chrono::duration_cast(end_time - thread_stats.task_run_start_time).count(); in_node->total_runtime_ += user_code_duration; }); }