#ifndef PLS_INTERNAL_PROFILING_PROFILER_H_ #define PLS_INTERNAL_PROFILING_PROFILER_H_ #define PLS_PROFILING_ENABLED false #include #include #include #include #include "dag_node.h" #include "thread_stats.h" namespace pls::internal::profiling { class profiler { using clock = std::chrono::steady_clock; using measurement_resolution = std::chrono::nanoseconds; using display_resolution = std::chrono::microseconds; struct profiler_run { profiler_run(unsigned num_threads) : start_time_{}, end_time_{}, root_node_{std::make_unique(0)}, per_thread_stats_(num_threads) {} clock::time_point start_time_; clock::time_point end_time_; std::unique_ptr root_node_; std::vector per_thread_stats_; static unsigned long m_to_d(unsigned long duration) { measurement_resolution measurement_duration{duration}; return std::chrono::duration_cast(measurement_duration).count(); } void print_stats(unsigned num_threads) const { root_node_->dag_compact(); auto run_duration = std::chrono::duration_cast(end_time_ - start_time_).count(); std::cout << "===========================" << std::endl; std::cout << "WALL TIME: " << m_to_d(run_duration) << std::endl; unsigned long total_user_time = root_node_->dag_total_user_time(); std::cout << "USER TIME: " << m_to_d(total_user_time) << std::endl; unsigned long critical_path_time = root_node_->dag_critical_path(); std::cout << "CRITICAL TIME: " << m_to_d(critical_path_time) << std::endl; std::cout << "MAX SPEEDUP:" << (double) total_user_time / (double) critical_path_time << std::endl; unsigned long total_failed_steals = 0; unsigned long total_successful_steals = 0; unsigned long total_steal_time = 0; for (auto &thread_stats : per_thread_stats_) { total_failed_steals += thread_stats.failed_steals_; total_successful_steals += thread_stats.successful_steals_; total_steal_time += thread_stats.total_time_stealing_; } std::cout << "STEALS: (Time " << m_to_d(total_steal_time) << ", Total: " << (total_successful_steals + total_failed_steals) << ", Success: " << total_successful_steals << ", Failed: " << total_failed_steals << ")" << std::endl; unsigned long total_measured = total_steal_time + total_user_time; unsigned long total_wall = run_duration * num_threads; std::cout << "Wall Time vs. Measured: " << 100.0 * total_measured / total_wall << std::endl; std::cout << "MEMORY: " << root_node_->dag_max_memory() << " bytes per stack" << std::endl; // TODO: re-add graph printing // std::cout << "digraph {" << std::endl; // root_node_->dag_print(std::cout, 0); // std::cout << "}" << std::endl; } }; public: profiler(unsigned num_threads) : num_threads_{num_threads}, profiler_runs_() {} dag_node *start_profiler_run() { profiler_run ¤t_run = profiler_runs_.emplace_back(num_threads_); current_run.start_time_ = clock::now(); return current_run.root_node_.get(); } void stop_profiler_run() { current_run().end_time_ = clock::now(); current_run().print_stats(num_threads_); } void stealing_start(unsigned thread_id) { auto &thread_stats = thread_stats_for(thread_id); thread_stats.run_on_stack([&] { thread_stats.stealing_start_time_ = clock::now(); thread_stats.total_steals_++; }); } void stealing_end(unsigned thread_id, bool success) { auto &thread_stats = thread_stats_for(thread_id); thread_stats.run_on_stack([&] { thread_stats.failed_steals_ += !success; thread_stats.successful_steals_ += success; auto end_time = clock::now(); auto steal_duration = std::chrono::duration_cast(end_time - thread_stats.stealing_start_time_).count(); thread_stats.total_time_stealing_ += steal_duration; }); } void stealing_cas_op(unsigned thread_id) { auto &thread_stats = thread_stats_for(thread_id); thread_stats.run_on_stack([&] { thread_stats.steal_cas_ops_++; }); } dag_node *task_spawn_child(unsigned thread_id, dag_node *parent) { auto &thread_stats = thread_stats_for(thread_id); dag_node *result; thread_stats.run_on_stack([&] { result = &parent->child_nodes_.emplace_back(thread_id); }); return result; } dag_node *task_sync(unsigned thread_id, dag_node *synced) { auto &thread_stats = thread_stats_for(thread_id); dag_node *result; thread_stats.run_on_stack([&] { synced->next_node_ = std::make_unique(thread_id); result = synced->next_node_.get(); }); return result; } void task_start_running(unsigned thread_id, dag_node *in_node) { auto &thread_stats = thread_stats_for(thread_id); thread_stats.run_on_stack([&] { thread_stats.task_run_start_time = clock::now(); }); } void task_stop_running(unsigned thread_id, dag_node *in_node) { auto &thread_stats = thread_stats_for(thread_id); thread_stats.run_on_stack([&] { auto end_time = clock::now(); auto user_code_duration = std::chrono::duration_cast(end_time - thread_stats.task_run_start_time).count(); in_node->total_runtime_ += user_code_duration; }); } static constexpr char MAGIC_BYTES[] = {'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'}; void task_prepare_stack_measure(unsigned thread_id, char *stack_memory, size_t stack_size) { auto &thread_stats = thread_stats_for(thread_id); thread_stats.run_on_stack([&] { for (size_t i = 0; i < stack_size - sizeof(MAGIC_BYTES); i += sizeof(MAGIC_BYTES)) { for (size_t j = 0; j < sizeof(MAGIC_BYTES); j++) { stack_memory[i + j] = MAGIC_BYTES[j]; } } }); } void task_finish_stack_measure(unsigned thread_id, char *stack_memory, size_t stack_size, dag_node *in_node) { auto &thread_stats = thread_stats_for(thread_id); thread_stats.run_on_stack([&] { for (size_t i = 0; i < stack_size - sizeof(MAGIC_BYTES); i += sizeof(MAGIC_BYTES)) { bool section_clean = true; for (size_t j = 0; j < sizeof(MAGIC_BYTES); j++) { if (stack_memory[i + j] != MAGIC_BYTES[j]) { section_clean = false; } } in_node->max_memory_ = stack_size - i + sizeof(MAGIC_BYTES); if (!section_clean) { return; } } }); } private: profiler_run ¤t_run() { return profiler_runs_[profiler_runs_.size() - 1]; } thread_stats &thread_stats_for(unsigned thread_id) { return current_run().per_thread_stats_[thread_id]; } unsigned num_threads_; std::vector profiler_runs_; }; } #endif //PLS_INTERNAL_PROFILING_PROFILER_H_