#ifndef PLS_INTERNAL_PROFILING_PROFILER_H_ #define PLS_INTERNAL_PROFILING_PROFILER_H_ #ifndef PLS_PROFILING_ENABLED #define PLS_PROFILING_ENABLED true #endif #include #include #include #include #include "dag_node.h" #include "thread_stats.h" namespace pls::internal::profiling { class profiler { using clock = std::chrono::steady_clock; using measurement_resolution = std::chrono::nanoseconds; using display_resolution = std::chrono::microseconds; static unsigned long m_to_d(unsigned long duration) { measurement_resolution measurement_duration{duration}; return std::chrono::duration_cast(measurement_duration).count(); } struct profiler_run { profiler_run(unsigned num_threads) : start_time_{}, end_time_{}, root_node_{std::make_unique(0)}, per_thread_stats_(num_threads), num_threads_{num_threads} {} // Runtime stats clock::time_point start_time_; clock::time_point end_time_; std::unique_ptr root_node_; std::vector per_thread_stats_; // Collective stats unsigned num_threads_; unsigned long wall_time_; unsigned long t_1_; unsigned long t_inf_; unsigned long steals_failed_; unsigned long steals_successful_; unsigned long steals_cas_ops_; unsigned long steals_time_; unsigned long max_memory_per_stack_; unsigned long spawn_depth_; void calculate_stats(); void print_stats() const; void print_dag(std::ostream &stream); }; public: explicit profiler(unsigned num_threads) : num_threads_{num_threads}, capture_memory_{true}, capture_time_{true}, profiler_runs_() { for (unsigned i = 0; i < num_threads_; i++) { stacks_.push_back(stack_allocator_.allocate_stack(STACK_SIZE)); } } ~profiler() { for (unsigned i = 0; i < num_threads_; i++) { stack_allocator_.free_stack(STACK_SIZE, stacks_[i]); } } dag_node *start_profiler_run(); void stop_profiler_run(); void stealing_start(unsigned thread_id); void stealing_end(unsigned thread_id, bool success); void stealing_cas_op(unsigned thread_id); dag_node *task_spawn_child(unsigned thread_id, dag_node *parent); dag_node *task_sync(unsigned thread_id, dag_node *synced); void task_start_running(unsigned thread_id, dag_node *in_node); void task_stop_running(unsigned thread_id, dag_node *in_node); void task_prepare_stack_measure(unsigned thread_id, char *stack_memory, size_t stack_size); void task_finish_stack_measure(unsigned thread_id, char *stack_memory, size_t stack_size, dag_node *in_node); profiler_run ¤t_run() { return profiler_runs_[profiler_runs_.size() - 1]; } thread_stats &thread_stats_for(unsigned thread_id) { return current_run().per_thread_stats_[thread_id]; } void disable_time_measure() { capture_time_ = false; } void enable_time_measure() { capture_time_ = true; } void disable_memory_measure() { capture_memory_ = false; } void enable_memory_measure() { capture_memory_ = true; } private: static constexpr char MAGIC_BYTES[] = {'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'}; unsigned num_threads_; bool capture_memory_; bool capture_time_; std::vector profiler_runs_; // Stacks to run the profiler code to not influence profiled stacks. static constexpr size_t STACK_SIZE = 4096 * 4; base::mmap_stack_allocator stack_allocator_; std::vector stacks_; template void run_on_stack(unsigned thread_id, const Function function) { context_switcher::enter_context(stacks_[thread_id], STACK_SIZE, [this, thread_id, function](auto cont) { function(thread_stats_for(thread_id)); return cont; }); } }; } #endif //PLS_INTERNAL_PROFILING_PROFILER_H_