Commit aeb55d12 by FritzFlorian

Add output for basic stats: T_1, T_inf, T_wall, memory and basic graph drawing.

Basic profiling works. Next steps are to 'refine' the interface, i.e. allow to disable parts of the profiling and better output for some stats. Maybe look at storing/loading the DAG's with stats to/from disk.
parent 41809925
Pipeline #1474 passed with stages
in 3 minutes 46 seconds
#ifndef PLS_INTERNAL_PROFILING_DAG_NODE_H_ #ifndef PLS_INTERNAL_PROFILING_DAG_NODE_H_
#define PLS_INTERNAL_PROFILING_DAG_NODE_H_ #define PLS_INTERNAL_PROFILING_DAG_NODE_H_
#include "pls/internal/base/system_details.h"
#include <memory> #include <memory>
#include <list> #include <list>
#include <algorithm> #include <algorithm>
#include <iostream>
namespace pls::internal::profiling { namespace pls::internal::profiling {
...@@ -17,6 +20,37 @@ struct dag_node { ...@@ -17,6 +20,37 @@ struct dag_node {
std::unique_ptr<dag_node> next_node_; std::unique_ptr<dag_node> next_node_;
std::list<dag_node> child_nodes_; std::list<dag_node> child_nodes_;
base::system_details::pointer_t node_print_id() {
return reinterpret_cast<base::system_details::pointer_t >(this);
}
void dag_compact() {
while (next_node_ && next_node_->child_nodes_.empty()) {
max_memory_ = std::max(max_memory_, next_node_->max_memory_);
total_runtime_ += next_node_->total_runtime_;
next_node_ = std::move(next_node_->next_node_);
}
if (next_node_) {
next_node_->dag_compact();
}
for (auto &child : child_nodes_) {
child.dag_compact();
}
}
void dag_print(std::ostream &stream, unsigned rank) {
stream << node_print_id() << " [label=" << spawning_thread_id_ << ", rank=" << rank << "];" << std::endl;
for (auto &child : child_nodes_) {
child.dag_print(stream, rank + 1);
stream << node_print_id() << " -> " << child.node_print_id() << ";" << std::endl;
}
if (next_node_) {
next_node_->dag_print(stream, rank);
stream << node_print_id() << " -> " << next_node_->node_print_id() << ";" << std::endl;
}
}
unsigned dag_max_memory() { unsigned dag_max_memory() {
unsigned max = max_memory_; unsigned max = max_memory_;
if (next_node_) { if (next_node_) {
...@@ -29,7 +63,7 @@ struct dag_node { ...@@ -29,7 +63,7 @@ struct dag_node {
} }
unsigned long dag_total_user_time() { unsigned long dag_total_user_time() {
unsigned total_user_time = total_runtime_; unsigned long total_user_time = total_runtime_;
if (next_node_) { if (next_node_) {
total_user_time += next_node_->dag_total_user_time(); total_user_time += next_node_->dag_total_user_time();
} }
...@@ -38,6 +72,21 @@ struct dag_node { ...@@ -38,6 +72,21 @@ struct dag_node {
} }
return total_user_time; return total_user_time;
} }
unsigned long dag_critical_path() {
unsigned long critical_path = total_runtime_;
if (next_node_) {
critical_path += next_node_->dag_critical_path();
}
unsigned long slowest_child = 0;
for (auto &child : child_nodes_) {
slowest_child = std::max(slowest_child, child.dag_critical_path());
}
critical_path += slowest_child;
return critical_path;
}
}; };
} }
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#ifndef PLS_INTERNAL_PROFILING_PROFILER_H_ #ifndef PLS_INTERNAL_PROFILING_PROFILER_H_
#define PLS_INTERNAL_PROFILING_PROFILER_H_ #define PLS_INTERNAL_PROFILING_PROFILER_H_
#define PLS_PROFILING_ENABLED true #define PLS_PROFILING_ENABLED false
#include <memory> #include <memory>
#include <chrono> #include <chrono>
...@@ -16,7 +16,8 @@ ...@@ -16,7 +16,8 @@
namespace pls::internal::profiling { namespace pls::internal::profiling {
class profiler { class profiler {
using clock = std::chrono::steady_clock; using clock = std::chrono::steady_clock;
using resolution = std::chrono::nanoseconds; using measurement_resolution = std::chrono::nanoseconds;
using display_resolution = std::chrono::microseconds;
struct profiler_run { struct profiler_run {
profiler_run(unsigned num_threads) : start_time_{}, profiler_run(unsigned num_threads) : start_time_{},
...@@ -29,12 +30,22 @@ class profiler { ...@@ -29,12 +30,22 @@ class profiler {
std::unique_ptr<dag_node> root_node_; std::unique_ptr<dag_node> root_node_;
std::vector<thread_stats> per_thread_stats_; std::vector<thread_stats> per_thread_stats_;
static unsigned long m_to_d(unsigned long duration) {
measurement_resolution measurement_duration{duration};
return std::chrono::duration_cast<display_resolution>(measurement_duration).count();
}
void print_stats(unsigned num_threads) const { void print_stats(unsigned num_threads) const {
auto run_duration = std::chrono::duration_cast<resolution>(end_time_ - start_time_).count(); root_node_->dag_compact();
auto run_duration = std::chrono::duration_cast<measurement_resolution>(end_time_ - start_time_).count();
std::cout << "===========================" << std::endl; std::cout << "===========================" << std::endl;
std::cout << "WALL TIME: " << run_duration << std::endl; std::cout << "WALL TIME: " << m_to_d(run_duration) << std::endl;
unsigned long total_user_time = root_node_->dag_total_user_time(); unsigned long total_user_time = root_node_->dag_total_user_time();
std::cout << "USER TIME: " << total_user_time << std::endl; std::cout << "USER TIME: " << m_to_d(total_user_time) << std::endl;
unsigned long critical_path_time = root_node_->dag_critical_path();
std::cout << "CRITICAL TIME: " << m_to_d(critical_path_time) << std::endl;
std::cout << "MAX SPEEDUP:" << (double) total_user_time / (double) critical_path_time << std::endl;
unsigned long total_failed_steals = 0; unsigned long total_failed_steals = 0;
unsigned long total_successful_steals = 0; unsigned long total_successful_steals = 0;
...@@ -44,7 +55,7 @@ class profiler { ...@@ -44,7 +55,7 @@ class profiler {
total_successful_steals += thread_stats.successful_steals_; total_successful_steals += thread_stats.successful_steals_;
total_steal_time += thread_stats.total_time_stealing_; total_steal_time += thread_stats.total_time_stealing_;
} }
std::cout << "STEALS: (Time " << total_steal_time std::cout << "STEALS: (Time " << m_to_d(total_steal_time)
<< ", Total: " << (total_successful_steals + total_failed_steals) << ", Total: " << (total_successful_steals + total_failed_steals)
<< ", Success: " << total_successful_steals << ", Success: " << total_successful_steals
<< ", Failed: " << total_failed_steals << ")" << std::endl; << ", Failed: " << total_failed_steals << ")" << std::endl;
...@@ -54,6 +65,11 @@ class profiler { ...@@ -54,6 +65,11 @@ class profiler {
std::cout << "Wall Time vs. Measured: " << 100.0 * total_measured / total_wall << std::endl; std::cout << "Wall Time vs. Measured: " << 100.0 * total_measured / total_wall << std::endl;
std::cout << "MEMORY: " << root_node_->dag_max_memory() << " bytes per stack" << std::endl; std::cout << "MEMORY: " << root_node_->dag_max_memory() << " bytes per stack" << std::endl;
// TODO: re-add graph printing
// std::cout << "digraph {" << std::endl;
// root_node_->dag_print(std::cout, 0);
// std::cout << "}" << std::endl;
} }
}; };
...@@ -90,7 +106,8 @@ class profiler { ...@@ -90,7 +106,8 @@ class profiler {
auto end_time = clock::now(); auto end_time = clock::now();
auto auto
steal_duration = std::chrono::duration_cast<resolution>(end_time - thread_stats.stealing_start_time_).count(); steal_duration =
std::chrono::duration_cast<measurement_resolution>(end_time - thread_stats.stealing_start_time_).count();
thread_stats.total_time_stealing_ += steal_duration; thread_stats.total_time_stealing_ += steal_duration;
}); });
} }
...@@ -138,7 +155,7 @@ class profiler { ...@@ -138,7 +155,7 @@ class profiler {
thread_stats.run_on_stack([&] { thread_stats.run_on_stack([&] {
auto end_time = clock::now(); auto end_time = clock::now();
auto user_code_duration = auto user_code_duration =
std::chrono::duration_cast<resolution>(end_time - thread_stats.task_run_start_time).count(); std::chrono::duration_cast<measurement_resolution>(end_time - thread_stats.task_run_start_time).count();
in_node->total_runtime_ += user_code_duration; in_node->total_runtime_ += user_code_duration;
}); });
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment