profiler.cpp 5.31 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
#include "pls/internal/profiling/profiler.h"

namespace pls::internal::profiling {
void profiler::profiler_run::print_stats(unsigned num_threads) const {
  root_node_->dag_compact();

  auto run_duration = std::chrono::duration_cast<measurement_resolution>(end_time_ - start_time_).count();
  std::cout << "===========================" << std::endl;
  std::cout << "WALL TIME: " << m_to_d(run_duration) << std::endl;
  unsigned long total_user_time = root_node_->dag_total_user_time();
  std::cout << "USER TIME: " << m_to_d(total_user_time) << std::endl;
  unsigned long critical_path_time = root_node_->dag_critical_path();
  std::cout << "CRITICAL TIME: " << m_to_d(critical_path_time) << std::endl;
  std::cout << "MAX SPEEDUP:" << (double) total_user_time / (double) critical_path_time << std::endl;

  unsigned long total_failed_steals = 0;
  unsigned long total_successful_steals = 0;
  unsigned long total_steal_time = 0;
  for (auto &thread_stats : per_thread_stats_) {
    total_failed_steals += thread_stats.failed_steals_;
    total_successful_steals += thread_stats.successful_steals_;
    total_steal_time += thread_stats.total_time_stealing_;
  }
  std::cout << "STEALS: (Time " << m_to_d(total_steal_time)
            << ", Total: " << (total_successful_steals + total_failed_steals)
            << ", Success: " << total_successful_steals
            << ", Failed: " << total_failed_steals << ")" << std::endl;

  unsigned long total_measured = total_steal_time + total_user_time;
  unsigned long total_wall = run_duration * num_threads;
  std::cout << "Wall Time vs. Measured: " << 100.0 * total_measured / total_wall << std::endl;

  std::cout << "MEMORY: " << root_node_->dag_max_memory() << " bytes per stack" << std::endl;

  // TODO: re-add graph printing

//      std::cout << "digraph {" << std::endl;
//      root_node_->dag_print(std::cout, 0);
//      std::cout << "}" << std::endl;
}

dag_node *profiler::start_profiler_run() {
  profiler_run &current_run = profiler_runs_.emplace_back(num_threads_);
  current_run.start_time_ = clock::now();
  return current_run.root_node_.get();
}

void profiler::stop_profiler_run() {
  current_run().end_time_ = clock::now();
  current_run().print_stats(num_threads_);
}

void profiler::stealing_start(unsigned thread_id) {
  auto &thread_stats = thread_stats_for(thread_id);

  thread_stats.run_on_stack([&] {
    thread_stats.stealing_start_time_ = clock::now();
    thread_stats.total_steals_++;
  });
}

void profiler::stealing_end(unsigned thread_id, bool success) {
  auto &thread_stats = thread_stats_for(thread_id);

  thread_stats.run_on_stack([&] {
    thread_stats.failed_steals_ += !success;
    thread_stats.successful_steals_ += success;

    auto end_time = clock::now();
    auto
        steal_duration =
        std::chrono::duration_cast<measurement_resolution>(end_time - thread_stats.stealing_start_time_).count();
    thread_stats.total_time_stealing_ += steal_duration;
  });
}

void profiler::stealing_cas_op(unsigned thread_id) {
  auto &thread_stats = thread_stats_for(thread_id);

  thread_stats.run_on_stack([&] {
    thread_stats.steal_cas_ops_++;
  });
}

dag_node *profiler::task_spawn_child(unsigned thread_id, dag_node *parent) {
  auto &thread_stats = thread_stats_for(thread_id);

  dag_node *result;
  thread_stats.run_on_stack([&] {
    result = &parent->child_nodes_.emplace_back(thread_id);
  });
  return result;
}

dag_node *profiler::task_sync(unsigned thread_id, dag_node *synced) {
  auto &thread_stats = thread_stats_for(thread_id);

  dag_node *result;
  thread_stats.run_on_stack([&] {
    synced->next_node_ = std::make_unique<dag_node>(thread_id);
    result = synced->next_node_.get();
  });
  return result;
}

void profiler::task_start_running(unsigned thread_id, dag_node *in_node) {
  (void) in_node; // unused, keep for 'symmetric' API
  auto &thread_stats = thread_stats_for(thread_id);

  thread_stats.run_on_stack([&] {
    thread_stats.task_run_start_time = clock::now();
  });
}

void profiler::task_stop_running(unsigned thread_id, dag_node *in_node) {
  auto &thread_stats = thread_stats_for(thread_id);

  thread_stats.run_on_stack([&] {
    auto end_time = clock::now();
    auto user_code_duration =
        std::chrono::duration_cast<measurement_resolution>(end_time - thread_stats.task_run_start_time).count();
    in_node->total_runtime_ += user_code_duration;
  });
}

void profiler::task_prepare_stack_measure(unsigned thread_id, char *stack_memory, size_t stack_size) {
  auto &thread_stats = thread_stats_for(thread_id);

  thread_stats.run_on_stack([&] {
    for (size_t i = 0; i < stack_size - sizeof(MAGIC_BYTES); i += sizeof(MAGIC_BYTES)) {
      for (size_t j = 0; j < sizeof(MAGIC_BYTES); j++) {
        stack_memory[i + j] = MAGIC_BYTES[j];
      }
    }
  });
}

void profiler::task_finish_stack_measure(unsigned thread_id, char *stack_memory, size_t stack_size, dag_node *in_node) {
  auto &thread_stats = thread_stats_for(thread_id);

  thread_stats.run_on_stack([&] {
    for (size_t i = 0; i < stack_size - sizeof(MAGIC_BYTES); i += sizeof(MAGIC_BYTES)) {
      bool section_clean = true;
      for (size_t j = 0; j < sizeof(MAGIC_BYTES); j++) {
        if (stack_memory[i + j] != MAGIC_BYTES[j]) {
          section_clean = false;
        }
      }

      in_node->max_memory_ = stack_size - i + sizeof(MAGIC_BYTES);
      if (!section_clean) {
        return;
      }
    }
  });
}

}