From 3202323c6ec38f0e9883baba4bf0cdd4ccf8f546 Mon Sep 17 00:00:00 2001
From: FritzFlorian <flo.fritz@t-online.de>
Date: Tue, 19 May 2020 21:20:43 +0200
Subject: [PATCH] Finish up DAG profiler.

---
 app/benchmark_fib/main.cpp                               |   4 ++++
 app/benchmark_matrix/main.cpp                            |   5 +++++
 lib/pls/include/pls/internal/profiling/dag_node.h        |  10 ++++++++++
 lib/pls/include/pls/internal/profiling/profiler.h        |  82 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
 lib/pls/include/pls/internal/profiling/thread_stats.h    |  24 +++---------------------
 lib/pls/include/pls/internal/scheduling/scheduler.h      |   6 ++++++
 lib/pls/include/pls/internal/scheduling/scheduler_impl.h |  27 ++++++++++++++++-----------
 lib/pls/src/internal/profiling/dag_node.cpp              |  20 +++++++++++++++++++-
 lib/pls/src/internal/profiling/profiler.cpp              | 182 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------------------------------------------------------------------
 lib/pls/src/internal/scheduling/scheduler.cpp            |   4 ++--
 10 files changed, 232 insertions(+), 132 deletions(-)
diff --git a/app/benchmark_fib/main.cpp b/app/benchmark_fib/main.cpp
index fdfe815..5f97c24 100644
--- a/app/benchmark_fib/main.cpp
+++ b/app/benchmark_fib/main.cpp
@@ -44,11 +44,15 @@ int main(int argc, char **argv) {
   scheduler scheduler{(unsigned) num_threads, MAX_NUM_TASKS, MAX_STACK_SIZE};
 
   volatile int res;
+  scheduler.get_profiler().disable_memory_measure();
   runner.run_iterations(fib::NUM_ITERATIONS, [&]() {
     scheduler.perform_work([&]() {
       res = pls_fib(fib::INPUT_N);
     });
   }, fib::NUM_WARMUP_ITERATIONS);
+  scheduler.get_profiler().current_run().print_dag(std::cout);
+  scheduler.get_profiler().current_run().print_stats();
+
   runner.commit_results(true);
 
   return 0;
diff --git a/app/benchmark_matrix/main.cpp b/app/benchmark_matrix/main.cpp
index 9ba1612..2896a03 100644
--- a/app/benchmark_matrix/main.cpp
+++ b/app/benchmark_matrix/main.cpp
@@ -4,6 +4,7 @@ using namespace pls;
 
 #include "benchmark_runner.h"
 #include "benchmark_base/matrix.h"
+#include <iostream>
 
 using namespace comparison_benchmarks::base;
 
@@ -37,10 +38,14 @@ int main(int argc, char **argv) {
 
   scheduler scheduler{(unsigned) num_threads, MAX_NUM_TASKS, MAX_STACK_SIZE};
 
+  scheduler.get_profiler().disable_memory_measure();
   runner.run_iterations(matrix::NUM_ITERATIONS, [&]() {
     scheduler.perform_work([&]() {
       result.multiply(a, b);
     });
   }, matrix::WARMUP_ITERATIONS);
+  scheduler.get_profiler().current_run().print_dag(std::cout);
+  scheduler.get_profiler().current_run().print_stats();
+
   runner.commit_results(true);
 }
diff --git a/lib/pls/include/pls/internal/profiling/dag_node.h b/lib/pls/include/pls/internal/profiling/dag_node.h
index eabc44e..6edbbb7 100644
--- a/lib/pls/include/pls/internal/profiling/dag_node.h
+++ b/lib/pls/include/pls/internal/profiling/dag_node.h
@@ -6,10 +6,19 @@
 #include <memory>
 #include <list>
 #include <iostream>
+#include <chrono>
 
 namespace pls::internal::profiling {
 
 struct dag_node {
+  using clock = std::chrono::steady_clock;
+  using measurement_resolution = std::chrono::nanoseconds;
+  using display_resolution = std::chrono::microseconds;
+  static unsigned long m_to_d(unsigned long duration) {
+    measurement_resolution measurement_duration{duration};
+    return std::chrono::duration_cast<display_resolution>(measurement_duration).count();
+  }
+
   dag_node(unsigned spawning_thread_id) : spawning_thread_id_{spawning_thread_id} {};
 
   unsigned spawning_thread_id_;
@@ -28,6 +37,7 @@ struct dag_node {
   unsigned dag_max_memory();
   unsigned long dag_total_user_time();
   unsigned long dag_critical_path();
+  unsigned long dag_depth();
 };
 
 }
diff --git a/lib/pls/include/pls/internal/profiling/profiler.h b/lib/pls/include/pls/internal/profiling/profiler.h
index 2a18d5a..fcae1f9 100644
--- a/lib/pls/include/pls/internal/profiling/profiler.h
+++ b/lib/pls/include/pls/internal/profiling/profiler.h
@@ -3,7 +3,7 @@
 #define PLS_INTERNAL_PROFILING_PROFILER_H_
 
 #ifndef PLS_PROFILING_ENABLED
-#define PLS_PROFILING_ENABLED false
+#define PLS_PROFILING_ENABLED true
 #endif
 
 #include <memory>
@@ -20,29 +20,60 @@ class profiler {
   using clock = std::chrono::steady_clock;
   using measurement_resolution = std::chrono::nanoseconds;
   using display_resolution = std::chrono::microseconds;
+  static unsigned long m_to_d(unsigned long duration) {
+    measurement_resolution measurement_duration{duration};
+    return std::chrono::duration_cast<display_resolution>(measurement_duration).count();
+  }
 
   struct profiler_run {
     profiler_run(unsigned num_threads) : start_time_{},
                                          end_time_{},
                                          root_node_{std::make_unique<dag_node>(0)},
-                                         per_thread_stats_(num_threads) {}
+                                         per_thread_stats_(num_threads),
+                                         num_threads_{num_threads} {}
 
+    // Runtime stats
     clock::time_point start_time_;
     clock::time_point end_time_;
     std::unique_ptr<dag_node> root_node_;
     std::vector<thread_stats> per_thread_stats_;
 
-    static unsigned long m_to_d(unsigned long duration) {
-      measurement_resolution measurement_duration{duration};
-      return std::chrono::duration_cast<display_resolution>(measurement_duration).count();
-    }
+    // Collective stats
+    unsigned num_threads_;
+
+    unsigned long wall_time_;
+
+    unsigned long t_1_;
+    unsigned long t_inf_;
+
+    unsigned long steals_failed_;
+    unsigned long steals_successful_;
+    unsigned long steals_cas_ops_;
+    unsigned long steals_time_;
+
+    unsigned long max_memory_per_stack_;
+    unsigned long spawn_depth_;
 
-    void print_stats(unsigned num_threads) const;
+    void calculate_stats();
+    void print_stats() const;
+    void print_dag(std::ostream &stream);
   };
 
  public:
-  profiler(unsigned num_threads) : num_threads_{num_threads},
-                                   profiler_runs_() {}
+  explicit profiler(unsigned num_threads) : num_threads_{num_threads},
+                                            capture_memory_{true},
+                                            capture_time_{true},
+                                            profiler_runs_() {
+    for (unsigned i = 0; i < num_threads_; i++) {
+      stacks_.push_back(stack_allocator_.allocate_stack(STACK_SIZE));
+    }
+  }
+
+  ~profiler() {
+    for (unsigned i = 0; i < num_threads_; i++) {
+      stack_allocator_.free_stack(STACK_SIZE, stacks_[i]);
+    }
+  }
 
   dag_node *start_profiler_run();
   void stop_profiler_run();
@@ -58,8 +89,6 @@ class profiler {
   void task_prepare_stack_measure(unsigned thread_id, char *stack_memory, size_t stack_size);
   void task_finish_stack_measure(unsigned thread_id, char *stack_memory, size_t stack_size, dag_node *in_node);
 
- private:
-  static constexpr char MAGIC_BYTES[] = {'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'};
   profiler_run &current_run() {
     return profiler_runs_[profiler_runs_.size() - 1];
   }
@@ -67,8 +96,39 @@ class profiler {
     return current_run().per_thread_stats_[thread_id];
   }
 
+  void disable_time_measure() {
+    capture_time_ = false;
+  }
+  void enable_time_measure() {
+    capture_time_ = true;
+  }
+  void disable_memory_measure() {
+    capture_memory_ = false;
+  }
+  void enable_memory_measure() {
+    capture_memory_ = true;
+  }
+
+ private:
+  static constexpr char MAGIC_BYTES[] = {'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'};
+
   unsigned num_threads_;
+  bool capture_memory_;
+  bool capture_time_;
   std::vector<profiler_run> profiler_runs_;
+
+  // Stacks to run the profiler code to not influence profiled stacks.
+  static constexpr size_t STACK_SIZE = 4096 * 4;
+  base::mmap_stack_allocator stack_allocator_;
+  std::vector<char *> stacks_;
+
+  template<typename Function>
+  void run_on_stack(unsigned thread_id, const Function function) {
+    context_switcher::enter_context(stacks_[thread_id], STACK_SIZE, [this, thread_id, function](auto cont) {
+      function(thread_stats_for(thread_id));
+      return cont;
+    });
+  }
 };
 }
 
diff --git a/lib/pls/include/pls/internal/profiling/thread_stats.h b/lib/pls/include/pls/internal/profiling/thread_stats.h
index 0249ed4..7b3fb7a 100644
--- a/lib/pls/include/pls/internal/profiling/thread_stats.h
+++ b/lib/pls/include/pls/internal/profiling/thread_stats.h
@@ -10,26 +10,11 @@
 #include "pls/internal/base/system_details.h"
 #include "pls/internal/base/stack_allocator.h"
 
+#include "pls/internal/profiling/dag_node.h"
+
 namespace pls::internal::profiling {
 
 struct PLS_CACHE_ALIGN thread_stats {
-  static constexpr size_t STACK_SIZE = 4096 * 4;
-
-  thread_stats() {
-    stack_ = stack_allocator_.allocate_stack(STACK_SIZE);
-  }
-  ~thread_stats() {
-    stack_allocator_.free_stack(STACK_SIZE, stack_);
-  }
-
-  template<typename Function>
-  void run_on_stack(const Function function) {
-    context_switcher::enter_context(stack_, STACK_SIZE, [function](auto cont) {
-      function();
-      return cont;
-    });
-  }
-
   using clock = std::chrono::steady_clock;
 
   unsigned long total_steals_{0};
@@ -40,10 +25,7 @@ struct PLS_CACHE_ALIGN thread_stats {
   unsigned long total_time_stealing_{0};
 
   clock::time_point stealing_start_time_;
-  clock::time_point task_run_start_time;
-
-  base::mmap_stack_allocator stack_allocator_;
-  char *stack_;
+  clock::time_point task_run_start_time_;
 };
 
 }
diff --git a/lib/pls/include/pls/internal/scheduling/scheduler.h b/lib/pls/include/pls/internal/scheduling/scheduler.h
index edb95d2..b9affc5 100644
--- a/lib/pls/include/pls/internal/scheduling/scheduler.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler.h
@@ -99,6 +99,12 @@ class scheduler {
   thread_state &thread_state_for(unsigned int thread_id) { return *thread_states_[thread_id]; }
   task_manager &task_manager_for(unsigned int thread_id) { return *task_managers_[thread_id]; }
 
+#if PLS_PROFILING_ENABLED
+  profiling::profiler &get_profiler() {
+    return profiler_;
+  }
+#endif
+
  private:
   static context_switcher::continuation slow_return(thread_state &calling_state);
 
diff --git a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
index 5e4dee3..55bb8e3 100644
--- a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
@@ -31,7 +31,7 @@ scheduler::scheduler(unsigned int num_threads,
     terminated_{false},
     stack_allocator_{std::make_shared<ALLOC>(std::forward<ALLOC>(stack_allocator))}
 #if PLS_PROFILING_ENABLED
-, profiler_{num_threads}
+    , profiler_{num_threads}
 #endif
 {
 
@@ -87,18 +87,16 @@ class scheduler::init_function_impl : public init_function {
       PLS_ASSERT(thread_state::get().main_continuation().valid(), "Must return valid continuation from main task.");
 
 #if PLS_PROFILING_ENABLED
+      thread_state::get().get_scheduler().profiler_.task_finish_stack_measure(thread_state::get().get_thread_id(),
+                                                                              root_task->stack_memory_,
+                                                                              root_task->stack_size_,
+                                                                              root_task->profiling_node_);
       thread_state::get().get_scheduler().profiler_.task_stop_running(thread_state::get().get_thread_id(),
                                                                       root_task->profiling_node_);
 #endif
+
       return std::move(thread_state::get().main_continuation());
     });
-
-#if PLS_PROFILING_ENABLED
-    thread_state::get().get_scheduler().profiler_.task_finish_stack_measure(thread_state::get().get_thread_id(),
-                                                                            root_task->stack_memory_,
-                                                                            root_task->stack_size_,
-                                                                            root_task->profiling_node_);
-#endif
   }
  private:
   F &function_;
@@ -141,7 +139,6 @@ void scheduler::spawn(Function &&lambda) {
   base_task *spawned_task = last_task->next_;
 
 #if PLS_PROFILING_ENABLED
-  // Memory and DAG nodes
   spawning_state.get_scheduler().profiler_.task_prepare_stack_measure(spawning_state.get_thread_id(),
                                                                       spawned_task->stack_memory_,
                                                                       spawned_task->stack_size_);
@@ -189,24 +186,32 @@ void scheduler::spawn(Function &&lambda) {
 
       syncing_state.set_active_task(last_task);
 #if PLS_PROFILING_ENABLED
+      syncing_state.get_scheduler().profiler_.task_finish_stack_measure(syncing_state.get_thread_id(),
+                                                                        spawned_task->stack_memory_,
+                                                                        spawned_task->stack_size_,
+                                                                        spawned_task->profiling_node_);
       syncing_state.get_scheduler().profiler_.task_stop_running(syncing_state.get_thread_id(),
                                                                 spawned_task->profiling_node_);
 #endif
       return std::move(last_task->continuation_);
     } else {
       // Slow path, the last task was stolen. This path is common to sync() events.
-      auto continuation = slow_return(syncing_state);
 #if PLS_PROFILING_ENABLED
+      syncing_state.get_scheduler().profiler_.task_finish_stack_measure(syncing_state.get_thread_id(),
+                                                                        spawned_task->stack_memory_,
+                                                                        spawned_task->stack_size_,
+                                                                        spawned_task->profiling_node_);
       syncing_state.get_scheduler().profiler_.task_stop_running(syncing_state.get_thread_id(),
                                                                 spawned_task->profiling_node_);
 #endif
+      auto continuation = slow_return(syncing_state);
       return continuation;
     }
   });
 
 #if PLS_PROFILING_ENABLED
   thread_state::get().get_scheduler().profiler_.task_start_running(thread_state::get().get_thread_id(),
-                                                                   thread_state::get().get_active_task()->profiling_node_);
+                                                                   last_task->profiling_node_);
 #endif
 
   if (continuation.valid()) {
diff --git a/lib/pls/src/internal/profiling/dag_node.cpp b/lib/pls/src/internal/profiling/dag_node.cpp
index 55f2b30..0c3d611 100644
--- a/lib/pls/src/internal/profiling/dag_node.cpp
+++ b/lib/pls/src/internal/profiling/dag_node.cpp
@@ -20,7 +20,11 @@ void dag_node::dag_compact() {
 }
 
 void dag_node::dag_print(std::ostream &stream, unsigned rank) {
-  stream << node_print_id() << " [label=" << spawning_thread_id_ << ", rank=" << rank << "];" << std::endl;
+  stream << node_print_id()
+         << " [label=\"" << spawning_thread_id_ << "\n"
+         << max_memory_ << " bytes\n"
+         << m_to_d(total_runtime_) << " us\""
+         << " ,rank=" << rank << "];" << std::endl;
   for (auto &child : child_nodes_) {
     child.dag_print(stream, rank + 1);
     stream << node_print_id() << " -> " << child.node_print_id() << ";" << std::endl;
@@ -68,4 +72,18 @@ unsigned long dag_node::dag_critical_path() {
   return critical_path;
 }
 
+unsigned long dag_node::dag_depth() {
+  unsigned long deepest_at_level = 1;
+  if (next_node_) {
+    deepest_at_level = std::max(deepest_at_level, next_node_->dag_depth());
+  }
+
+  unsigned long deepest_child = 0;
+  for (auto &child : child_nodes_) {
+    deepest_child = std::max(deepest_child, child.dag_depth());
+  }
+
+  return std::max(deepest_child + 1, deepest_at_level);
+}
+
 }
diff --git a/lib/pls/src/internal/profiling/profiler.cpp b/lib/pls/src/internal/profiling/profiler.cpp
index aa9ec65..74b514c 100644
--- a/lib/pls/src/internal/profiling/profiler.cpp
+++ b/lib/pls/src/internal/profiling/profiler.cpp
@@ -1,42 +1,56 @@
 #include "pls/internal/profiling/profiler.h"
 
 namespace pls::internal::profiling {
-void profiler::profiler_run::print_stats(unsigned num_threads) const {
+
+void profiler::profiler_run::calculate_stats() {
   root_node_->dag_compact();
 
-  auto run_duration = std::chrono::duration_cast<measurement_resolution>(end_time_ - start_time_).count();
-  std::cout << "===========================" << std::endl;
-  std::cout << "WALL TIME: " << m_to_d(run_duration) << std::endl;
-  unsigned long total_user_time = root_node_->dag_total_user_time();
-  std::cout << "USER TIME: " << m_to_d(total_user_time) << std::endl;
-  unsigned long critical_path_time = root_node_->dag_critical_path();
-  std::cout << "CRITICAL TIME: " << m_to_d(critical_path_time) << std::endl;
-  std::cout << "MAX SPEEDUP:" << (double) total_user_time / (double) critical_path_time << std::endl;
-
-  unsigned long total_failed_steals = 0;
-  unsigned long total_successful_steals = 0;
-  unsigned long total_steal_time = 0;
+  wall_time_ = std::chrono::duration_cast<measurement_resolution>(end_time_ - start_time_).count();
+
+  t_1_ = root_node_->dag_total_user_time();
+  t_inf_ = root_node_->dag_critical_path();
+
+  steals_failed_ = 0;
+  steals_successful_ = 0;
+  steals_cas_ops_ = 0;
+  steals_time_ = 0;
   for (auto &thread_stats : per_thread_stats_) {
-    total_failed_steals += thread_stats.failed_steals_;
-    total_successful_steals += thread_stats.successful_steals_;
-    total_steal_time += thread_stats.total_time_stealing_;
+    steals_failed_ += thread_stats.failed_steals_;
+    steals_successful_ += thread_stats.successful_steals_;
+    steals_cas_ops_ += thread_stats.steal_cas_ops_;
+    steals_time_ += thread_stats.total_time_stealing_;
   }
-  std::cout << "STEALS: (Time " << m_to_d(total_steal_time)
-            << ", Total: " << (total_successful_steals + total_failed_steals)
-            << ", Success: " << total_successful_steals
-            << ", Failed: " << total_failed_steals << ")" << std::endl;
 
-  unsigned long total_measured = total_steal_time + total_user_time;
-  unsigned long total_wall = run_duration * num_threads;
-  std::cout << "Wall Time vs. Measured: " << 100.0 * total_measured / total_wall << std::endl;
+  max_memory_per_stack_ = root_node_->dag_max_memory();
+  spawn_depth_ = root_node_->dag_depth();
+}
 
-  std::cout << "MEMORY: " << root_node_->dag_max_memory() << " bytes per stack" << std::endl;
+void profiler::profiler_run::print_stats() const {
 
-  // TODO: re-add graph printing
+  std::cout << "===========================" << std::endl;
+  std::cout << "WALL TIME: " << m_to_d(wall_time_) << std::endl;
+  std::cout << "USER TIME: " << m_to_d(t_1_) << std::endl;
+  std::cout << "CRITICAL TIME: " << m_to_d(t_inf_) << std::endl;
+  std::cout << "MAX SPEEDUP:" << (double) t_1_ / (double) t_inf_ << std::endl;
+
+  std::cout << "STEALS: (Time " << m_to_d(steals_time_)
+            << ", Total: " << (steals_failed_ + steals_successful_)
+            << ", Success: " << steals_successful_
+            << ", Failed: " << steals_failed_
+            << ", CAS: " << steals_cas_ops_ << ")" << std::endl;
+
+  unsigned long total_measured = steals_time_ + t_1_;
+  unsigned long total_wall = wall_time_ * num_threads_;
+  std::cout << "Wall Time vs. Measured: " << 100.0 * total_measured / total_wall << std::endl;
 
-//      std::cout << "digraph {" << std::endl;
-//      root_node_->dag_print(std::cout, 0);
-//      std::cout << "}" << std::endl;
+  std::cout << "MEMORY: " << max_memory_per_stack_ << " bytes per stack" << std::endl;
+  std::cout << "DEPTH:" << spawn_depth_ << std::endl;
+}
+
+void profiler::profiler_run::print_dag(std::ostream &stream) {
+  stream << "digraph {" << std::endl;
+  root_node_->dag_print(std::cout, 0);
+  stream << "}" << std::endl;
 }
 
 dag_node *profiler::start_profiler_run() {
@@ -47,56 +61,52 @@ dag_node *profiler::start_profiler_run() {
 
 void profiler::stop_profiler_run() {
   current_run().end_time_ = clock::now();
-  current_run().print_stats(num_threads_);
+  current_run().root_node_->dag_compact();
+  current_run().calculate_stats();
 }
 
 void profiler::stealing_start(unsigned thread_id) {
-  auto &thread_stats = thread_stats_for(thread_id);
-
-  thread_stats.run_on_stack([&] {
-    thread_stats.stealing_start_time_ = clock::now();
+  run_on_stack(thread_id, [&](thread_stats &thread_stats) {
     thread_stats.total_steals_++;
+
+    if (capture_time_) {
+      thread_stats.stealing_start_time_ = clock::now();
+    }
   });
 }
 
 void profiler::stealing_end(unsigned thread_id, bool success) {
-  auto &thread_stats = thread_stats_for(thread_id);
-
-  thread_stats.run_on_stack([&] {
+  run_on_stack(thread_id, [&](thread_stats &thread_stats) {
     thread_stats.failed_steals_ += !success;
     thread_stats.successful_steals_ += success;
 
-    auto end_time = clock::now();
-    auto
-        steal_duration =
-        std::chrono::duration_cast<measurement_resolution>(end_time - thread_stats.stealing_start_time_).count();
-    thread_stats.total_time_stealing_ += steal_duration;
+    if (capture_time_) {
+      auto end_time = clock::now();
+      auto
+          steal_duration =
+          std::chrono::duration_cast<measurement_resolution>(end_time - thread_stats.stealing_start_time_).count();
+      thread_stats.total_time_stealing_ += steal_duration;
+    }
   });
 }
 
 void profiler::stealing_cas_op(unsigned thread_id) {
-  auto &thread_stats = thread_stats_for(thread_id);
-
-  thread_stats.run_on_stack([&] {
+  run_on_stack(thread_id, [&](thread_stats &thread_stats) {
     thread_stats.steal_cas_ops_++;
   });
 }
 
 dag_node *profiler::task_spawn_child(unsigned thread_id, dag_node *parent) {
-  auto &thread_stats = thread_stats_for(thread_id);
-
   dag_node *result;
-  thread_stats.run_on_stack([&] {
+  run_on_stack(thread_id, [&](thread_stats &) {
     result = &parent->child_nodes_.emplace_back(thread_id);
   });
   return result;
 }
 
 dag_node *profiler::task_sync(unsigned thread_id, dag_node *synced) {
-  auto &thread_stats = thread_stats_for(thread_id);
-
   dag_node *result;
-  thread_stats.run_on_stack([&] {
+  run_on_stack(thread_id, [&](thread_stats &) {
     synced->next_node_ = std::make_unique<dag_node>(thread_id);
     result = synced->next_node_.get();
   });
@@ -104,55 +114,55 @@ dag_node *profiler::task_sync(unsigned thread_id, dag_node *synced) {
 }
 
 void profiler::task_start_running(unsigned thread_id, dag_node *in_node) {
-  (void) in_node; // unused, keep for 'symmetric' API
-  auto &thread_stats = thread_stats_for(thread_id);
-
-  thread_stats.run_on_stack([&] {
-    thread_stats.task_run_start_time = clock::now();
-  });
+  if (capture_time_) {
+    (void) in_node; // unused, keep for 'symmetric' API
+    run_on_stack(thread_id, [&](thread_stats &thread_stats) {
+      thread_stats.task_run_start_time_ = clock::now();
+    });
+  }
 }
 
 void profiler::task_stop_running(unsigned thread_id, dag_node *in_node) {
-  auto &thread_stats = thread_stats_for(thread_id);
-
-  thread_stats.run_on_stack([&] {
-    auto end_time = clock::now();
-    auto user_code_duration =
-        std::chrono::duration_cast<measurement_resolution>(end_time - thread_stats.task_run_start_time).count();
-    in_node->total_runtime_ += user_code_duration;
-  });
+  if (capture_time_) {
+    run_on_stack(thread_id, [&](thread_stats &thread_stats) {
+      auto end_time = clock::now();
+      auto user_code_duration =
+          std::chrono::duration_cast<measurement_resolution>(end_time - thread_stats.task_run_start_time_).count();
+      in_node->total_runtime_ += user_code_duration;
+    });
+  }
 }
 
 void profiler::task_prepare_stack_measure(unsigned thread_id, char *stack_memory, size_t stack_size) {
-  auto &thread_stats = thread_stats_for(thread_id);
-
-  thread_stats.run_on_stack([&] {
-    for (size_t i = 0; i < stack_size - sizeof(MAGIC_BYTES); i += sizeof(MAGIC_BYTES)) {
-      for (size_t j = 0; j < sizeof(MAGIC_BYTES); j++) {
-        stack_memory[i + j] = MAGIC_BYTES[j];
+  if (capture_memory_) {
+    run_on_stack(thread_id, [&](thread_stats &) {
+      for (size_t i = 0; i < stack_size - sizeof(MAGIC_BYTES); i += sizeof(MAGIC_BYTES)) {
+        for (size_t j = 0; j < sizeof(MAGIC_BYTES); j++) {
+          stack_memory[i + j] = MAGIC_BYTES[j];
+        }
       }
-    }
-  });
+    });
+  }
 }
 
 void profiler::task_finish_stack_measure(unsigned thread_id, char *stack_memory, size_t stack_size, dag_node *in_node) {
-  auto &thread_stats = thread_stats_for(thread_id);
-
-  thread_stats.run_on_stack([&] {
-    for (size_t i = 0; i < stack_size - sizeof(MAGIC_BYTES); i += sizeof(MAGIC_BYTES)) {
-      bool section_clean = true;
-      for (size_t j = 0; j < sizeof(MAGIC_BYTES); j++) {
-        if (stack_memory[i + j] != MAGIC_BYTES[j]) {
-          section_clean = false;
+  if (capture_memory_) {
+    run_on_stack(thread_id, [&](thread_stats &) {
+      for (size_t i = 0; i < stack_size - sizeof(MAGIC_BYTES); i += sizeof(MAGIC_BYTES)) {
+        bool section_clean = true;
+        for (size_t j = 0; j < sizeof(MAGIC_BYTES); j++) {
+          if (stack_memory[i + j] != MAGIC_BYTES[j]) {
+            section_clean = false;
+          }
         }
-      }
 
-      in_node->max_memory_ = stack_size - i + sizeof(MAGIC_BYTES);
-      if (!section_clean) {
-        return;
+        in_node->max_memory_ = stack_size - i + sizeof(MAGIC_BYTES);
+        if (!section_clean) {
+          return;
+        }
       }
-    }
-  });
+    });
+  }
 }
 
 }
diff --git a/lib/pls/src/internal/scheduling/scheduler.cpp b/lib/pls/src/internal/scheduling/scheduler.cpp
index 3c418c3..e682e32 100644
--- a/lib/pls/src/internal/scheduling/scheduler.cpp
+++ b/lib/pls/src/internal/scheduling/scheduler.cpp
@@ -124,8 +124,8 @@ void scheduler::sync() {
 
   if (active_task->is_synchronized_) {
 #if PLS_PROFILING_ENABLED
-    syncing_state.get_scheduler().profiler_.task_start_running(syncing_state.get_thread_id(),
-                                                               active_task->profiling_node_);
+    thread_state::get().get_scheduler().profiler_.task_start_running(thread_state::get().get_thread_id(),
+                                                                     thread_state::get().get_active_task()->profiling_node_);
 #endif
     return; // We are already the sole owner of last_task
   } else {
--
libgit2 0.26.0