Finish up DAG profiler.

3202323c · FritzFlorian · 6148bd1e · 3202323c · 3202323c · 3202323c
Commit 3202323c authored May 19, 2020 by FritzFlorian
10 changed files
--- a/app/benchmark_fib/main.cpp
+++ b/app/benchmark_fib/main.cpp
@@ -44,11 +44,15 @@ int main(int argc, char **argv) {
  scheduler scheduler{(unsigned) num_threads, MAX_NUM_TASKS, MAX_STACK_SIZE};

  volatile int res;
+  scheduler.get_profiler().disable_memory_measure();
  runner.run_iterations(fib::NUM_ITERATIONS, [&]() {
    scheduler.perform_work([&]() {
      res = pls_fib(fib::INPUT_N);
    });
  }, fib::NUM_WARMUP_ITERATIONS);
+  scheduler.get_profiler().current_run().print_dag(std::cout);
+  scheduler.get_profiler().current_run().print_stats();
+
  runner.commit_results(true);

  return 0;

--- a/app/benchmark_matrix/main.cpp
+++ b/app/benchmark_matrix/main.cpp
@@ -4,6 +4,7 @@ using namespace pls;

 #include "benchmark_runner.h"
 #include "benchmark_base/matrix.h"
+#include <iostream>

 using namespace comparison_benchmarks::base;

@@ -37,10 +38,14 @@ int main(int argc, char **argv) {

  scheduler scheduler{(unsigned) num_threads, MAX_NUM_TASKS, MAX_STACK_SIZE};

+  scheduler.get_profiler().disable_memory_measure();
  runner.run_iterations(matrix::NUM_ITERATIONS, [&]() {
    scheduler.perform_work([&]() {
      result.multiply(a, b);
    });
  }, matrix::WARMUP_ITERATIONS);
+  scheduler.get_profiler().current_run().print_dag(std::cout);
+  scheduler.get_profiler().current_run().print_stats();
+
  runner.commit_results(true);
 }
--- a/lib/pls/include/pls/internal/profiling/dag_node.h
+++ b/lib/pls/include/pls/internal/profiling/dag_node.h
@@ -6,10 +6,19 @@
 #include <memory>
 #include <list>
 #include <iostream>
+#include <chrono>

 namespace pls::internal::profiling {

 struct dag_node {
+  using clock = std::chrono::steady_clock;
+  using measurement_resolution = std::chrono::nanoseconds;
+  using display_resolution = std::chrono::microseconds;
+  static unsigned long m_to_d(unsigned long duration) {
+    measurement_resolution measurement_duration{duration};
+    return std::chrono::duration_cast<display_resolution>(measurement_duration).count();
+  }
+
  dag_node(unsigned spawning_thread_id) : spawning_thread_id_{spawning_thread_id} {};

  unsigned spawning_thread_id_;
@@ -28,6 +37,7 @@ struct dag_node {
  unsigned dag_max_memory();
  unsigned long dag_total_user_time();
  unsigned long dag_critical_path();
+  unsigned long dag_depth();
 };

 }

--- a/lib/pls/include/pls/internal/profiling/profiler.h
+++ b/lib/pls/include/pls/internal/profiling/profiler.h
@@ -3,7 +3,7 @@
 #define PLS_INTERNAL_PROFILING_PROFILER_H_

 #ifndef PLS_PROFILING_ENABLED
-#define PLS_PROFILING_ENABLED false
+#define PLS_PROFILING_ENABLED true
 #endif

 #include <memory>
@@ -20,29 +20,60 @@ class profiler {
  using clock = std::chrono::steady_clock;
  using measurement_resolution = std::chrono::nanoseconds;
  using display_resolution = std::chrono::microseconds;
+  static unsigned long m_to_d(unsigned long duration) {
+    measurement_resolution measurement_duration{duration};
+    return std::chrono::duration_cast<display_resolution>(measurement_duration).count();
+  }

  struct profiler_run {
    profiler_run(unsigned num_threads) : start_time_{},
                                         end_time_{},
                                         root_node_{std::make_unique<dag_node>(0)},
-                                         per_thread_stats_(num_threads) {}
+                                         per_thread_stats_(num_threads),
+                                         num_threads_{num_threads} {}

+    // Runtime stats
    clock::time_point start_time_;
    clock::time_point end_time_;
    std::unique_ptr<dag_node> root_node_;
    std::vector<thread_stats> per_thread_stats_;

-    static unsigned long m_to_d(unsigned long duration) {
-      measurement_resolution measurement_duration{duration};
-      return std::chrono::duration_cast<display_resolution>(measurement_duration).count();
-    }
+    // Collective stats
+    unsigned num_threads_;
+
+    unsigned long wall_time_;
+
+    unsigned long t_1_;
+    unsigned long t_inf_;
+
+    unsigned long steals_failed_;
+    unsigned long steals_successful_;
+    unsigned long steals_cas_ops_;
+    unsigned long steals_time_;
+
+    unsigned long max_memory_per_stack_;
+    unsigned long spawn_depth_;

-    void print_stats(unsigned num_threads) const;
+    void calculate_stats();
+    void print_stats() const;
+    void print_dag(std::ostream &stream);
  };

 public:
-  profiler(unsigned num_threads) : num_threads_{num_threads},
-                                   profiler_runs_() {}
+  explicit profiler(unsigned num_threads) : num_threads_{num_threads},
+                                            capture_memory_{true},
+                                            capture_time_{true},
+                                            profiler_runs_() {
+    for (unsigned i = 0; i < num_threads_; i++) {
+      stacks_.push_back(stack_allocator_.allocate_stack(STACK_SIZE));
+    }
+  }
+
+  ~profiler() {
+    for (unsigned i = 0; i < num_threads_; i++) {
+      stack_allocator_.free_stack(STACK_SIZE, stacks_[i]);
+    }
+  }

  dag_node *start_profiler_run();
  void stop_profiler_run();
@@ -58,8 +89,6 @@ class profiler {
  void task_prepare_stack_measure(unsigned thread_id, char *stack_memory, size_t stack_size);
  void task_finish_stack_measure(unsigned thread_id, char *stack_memory, size_t stack_size, dag_node *in_node);

- private:
-  static constexpr char MAGIC_BYTES[] = {'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'};
  profiler_run &current_run() {
    return profiler_runs_[profiler_runs_.size() - 1];
  }
@@ -67,8 +96,39 @@ class profiler {
    return current_run().per_thread_stats_[thread_id];
  }

+  void disable_time_measure() {
+    capture_time_ = false;
+  }
+  void enable_time_measure() {
+    capture_time_ = true;
+  }
+  void disable_memory_measure() {
+    capture_memory_ = false;
+  }
+  void enable_memory_measure() {
+    capture_memory_ = true;
+  }
+
+ private:
+  static constexpr char MAGIC_BYTES[] = {'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'};
+
  unsigned num_threads_;
+  bool capture_memory_;
+  bool capture_time_;
  std::vector<profiler_run> profiler_runs_;
+
+  // Stacks to run the profiler code to not influence profiled stacks.
+  static constexpr size_t STACK_SIZE = 4096 * 4;
+  base::mmap_stack_allocator stack_allocator_;
+  std::vector<char *> stacks_;
+
+  template<typename Function>
+  void run_on_stack(unsigned thread_id, const Function function) {
+    context_switcher::enter_context(stacks_[thread_id], STACK_SIZE, [this, thread_id, function](auto cont) {
+      function(thread_stats_for(thread_id));
+      return cont;
+    });
+  }
 };
 }


--- a/lib/pls/include/pls/internal/profiling/thread_stats.h
+++ b/lib/pls/include/pls/internal/profiling/thread_stats.h
@@ -10,26 +10,11 @@
 #include "pls/internal/base/system_details.h"
 #include "pls/internal/base/stack_allocator.h"

+#include "pls/internal/profiling/dag_node.h"
+
 namespace pls::internal::profiling {

 struct PLS_CACHE_ALIGN thread_stats {
-  static constexpr size_t STACK_SIZE = 4096 * 4;
-
-  thread_stats() {
-    stack_ = stack_allocator_.allocate_stack(STACK_SIZE);
-  }
-  ~thread_stats() {
-    stack_allocator_.free_stack(STACK_SIZE, stack_);
-  }
-
-  template<typename Function>
-  void run_on_stack(const Function function) {
-    context_switcher::enter_context(stack_, STACK_SIZE, [function](auto cont) {
-      function();
-      return cont;
-    });
-  }
-
  using clock = std::chrono::steady_clock;

  unsigned long total_steals_{0};
@@ -40,10 +25,7 @@ struct PLS_CACHE_ALIGN thread_stats {
  unsigned long total_time_stealing_{0};

  clock::time_point stealing_start_time_;
-  clock::time_point task_run_start_time;
-
-  base::mmap_stack_allocator stack_allocator_;
-  char *stack_;
+  clock::time_point task_run_start_time_;
 };

 }

--- a/lib/pls/include/pls/internal/scheduling/scheduler.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler.h
@@ -99,6 +99,12 @@ class scheduler {
  thread_state &thread_state_for(unsigned int thread_id) { return *thread_states_[thread_id]; }
  task_manager &task_manager_for(unsigned int thread_id) { return *task_managers_[thread_id]; }

+#if PLS_PROFILING_ENABLED
+  profiling::profiler &get_profiler() {
+    return profiler_;
+  }
+#endif
+
 private:
  static context_switcher::continuation slow_return(thread_state &calling_state);


--- a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
@@ -31,7 +31,7 @@ scheduler::scheduler(unsigned int num_threads,
    terminated_{false},
    stack_allocator_{std::make_shared<ALLOC>(std::forward<ALLOC>(stack_allocator))}
 #if PLS_PROFILING_ENABLED
-, profiler_{num_threads}
+    , profiler_{num_threads}
 #endif
 {

@@ -87,18 +87,16 @@ class scheduler::init_function_impl : public init_function {
      PLS_ASSERT(thread_state::get().main_continuation().valid(), "Must return valid continuation from main task.");

 #if PLS_PROFILING_ENABLED
+      thread_state::get().get_scheduler().profiler_.task_finish_stack_measure(thread_state::get().get_thread_id(),
+                                                                              root_task->stack_memory_,
+                                                                              root_task->stack_size_,
+                                                                              root_task->profiling_node_);
      thread_state::get().get_scheduler().profiler_.task_stop_running(thread_state::get().get_thread_id(),
                                                                      root_task->profiling_node_);
 #endif
+
      return std::move(thread_state::get().main_continuation());
    });
-
-#if PLS_PROFILING_ENABLED
-    thread_state::get().get_scheduler().profiler_.task_finish_stack_measure(thread_state::get().get_thread_id(),
-                                                                            root_task->stack_memory_,
-                                                                            root_task->stack_size_,
-                                                                            root_task->profiling_node_);
-#endif
  }
 private:
  F &function_;
@@ -141,7 +139,6 @@ void scheduler::spawn(Function &&lambda) {
  base_task *spawned_task = last_task->next_;

 #if PLS_PROFILING_ENABLED
-  // Memory and DAG nodes
  spawning_state.get_scheduler().profiler_.task_prepare_stack_measure(spawning_state.get_thread_id(),
                                                                      spawned_task->stack_memory_,
                                                                      spawned_task->stack_size_);
@@ -189,24 +186,32 @@ void scheduler::spawn(Function &&lambda) {

      syncing_state.set_active_task(last_task);
 #if PLS_PROFILING_ENABLED
+      syncing_state.get_scheduler().profiler_.task_finish_stack_measure(syncing_state.get_thread_id(),
+                                                                        spawned_task->stack_memory_,
+                                                                        spawned_task->stack_size_,
+                                                                        spawned_task->profiling_node_);
      syncing_state.get_scheduler().profiler_.task_stop_running(syncing_state.get_thread_id(),
                                                                spawned_task->profiling_node_);
 #endif
      return std::move(last_task->continuation_);
    } else {
      // Slow path, the last task was stolen. This path is common to sync() events.
-      auto continuation = slow_return(syncing_state);
 #if PLS_PROFILING_ENABLED
+      syncing_state.get_scheduler().profiler_.task_finish_stack_measure(syncing_state.get_thread_id(),
+                                                                        spawned_task->stack_memory_,
+                                                                        spawned_task->stack_size_,
+                                                                        spawned_task->profiling_node_);
      syncing_state.get_scheduler().profiler_.task_stop_running(syncing_state.get_thread_id(),
                                                                spawned_task->profiling_node_);
 #endif
+      auto continuation = slow_return(syncing_state);
      return continuation;
    }
  });

 #if PLS_PROFILING_ENABLED
  thread_state::get().get_scheduler().profiler_.task_start_running(thread_state::get().get_thread_id(),
-                                                                   thread_state::get().get_active_task()->profiling_node_);
+                                                                   last_task->profiling_node_);
 #endif

  if (continuation.valid()) {

--- a/lib/pls/src/internal/profiling/dag_node.cpp
+++ b/lib/pls/src/internal/profiling/dag_node.cpp
@@ -20,7 +20,11 @@ void dag_node::dag_compact() {
 }

 void dag_node::dag_print(std::ostream &stream, unsigned rank) {
-  stream << node_print_id() << " [label=" << spawning_thread_id_ << ", rank=" << rank << "];" << std::endl;
+  stream << node_print_id()
+         << " [label=\"" << spawning_thread_id_ << "\n"
+         << max_memory_ << " bytes\n"
+         << m_to_d(total_runtime_) << " us\""
+         << " ,rank=" << rank << "];" << std::endl;
  for (auto &child : child_nodes_) {
    child.dag_print(stream, rank + 1);
    stream << node_print_id() << " -> " << child.node_print_id() << ";" << std::endl;
@@ -68,4 +72,18 @@ unsigned long dag_node::dag_critical_path() {
  return critical_path;
 }

+unsigned long dag_node::dag_depth() {
+  unsigned long deepest_at_level = 1;
+  if (next_node_) {
+    deepest_at_level = std::max(deepest_at_level, next_node_->dag_depth());
+  }
+
+  unsigned long deepest_child = 0;
+  for (auto &child : child_nodes_) {
+    deepest_child = std::max(deepest_child, child.dag_depth());
+  }
+
+  return std::max(deepest_child + 1, deepest_at_level);
+}
+
 }
--- a/lib/pls/src/internal/profiling/profiler.cpp
+++ b/lib/pls/src/internal/profiling/profiler.cpp
 #include "pls/internal/profiling/profiler.h"

 namespace pls::internal::profiling {
-void profiler::profiler_run::print_stats(unsigned num_threads) const {
+
+void profiler::profiler_run::calculate_stats() {
  root_node_->dag_compact();

-  auto run_duration = std::chrono::duration_cast<measurement_resolution>(end_time_ - start_time_).count();
-  std::cout << "===========================" << std::endl;
-  std::cout << "WALL TIME: " << m_to_d(run_duration) << std::endl;
-  unsigned long total_user_time = root_node_->dag_total_user_time();
-  std::cout << "USER TIME: " << m_to_d(total_user_time) << std::endl;
-  unsigned long critical_path_time = root_node_->dag_critical_path();
-  std::cout << "CRITICAL TIME: " << m_to_d(critical_path_time) << std::endl;
-  std::cout << "MAX SPEEDUP:" << (double) total_user_time / (double) critical_path_time << std::endl;
-
-  unsigned long total_failed_steals = 0;
-  unsigned long total_successful_steals = 0;
-  unsigned long total_steal_time = 0;
+  wall_time_ = std::chrono::duration_cast<measurement_resolution>(end_time_ - start_time_).count();
+
+  t_1_ = root_node_->dag_total_user_time();
+  t_inf_ = root_node_->dag_critical_path();
+
+  steals_failed_ = 0;
+  steals_successful_ = 0;
+  steals_cas_ops_ = 0;
+  steals_time_ = 0;
  for (auto &thread_stats : per_thread_stats_) {
-    total_failed_steals += thread_stats.failed_steals_;
-    total_successful_steals += thread_stats.successful_steals_;
-    total_steal_time += thread_stats.total_time_stealing_;
+    steals_failed_ += thread_stats.failed_steals_;
+    steals_successful_ += thread_stats.successful_steals_;
+    steals_cas_ops_ += thread_stats.steal_cas_ops_;
+    steals_time_ += thread_stats.total_time_stealing_;
  }
-  std::cout << "STEALS: (Time " << m_to_d(total_steal_time)
-            << ", Total: " << (total_successful_steals + total_failed_steals)
-            << ", Success: " << total_successful_steals
-            << ", Failed: " << total_failed_steals << ")" << std::endl;

-  unsigned long total_measured = total_steal_time + total_user_time;
-  unsigned long total_wall = run_duration * num_threads;
-  std::cout << "Wall Time vs. Measured: " << 100.0 * total_measured / total_wall << std::endl;
+  max_memory_per_stack_ = root_node_->dag_max_memory();
+  spawn_depth_ = root_node_->dag_depth();
+}

-  std::cout << "MEMORY: " << root_node_->dag_max_memory() << " bytes per stack" << std::endl;
+void profiler::profiler_run::print_stats() const {

-  // TODO: re-add graph printing
+  std::cout << "===========================" << std::endl;
+  std::cout << "WALL TIME: " << m_to_d(wall_time_) << std::endl;
+  std::cout << "USER TIME: " << m_to_d(t_1_) << std::endl;
+  std::cout << "CRITICAL TIME: " << m_to_d(t_inf_) << std::endl;
+  std::cout << "MAX SPEEDUP:" << (double) t_1_ / (double) t_inf_ << std::endl;
+
+  std::cout << "STEALS: (Time " << m_to_d(steals_time_)
+            << ", Total: " << (steals_failed_ + steals_successful_)
+            << ", Success: " << steals_successful_
+            << ", Failed: " << steals_failed_
+            << ", CAS: " << steals_cas_ops_ << ")" << std::endl;
+
+  unsigned long total_measured = steals_time_ + t_1_;
+  unsigned long total_wall = wall_time_ * num_threads_;
+  std::cout << "Wall Time vs. Measured: " << 100.0 * total_measured / total_wall << std::endl;

-//      std::cout << "digraph {" << std::endl;
-//      root_node_->dag_print(std::cout, 0);
-//      std::cout << "}" << std::endl;
+  std::cout << "MEMORY: " << max_memory_per_stack_ << " bytes per stack" << std::endl;
+  std::cout << "DEPTH:" << spawn_depth_ << std::endl;
+}
+
+void profiler::profiler_run::print_dag(std::ostream &stream) {
+  stream << "digraph {" << std::endl;
+  root_node_->dag_print(std::cout, 0);
+  stream << "}" << std::endl;
 }

 dag_node *profiler::start_profiler_run() {
@@ -47,56 +61,52 @@ dag_node *profiler::start_profiler_run() {

 void profiler::stop_profiler_run() {
  current_run().end_time_ = clock::now();
-  current_run().print_stats(num_threads_);
+  current_run().root_node_->dag_compact();
+  current_run().calculate_stats();
 }

 void profiler::stealing_start(unsigned thread_id) {
-  auto &thread_stats = thread_stats_for(thread_id);
-
-  thread_stats.run_on_stack([&] {
-    thread_stats.stealing_start_time_ = clock::now();
+  run_on_stack(thread_id, [&](thread_stats &thread_stats) {
    thread_stats.total_steals_++;
+
+    if (capture_time_) {
+      thread_stats.stealing_start_time_ = clock::now();
+    }
  });
 }

 void profiler::stealing_end(unsigned thread_id, bool success) {
-  auto &thread_stats = thread_stats_for(thread_id);
-
-  thread_stats.run_on_stack([&] {
+  run_on_stack(thread_id, [&](thread_stats &thread_stats) {
    thread_stats.failed_steals_ += !success;
    thread_stats.successful_steals_ += success;

-    auto end_time = clock::now();
-    auto
-        steal_duration =
-        std::chrono::duration_cast<measurement_resolution>(end_time - thread_stats.stealing_start_time_).count();
-    thread_stats.total_time_stealing_ += steal_duration;
+    if (capture_time_) {
+      auto end_time = clock::now();
+      auto
+          steal_duration =
+          std::chrono::duration_cast<measurement_resolution>(end_time - thread_stats.stealing_start_time_).count();
+      thread_stats.total_time_stealing_ += steal_duration;
+    }
  });
 }

 void profiler::stealing_cas_op(unsigned thread_id) {
-  auto &thread_stats = thread_stats_for(thread_id);
-
-  thread_stats.run_on_stack([&] {
+  run_on_stack(thread_id, [&](thread_stats &thread_stats) {
    thread_stats.steal_cas_ops_++;
  });
 }

 dag_node *profiler::task_spawn_child(unsigned thread_id, dag_node *parent) {
-  auto &thread_stats = thread_stats_for(thread_id);
-
  dag_node *result;
-  thread_stats.run_on_stack([&] {
+  run_on_stack(thread_id, [&](thread_stats &) {
    result = &parent->child_nodes_.emplace_back(thread_id);
  });
  return result;
 }

 dag_node *profiler::task_sync(unsigned thread_id, dag_node *synced) {
-  auto &thread_stats = thread_stats_for(thread_id);
-
  dag_node *result;
-  thread_stats.run_on_stack([&] {
+  run_on_stack(thread_id, [&](thread_stats &) {
    synced->next_node_ = std::make_unique<dag_node>(thread_id);
    result = synced->next_node_.get();
  });
@@ -104,55 +114,55 @@ dag_node *profiler::task_sync(unsigned thread_id, dag_node *synced) {
 }

 void profiler::task_start_running(unsigned thread_id, dag_node *in_node) {
-  (void) in_node; // unused, keep for 'symmetric' API
-  auto &thread_stats = thread_stats_for(thread_id);
-
-  thread_stats.run_on_stack([&] {
-    thread_stats.task_run_start_time = clock::now();
-  });
+  if (capture_time_) {
+    (void) in_node; // unused, keep for 'symmetric' API
+    run_on_stack(thread_id, [&](thread_stats &thread_stats) {
+      thread_stats.task_run_start_time_ = clock::now();
+    });
+  }
 }

 void profiler::task_stop_running(unsigned thread_id, dag_node *in_node) {
-  auto &thread_stats = thread_stats_for(thread_id);
-
-  thread_stats.run_on_stack([&] {
-    auto end_time = clock::now();
-    auto user_code_duration =
-        std::chrono::duration_cast<measurement_resolution>(end_time - thread_stats.task_run_start_time).count();
-    in_node->total_runtime_ += user_code_duration;
-  });
+  if (capture_time_) {
+    run_on_stack(thread_id, [&](thread_stats &thread_stats) {
+      auto end_time = clock::now();
+      auto user_code_duration =
+          std::chrono::duration_cast<measurement_resolution>(end_time - thread_stats.task_run_start_time_).count();
+      in_node->total_runtime_ += user_code_duration;
+    });
+  }
 }

 void profiler::task_prepare_stack_measure(unsigned thread_id, char *stack_memory, size_t stack_size) {
-  auto &thread_stats = thread_stats_for(thread_id);
-
-  thread_stats.run_on_stack([&] {
-    for (size_t i = 0; i < stack_size - sizeof(MAGIC_BYTES); i += sizeof(MAGIC_BYTES)) {
-      for (size_t j = 0; j < sizeof(MAGIC_BYTES); j++) {
-        stack_memory[i + j] = MAGIC_BYTES[j];
+  if (capture_memory_) {
+    run_on_stack(thread_id, [&](thread_stats &) {
+      for (size_t i = 0; i < stack_size - sizeof(MAGIC_BYTES); i += sizeof(MAGIC_BYTES)) {
+        for (size_t j = 0; j < sizeof(MAGIC_BYTES); j++) {
+          stack_memory[i + j] = MAGIC_BYTES[j];
+        }
      }
-    }
-  });
+    });
+  }
 }

 void profiler::task_finish_stack_measure(unsigned thread_id, char *stack_memory, size_t stack_size, dag_node *in_node) {
-  auto &thread_stats = thread_stats_for(thread_id);
-
-  thread_stats.run_on_stack([&] {
-    for (size_t i = 0; i < stack_size - sizeof(MAGIC_BYTES); i += sizeof(MAGIC_BYTES)) {
-      bool section_clean = true;
-      for (size_t j = 0; j < sizeof(MAGIC_BYTES); j++) {
-        if (stack_memory[i + j] != MAGIC_BYTES[j]) {
-          section_clean = false;
+  if (capture_memory_) {
+    run_on_stack(thread_id, [&](thread_stats &) {
+      for (size_t i = 0; i < stack_size - sizeof(MAGIC_BYTES); i += sizeof(MAGIC_BYTES)) {
+        bool section_clean = true;
+        for (size_t j = 0; j < sizeof(MAGIC_BYTES); j++) {
+          if (stack_memory[i + j] != MAGIC_BYTES[j]) {
+            section_clean = false;
+          }
        }
-      }

-      in_node->max_memory_ = stack_size - i + sizeof(MAGIC_BYTES);
-      if (!section_clean) {
-        return;
+        in_node->max_memory_ = stack_size - i + sizeof(MAGIC_BYTES);
+        if (!section_clean) {
+          return;
+        }
      }
-    }
-  });
+    });
+  }
 }

 }
--- a/lib/pls/src/internal/scheduling/scheduler.cpp
+++ b/lib/pls/src/internal/scheduling/scheduler.cpp
@@ -124,8 +124,8 @@ void scheduler::sync() {

  if (active_task->is_synchronized_) {
 #if PLS_PROFILING_ENABLED
-    syncing_state.get_scheduler().profiler_.task_start_running(syncing_state.get_thread_id(),
-                                                               active_task->profiling_node_);
+    thread_state::get().get_scheduler().profiler_.task_start_running(thread_state::get().get_thread_id(),
+                                                                     thread_state::get().get_active_task()->profiling_node_);
 #endif
    return; // We are already the sole owner of last_task
  } else {