From 2f539691980e478ab6fd2bd451d5abcddac3555e Mon Sep 17 00:00:00 2001
From: FritzFlorian <flo.fritz@t-online.de>
Date: Thu, 5 Dec 2019 15:32:20 +0100
Subject: [PATCH] Minor changes for profiling and add more alignment.

The idea is to exclude as many sources as possible that could lead to issues with contention and cache misses. After some experimentation, we think that hyperthreading is simply not working very well with our kind of workload. In the future we might simply test on other hardware.
---
 PERFORMANCE-v1.md                                                    | 384 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 PERFORMANCE-v2.md                                                    |  22 ++++++++++++++++++++++
 PERFORMANCE.md                                                       | 384 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 app/benchmark_fft/main.cpp                                           |  11 ++++++++---
 app/benchmark_matrix/main.cpp                                        |  46 +++++++++++++++++++++++++++++++++++++---------
 app/benchmark_unbalanced/main.cpp                                    |  39 +++++++++++++++++++++++++++++++--------
 app/playground/main.cpp                                              |  18 ++++++++++--------
 lib/pls/include/pls/algorithms/for_each.h                            |  23 ++++++++++++++++-------
 lib/pls/include/pls/algorithms/for_each_impl.h                       |  71 ++++++++++++++++++++++++++++++++++++-----------------------------------
 lib/pls/include/pls/internal/data_structures/bounded_trading_deque.h |   4 ++--
 lib/pls/include/pls/internal/helpers/profiler.h                      |  12 +++++++-----
 lib/pls/include/pls/internal/scheduling/cont.h                       |   3 +++
 lib/pls/include/pls/internal/scheduling/memory_block.h               |  13 +------------
 lib/pls/include/pls/internal/scheduling/scheduler_impl.h             |  12 ++++++------
 lib/pls/include/pls/internal/scheduling/scheduler_memory.h           |  35 +++++++++++++++++++----------------
 lib/pls/include/pls/internal/scheduling/task.h                       |   3 +++
 lib/pls/include/pls/internal/scheduling/task_manager.h               |   4 ----
 lib/pls/include/pls/internal/scheduling/thread_state_static.h        |  10 ++++++----
 lib/pls/src/internal/scheduling/scheduler.cpp                        |  14 ++++++++------
 media/e34ea267_fft_execution_pattern.png                             | Bin 0 -> 87487 bytes
 media/e34ea267_thread_state_for.png                                  | Bin 0 -> 95831 bytes
 21 files changed, 599 insertions(+), 509 deletions(-)
 create mode 100644 PERFORMANCE-v1.md
 create mode 100644 PERFORMANCE-v2.md
 delete mode 100644 PERFORMANCE.md
 create mode 100644 media/e34ea267_fft_execution_pattern.png
 create mode 100644 media/e34ea267_thread_state_for.png

diff --git a/PERFORMANCE-v1.md b/PERFORMANCE-v1.md
new file mode 100644
index 0000000..7112439
--- /dev/null
+++ b/PERFORMANCE-v1.md
@@ -0,0 +1,384 @@
+# Notes on performance measures during development
+
+#### Commit 52fcb51f - Add basic random stealing
+
+Slight improvement, needs further measurement after removing more important bottlenecks.
+Below are three individual measurements of the difference.
+Overall the trend (sum of all numbers/last number),
+go down (98.7%, 96.9% and 100.6%), but with the one measurement
+above 100% we think the improvements are minor.
+
+| | | | | | | | | | |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+old     |    1659.01 us|    967.19 us|    830.08 us|    682.69 us|    737.71 us|    747.92 us|    749.37 us|    829.75 us|   7203.73 us
+new     |    1676.06 us|    981.56 us|    814.71 us|    698.72 us|    680.87 us|    737.68 us|    756.91 us|    764.71 us|   7111.22 us
+change  |    101.03  %|    101.49  %|     98.15  %|    102.35  %|     92.30  %|     98.63  %|    101.01  %|     92.16  %|     98.72  %
+
+| | | | | | | | | | |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+old     |    1648.65 us|    973.33 us|    820.18 us|    678.80 us|    746.21 us|    767.63 us|    747.17 us|   1025.35 us|   7407.32 us
+new     |    1655.09 us|    964.99 us|    807.57 us|    731.34 us|    747.47 us|    714.71 us|    794.35 us|    760.28 us|   7175.80 us
+change  |    100.39  %|     99.14  %|     98.46  %|    107.74  %|    100.17  %|     93.11  %|    106.31  %|     74.15  %|     96.87  %
+
+| | | | | | | | | | |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+old     |    1654.26 us|    969.12 us|    832.13 us|    680.69 us|    718.70 us|    750.80 us|    744.12 us|    775.24 us|   7125.07 us
+new     |    1637.04 us|    978.09 us|    799.93 us|    709.33 us|    746.42 us|    684.87 us|    822.30 us|    787.61 us|   7165.59 us
+change  |     98.96  %|    100.93  %|     96.13  %|    104.21  %|    103.86  %|     91.22  %|    110.51  %|    101.60  %|    100.57  %
+
+#### Commit 3535cbd8  - Cache Align scheduler_memory
+
+Big improvements of about 6% in our test. This seems like a little,
+but 6% from the scheduler is a lot, as the 'main work' is the tasks
+itself, not the scheduler.
+
+This change unsurprisingly yields the biggest improvement yet.
+
+#### Commit b9bb90a4  - Try to figure out the 'high thread bottleneck'
+
+We are currently seeing good performance on low core counts
+(up to 1/2 of the machines cores), but after that performance
+plumishes:
+
+Bana-Pi Best-Case:
+
+<img src="./media/b9bb90a4-banana-pi-best-case.png" width="400"/>
+
+Bana-Pi Average-Case:
+
+<img src="./media/b9bb90a4-banana-pi-average-case.png" width="400"/>
+
+Laptop Best-Case:
+
+<img src="./media/b9bb90a4-laptop-best-case.png" width="400"/>
+
+Laptop Average-Case:
+
+<img src="./media/b9bb90a4-laptop-average-case.png" width="400"/>
+
+
+As we can see, in average the performance of PLS starts getting
+way worse than TBB and EMBB after 4 cores. We suspect this is due
+to contemption, but could not resolve it with any combination
+of `tas_spinlock` vs `ttas_spinlock` and `lock` vs `try_lock`.
+
+This issue clearly needs further investigation.
+
+### Commit aa27064 - Performance with ttsa spinlocks (and 'full blocking' top level)
+
+<img src="media/aa27064_fft_average.png" width="400"/>
+
+### Commit d16ad3e - Performance with rw-lock and backoff
+
+<img src="media/d16ad3e_fft_average.png" width="400"/>
+
+### Commit 18b2d744 - Performance with lock-free deque
+
+After much tinkering we still have performance problems with higher
+thread counts in the FFT benchmark. Upward from 4/5 threads the
+performance gains start to saturate (before removing the top level
+locks we even saw a slight drop in performance).
+
+Currently the FFT benchmark shows the following results (average):
+
+<img src="media/18b2d744_fft_average.png" width="400"/>
+
+We want to positively note that the overall trend of 'performance drops'
+at the hyperthreading mark is not really bad anymore, it rather
+seems similar to EMBB now (with backoff + lockfree deque + top level
+reader-writers lock). This comes partly because the spike at 4 threads
+is lower (less performance at 4 threads). We also see better times
+on the multiprogramed system with the lock-free deque.
+
+This is discouraging after many tests. To see where the overhead lies
+we also implemented the unbalanced tree search benchmark,
+resulting in the following, suprisingly good, results (average):
+
+<img src="media/18b2d744_unbalanced_average.png" width="400"/>
+
+The main difference between the two benchmarks is, that the second
+one has more work and the work is relatively independent.
+Additionaly, the first one uses our high level API (parallel invoke),
+while the second one uses our low level API.
+It is worth investigating if either or high level API or the structure
+of the memory access in FFT are the problem.
+
+### Commit cf056856 - Remove two-level scheduler
+
+In this test we replace the two level scheduler with ONLY fork_join
+tasks. This removes the top level steal overhead and performs only
+internal stealing. For this we set the fork_join task as the only
+possible task type and removed the top level rw-lock, the digging
+down to our level and solely use internal stealing.
+
+Average results FFT:
+
+<img src="media/cf056856_fft_average.png" width="400"/>
+
+Average results Unbalanced:
+
+<img src="media/cf056856_unbalanced_average.png" width="400"/>
+
+There seems to be only a minor performance difference between the two,
+suggesting tha our two-level approach is not the part causing our
+weaker performance.
+
+### Commit afd0331b - Some notes on scaling problems
+
+After tweaking individual values and parameters we can still not find
+the main cause for our slowdown on multiple processors.
+We also use intel's vtune amplifier to measure performance on our run
+and find that we always spend way too much time 'waiting for work',
+e.g. in the backoff mechanism when enabled or in the locks for stealing
+work when backoff is disabled. This leads us to believe that our problems
+might be connected to some issue with work distribution on the FFT case,
+as the unbalanced tree search (with a lot 'local' work) performs good.
+
+To get more data in we add benchmarks on matrix multiplication implemented
+in two fashions: once with a 'native' array stealing task and once with
+a fork-join task. Both implementations use the same minimum array
+sub-size of 4 elements and we can hopefully see if they have any
+performance differences.
+
+Best case fork-join:
+
+<img src="media/afd0331b_matrix_best_case_fork.png" width="400"/>
+
+Average case fork-join:
+
+<img src="media/afd0331b_matrix_average_case_fork.png" width="400"/>
+
+Best case Native:
+
+<img src="media/afd0331b_matrix_best_case_native.png" width="400"/>
+
+Average case Native:
+
+<img src="media/afd0331b_matrix_average_case_native.png" width="400"/>
+
+What we find very interesting is, that the best case times of our
+pls library are very fast (as good as TBB), but the average times
+drop badly. We currently do not know why this is the case.
+
+### Commit afd0331b - Intel VTune Amplifier
+
+We did serval measurements with intel's VTune Amplifier profiling
+tool. The main thing that we notice is, that the cycles per instruction
+for our useful work blocks increase, thus requiring more CPU time
+for the acutal useful work.
+
+We also measured an implementation using TBB and found no significante
+difference, e.g. TBB also has a higher CPI with 8 threads.
+Our conclusion after this long hunting for performance is, that we
+might just be bound by some general performance issues with our code.
+The next step will therefore be to read the other frameworks and our
+code carefully, trying to find potential issues.
+
+### Commit 116cf4af - Removing Exponential Backoff
+
+In the steal loop we first hade a backoff-mechanism as often seen in
+locks (spin with relaxed CPU, then sleep/yield after too many backoffs).
+The rationale behind this is to relax the memory bus by not busily
+working on atomic variables. We introduced it first with the fear that
+keeping the CPU busy with spinning would degregade performance of the
+other working threads. However, the above examination with Intel VTune
+showed that this seems to not be the main problem of our implementation
+(TBB shows the same CPI increases with more threads, our implementation
+seems fine in this regard).
+
+To further reduce elements that could cause performance problems, we
+therefore decided to perform one more measurement without this backoff.
+
+#### Results of FFT
+
+The first measurement is on the FFT. Here we tested two variants:
+One with a 'yield/sleep' statement after a worker thread failed
+to steal any work after the first try on every other thread and
+one without this sleep. The rationale behind the sleep is that
+it relaxes the CPU (it is also found in EMBB).
+
+Average with sleep:
+
+<img src="media/116cf4af_fft_average_sleep.png" width="400"/>
+
+
+Average without sleep:
+
+<img src="media/116cf4af_fft_average_no_sleep.png" width="400"/>
+
+
+We clearly observe that the version without a sleep statement
+is faster, and thus in future experiments/measurements
+will exclude this statement. This also makes sense, as our
+steal loop can fail, even thought there potentially is work
+(because of our lock free deque implementation).
+
+#### Results Matrix
+
+We re-ran our benchmarks on the fork-join and native matrix
+multiplication implementation to see how those change without
+the backoff. We expect good results, as the matrix multiplication
+mostly has enough work to keep all threads busy, thus having
+workers less time spinning in the steal loop.
+
+Average Fork-Join Matrix:
+
+<img src="media/116cf4af_matrix_average_fork.png" width="400"/>
+
+
+Average Native Matrix:
+
+<img src="media/116cf4af_matrix_average_native.png" width="400"/>
+
+The results are far better than the last ones, and indicate that
+removing the backoff can drasticly improve performance.
+
+#### Conclusion
+
+We will exclude the backoff mechanisms for further tests, as this
+seems to generally improve (or at least not harm performance in
+case of FFT).
+
+We also want to note that all these measurements are not very
+controlled/scientific, but simply ran ot our notebook for
+fast iterations over different, potential issues with our scheduler.
+
+
+### Commit 116cf4af - VTune Amplifier and MRSW top level lock
+
+When looking at why our code works quite well on problems with
+mostly busy workers and not so well on code with spinning/waiting
+workers (like in the FFT), we take a closer look at the FFT and
+matrix multiplication in VTune.
+
+FFT:
+
+<img src="media/116cf4af_fft_vtune.png" width="400"/>
+
+Matrix:
+
+<img src="media/116cf4af_matrix_vtune.png" width="400"/>
+
+The sections highlighted in red represent parts of the code spent
+on spinning in the work-stealing loop.
+We can see that as long as our workers are mainly busy/find work
+in the stealing loop the overhead spent on spinning is minimal.
+We can also see that in the FFT considerable amounts of time are
+spent spining.
+
+A general observation are the high CPI rates for our spinning code.
+This makes sense, as we are currently working on locks that share
+atomic variables in order to work, thus leading to cache misses.
+
+### Commit 116cf4af - 2D Heat Diffusion
+
+As a last test for our current state on performance we implemented the
+2D heat diffusion benchmark using our framework (using fork-join based
+parallel_for, 512 heat array size):
+
+<img src="media/116cf4af_heat_average.png" width="400"/>
+
+We observe solid performance from our implementation.
+(Again, not very scientific test environment, but good enough for
+our general direction)
+
+### Commit 3bdaba42 - Move to pure fork-join tasks (remove two level)
+
+We moved away from our two-level scheduler approach towards a
+pure fork-join task model (in order to remove any lock's in the
+code more easily and to make further tests simpler/more focused
+on one specific aspecs.
+These are the measurements made after the change
+(without any performance optimizations done):
+
+FFT Average:
+
+<img src="media/3bdaba42_fft_average.png" width="400"/>
+
+Heat Diffusion Average:
+
+<img src="media/3bdaba42_heat_average.png" width="400"/>
+
+Matrix Multiplication Average:
+
+<img src="media/3bdaba42_matrix_average.png" width="400"/>
+
+Unbalanced Tree Search Average:
+
+<img src="media/3bdaba42_unbalanced_average.png" width="400"/>
+
+
+We note that in heat diffusion, matrix multiplication and unbalanced
+tree search - all three benchmarks with mostly enough work avaliable at
+all time - our implementation performs head on head with intel's
+TBB. Only the FFT benchmark is a major problem four our library.
+We notice a MAJOR drop in performance exactly at the hyperthreading
+mark, indicating problems with limited resources due to the spinning
+threads (threads without any actual work) and the threads actually
+performing work. Most likely there is a resource on the same cache
+line used that hinders the working threads, but we can not really
+figure out which one it is.
+
+### Commit be2cdbfe - Locking Deque
+
+Switching to a locking deque has not improved (or even slightly hurt)
+performance, we therefore think that the deque itself is not the
+portion slowing down our execution.
+
+### Commit 5044f0a1 - Performance Bottelneck in FFT FIXED
+
+By moving from directly calling one of the parallel invocations
+
+```c++
+scheduler::spawn_child(sub_task_2);
+function1(); // Execute first function 'inline' without spawning a sub_task object
+```
+
+to spawning two tasks
+```c++
+scheduler::spawn_child(sub_task_2);
+scheduler::spawn_child(sub_task_1);
+```
+
+we where able to fix the bad performance of our framework in the
+FFT benchmark (where there is a lot spinning/idling of some
+worker threads).
+
+We think this is due to some sort of cache misses/bus contemption
+on the finishing counters. This would make sense, as the drop
+at the hyperthreading mark indicates problems with this part of the
+CPU pipeline (althought it did not show clearly in our profiling runs).
+We will now try to find the exact spot where the problem originates and
+fix the source rather then 'circumventing' it with these extra tasks.
+(This then aigain, should hopefully even boost all other workloads
+performance, as contemption on the bus/cache is always bad)
+
+
+After some research we think that the issue is down to many threads
+referencing the same atomic reference counter. We think so because
+even cache aligning the shared refernce count does not fix the issue
+when using the direct function call. Also, forcing a new method call
+(going down in the call stack one function call) is not solving the
+issue (thus making sure that it is not related with some caching issue
+in the call itself).
+
+In conclusion there seems to be a hyperthreading issue with this
+shared reference count. We keep this in mind if we eventually get
+tasks with changing data memebers (as this problem could reappear there,
+as then the ref_count actualy is in the same memory region as our
+'user variables'). For now we leave the code like it is.
+
+
+FFT Average with new call method:
+
+<img src="media/5044f0a1_fft_average.png" width="400"/>
+
+The performance of our new call method looks shockingly similar
+to TBB with a slight, constant performance drop behind it.
+This makes sense, as the basic principle (lock-free, classic work
+stealing deque and the parallel call structure) are nearly the same.
+
+We will see if minor optimizations can even close this last gap.
+Overall the performance at this point is good enough to move on
+to implementing more functionality and to running tests on different
+queues/stealing tactics etc.
diff --git a/PERFORMANCE-v2.md b/PERFORMANCE-v2.md
new file mode 100644
index 0000000..b8a27df
--- /dev/null
+++ b/PERFORMANCE-v2.md
@@ -0,0 +1,22 @@
+# Notes on performance measures during development
+
+#### Commit e34ea267 - 05.12.2019 - First Version of new Algorithm - Scaling Problems
+
+The first version of our memory trading work stealing algorithm works. It still shows scaling issues over
+the hyperthreading mark, very similar to what we have seen in version 1. This indicates some sort of
+contention between the threads when running the FFT algorithm.
+
+Analyzing the current version we find issue with the frequent call to `thread_state_for(id)` in
+the stealing loop.
+
+![](./media/e34ea267_thread_state_for.png)
+
+It is obvious that the method takes some amount of runtime, as FFT has a structure that tends to only
+work on the continuations in the end of the computation (the critical path of FFT can only be executed
+after most parallel tasks are done).
+
+![](./media/e34ea267_fft_execution_pattern.png)
+
+What we can see here is the long tail of continuations running at the end of the computation. During
+this time the non working threads constantly steal, thus requiring the `thread_state_for(id)`
+virtual method, potentially hindering other threads from doing their work properly. 
diff --git a/PERFORMANCE.md b/PERFORMANCE.md
deleted file mode 100644
index 7112439..0000000
--- a/PERFORMANCE.md
+++ /dev/null
@@ -1,384 +0,0 @@
-# Notes on performance measures during development
-
-#### Commit 52fcb51f - Add basic random stealing
-
-Slight improvement, needs further measurement after removing more important bottlenecks.
-Below are three individual measurements of the difference.
-Overall the trend (sum of all numbers/last number),
-go down (98.7%, 96.9% and 100.6%), but with the one measurement
-above 100% we think the improvements are minor.
-
-| | | | | | | | | | |
-| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
-old     |    1659.01 us|    967.19 us|    830.08 us|    682.69 us|    737.71 us|    747.92 us|    749.37 us|    829.75 us|   7203.73 us
-new     |    1676.06 us|    981.56 us|    814.71 us|    698.72 us|    680.87 us|    737.68 us|    756.91 us|    764.71 us|   7111.22 us
-change  |    101.03  %|    101.49  %|     98.15  %|    102.35  %|     92.30  %|     98.63  %|    101.01  %|     92.16  %|     98.72  %
-
-| | | | | | | | | | |
-| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
-old     |    1648.65 us|    973.33 us|    820.18 us|    678.80 us|    746.21 us|    767.63 us|    747.17 us|   1025.35 us|   7407.32 us
-new     |    1655.09 us|    964.99 us|    807.57 us|    731.34 us|    747.47 us|    714.71 us|    794.35 us|    760.28 us|   7175.80 us
-change  |    100.39  %|     99.14  %|     98.46  %|    107.74  %|    100.17  %|     93.11  %|    106.31  %|     74.15  %|     96.87  %
-
-| | | | | | | | | | |
-| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
-old     |    1654.26 us|    969.12 us|    832.13 us|    680.69 us|    718.70 us|    750.80 us|    744.12 us|    775.24 us|   7125.07 us
-new     |    1637.04 us|    978.09 us|    799.93 us|    709.33 us|    746.42 us|    684.87 us|    822.30 us|    787.61 us|   7165.59 us
-change  |     98.96  %|    100.93  %|     96.13  %|    104.21  %|    103.86  %|     91.22  %|    110.51  %|    101.60  %|    100.57  %
-
-#### Commit 3535cbd8  - Cache Align scheduler_memory
-
-Big improvements of about 6% in our test. This seems like a little,
-but 6% from the scheduler is a lot, as the 'main work' is the tasks
-itself, not the scheduler.
-
-This change unsurprisingly yields the biggest improvement yet.
-
-#### Commit b9bb90a4  - Try to figure out the 'high thread bottleneck'
-
-We are currently seeing good performance on low core counts
-(up to 1/2 of the machines cores), but after that performance
-plumishes:
-
-Bana-Pi Best-Case:
-
-<img src="./media/b9bb90a4-banana-pi-best-case.png" width="400"/>
-
-Bana-Pi Average-Case:
-
-<img src="./media/b9bb90a4-banana-pi-average-case.png" width="400"/>
-
-Laptop Best-Case:
-
-<img src="./media/b9bb90a4-laptop-best-case.png" width="400"/>
-
-Laptop Average-Case:
-
-<img src="./media/b9bb90a4-laptop-average-case.png" width="400"/>
-
-
-As we can see, in average the performance of PLS starts getting
-way worse than TBB and EMBB after 4 cores. We suspect this is due
-to contemption, but could not resolve it with any combination
-of `tas_spinlock` vs `ttas_spinlock` and `lock` vs `try_lock`.
-
-This issue clearly needs further investigation.
-
-### Commit aa27064 - Performance with ttsa spinlocks (and 'full blocking' top level)
-
-<img src="media/aa27064_fft_average.png" width="400"/>
-
-### Commit d16ad3e - Performance with rw-lock and backoff
-
-<img src="media/d16ad3e_fft_average.png" width="400"/>
-
-### Commit 18b2d744 - Performance with lock-free deque
-
-After much tinkering we still have performance problems with higher
-thread counts in the FFT benchmark. Upward from 4/5 threads the
-performance gains start to saturate (before removing the top level
-locks we even saw a slight drop in performance).
-
-Currently the FFT benchmark shows the following results (average):
-
-<img src="media/18b2d744_fft_average.png" width="400"/>
-
-We want to positively note that the overall trend of 'performance drops'
-at the hyperthreading mark is not really bad anymore, it rather
-seems similar to EMBB now (with backoff + lockfree deque + top level
-reader-writers lock). This comes partly because the spike at 4 threads
-is lower (less performance at 4 threads). We also see better times
-on the multiprogramed system with the lock-free deque.
-
-This is discouraging after many tests. To see where the overhead lies
-we also implemented the unbalanced tree search benchmark,
-resulting in the following, suprisingly good, results (average):
-
-<img src="media/18b2d744_unbalanced_average.png" width="400"/>
-
-The main difference between the two benchmarks is, that the second
-one has more work and the work is relatively independent.
-Additionaly, the first one uses our high level API (parallel invoke),
-while the second one uses our low level API.
-It is worth investigating if either or high level API or the structure
-of the memory access in FFT are the problem.
-
-### Commit cf056856 - Remove two-level scheduler
-
-In this test we replace the two level scheduler with ONLY fork_join
-tasks. This removes the top level steal overhead and performs only
-internal stealing. For this we set the fork_join task as the only
-possible task type and removed the top level rw-lock, the digging
-down to our level and solely use internal stealing.
-
-Average results FFT:
-
-<img src="media/cf056856_fft_average.png" width="400"/>
-
-Average results Unbalanced:
-
-<img src="media/cf056856_unbalanced_average.png" width="400"/>
-
-There seems to be only a minor performance difference between the two,
-suggesting tha our two-level approach is not the part causing our
-weaker performance.
-
-### Commit afd0331b - Some notes on scaling problems
-
-After tweaking individual values and parameters we can still not find
-the main cause for our slowdown on multiple processors.
-We also use intel's vtune amplifier to measure performance on our run
-and find that we always spend way too much time 'waiting for work',
-e.g. in the backoff mechanism when enabled or in the locks for stealing
-work when backoff is disabled. This leads us to believe that our problems
-might be connected to some issue with work distribution on the FFT case,
-as the unbalanced tree search (with a lot 'local' work) performs good.
-
-To get more data in we add benchmarks on matrix multiplication implemented
-in two fashions: once with a 'native' array stealing task and once with
-a fork-join task. Both implementations use the same minimum array
-sub-size of 4 elements and we can hopefully see if they have any
-performance differences.
-
-Best case fork-join:
-
-<img src="media/afd0331b_matrix_best_case_fork.png" width="400"/>
-
-Average case fork-join:
-
-<img src="media/afd0331b_matrix_average_case_fork.png" width="400"/>
-
-Best case Native:
-
-<img src="media/afd0331b_matrix_best_case_native.png" width="400"/>
-
-Average case Native:
-
-<img src="media/afd0331b_matrix_average_case_native.png" width="400"/>
-
-What we find very interesting is, that the best case times of our
-pls library are very fast (as good as TBB), but the average times
-drop badly. We currently do not know why this is the case.
-
-### Commit afd0331b - Intel VTune Amplifier
-
-We did serval measurements with intel's VTune Amplifier profiling
-tool. The main thing that we notice is, that the cycles per instruction
-for our useful work blocks increase, thus requiring more CPU time
-for the acutal useful work.
-
-We also measured an implementation using TBB and found no significante
-difference, e.g. TBB also has a higher CPI with 8 threads.
-Our conclusion after this long hunting for performance is, that we
-might just be bound by some general performance issues with our code.
-The next step will therefore be to read the other frameworks and our
-code carefully, trying to find potential issues.
-
-### Commit 116cf4af - Removing Exponential Backoff
-
-In the steal loop we first hade a backoff-mechanism as often seen in
-locks (spin with relaxed CPU, then sleep/yield after too many backoffs).
-The rationale behind this is to relax the memory bus by not busily
-working on atomic variables. We introduced it first with the fear that
-keeping the CPU busy with spinning would degregade performance of the
-other working threads. However, the above examination with Intel VTune
-showed that this seems to not be the main problem of our implementation
-(TBB shows the same CPI increases with more threads, our implementation
-seems fine in this regard).
-
-To further reduce elements that could cause performance problems, we
-therefore decided to perform one more measurement without this backoff.
-
-#### Results of FFT
-
-The first measurement is on the FFT. Here we tested two variants:
-One with a 'yield/sleep' statement after a worker thread failed
-to steal any work after the first try on every other thread and
-one without this sleep. The rationale behind the sleep is that
-it relaxes the CPU (it is also found in EMBB).
-
-Average with sleep:
-
-<img src="media/116cf4af_fft_average_sleep.png" width="400"/>
-
-
-Average without sleep:
-
-<img src="media/116cf4af_fft_average_no_sleep.png" width="400"/>
-
-
-We clearly observe that the version without a sleep statement
-is faster, and thus in future experiments/measurements
-will exclude this statement. This also makes sense, as our
-steal loop can fail, even thought there potentially is work
-(because of our lock free deque implementation).
-
-#### Results Matrix
-
-We re-ran our benchmarks on the fork-join and native matrix
-multiplication implementation to see how those change without
-the backoff. We expect good results, as the matrix multiplication
-mostly has enough work to keep all threads busy, thus having
-workers less time spinning in the steal loop.
-
-Average Fork-Join Matrix:
-
-<img src="media/116cf4af_matrix_average_fork.png" width="400"/>
-
-
-Average Native Matrix:
-
-<img src="media/116cf4af_matrix_average_native.png" width="400"/>
-
-The results are far better than the last ones, and indicate that
-removing the backoff can drasticly improve performance.
-
-#### Conclusion
-
-We will exclude the backoff mechanisms for further tests, as this
-seems to generally improve (or at least not harm performance in
-case of FFT).
-
-We also want to note that all these measurements are not very
-controlled/scientific, but simply ran ot our notebook for
-fast iterations over different, potential issues with our scheduler.
-
-
-### Commit 116cf4af - VTune Amplifier and MRSW top level lock
-
-When looking at why our code works quite well on problems with
-mostly busy workers and not so well on code with spinning/waiting
-workers (like in the FFT), we take a closer look at the FFT and
-matrix multiplication in VTune.
-
-FFT:
-
-<img src="media/116cf4af_fft_vtune.png" width="400"/>
-
-Matrix:
-
-<img src="media/116cf4af_matrix_vtune.png" width="400"/>
-
-The sections highlighted in red represent parts of the code spent
-on spinning in the work-stealing loop.
-We can see that as long as our workers are mainly busy/find work
-in the stealing loop the overhead spent on spinning is minimal.
-We can also see that in the FFT considerable amounts of time are
-spent spining.
-
-A general observation are the high CPI rates for our spinning code.
-This makes sense, as we are currently working on locks that share
-atomic variables in order to work, thus leading to cache misses.
-
-### Commit 116cf4af - 2D Heat Diffusion
-
-As a last test for our current state on performance we implemented the
-2D heat diffusion benchmark using our framework (using fork-join based
-parallel_for, 512 heat array size):
-
-<img src="media/116cf4af_heat_average.png" width="400"/>
-
-We observe solid performance from our implementation.
-(Again, not very scientific test environment, but good enough for
-our general direction)
-
-### Commit 3bdaba42 - Move to pure fork-join tasks (remove two level)
-
-We moved away from our two-level scheduler approach towards a
-pure fork-join task model (in order to remove any lock's in the
-code more easily and to make further tests simpler/more focused
-on one specific aspecs.
-These are the measurements made after the change
-(without any performance optimizations done):
-
-FFT Average:
-
-<img src="media/3bdaba42_fft_average.png" width="400"/>
-
-Heat Diffusion Average:
-
-<img src="media/3bdaba42_heat_average.png" width="400"/>
-
-Matrix Multiplication Average:
-
-<img src="media/3bdaba42_matrix_average.png" width="400"/>
-
-Unbalanced Tree Search Average:
-
-<img src="media/3bdaba42_unbalanced_average.png" width="400"/>
-
-
-We note that in heat diffusion, matrix multiplication and unbalanced
-tree search - all three benchmarks with mostly enough work avaliable at
-all time - our implementation performs head on head with intel's
-TBB. Only the FFT benchmark is a major problem four our library.
-We notice a MAJOR drop in performance exactly at the hyperthreading
-mark, indicating problems with limited resources due to the spinning
-threads (threads without any actual work) and the threads actually
-performing work. Most likely there is a resource on the same cache
-line used that hinders the working threads, but we can not really
-figure out which one it is.
-
-### Commit be2cdbfe - Locking Deque
-
-Switching to a locking deque has not improved (or even slightly hurt)
-performance, we therefore think that the deque itself is not the
-portion slowing down our execution.
-
-### Commit 5044f0a1 - Performance Bottelneck in FFT FIXED
-
-By moving from directly calling one of the parallel invocations
-
-```c++
-scheduler::spawn_child(sub_task_2);
-function1(); // Execute first function 'inline' without spawning a sub_task object
-```
-
-to spawning two tasks
-```c++
-scheduler::spawn_child(sub_task_2);
-scheduler::spawn_child(sub_task_1);
-```
-
-we where able to fix the bad performance of our framework in the
-FFT benchmark (where there is a lot spinning/idling of some
-worker threads).
-
-We think this is due to some sort of cache misses/bus contemption
-on the finishing counters. This would make sense, as the drop
-at the hyperthreading mark indicates problems with this part of the
-CPU pipeline (althought it did not show clearly in our profiling runs).
-We will now try to find the exact spot where the problem originates and
-fix the source rather then 'circumventing' it with these extra tasks.
-(This then aigain, should hopefully even boost all other workloads
-performance, as contemption on the bus/cache is always bad)
-
-
-After some research we think that the issue is down to many threads
-referencing the same atomic reference counter. We think so because
-even cache aligning the shared refernce count does not fix the issue
-when using the direct function call. Also, forcing a new method call
-(going down in the call stack one function call) is not solving the
-issue (thus making sure that it is not related with some caching issue
-in the call itself).
-
-In conclusion there seems to be a hyperthreading issue with this
-shared reference count. We keep this in mind if we eventually get
-tasks with changing data memebers (as this problem could reappear there,
-as then the ref_count actualy is in the same memory region as our
-'user variables'). For now we leave the code like it is.
-
-
-FFT Average with new call method:
-
-<img src="media/5044f0a1_fft_average.png" width="400"/>
-
-The performance of our new call method looks shockingly similar
-to TBB with a slight, constant performance drop behind it.
-This makes sense, as the basic principle (lock-free, classic work
-stealing deque and the parallel call structure) are nearly the same.
-
-We will see if minor optimizations can even close this last gap.
-Overall the performance at this point is good enough to move on
-to implementing more functionality and to running tests on different
-queues/stealing tactics etc.
diff --git a/app/benchmark_fft/main.cpp b/app/benchmark_fft/main.cpp
index e46370d..68a587c 100644
--- a/app/benchmark_fft/main.cpp
+++ b/app/benchmark_fft/main.cpp
@@ -1,6 +1,7 @@
 #include "pls/internal/scheduling/scheduler.h"
 #include "pls/internal/scheduling/parallel_result.h"
 #include "pls/internal/scheduling/scheduler_memory.h"
+#include "pls/internal/helpers/profiler.h"
 using namespace pls::internal::scheduling;
 
 #include <iostream>
@@ -9,7 +10,7 @@ using namespace pls::internal::scheduling;
 #include <atomic>
 
 static constexpr int CUTOFF = 16;
-static constexpr int INPUT_SIZE = 8192;
+static constexpr int INPUT_SIZE = 16384;
 typedef std::vector<std::complex<double>> complex_vector;
 
 void divide(complex_vector::iterator data, int n) {
@@ -90,8 +91,8 @@ complex_vector prepare_input(int input_size) {
   return data;
 }
 
-static constexpr int NUM_ITERATIONS = 1000;
-constexpr size_t NUM_THREADS = 5;
+static constexpr int NUM_ITERATIONS = 500;
+constexpr size_t NUM_THREADS = 2;
 
 constexpr size_t NUM_TASKS = 128;
 
@@ -99,6 +100,7 @@ constexpr size_t NUM_CONTS = 128;
 constexpr size_t MAX_CONT_SIZE = 512;
 
 int main() {
+  PROFILE_ENABLE;
   complex_vector initial_input = prepare_input(INPUT_SIZE);
 
   static_scheduler_memory<NUM_THREADS,
@@ -112,6 +114,7 @@ int main() {
   for (int i = 0; i < NUM_ITERATIONS; i++) {
     complex_vector input_2(initial_input);
     scheduler.perform_work([&]() {
+      PROFILE_MAIN_THREAD;
       return scheduler::par([&]() {
         return fft(input_2.begin(), INPUT_SIZE);
       }, []() {
@@ -120,10 +123,12 @@ int main() {
         return parallel_result<int>{0};
       });
     });
+    PROFILE_LOCK("DONE");
   }
   auto end = std::chrono::steady_clock::now();
   std::cout << "Framework:  " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
             << std::endl;
+  PROFILE_SAVE("test_profile.prof");
 
   start = std::chrono::steady_clock::now();
   for (int i = 0; i < NUM_ITERATIONS; i++) {
diff --git a/app/benchmark_matrix/main.cpp b/app/benchmark_matrix/main.cpp
index ed6d874..9a14a57 100644
--- a/app/benchmark_matrix/main.cpp
+++ b/app/benchmark_matrix/main.cpp
@@ -1,6 +1,8 @@
-#include <pls/pls.h>
-#include <pls/internal/helpers/profiler.h>
-#include <pls/internal/helpers/mini_benchmark.h>
+#include "pls/internal/scheduling/scheduler.h"
+#include "pls/internal/scheduling/parallel_result.h"
+#include "pls/internal/scheduling/scheduler_memory.h"
+#include "pls/algorithms/for_each.h"
+using namespace pls::internal::scheduling;
 
 #include <chrono>
 
@@ -15,8 +17,8 @@ class matrix {
     std::fill(&data[0][0], &data[0][0] + SIZE * SIZE, i);
   }
 
-  void multiply(const matrix<T, SIZE> &a, const matrix<T, SIZE> &b) {
-    pls::for_each_range(0, SIZE, [&](int i) {
+  parallel_result<int> multiply(const matrix<T, SIZE> &a, const matrix<T, SIZE> &b) {
+    return pls::algorithm::for_each_range(0, SIZE, [&](int i) {
       this->multiply_column(i, a, b);
     });
   }
@@ -44,6 +46,14 @@ void fill_with_data(matrix<double, MATRIX_SIZE> &a, matrix<double, MATRIX_SIZE> 
   }
 }
 
+static constexpr int NUM_ITERATIONS = 1000;
+constexpr size_t NUM_THREADS = 3;
+
+constexpr size_t NUM_TASKS = 128;
+
+constexpr size_t NUM_CONTS = 128;
+constexpr size_t MAX_CONT_SIZE = 512;
+
 int main() {
   PROFILE_ENABLE
   matrix<double, MATRIX_SIZE> a;
@@ -51,11 +61,29 @@ int main() {
   matrix<double, MATRIX_SIZE> result;
   fill_with_data(a, b);
 
-  pls::internal::helpers::run_mini_benchmark([&] {
-    result.multiply(a, b);
-  }, 8, 1000);
+  static_scheduler_memory<NUM_THREADS,
+                          NUM_TASKS,
+                          NUM_CONTS,
+                          MAX_CONT_SIZE> static_scheduler_memory;
 
-  PROFILE_SAVE("test_profile.prof")
+  scheduler scheduler{static_scheduler_memory, NUM_THREADS};
+
+  auto start = std::chrono::steady_clock::now();
+  for (int i = 0; i < NUM_ITERATIONS; i++) {
+    scheduler.perform_work([&]() {
+      PROFILE_MAIN_THREAD;
+      return scheduler::par([&]() {
+        return result.multiply(a, b);
+      }, []() {
+        return parallel_result<int>{0};
+      }).then([](int, int) {
+        return parallel_result<int>{0};
+      });
+    });
+  }
+  auto end = std::chrono::steady_clock::now();
+  std::cout << "Framework:  " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
+            << std::endl;
 }
 
 //int main() {
diff --git a/app/benchmark_unbalanced/main.cpp b/app/benchmark_unbalanced/main.cpp
index 75f5daa..2753b8c 100644
--- a/app/benchmark_unbalanced/main.cpp
+++ b/app/benchmark_unbalanced/main.cpp
@@ -1,6 +1,7 @@
-#include <pls/pls.h>
-#include <pls/internal/helpers/profiler.h>
-#include <pls/internal/helpers/mini_benchmark.h>
+#include "pls/internal/scheduling/scheduler.h"
+#include "pls/internal/scheduling/parallel_result.h"
+#include "pls/internal/scheduling/scheduler_memory.h"
+using namespace pls::internal::scheduling;
 
 #include "node.h"
 
@@ -11,7 +12,7 @@ const int NORMAL_CHILDREN = 8;
 
 const int NUM_NODES = 71069;
 
-int count_child_nodes(uts::node &node) {
+parallel_result<int> count_child_nodes(uts::node &node) {
   int child_count = 1;
   std::vector<uts::node> children = node.spawn_child_nodes();
 
@@ -36,7 +37,7 @@ int count_child_nodes(uts::node &node) {
   return child_count;
 }
 
-int unbalanced_tree_search(int seed, int root_children, double q, int normal_children) {
+parallel_result<int> unbalanced_tree_search(int seed, int root_children, double q, int normal_children) {
   int result;
 
   auto lambda = [&] {
@@ -50,11 +51,33 @@ int unbalanced_tree_search(int seed, int root_children, double q, int normal_chi
   return result;
 }
 
+constexpr size_t NUM_THREADS = 5;
+
+constexpr size_t NUM_TASKS = 128;
+
+constexpr size_t NUM_CONTS = 128;
+constexpr size_t MAX_CONT_SIZE = 512;
+
+volatile int result;
 int main() {
   PROFILE_ENABLE
-  pls::internal::helpers::run_mini_benchmark([&] {
-    unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN);
-  }, 8, 2000);
+  static_scheduler_memory<NUM_THREADS,
+                          NUM_TASKS,
+                          NUM_CONTS,
+                          MAX_CONT_SIZE> static_scheduler_memory;
+
+  scheduler scheduler{static_scheduler_memory, NUM_THREADS};
+
+  scheduler.perform_work([&]() {
+    return scheduler::par([&]() {
+      return unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN);
+    }, []() {
+      return parallel_result<int>{0};
+    }).then([](int a, int) {
+      result = a;
+      return parallel_result<int>{0};
+    });
+  });
 
   PROFILE_SAVE("test_profile.prof")
 }
diff --git a/app/playground/main.cpp b/app/playground/main.cpp
index 4db3a7b..7c353ae 100644
--- a/app/playground/main.cpp
+++ b/app/playground/main.cpp
@@ -8,7 +8,7 @@
 
 using namespace pls::internal;
 
-constexpr size_t NUM_THREADS = 1;
+constexpr size_t NUM_THREADS = 4;
 
 constexpr size_t NUM_TASKS = 128;
 static constexpr int NUM_ITERATIONS = 100;
@@ -29,11 +29,8 @@ int fib_normal(int n) {
 }
 
 scheduling::parallel_result<int> fib(int n) {
-  if (n == 0) {
-    return 0;
-  }
-  if (n == 1) {
-    return 1;
+  if (n <= 10) {
+    return fib_normal(n);
   }
 
   return scheduling::scheduler::par([=]() {
@@ -47,6 +44,7 @@ scheduling::parallel_result<int> fib(int n) {
 
 static volatile int result;
 int main() {
+  PROFILE_ENABLE;
   scheduling::static_scheduler_memory<NUM_THREADS,
                                       NUM_TASKS,
                                       NUM_CONTS,
@@ -56,7 +54,7 @@ int main() {
 
   auto start = std::chrono::steady_clock::now();
   for (int i = 0; i < NUM_ITERATIONS; i++) {
-    result = fib_normal(30);
+    result = fib_normal(35);
   }
   auto end = std::chrono::steady_clock::now();
   std::cout << "Normal:     " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
@@ -66,16 +64,20 @@ int main() {
 
   for (int i = 0; i < NUM_ITERATIONS; i++) {
     scheduler.perform_work([]() {
+      PROFILE_MAIN_THREAD;
       return scheduling::scheduler::par([]() {
         return scheduling::parallel_result<int>(0);
       }, []() {
-        return fib(30);
+        return fib(35);
       }).then([](int, int b) {
         result = b;
+        PROFILE_LOCK("DONE");
         return scheduling::parallel_result<int>{0};
       });
     });
+    PROFILE_LOCK("DONE");
   }
+  PROFILE_SAVE("test_profile.prof");
 
   end = std::chrono::steady_clock::now();
   std::cout << "Framework: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << std::endl;
diff --git a/lib/pls/include/pls/algorithms/for_each.h b/lib/pls/include/pls/algorithms/for_each.h
index 0cc11b1..54af7fa 100644
--- a/lib/pls/include/pls/algorithms/for_each.h
+++ b/lib/pls/include/pls/algorithms/for_each.h
@@ -2,6 +2,8 @@
 #ifndef PLS_PARALLEL_FOR_H
 #define PLS_PARALLEL_FOR_H
 
+#include "pls/internal/scheduling/parallel_result.h"
+
 namespace pls {
 namespace algorithm {
 
@@ -9,19 +11,26 @@ class fixed_strategy;
 class dynamic_strategy;
 
 template<typename Function, typename ExecutionStrategy>
-void for_each_range(unsigned long first,
-                    unsigned long last,
-                    const Function &function,
-                    ExecutionStrategy &execution_strategy);
+pls::internal::scheduling::parallel_result<int> for_each_range(unsigned long first,
+                                                               unsigned long last,
+                                                               const Function &function,
+                                                               ExecutionStrategy &execution_strategy);
 
 template<typename Function>
-void for_each_range(unsigned long first, unsigned long last, const Function &function);
+pls::internal::scheduling::parallel_result<int> for_each_range(unsigned long first,
+                                                               unsigned long last,
+                                                               const Function &function);
 
 template<typename RandomIt, typename Function, typename ExecutionStrategy>
-void for_each(RandomIt first, RandomIt last, const Function &function, ExecutionStrategy execution_strategy);
+pls::internal::scheduling::parallel_result<int> for_each(RandomIt first,
+                                                         RandomIt last,
+                                                         const Function &function,
+                                                         ExecutionStrategy execution_strategy);
 
 template<typename RandomIt, typename Function>
-void for_each(RandomIt first, RandomIt last, const Function &function);
+pls::internal::scheduling::parallel_result<int> for_each(RandomIt first,
+                                                         RandomIt last,
+                                                         const Function &function);
 
 }
 }
diff --git a/lib/pls/include/pls/algorithms/for_each_impl.h b/lib/pls/include/pls/algorithms/for_each_impl.h
index 058ae9f..2473f24 100644
--- a/lib/pls/include/pls/algorithms/for_each_impl.h
+++ b/lib/pls/include/pls/algorithms/for_each_impl.h
@@ -2,11 +2,8 @@
 #ifndef PLS_PARALLEL_FOR_IMPL_H
 #define PLS_PARALLEL_FOR_IMPL_H
 
-#include "pls/internal/scheduling/task.h"
 #include "pls/internal/scheduling/scheduler.h"
 #include "pls/internal/scheduling/thread_state.h"
-
-#include "pls/internal/helpers/unique_id.h"
 #include "pls/internal/helpers/range.h"
 
 namespace pls {
@@ -14,7 +11,10 @@ namespace algorithm {
 namespace internal {
 
 template<typename RandomIt, typename Function>
-void for_each(const RandomIt first, const RandomIt last, const Function function, const long min_elements) {
+pls::internal::scheduling::parallel_result<int> for_each(const RandomIt first,
+                                                         const RandomIt last,
+                                                         const Function function,
+                                                         const long min_elements) {
   using namespace ::pls::internal::scheduling;
 
   const long num_elements = std::distance(first, last);
@@ -23,29 +23,25 @@ void for_each(const RandomIt first, const RandomIt last, const Function function
     for (auto current = first; current != last; current++) {
       function(*current);
     }
+
+    return parallel_result<int>{0};
   } else {
     // Cut in half recursively
     const long middle_index = num_elements / 2;
 
-    auto second_half_body =
-        [first, middle_index, last, &function, min_elements] {
-          internal::for_each(first + middle_index,
-                             last,
-                             function,
-                             min_elements);
-        };
-    using second_half_t = lambda_task_by_reference<decltype(second_half_body)>;
-    scheduler::spawn_child<second_half_t>(std::move(second_half_body));
-
-    auto first_half_body =
-        [first, middle_index, last, &function, min_elements] {
-          internal::for_each(first,
-                             first + middle_index,
-                             function,
-                             min_elements);
-        };
-    using first_half_t = lambda_task_by_reference<decltype(first_half_body)>;
-    scheduler::spawn_child_and_wait<first_half_t>(std::move(first_half_body));
+    return scheduler::par([first, middle_index, last, &function, min_elements] {
+      return internal::for_each(first,
+                                first + middle_index,
+                                function,
+                                min_elements);
+    }, [first, middle_index, last, &function, min_elements] {
+      return internal::for_each(first + middle_index,
+                                last,
+                                function,
+                                min_elements);
+    }).then([](int, int) {
+      return parallel_result<int>{0};
+    });
   }
 }
 
@@ -56,7 +52,7 @@ class dynamic_strategy {
   explicit dynamic_strategy(const unsigned int tasks_per_thread = 4) : tasks_per_thread_{tasks_per_thread} {};
 
   long calculate_min_elements(long num_elements) const {
-    const long num_threads = pls::internal::scheduling::thread_state::get()->scheduler_->num_threads();
+    const long num_threads = pls::internal::scheduling::thread_state::get().scheduler_->num_threads();
     return num_elements / (num_threads * tasks_per_thread_);
   }
  private:
@@ -75,29 +71,34 @@ class fixed_strategy {
 };
 
 template<typename RandomIt, typename Function, typename ExecutionStrategy>
-void for_each(RandomIt first, RandomIt last, const Function &function, ExecutionStrategy execution_strategy) {
+pls::internal::scheduling::parallel_result<int> for_each(RandomIt first,
+                                                         RandomIt last,
+                                                         const Function &function,
+                                                         ExecutionStrategy execution_strategy) {
   long num_elements = std::distance(first, last);
-  internal::for_each(first, last, function, execution_strategy.calculate_min_elements(num_elements));
+  return internal::for_each(first, last, function, execution_strategy.calculate_min_elements(num_elements));
 }
 
 template<typename RandomIt, typename Function>
-void for_each(RandomIt first, RandomIt last, const Function &function) {
-  for_each(first, last, function, dynamic_strategy{4});
+pls::internal::scheduling::parallel_result<int> for_each(RandomIt first, RandomIt last, const Function &function) {
+  return for_each(first, last, function, dynamic_strategy{4});
 }
 
 template<typename Function, typename ExecutionStrategy>
-void for_each_range(unsigned long first,
-                    unsigned long last,
-                    const Function &function,
-                    ExecutionStrategy execution_strategy) {
+pls::internal::scheduling::parallel_result<int> for_each_range(unsigned long first,
+                                                               unsigned long last,
+                                                               const Function &function,
+                                                               ExecutionStrategy execution_strategy) {
   auto range = pls::internal::helpers::range(first, last);
-  for_each(range.begin(), range.end(), function, execution_strategy);
+  return for_each(range.begin(), range.end(), function, execution_strategy);
 }
 
 template<typename Function>
-void for_each_range(unsigned long first, unsigned long last, const Function &function) {
+pls::internal::scheduling::parallel_result<int> for_each_range(unsigned long first,
+                                                               unsigned long last,
+                                                               const Function &function) {
   auto range = pls::internal::helpers::range(first, last);
-  for_each(range.begin(), range.end(), function);
+  return for_each(range.begin(), range.end(), function);
 }
 
 }
diff --git a/lib/pls/include/pls/internal/data_structures/bounded_trading_deque.h b/lib/pls/include/pls/internal/data_structures/bounded_trading_deque.h
index 6f77d39..e54d9d7 100644
--- a/lib/pls/include/pls/internal/data_structures/bounded_trading_deque.h
+++ b/lib/pls/include/pls/internal/data_structures/bounded_trading_deque.h
@@ -247,9 +247,9 @@ class bounded_trading_deque {
   deque_entry *entries_;
   size_t num_entries_;
 
-  std::atomic<stamped_integer> top_{{0, 0}};
+  alignas(base::system_details::CACHE_LINE_SIZE) std::atomic<stamped_integer> top_{{0, 0}};
+  alignas(base::system_details::CACHE_LINE_SIZE) std::atomic<size_t> bot_{0};
 
-  std::atomic<size_t> bot_{0};
   stamped_integer bot_internal_{0, 0};
 };
 
diff --git a/lib/pls/include/pls/internal/helpers/profiler.h b/lib/pls/include/pls/internal/helpers/profiler.h
index 2902344..e0fe3ad 100644
--- a/lib/pls/include/pls/internal/helpers/profiler.h
+++ b/lib/pls/include/pls/internal/helpers/profiler.h
@@ -6,9 +6,10 @@
 #include <easy/profiler.h>
 #include <easy/arbitrary_value.h>
 
-#define PROFILE_WORK_BLOCK(msg) EASY_BLOCK(msg, profiler::colors::LightGreen)
-#define PROFILE_FORK_JOIN_STEALING(msg) EASY_BLOCK(msg, profiler::colors::LightBlue)
-#define PROFILE_STEALING(msg) EASY_BLOCK(msg, profiler::colors::Blue)
+#define PROFILE_TASK(msg) EASY_BLOCK(msg, profiler::colors::LightBlue)
+#define PROFILE_CONTINUATION(msg) EASY_BLOCK(msg, profiler::colors::LightBlue)
+#define PROFILE_FAST_PATH(msg) EASY_BLOCK(msg, profiler::colors::Green)
+#define PROFILE_STEALING(msg) EASY_BLOCK(msg, profiler::colors::Orange)
 #define PROFILE_LOCK(msg) EASY_BLOCK(msg, profiler::colors::Red)
 
 #define PROFILE_END_BLOCK EASY_END_BLOCK
@@ -21,8 +22,9 @@
 
 #else //ENABLE_EASY_PROFILER
 
-#define PROFILE_WORK_BLOCK(msg)
-#define PROFILE_FORK_JOIN_STEALING(msg)
+#define PROFILE_TASK(msg)
+#define PROFILE_CONTINUATION(msg)
+#define PROFILE_FAST_PATH(msg)
 #define PROFILE_STEALING(msg)
 #define PROFILE_LOCK(msg)
 
diff --git a/lib/pls/include/pls/internal/scheduling/cont.h b/lib/pls/include/pls/internal/scheduling/cont.h
index dd05411..1b981e8 100644
--- a/lib/pls/include/pls/internal/scheduling/cont.h
+++ b/lib/pls/include/pls/internal/scheduling/cont.h
@@ -11,6 +11,8 @@
 #include "pls/internal/base/alignment.h"
 #include "pls/internal/base/error_handling.h"
 
+#include "pls/internal/helpers/profiler.h"
+
 #include "parallel_result.h"
 #include "memory_block.h"
 
@@ -119,6 +121,7 @@ class cont : public base_cont {
       task_{std::forward<T2ARGS>(task_2_args)..., this} {};
 
   void execute() override {
+    PROFILE_CONTINUATION("execute_cont");
     using result_type = decltype(function_((*left_result_).value(), (*right_result_).value()));
     result_runner<result_type>::execute(*this);
 
diff --git a/lib/pls/include/pls/internal/scheduling/memory_block.h b/lib/pls/include/pls/internal/scheduling/memory_block.h
index 42ad2c0..f4d42c1 100644
--- a/lib/pls/include/pls/internal/scheduling/memory_block.h
+++ b/lib/pls/include/pls/internal/scheduling/memory_block.h
@@ -27,8 +27,7 @@ class memory_block {
         memory_buffer_{memory_buffer},
         memory_buffer_size_{memory_buffer_size},
         memory_buffer_used_{false},
-        depth_{depth},
-        owner_{0} {};
+        depth_{depth} {};
 
   template<typename T, typename ...ARGS>
   T *place_in_buffer(ARGS &&...args) {
@@ -82,13 +81,6 @@ class memory_block {
     results_missing_.store(2);
   }
 
-  void set_owner(int owner) {
-    owner_ = owner;
-  }
-  int get_owner() {
-    return owner_;
-  }
-
  private:
   // Linked list property of memory blocks (a complete list represents a threads currently owned memory).
   // Each block knows its chain start to allow stealing a whole chain in O(1)
@@ -120,9 +112,6 @@ class memory_block {
   // Swapping parts of a memory chain will not reorder it, as always parts of
   // the same size are exchanged.
   const int depth_;
-
-  // TODO: Remove, debug only
-  int owner_;
 };
 
 }
diff --git a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
index f28b844..8b2e8d7 100644
--- a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
@@ -7,6 +7,8 @@
 #include "pls/internal/scheduling/parallel_result.h"
 #include "pls/internal/scheduling/task.h"
 
+#include "pls/internal/helpers/profiler.h"
+
 namespace pls {
 namespace internal {
 namespace scheduling {
@@ -32,6 +34,7 @@ struct scheduler::starter {
   auto then(FCONT &&cont_function)
   -> decltype(cont_function(std::declval<typename return_type_1::value_type>(),
                             std::declval<typename return_type_2::value_type>())) {
+    PROFILE_FAST_PATH("then");
     using continuation_type = cont<task<F2>, return_type_1, return_type_2, FCONT>;
     using result_type = decltype(cont_function(std::declval<typename return_type_1::value_type>(),
                                                std::declval<typename return_type_2::value_type>()));
@@ -49,7 +52,6 @@ struct scheduler::starter {
     const bool is_right_cont = my_state.right_spawn_;
     base_cont *parent_cont = my_state.parent_cont_;
 
-    current_memory_block->set_owner(my_state.get_id());
     continuation_type *current_cont = current_memory_block->place_in_buffer<continuation_type>(parent_cont,
                                                                                                current_memory_block,
                                                                                                is_right_cont,
@@ -63,7 +65,7 @@ struct scheduler::starter {
     // Call first function on fast path
     my_state.right_spawn_ = false;
     return_type_1 result_1 = function_1_();
-    if (cont_manager.falling_through()) {
+    if (!result_1.fast_path()) {
       // Get our replacement from the task stack and store it for later use when we are actually blocked.
       auto traded_memory = my_state.get_task_manager().try_pop_local();
       current_cont->get_memory_block()->get_offered_chain().store(*traded_memory);
@@ -87,7 +89,7 @@ struct scheduler::starter {
     } else {
       my_state.right_spawn_ = true;
       return_type_2 result_2 = function_2_();
-      if (cont_manager.falling_through()) {
+      if (!result_2.fast_path()) {
         // Main scheduling loop is responsible for entering the result to the slow path...
         current_cont->store_left_result(std::move(result_1));
         current_cont->get_memory_block()->get_results_missing().fetch_add(-1);
@@ -109,7 +111,7 @@ struct scheduler::starter {
       my_state.right_spawn_ = is_right_cont;
 
       auto cont_result = cont_function(result_1.value(), result_2.value());
-      if (cont_manager.falling_through()) {
+      if (!cont_result.fast_path()) {
         // Unwind stack...
         return result_type{};
       }
@@ -154,8 +156,6 @@ class scheduler::init_function_impl : public init_function {
 
 template<typename Function>
 void scheduler::perform_work(Function work_section) {
-  PROFILE_WORK_BLOCK("scheduler::perform_work")
-
   // Prepare main root task
   init_function_impl<Function> starter_function{work_section};
   main_thread_starter_function_ = &starter_function;
diff --git a/lib/pls/include/pls/internal/scheduling/scheduler_memory.h b/lib/pls/include/pls/internal/scheduling/scheduler_memory.h
index 935c499..25c8063 100644
--- a/lib/pls/include/pls/internal/scheduling/scheduler_memory.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler_memory.h
@@ -18,19 +18,27 @@ class scheduler_memory {
   //       By not having an initialization routine we can do our 'static and heap specialization'
   //       without running into any ordering problems in the initialization sequence.
 
-  // We first worried about performance of this being virtual.
-  // However, we decided that only thread_state_for is used during the
-  // runtime and that only when stealing. As stealing is expensive anyways,
-  // this should not add too much overhead.
+ protected:
+  thread_state **thread_states_array_{nullptr};
+
  public:
   virtual size_t max_threads() const = 0;
   virtual base::thread &thread_for(size_t id) = 0;
-  virtual thread_state &thread_state_for(size_t id) = 0;
+  thread_state &thread_state_for(size_t id) {
+    return *thread_states_array_[id];
+  }
 };
 
 template<size_t MAX_THREADS, size_t NUM_TASKS, size_t NUM_CONTS, size_t MAX_CONT_SIZE>
 class static_scheduler_memory : public scheduler_memory {
  public:
+  static_scheduler_memory() : scheduler_memory{} {
+    for (size_t i = 0; i < MAX_THREADS; i++) {
+      thread_state_pointers_[i] = &thread_states_[i].get_thread_state();
+    }
+    thread_states_array_ = thread_state_pointers_.data();
+  }
+
   size_t max_threads() const override {
     return MAX_THREADS;
   }
@@ -38,16 +46,12 @@ class static_scheduler_memory : public scheduler_memory {
   base::thread &thread_for(size_t id) override {
     return threads_[id];
   }
-
-  thread_state &thread_state_for(size_t id) override {
-    return thread_states_[id].get_thread_state();
-  }
-
  private:
   using thread_state_type = thread_state_static<NUM_TASKS, NUM_CONTS, MAX_CONT_SIZE>;
 
   alignas(base::system_details::CACHE_LINE_SIZE) std::array<base::thread, MAX_THREADS> threads_;
   alignas(base::system_details::CACHE_LINE_SIZE) std::array<thread_state_type, MAX_THREADS> thread_states_;
+  alignas(base::system_details::CACHE_LINE_SIZE) std::array<thread_state *, MAX_THREADS> thread_state_pointers_;
 };
 
 template<size_t NUM_TASKS, size_t MAX_TASK_STACK_SIZE, size_t NUM_CONTS, size_t MAX_CONT_SIZE>
@@ -55,14 +59,17 @@ class heap_scheduler_memory : public scheduler_memory {
  public:
   explicit heap_scheduler_memory(size_t max_threads) : max_threads_{max_threads},
                                                        thread_vector_{},
-                                                       thread_state_vector_{} {
+                                                       thread_state_vector_{},
+                                                       thread_state_pointers_{} {
     thread_vector_.reserve(max_threads);
     thread_state_vector_.reserve(max_threads);
 
     for (size_t i = 0; i < max_threads; i++) {
       thread_vector_.emplace_back();
       thread_state_vector_.emplace_back();
+      thread_state_pointers_.emplace_back(&thread_state_vector_[i].get_thread_state());
     }
+    thread_states_array_ = thread_state_pointers_.data();
   }
 
   size_t max_threads() const override {
@@ -72,11 +79,6 @@ class heap_scheduler_memory : public scheduler_memory {
   base::thread &thread_for(size_t id) override {
     return thread_vector_[id];
   }
-
-  thread_state &thread_state_for(size_t id) override {
-    return thread_state_vector_[id].object().get_thread_state();
-  }
-
  private:
   using thread_state_type = thread_state_static<NUM_TASKS, NUM_CONTS, MAX_CONT_SIZE>;
   // thread_state_type is aligned at the cache line and therefore overaligned (C++ 11 does not require
@@ -88,6 +90,7 @@ class heap_scheduler_memory : public scheduler_memory {
   size_t max_threads_;
   std::vector<base::thread> thread_vector_;
   std::vector<thread_state_wrapper> thread_state_vector_;
+  std::vector<thread_state *> thread_state_pointers_;
 };
 
 }
diff --git a/lib/pls/include/pls/internal/scheduling/task.h b/lib/pls/include/pls/internal/scheduling/task.h
index 8b10e90..372a18e 100644
--- a/lib/pls/include/pls/internal/scheduling/task.h
+++ b/lib/pls/include/pls/internal/scheduling/task.h
@@ -4,6 +4,8 @@
 #include "pls/internal/scheduling/cont.h"
 #include "pls/internal/scheduling/memory_block.h"
 
+#include "pls/internal/helpers/profiler.h"
+
 namespace pls {
 namespace internal {
 namespace scheduling {
@@ -47,6 +49,7 @@ class task : public base_task {
       : base_task{cont}, function_{std::forward<FARG>(function)} {}
 
   void execute_internal() override {
+    PROFILE_TASK("execute_internal")
     auto result = function_();
     if (result.fast_path()) {
       cont_->store_right_result(std::move(result));
diff --git a/lib/pls/include/pls/internal/scheduling/task_manager.h b/lib/pls/include/pls/internal/scheduling/task_manager.h
index 961cf1b..5eeb73d 100644
--- a/lib/pls/include/pls/internal/scheduling/task_manager.h
+++ b/lib/pls/include/pls/internal/scheduling/task_manager.h
@@ -30,20 +30,17 @@ class task_manager {
  public:
   // Publishes a task on the stack, i.e. makes it visible for other threads to steal.
   void publish_task(base_task *task) {
-//    std::lock_guard<base::spin_lock> lock{lock_};
     task_deque_.push_bot(task->get_cont()->get_memory_block());
   }
 
   // Try to pop a local task from this task managers stack.
   data_structures::optional<memory_block *> try_pop_local() {
-//    std::lock_guard<base::spin_lock> lock{lock_};
     return task_deque_.pop_bot().traded_;
   }
 
   // Try to steal a task from a remote task_manager instance. The stolen task must be stored locally.
   // Returns a pair containing the actual task and if the steal was successful.
   base_task *steal_remote_task(cont_manager &stealing_cont_manager) {
-//    std::lock_guard<base::spin_lock> lock{lock_};
     auto peek = task_deque_.peek_top();
 
     if (std::get<0>(peek)) {
@@ -73,7 +70,6 @@ class task_manager {
 
  private:
   data_structures::bounded_trading_deque<memory_block, memory_block> &task_deque_;
-  base::spin_lock lock_{};
 };
 
 template<size_t NUM_TASKS>
diff --git a/lib/pls/include/pls/internal/scheduling/thread_state_static.h b/lib/pls/include/pls/internal/scheduling/thread_state_static.h
index 39b1815..4ac295a 100644
--- a/lib/pls/include/pls/internal/scheduling/thread_state_static.h
+++ b/lib/pls/include/pls/internal/scheduling/thread_state_static.h
@@ -5,6 +5,8 @@
 #include "pls/internal/scheduling/task_manager.h"
 #include "pls/internal/scheduling/cont_manager.h"
 
+#include "pls/internal/base/system_details.h"
+
 #include "thread_state.h"
 
 namespace pls {
@@ -12,7 +14,7 @@ namespace internal {
 namespace scheduling {
 
 template<size_t NUM_TASKS, size_t NUM_CONTS, size_t MAX_CONT_SIZE>
-struct thread_state_static {
+struct alignas(base::system_details::CACHE_LINE_SIZE) thread_state_static {
  public:
   thread_state_static()
       : static_task_manager_{},
@@ -21,9 +23,9 @@ struct thread_state_static {
   thread_state &get_thread_state() { return thread_state_; }
 
  private:
-  static_task_manager<NUM_TASKS> static_task_manager_;
-  static_cont_manager<NUM_CONTS, MAX_CONT_SIZE> static_cont_manager_;
-  thread_state thread_state_;
+  alignas(base::system_details::CACHE_LINE_SIZE) static_task_manager<NUM_TASKS> static_task_manager_;
+  alignas(base::system_details::CACHE_LINE_SIZE) static_cont_manager<NUM_CONTS, MAX_CONT_SIZE> static_cont_manager_;
+  alignas(base::system_details::CACHE_LINE_SIZE) thread_state thread_state_;
 };
 
 }
diff --git a/lib/pls/src/internal/scheduling/scheduler.cpp b/lib/pls/src/internal/scheduling/scheduler.cpp
index 3a3c9c0..960a216 100644
--- a/lib/pls/src/internal/scheduling/scheduler.cpp
+++ b/lib/pls/src/internal/scheduling/scheduler.cpp
@@ -1,8 +1,9 @@
 #include "pls/internal/scheduling/scheduler.h"
 #include "pls/internal/scheduling/thread_state.h"
-#include "pls/internal/scheduling/task.h"
 
+#include "pls/internal/base/thread.h"
 #include "pls/internal/base/error_handling.h"
+#include "pls/internal/helpers/profiler.h"
 
 namespace pls {
 namespace internal {
@@ -74,17 +75,15 @@ void scheduler::work_thread_work_section() {
     // Steal Routine (will be continuously executed when there are no more fall through's).
     // TODO: move into separate function
     const size_t offset = my_state.random_() % num_threads;
-    const size_t max_tries = num_threads - 1;
+    const size_t max_tries = num_threads;
     for (size_t i = 0; i < max_tries; i++) {
       size_t target = (offset + i) % num_threads;
-
-      // Skip our self for stealing
-      target = ((target == my_id) + target) % num_threads;
-
       auto &target_state = my_state.get_scheduler().thread_state_for(target);
 
       PLS_ASSERT(my_cont_manager.is_clean(), "Only steal with clean chain!");
+      PROFILE_STEALING("steal")
       auto *stolen_task = target_state.get_task_manager().steal_remote_task(my_cont_manager);
+      PROFILE_END_BLOCK;
       if (stolen_task != nullptr) {
         my_state.parent_cont_ = stolen_task->get_cont();
         my_state.right_spawn_ = true;
@@ -97,6 +96,9 @@ void scheduler::work_thread_work_section() {
         }
       }
     }
+//    if (!my_cont_manager.falling_through()) {
+//      base::this_thread::sleep(5);
+//    }
   } while (!work_section_done_);
 
   PLS_ASSERT(my_cont_manager.is_clean(), "Only finish work section with clean chain!");
diff --git a/media/e34ea267_fft_execution_pattern.png b/media/e34ea267_fft_execution_pattern.png
new file mode 100644
index 0000000..108ab8d
Binary files /dev/null and b/media/e34ea267_fft_execution_pattern.png differ
diff --git a/media/e34ea267_thread_state_for.png b/media/e34ea267_thread_state_for.png
new file mode 100644
index 0000000..8431bfc
Binary files /dev/null and b/media/e34ea267_thread_state_for.png differ
--
libgit2 0.26.0