diff --git a/PERFORMANCE.md b/PERFORMANCE.md
index 7cc7c1e..ea9e983 100644
--- a/PERFORMANCE.md
+++ b/PERFORMANCE.md
@@ -242,3 +242,42 @@ case of FFT).
We also want to note that all these measurements are not very
controlled/scientific, but simply ran ot our notebook for
fast iterations over different, potential issues with our scheduler.
+
+
+### Commit 116cf4af - VTune Amplifier and MRSW top level lock
+
+When looking at why our code works quite well on problems with
+mostly busy workers and not so well on code with spinning/waiting
+workers (like in the FFT), we take a closer look at the FFT and
+matrix multiplication in VTune.
+
+FFT:
+
+
+
+Matrix:
+
+
+
+The sections highlighted in red represent parts of the code spent
+on spinning in the work-stealing loop.
+We can see that as long as our workers are mainly busy/find work
+in the stealing loop the overhead spent on spinning is minimal.
+We can also see that in the FFT considerable amounts of time are
+spent spining.
+
+A general observation are the high CPI rates for our spinning code.
+This makes sense, as we are currently working on locks that share
+atomic variables in order to work, thus leading to cache misses.
+
+### Commit 116cf4af - 2D Heat Diffusion
+
+As a last test for our current state on performance we implemented the
+2D heat diffusion benchmark using our framework (using fork-join based
+parallel_for, 512 heat array size):
+
+
+
+We observe solid performance from our implementation.
+(Again, not very scientific test environment, but good enough for
+our general direction)
diff --git a/app/benchmark_matrix/main.cpp b/app/benchmark_matrix/main.cpp
index c633820..bb21de6 100644
--- a/app/benchmark_matrix/main.cpp
+++ b/app/benchmark_matrix/main.cpp
@@ -71,7 +71,7 @@ int main() {
//
// scheduler.perform_work([&] {
// PROFILE_MAIN_THREAD
-// for (int i = 0; i < 10; i++) {
+// for (int i = 0; i < 5000; i++) {
// PROFILE_WORK_BLOCK("Top Level")
// result.multiply(a, b);
// }
@@ -79,4 +79,4 @@ int main() {
//
// PROFILE_SAVE("test_profile.prof")
//}
-
+//
diff --git a/app/benchmark_unbalanced/main.cpp b/app/benchmark_unbalanced/main.cpp
index 27cdaf3..1860877 100644
--- a/app/benchmark_unbalanced/main.cpp
+++ b/app/benchmark_unbalanced/main.cpp
@@ -50,29 +50,29 @@ int unbalanced_tree_search(int seed, int root_children, double q, int normal_chi
return result;
}
+//
+//int main() {
+// PROFILE_ENABLE
+// pls::internal::helpers::run_mini_benchmark([&] {
+// unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN);
+// }, 8, 4000);
+//
+// PROFILE_SAVE("test_profile.prof")
+//}
int main() {
PROFILE_ENABLE
- pls::internal::helpers::run_mini_benchmark([&] {
- unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN);
- }, 8, 4000);
+ pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18};
+ pls::scheduler scheduler{&my_scheduler_memory, 8};
+
+ scheduler.perform_work([&] {
+ PROFILE_MAIN_THREAD
+ for (int i = 0; i < 50; i++) {
+ PROFILE_WORK_BLOCK("Top Level")
+ int result = unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN);
+ std::cout << result << std::endl;
+ }
+ });
PROFILE_SAVE("test_profile.prof")
}
-
-//int main() {
-// PROFILE_ENABLE
-// pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18};
-// pls::scheduler scheduler{&my_scheduler_memory, 8};
-//
-// scheduler.perform_work([&] {
-// PROFILE_MAIN_THREAD
-// for (int i = 0; i < 10; i++) {
-// PROFILE_WORK_BLOCK("Top Level")
-// int result = unbalanced_tree_search(SEED, ROOT_CHILDREN, Q, NORMAL_CHILDREN);
-// std::cout << result << std::endl;
-// }
-// });
-//
-// PROFILE_SAVE("test_profile.prof")
-//}
diff --git a/app/invoke_parallel/main.cpp b/app/invoke_parallel/main.cpp
index 4382168..e469bad 100644
--- a/app/invoke_parallel/main.cpp
+++ b/app/invoke_parallel/main.cpp
@@ -91,7 +91,7 @@ int main() {
PROFILE_MAIN_THREAD
// Call looks just the same, only requirement is
// the enclosure in the perform_work lambda.
- for (int i = 0; i < 10; i++) {
+ for (int i = 0; i < 1000; i++) {
PROFILE_WORK_BLOCK("Top Level FFT")
complex_vector input = initial_input;
fft(input.begin(), input.size());
diff --git a/lib/pls/src/internal/scheduling/abstract_task.cpp b/lib/pls/src/internal/scheduling/abstract_task.cpp
index 85e2c36..0fb15d2 100644
--- a/lib/pls/src/internal/scheduling/abstract_task.cpp
+++ b/lib/pls/src/internal/scheduling/abstract_task.cpp
@@ -18,7 +18,7 @@ bool abstract_task::steal_work() {
const size_t my_id = my_state->id_;
const size_t offset = my_state->random_() % my_scheduler->num_threads();
- const size_t max_tries = my_scheduler->num_threads() - 1; // TODO: Tune this value
+ const size_t max_tries = my_scheduler->num_threads(); // TODO: Tune this value
for (size_t i = 0; i < max_tries; i++) {
size_t target = (offset + i) % my_scheduler->num_threads();
if (target == my_id) {
diff --git a/media/116cf4af_fft_vtune.png b/media/116cf4af_fft_vtune.png
new file mode 100644
index 0000000..01008cb
Binary files /dev/null and b/media/116cf4af_fft_vtune.png differ
diff --git a/media/116cf4af_heat_average.png b/media/116cf4af_heat_average.png
new file mode 100644
index 0000000..24f159b
Binary files /dev/null and b/media/116cf4af_heat_average.png differ
diff --git a/media/116cf4af_matrix_vtune.png b/media/116cf4af_matrix_vtune.png
new file mode 100644
index 0000000..8182121
Binary files /dev/null and b/media/116cf4af_matrix_vtune.png differ