diff --git a/PERFORMANCE.md b/PERFORMANCE.md
index dbb786c..7112439 100644
--- a/PERFORMANCE.md
+++ b/PERFORMANCE.md
@@ -354,4 +354,31 @@ fix the source rather then 'circumventing' it with these extra tasks.
performance, as contemption on the bus/cache is always bad)
-
+After some research we think that the issue is down to many threads
+referencing the same atomic reference counter. We think so because
+even cache aligning the shared refernce count does not fix the issue
+when using the direct function call. Also, forcing a new method call
+(going down in the call stack one function call) is not solving the
+issue (thus making sure that it is not related with some caching issue
+in the call itself).
+
+In conclusion there seems to be a hyperthreading issue with this
+shared reference count. We keep this in mind if we eventually get
+tasks with changing data memebers (as this problem could reappear there,
+as then the ref_count actualy is in the same memory region as our
+'user variables'). For now we leave the code like it is.
+
+
+FFT Average with new call method:
+
+
+
+The performance of our new call method looks shockingly similar
+to TBB with a slight, constant performance drop behind it.
+This makes sense, as the basic principle (lock-free, classic work
+stealing deque and the parallel call structure) are nearly the same.
+
+We will see if minor optimizations can even close this last gap.
+Overall the performance at this point is good enough to move on
+to implementing more functionality and to running tests on different
+queues/stealing tactics etc.
diff --git a/lib/pls/include/pls/algorithms/invoke_parallel_impl.h b/lib/pls/include/pls/algorithms/invoke_parallel_impl.h
index c96fb3f..ef634bd 100644
--- a/lib/pls/include/pls/algorithms/invoke_parallel_impl.h
+++ b/lib/pls/include/pls/algorithms/invoke_parallel_impl.h
@@ -5,6 +5,7 @@
#include "pls/internal/scheduling/task.h"
#include "pls/internal/scheduling/lambda_task.h"
#include "pls/internal/scheduling/scheduler.h"
+#include "pls/internal/scheduling/thread_state.h"
namespace pls {
namespace algorithm {
@@ -17,10 +18,7 @@ void invoke_parallel(const Function1 &function1, const Function2 &function2) {
auto sub_task_2 = lambda_task_by_reference(function2);
scheduler::spawn_child(sub_task_2);
- scheduler::spawn_child(sub_task_1);
- // TODO: Research the exact cause of this being faster
-// function1(); // Execute first function 'inline' without spawning a sub_task object
- scheduler::wait_for_all();
+ scheduler::spawn_child_and_wait(sub_task_1);
}
template
@@ -31,12 +29,9 @@ void invoke_parallel(const Function1 &function1, const Function2 &function2, con
auto sub_task_2 = lambda_task_by_reference(function2);
auto sub_task_3 = lambda_task_by_reference(function3);
- scheduler::spawn_child(sub_task_2);
scheduler::spawn_child(sub_task_3);
- scheduler::spawn_child(sub_task_1);
- // TODO: Research the exact cause of this being faster
-// function1(); // Execute first function 'inline' without spawning a sub_task object
- scheduler::wait_for_all();
+ scheduler::spawn_child(sub_task_2);
+ scheduler::spawn_child_and_wait(sub_task_1);
}
}
diff --git a/lib/pls/include/pls/algorithms/parallel_for_impl.h b/lib/pls/include/pls/algorithms/parallel_for_impl.h
index 786b875..b787b44 100644
--- a/lib/pls/include/pls/algorithms/parallel_for_impl.h
+++ b/lib/pls/include/pls/algorithms/parallel_for_impl.h
@@ -25,11 +25,13 @@ void parallel_for(RandomIt first, RandomIt last, const Function &function) {
// Cut in half recursively
long middle_index = num_elements / 2;
- auto body = [=] { parallel_for(first + middle_index, last, function); };
- lambda_task_by_reference second_half_task(body);
+ auto body2 = [=] { parallel_for(first + middle_index, last, function); };
+ lambda_task_by_reference second_half_task(body2);
scheduler::spawn_child(second_half_task);
- parallel_for(first, first + middle_index, function);
+ auto body1 = [=] { parallel_for(first, first + middle_index, function); };
+ lambda_task_by_reference first_half_task(body1);
+ scheduler::spawn_child(first_half_task);
scheduler::wait_for_all();
}
}
diff --git a/lib/pls/include/pls/internal/scheduling/scheduler.h b/lib/pls/include/pls/internal/scheduling/scheduler.h
index c38ae64..54075fc 100644
--- a/lib/pls/include/pls/internal/scheduling/scheduler.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler.h
@@ -84,6 +84,15 @@ class scheduler {
static void spawn_child(T &sub_task);
/**
+ * Helper to spawn a child on the currently running task and waiting for it (skipping over the task-deque).
+ *
+ * @tparam T type of the new task
+ * @param sub_task the new task to be spawned
+ */
+ template
+ static void spawn_child_and_wait(T &sub_task);
+
+ /**
* Helper to wait for all children of the currently executing task.
*/
static void wait_for_all();
diff --git a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
index 46265b7..98156dc 100644
--- a/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
+++ b/lib/pls/include/pls/internal/scheduling/scheduler_impl.h
@@ -34,6 +34,11 @@ void scheduler::spawn_child(T &sub_task) {
thread_state::get()->current_task_->spawn_child(sub_task);
}
+template
+void scheduler::spawn_child_and_wait(T &sub_task) {
+ thread_state::get()->current_task_->spawn_child_and_wait(sub_task);
+}
+
}
}
}
diff --git a/lib/pls/include/pls/internal/scheduling/task.h b/lib/pls/include/pls/internal/scheduling/task.h
index 30e04b0..cdeb9f2 100644
--- a/lib/pls/include/pls/internal/scheduling/task.h
+++ b/lib/pls/include/pls/internal/scheduling/task.h
@@ -35,6 +35,8 @@ class task {
template
void spawn_child(T &&sub_task);
+ template
+ void spawn_child_and_wait(T &&sub_task);
void wait_for_all();
private:
@@ -58,6 +60,19 @@ void task::spawn_child(T &&sub_task) {
thread_state::get()->deque_.push_tail(const_task);
}
+template
+void task::spawn_child_and_wait(T &&sub_task) {
+ PROFILE_FORK_JOIN_STEALING("spawn_child")
+ static_assert(std::is_base_of::type>::value, "Only pass task subclasses!");
+
+ // Assign forced values (for stack and parent management)
+ sub_task.parent_ = nullptr;
+ sub_task.deque_state_ = thread_state::get()->deque_.save_state();
+ sub_task.execute();
+
+ wait_for_all();
+}
+
}
}
}
diff --git a/lib/pls/src/internal/data_structures/locking_deque.cpp b/lib/pls/src/internal/data_structures/locking_deque.cpp
deleted file mode 100644
index 90971ce..0000000
--- a/lib/pls/src/internal/data_structures/locking_deque.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-#include
-
-#include "pls/internal/data_structures/locking_deque.h"
-
-namespace pls {
-namespace internal {
-namespace data_structures {
-
-locking_deque_item *locking_deque_internal::pop_head_internal() {
- std::lock_guard lock{lock_};
-
- if (head_ == nullptr) {
- return nullptr;
- }
-
- locking_deque_item *result = head_;
- head_ = head_->next_;
- if (head_ == nullptr) {
- tail_ = nullptr;
- } else {
- head_->prev_ = nullptr;
- }
-
- return result;
-}
-
-locking_deque_item *locking_deque_internal::pop_tail_internal() {
- std::lock_guard lock{lock_};
-
- if (tail_ == nullptr) {
- return nullptr;
- }
-
- locking_deque_item *result = tail_;
- tail_ = tail_->prev_;
- if (tail_ == nullptr) {
- head_ = nullptr;
- } else {
- tail_->next_ = nullptr;
- }
-
- return result;
-}
-
-void locking_deque_internal::push_tail_internal(locking_deque_item *new_item) {
- std::lock_guard lock{lock_};
-
- if (tail_ != nullptr) {
- tail_->next_ = new_item;
- } else {
- head_ = new_item;
- }
- new_item->prev_ = tail_;
- new_item->next_ = nullptr;
- tail_ = new_item;
-}
-
-}
-}
-}
diff --git a/media/5044f0a1_fft_average.png b/media/5044f0a1_fft_average.png
new file mode 100644
index 0000000..deddf73
Binary files /dev/null and b/media/5044f0a1_fft_average.png differ