Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
las3_pub
/
predictable_parallel_patterns
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
bd826491
authored
Apr 30, 2019
by
FritzFlorian
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
WIP: lock-free work stealing deque based on our stack.
parent
d16ad3eb
Pipeline
#1159
passed with stages
in 3 minutes 37 seconds
Changes
21
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
21 changed files
with
309 additions
and
106 deletions
+309
-106
app/benchmark_fft/main.cpp
+1
-1
app/invoke_parallel/main.cpp
+73
-21
app/playground/main.cpp
+1
-4
lib/pls/CMakeLists.txt
+1
-0
lib/pls/include/pls/internal/base/alignment.h
+23
-1
lib/pls/include/pls/internal/base/backoff.h
+2
-2
lib/pls/include/pls/internal/base/error_handling.h
+1
-0
lib/pls/include/pls/internal/base/system_details.h
+24
-2
lib/pls/include/pls/internal/base/ttas_spin_lock.h
+2
-3
lib/pls/include/pls/internal/data_structures/aligned_stack.h
+8
-6
lib/pls/include/pls/internal/data_structures/aligned_stack_impl.h
+1
-1
lib/pls/include/pls/internal/data_structures/work_stealing_deque.h
+0
-0
lib/pls/include/pls/internal/scheduling/fork_join_task.h
+22
-12
lib/pls/src/internal/base/alignment.cpp
+12
-3
lib/pls/src/internal/base/swmr_spin_lock.cpp
+4
-4
lib/pls/src/internal/base/ttas_spin_lock.cpp
+2
-2
lib/pls/src/internal/data_structures/aligned_stack.cpp
+6
-1
lib/pls/src/internal/data_structures/deque.cpp
+7
-7
lib/pls/src/internal/scheduling/fork_join_task.cpp
+30
-35
test/CMakeLists.txt
+1
-1
test/data_structures_test.cpp
+88
-0
No files found.
app/benchmark_fft/main.cpp
View file @
bd826491
...
...
@@ -6,7 +6,7 @@
#include <complex>
#include <vector>
static
constexpr
int
CUTOFF
=
1
0
;
static
constexpr
int
CUTOFF
=
1
6
;
static
constexpr
int
NUM_ITERATIONS
=
1000
;
static
constexpr
int
INPUT_SIZE
=
2064
;
typedef
std
::
vector
<
std
::
complex
<
double
>>
complex_vector
;
...
...
app/invoke_parallel/main.cpp
View file @
bd826491
...
...
@@ -2,48 +2,100 @@
#include <pls/internal/helpers/profiler.h>
#include <iostream>
#include <complex>
#include <vector>
static
pls
::
static_scheduler_memory
<
8
,
2
<<
14
>
my_scheduler_memory
;
static
constexpr
int
CUTOFF
=
16
;
static
constexpr
int
NUM_ITERATIONS
=
1000
;
static
constexpr
int
INPUT_SIZE
=
2064
;
typedef
std
::
vector
<
std
::
complex
<
double
>>
complex_vector
;
static
constexpr
int
CUTOFF
=
10
;
void
divide
(
complex_vector
::
iterator
data
,
int
n
)
{
complex_vector
tmp_odd_elements
(
n
/
2
);
for
(
int
i
=
0
;
i
<
n
/
2
;
i
++
)
{
tmp_odd_elements
[
i
]
=
data
[
i
*
2
+
1
];
}
for
(
int
i
=
0
;
i
<
n
/
2
;
i
++
)
{
data
[
i
]
=
data
[
i
*
2
];
}
for
(
int
i
=
0
;
i
<
n
/
2
;
i
++
)
{
data
[
i
+
n
/
2
]
=
tmp_odd_elements
[
i
];
}
}
void
combine
(
complex_vector
::
iterator
data
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
/
2
;
i
++
)
{
std
::
complex
<
double
>
even
=
data
[
i
];
std
::
complex
<
double
>
odd
=
data
[
i
+
n
/
2
];
// w is the "twiddle-factor".
// this could be cached, but we run the same 'data_structures' algorithm parallel/serial,
// so it won't impact the performance comparison.
std
::
complex
<
double
>
w
=
exp
(
std
::
complex
<
double
>
(
0
,
-
2.
*
M_PI
*
i
/
n
));
long
fib_serial
(
long
n
)
{
if
(
n
==
0
)
{
return
0
;
data
[
i
]
=
even
+
w
*
odd
;
data
[
i
+
n
/
2
]
=
even
-
w
*
odd
;
}
if
(
n
==
1
)
{
return
1
;
}
void
fft
(
complex_vector
::
iterator
data
,
int
n
)
{
if
(
n
<
2
)
{
return
;
}
return
fib_serial
(
n
-
1
)
+
fib_serial
(
n
-
2
);
PROFILE_WORK_BLOCK
(
"Divide"
)
divide
(
data
,
n
);
PROFILE_END_BLOCK
PROFILE_WORK_BLOCK
(
"Invoke Parallel"
)
if
(
n
==
CUTOFF
)
{
PROFILE_WORK_BLOCK
(
"FFT Serial"
)
fft
(
data
,
n
/
2
);
fft
(
data
+
n
/
2
,
n
/
2
);
}
else
if
(
n
<=
CUTOFF
)
{
fft
(
data
,
n
/
2
);
fft
(
data
+
n
/
2
,
n
/
2
);
}
else
{
pls
::
invoke_parallel
(
[
&
]
{
fft
(
data
,
n
/
2
);
},
[
&
]
{
fft
(
data
+
n
/
2
,
n
/
2
);
}
);
}
PROFILE_END_BLOCK
PROFILE_WORK_BLOCK
(
"Combine"
)
combine
(
data
,
n
);
PROFILE_END_BLOCK
}
long
fib
(
long
n
)
{
if
(
n
<=
CUTOFF
)
{
return
fib_serial
(
n
);
complex_vector
prepare_input
(
int
input_size
)
{
std
::
vector
<
double
>
known_frequencies
{
2
,
11
,
52
,
88
,
256
};
complex_vector
data
(
input_size
);
// Set our input data to match a time series of the known_frequencies.
// When applying fft to this time-series we should find these frequencies.
for
(
int
i
=
0
;
i
<
input_size
;
i
++
)
{
data
[
i
]
=
std
::
complex
<
double
>
(
0.0
,
0.0
);
for
(
auto
frequencie
:
known_frequencies
)
{
data
[
i
]
+=
sin
(
2
*
M_PI
*
frequencie
*
i
/
input_size
);
}
}
// Actual 'invoke_parallel' logic/code
int
left
,
right
;
pls
::
invoke_parallel
(
[
&
]
{
left
=
fib
(
n
-
1
);
},
[
&
]
{
right
=
fib
(
n
-
2
);
}
);
return
left
+
right
;
return
data
;
}
int
main
()
{
PROFILE_ENABLE
pls
::
malloc_scheduler_memory
my_scheduler_memory
{
8
,
2u
<<
14
};
pls
::
scheduler
scheduler
{
&
my_scheduler_memory
,
8
};
long
result
;
complex_vector
initial_input
=
prepare_input
(
INPUT_SIZE
)
;
scheduler
.
perform_work
([
&
]
{
PROFILE_MAIN_THREAD
// Call looks just the same, only requirement is
// the enclosure in the perform_work lambda.
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
result
=
fib
(
30
);
std
::
cout
<<
"Fib(30)="
<<
result
<<
std
::
endl
;
PROFILE_WORK_BLOCK
(
"Top Level FFT"
)
complex_vector
input
=
initial_input
;
fft
(
input
.
begin
(),
input
.
size
());
}
});
...
...
app/playground/main.cpp
View file @
bd826491
...
...
@@ -11,8 +11,5 @@
#include <pls/internal/helpers/unique_id.h>
int
main
()
{
std
::
cout
<<
pls
::
internal
::
scheduling
::
root_task
<
void
(
*
)
>::
create_id
().
type_
.
hash_code
()
<<
std
::
endl
;
std
::
cout
<<
pls
::
internal
::
helpers
::
unique_id
::
create
<
pls
::
internal
::
scheduling
::
root_task
<
void
(
*
)
>>
().
type_
.
hash_code
()
<<
std
::
endl
;
}
lib/pls/CMakeLists.txt
View file @
bd826491
...
...
@@ -20,6 +20,7 @@ add_library(pls STATIC
include/pls/internal/data_structures/aligned_stack.h src/internal/data_structures/aligned_stack.cpp
include/pls/internal/data_structures/aligned_stack_impl.h
include/pls/internal/data_structures/deque.h src/internal/data_structures/deque.cpp
include/pls/internal/data_structures/work_stealing_deque.h
include/pls/internal/helpers/prohibit_new.h
include/pls/internal/helpers/profiler.h
...
...
lib/pls/include/pls/internal/base/alignment.h
View file @
bd826491
...
...
@@ -19,10 +19,32 @@ struct aligned_wrapper {
};
void
*
allocate_aligned
(
size_t
size
);
std
::
uintptr_t
next_alignment
(
std
::
uintptr_t
size
);
system_details
::
pointer_t
next_alignment
(
system_details
::
pointer_t
size
);
system_details
::
pointer_t
previous_alignment
(
system_details
::
pointer_t
size
);
char
*
next_alignment
(
char
*
pointer
);
}
template
<
typename
T
>
struct
aligned_aba_pointer
{
const
system_details
::
pointer_t
pointer_
;
explicit
aligned_aba_pointer
(
T
*
pointer
,
unsigned
int
aba
=
0
)
:
pointer_
{
reinterpret_cast
<
system_details
::
pointer_t
>
(
pointer
)
+
aba
}
{}
T
*
pointer
()
const
{
return
reinterpret_cast
<
T
*>
(
pointer_
&
system_details
::
CACHE_LINE_ADDRESS_USED_BITS
);
}
unsigned
int
aba
()
const
{
return
pointer_
&
system_details
::
CACHE_LINE_ADDRESS_UNUSED_BITS
;
}
aligned_aba_pointer
set_aba
(
unsigned
int
aba
)
const
{
return
aligned_aba_pointer
(
pointer
(),
aba
);
}
};
}
}
}
...
...
lib/pls/include/pls/internal/base/backoff.h
View file @
bd826491
...
...
@@ -14,8 +14,8 @@ namespace internal {
namespace
base
{
class
backoff
{
static
constexpr
unsigned
long
INITIAL_SPIN_ITERS
=
2u
<<
2u
;
static
constexpr
unsigned
long
MAX_SPIN_ITERS
=
2u
<<
6u
;
const
unsigned
long
INITIAL_SPIN_ITERS
=
2u
<<
2u
;
const
unsigned
long
MAX_SPIN_ITERS
=
2u
<<
6u
;
unsigned
long
current_
=
INITIAL_SPIN_ITERS
;
std
::
minstd_rand
random_
;
...
...
lib/pls/include/pls/internal/base/error_handling.h
View file @
bd826491
...
...
@@ -11,5 +11,6 @@
* (or its inclusion adds too much overhead).
*/
#define PLS_ERROR(msg) std::cout << msg << std::endl; exit(1);
#define PLS_ASSERT(cond, msg) if (!cond) { PLS_ERROR(msg) }
#endif //PLS_ERROR_HANDLING_H
lib/pls/include/pls/internal/base/system_details.h
View file @
bd826491
...
...
@@ -18,10 +18,32 @@ namespace base {
* Currently sane default values for x86.
*/
namespace
system_details
{
/**
* Pointer Types needed for ABA protection mixed into addresses.
* pointer_t should be an integer type capable of holding ANY pointer value.
*/
using
pointer_t
=
std
::
uintptr_t
;
constexpr
pointer_t
ZERO_POINTER
=
0
;
constexpr
pointer_t
MAX_POINTER
=
~
ZERO_POINTER
;
/**
* Biggest type that supports atomic CAS operations.
* Usually it is sane to assume a pointer can be swapped in a single CAS operation.
*/
using
cas_integer
=
pointer_t
;
constexpr
cas_integer
MIN_CAS_INTEGER
=
0
;
constexpr
cas_integer
MAX_CAS_INTEGER
=
~
MIN_CAS_INTEGER
;
constexpr
cas_integer
FIRST_HALF_CAS_INTEGER
=
MAX_CAS_INTEGER
<<
((
sizeof
(
cas_integer
)
/
2
)
*
8
);
constexpr
cas_integer
SECOND_HALF_CAS_INTEGER
=
~
FIRST_HALF_CAS_INTEGER
;
/**
* Most processors have 64 byte cache lines
* Most processors have 64 byte cache lines
(last 6 bit of the address are zero at line beginnings).
*/
constexpr
std
::
uintptr_t
CACHE_LINE_SIZE
=
64
;
constexpr
unsigned
int
CACHE_LINE_ADDRESS_BITS
=
6
;
constexpr
pointer_t
CACHE_LINE_SIZE
=
2u
<<
(
CACHE_LINE_ADDRESS_BITS
-
1
);
constexpr
pointer_t
CACHE_LINE_ADDRESS_USED_BITS
=
MAX_POINTER
<<
CACHE_LINE_ADDRESS_BITS
;
constexpr
pointer_t
CACHE_LINE_ADDRESS_UNUSED_BITS
=
~
CACHE_LINE_ADDRESS_USED_BITS
;
/**
* Choose one of the following ways to store thread specific data.
...
...
lib/pls/include/pls/internal/base/ttas_spin_lock.h
View file @
bd826491
...
...
@@ -19,11 +19,10 @@ namespace base {
*/
class
ttas_spin_lock
{
std
::
atomic
<
int
>
flag_
;
backoff
backoff_
;
public
:
ttas_spin_lock
()
:
flag_
{
0
}
,
backoff_
{}
{};
ttas_spin_lock
(
const
ttas_spin_lock
&
/*other*/
)
:
flag_
{
0
}
,
backoff_
{}
{}
ttas_spin_lock
()
:
flag_
{
0
}
{};
ttas_spin_lock
(
const
ttas_spin_lock
&
/*other*/
)
:
flag_
{
0
}
{}
void
lock
();
bool
try_lock
(
unsigned
int
num_tries
=
1
);
...
...
lib/pls/include/pls/internal/data_structures/aligned_stack.h
View file @
bd826491
...
...
@@ -12,6 +12,8 @@ namespace pls {
namespace
internal
{
namespace
data_structures
{
using
base
::
system_details
::
pointer_t
;
/**
* Generic stack-like data structure that allows to allocate arbitrary objects in a given memory region.
* The objects will be stored aligned in the stack, making the storage cache friendly and very fast
...
...
@@ -26,15 +28,16 @@ namespace data_structures {
*/
class
aligned_stack
{
// Keep bounds of our memory block
char
*
memory_start_
;
char
*
memory_end_
;
pointer_t
memory_start_
;
pointer_t
memory_end_
;
// Current head will always be aligned to cache lines
char
*
head_
;
pointer_t
head_
;
public
:
typedef
char
*
state
;
typedef
pointer_t
state
;
aligned_stack
()
:
memory_start_
{
nullptr
},
memory_end_
{
nullptr
},
head_
{
nullptr
}
{};
aligned_stack
()
:
memory_start_
{
0
},
memory_end_
{
0
},
head_
{
0
}
{};
aligned_stack
(
pointer_t
memory_region
,
std
::
size_t
size
);
aligned_stack
(
char
*
memory_region
,
std
::
size_t
size
);
template
<
typename
T
>
...
...
@@ -48,7 +51,6 @@ class aligned_stack {
void
reset_state
(
state
new_state
)
{
head_
=
new_state
;
}
};
}
}
}
...
...
lib/pls/include/pls/internal/data_structures/aligned_stack_impl.h
View file @
bd826491
...
...
@@ -9,7 +9,7 @@ namespace data_structures {
template
<
typename
T
>
T
*
aligned_stack
::
push
(
const
T
&
object
)
{
// Copy-Construct
return
new
(
(
void
*
)
push
<
T
>
())
T
(
object
);
return
new
(
push
<
T
>
())
T
(
object
);
}
template
<
typename
T
>
...
...
lib/pls/include/pls/internal/data_structures/work_stealing_deque.h
0 → 100644
View file @
bd826491
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/scheduling/fork_join_task.h
View file @
bd826491
...
...
@@ -5,7 +5,7 @@
#include "pls/internal/helpers/profiler.h"
#include "pls/internal/data_structures/aligned_stack.h"
#include "pls/internal/data_structures/deque.h"
#include "pls/internal/data_structures/
work_stealing_
deque.h"
#include "abstract_task.h"
#include "thread_state.h"
...
...
@@ -15,7 +15,7 @@ namespace internal {
namespace
scheduling
{
class
fork_join_task
;
class
fork_join_sub_task
:
public
data_structures
::
deque_item
{
class
fork_join_sub_task
{
friend
class
fork_join_task
;
// Coordinate finishing of sub_tasks
...
...
@@ -25,8 +25,11 @@ class fork_join_sub_task : public data_structures::deque_item {
// Access to TBB scheduling environment
fork_join_task
*
tbb_task_
;
bool
executed
=
false
;
int
executed_at
=
-
1
;
// Stack Management (reset stack pointer after wait_for_all() calls)
data_structures
::
aligned_stack
::
state
stack
_state_
;
data_structures
::
work_stealing_deque
<
fork_join_sub_task
>::
state
deque
_state_
;
protected
:
explicit
fork_join_sub_task
();
fork_join_sub_task
(
const
fork_join_sub_task
&
other
);
...
...
@@ -37,11 +40,10 @@ class fork_join_sub_task : public data_structures::deque_item {
public
:
// Only use them when actually executing this sub_task (only public for simpler API design)
template
<
typename
T
>
void
spawn_child
(
const
T
&
sub_task
);
void
spawn_child
(
T
&
sub_task
);
void
wait_for_all
();
private
:
void
spawn_child_internal
(
fork_join_sub_task
*
sub_task
);
void
execute
();
};
...
...
@@ -50,7 +52,7 @@ class fork_join_lambda_by_reference : public fork_join_sub_task {
const
Function
*
function_
;
public
:
explicit
fork_join_lambda_by_reference
(
const
Function
*
function
)
:
function_
{
function
}
{};
explicit
fork_join_lambda_by_reference
(
const
Function
*
function
)
:
f
ork_join_sub_task
{},
f
unction_
{
function
}
{};
protected
:
void
execute_internal
()
override
{
...
...
@@ -63,7 +65,7 @@ class fork_join_lambda_by_value : public fork_join_sub_task {
const
Function
function_
;
public
:
explicit
fork_join_lambda_by_value
(
const
Function
&
function
)
:
function_
{
function
}
{};
explicit
fork_join_lambda_by_value
(
const
Function
&
function
)
:
f
ork_join_sub_task
{},
f
unction_
{
function
}
{};
protected
:
void
execute_internal
()
override
{
...
...
@@ -76,10 +78,9 @@ class fork_join_task : public abstract_task {
fork_join_sub_task
*
root_task_
;
fork_join_sub_task
*
currently_executing_
;
data_structures
::
aligned_stack
*
my_stack_
;
// Double-Ended Queue management
data_structures
::
deque
<
fork_join_sub_task
>
deque_
;
data_structures
::
work_stealing_
deque
<
fork_join_sub_task
>
deque_
;
// Steal Management
fork_join_sub_task
*
last_stolen_
;
...
...
@@ -97,12 +98,21 @@ class fork_join_task : public abstract_task {
};
template
<
typename
T
>
void
fork_join_sub_task
::
spawn_child
(
const
T
&
task
)
{
void
fork_join_sub_task
::
spawn_child
(
T
&
task
)
{
PROFILE_FORK_JOIN_STEALING
(
"spawn_child"
)
static_assert
(
std
::
is_base_of
<
fork_join_sub_task
,
T
>::
value
,
"Only pass fork_join_sub_task subclasses!"
);
T
*
new_task
=
tbb_task_
->
my_stack_
->
push
(
task
);
spawn_child_internal
(
new_task
);
// Keep our refcount up to date
ref_count_
++
;
// Assign forced values
task
.
parent_
=
this
;
task
.
tbb_task_
=
tbb_task_
;
task
.
deque_state_
=
tbb_task_
->
deque_
.
save_state
();
// Push on our deque
const
T
const_task
=
task
;
tbb_task_
->
deque_
.
push_tail
(
const_task
);
}
}
...
...
lib/pls/src/internal/base/alignment.cpp
View file @
bd826491
...
...
@@ -10,8 +10,8 @@ void *allocate_aligned(size_t size) {
return
aligned_alloc
(
system_details
::
CACHE_LINE_SIZE
,
size
);
}
s
td
::
uintptr_t
next_alignment
(
std
::
uintpt
r_t
size
)
{
s
td
::
uintpt
r_t
miss_alignment
=
size
%
base
::
system_details
::
CACHE_LINE_SIZE
;
s
ystem_details
::
pointer_t
next_alignment
(
system_details
::
pointe
r_t
size
)
{
s
ystem_details
::
pointe
r_t
miss_alignment
=
size
%
base
::
system_details
::
CACHE_LINE_SIZE
;
if
(
miss_alignment
==
0
)
{
return
size
;
}
else
{
...
...
@@ -19,8 +19,17 @@ std::uintptr_t next_alignment(std::uintptr_t size) {
}
}
system_details
::
pointer_t
previous_alignment
(
system_details
::
pointer_t
size
)
{
system_details
::
pointer_t
miss_alignment
=
size
%
base
::
system_details
::
CACHE_LINE_SIZE
;
if
(
miss_alignment
==
0
)
{
return
size
;
}
else
{
return
size
-
miss_alignment
;
}
}
char
*
next_alignment
(
char
*
pointer
)
{
return
reinterpret_cast
<
char
*>
(
next_alignment
(
reinterpret_cast
<
s
td
::
uintpt
r_t
>
(
pointer
)));
return
reinterpret_cast
<
char
*>
(
next_alignment
(
reinterpret_cast
<
s
ystem_details
::
pointe
r_t
>
(
pointer
)));
}
}
...
...
lib/pls/src/internal/base/swmr_spin_lock.cpp
View file @
bd826491
...
...
@@ -23,22 +23,22 @@ bool swmr_spin_lock::reader_try_lock() {
void
swmr_spin_lock
::
reader_unlock
()
{
PROFILE_LOCK
(
"Release Read Lock"
)
readers_
.
fetch_add
(
-
1
,
std
::
memory_order_release
)
;
readers_
--
;
}
void
swmr_spin_lock
::
writer_lock
()
{
PROFILE_LOCK
(
"Acquire Write Lock"
)
// Tell the readers that we would like to write
write_request_
.
store
(
1
,
std
::
memory_order_acquire
)
;
write_request_
=
1
;
// Wait for all of them to exit the critical section
while
(
readers_
.
load
(
std
::
memory_order_acquire
)
>
0
)
while
(
readers_
>
0
)
system_details
::
relax_cpu
();
// Spin, not expensive as relaxed load
}
void
swmr_spin_lock
::
writer_unlock
()
{
PROFILE_LOCK
(
"Release Write Lock"
)
write_request_
.
store
(
0
,
std
::
memory_order_release
)
;
write_request_
=
0
;
}
}
...
...
lib/pls/src/internal/base/ttas_spin_lock.cpp
View file @
bd826491
...
...
@@ -9,7 +9,7 @@ namespace base {
void
ttas_spin_lock
::
lock
()
{
PROFILE_LOCK
(
"Acquire Lock"
)
int
expected
=
0
;
backoff
_
.
reset
()
;
backoff
backoff_
;
while
(
true
)
{
while
(
flag_
.
load
(
std
::
memory_order_relaxed
)
==
1
)
...
...
@@ -26,7 +26,7 @@ void ttas_spin_lock::lock() {
bool
ttas_spin_lock
::
try_lock
(
unsigned
int
num_tries
)
{
PROFILE_LOCK
(
"Try Acquire Lock"
)
int
expected
=
0
;
backoff
_
.
reset
()
;
backoff
backoff_
;
while
(
true
)
{
while
(
flag_
.
load
()
==
1
)
{
...
...
lib/pls/src/internal/data_structures/aligned_stack.cpp
View file @
bd826491
...
...
@@ -5,11 +5,16 @@ namespace pls {
namespace
internal
{
namespace
data_structures
{
aligned_stack
::
aligned_stack
(
char
*
memory_region
,
const
std
::
size_t
size
)
:
aligned_stack
::
aligned_stack
(
pointer_t
memory_region
,
const
std
::
size_t
size
)
:
memory_start_
{
memory_region
},
memory_end_
{
memory_region
+
size
},
head_
{
base
::
alignment
::
next_alignment
(
memory_start_
)}
{}
aligned_stack
::
aligned_stack
(
char
*
memory_region
,
const
std
::
size_t
size
)
:
memory_start_
{(
pointer_t
)
memory_region
},
memory_end_
{(
pointer_t
)
memory_region
+
size
},
head_
{
base
::
alignment
::
next_alignment
(
memory_start_
)}
{}
}
}
}
lib/pls/src/internal/data_structures/deque.cpp
View file @
bd826491
...
...
@@ -14,11 +14,11 @@ deque_item *deque_internal::pop_head_internal() {
}
deque_item
*
result
=
head_
;
head_
=
head_
->
prev
_
;
head_
=
head_
->
next
_
;
if
(
head_
==
nullptr
)
{
tail_
=
nullptr
;
}
else
{
head_
->
next
_
=
nullptr
;
head_
->
prev
_
=
nullptr
;
}
return
result
;
...
...
@@ -32,11 +32,11 @@ deque_item *deque_internal::pop_tail_internal() {
}
deque_item
*
result
=
tail_
;
tail_
=
tail_
->
next
_
;
tail_
=
tail_
->
prev
_
;
if
(
tail_
==
nullptr
)
{
head_
=
nullptr
;
}
else
{
tail_
->
prev
_
=
nullptr
;
tail_
->
next
_
=
nullptr
;
}
return
result
;
...
...
@@ -46,12 +46,12 @@ void deque_internal::push_tail_internal(deque_item *new_item) {
std
::
lock_guard
<
base
::
spin_lock
>
lock
{
lock_
};
if
(
tail_
!=
nullptr
)
{
tail_
->
prev
_
=
new_item
;
tail_
->
next
_
=
new_item
;
}
else
{
head_
=
new_item
;
}
new_item
->
next
_
=
tail_
;
new_item
->
prev
_
=
nullptr
;
new_item
->
prev
_
=
tail_
;
new_item
->
next
_
=
nullptr
;
tail_
=
new_item
;
}
...
...
lib/pls/src/internal/scheduling/fork_join_task.cpp
View file @
bd826491
...
...
@@ -8,22 +8,26 @@ namespace internal {
namespace
scheduling
{
fork_join_sub_task
::
fork_join_sub_task
()
:
data_structures
::
deque_item
{},
ref_count_
{
0
},
parent_
{
nullptr
},
tbb_task_
{
nullptr
},
stack_state_
{
nullptr
}
{}
deque_state_
{
0
}
{}
fork_join_sub_task
::
fork_join_sub_task
(
const
fork_join_sub_task
&
other
)
:
data_structures
::
deque_item
(
other
),
ref_count_
{
0
},
parent_
{
nullptr
},
tbb_task_
{
nullptr
},
stack_state_
{
nullptr
}
{}
parent_
{
other
.
parent_
},
tbb_task_
{
other
.
tbb_task_
},
deque_state_
{
other
.
deque_state_
}
{}
void
fork_join_sub_task
::
execute
()
{
PROFILE_WORK_BLOCK
(
"execute sub_task"
)
tbb_task_
->
currently_executing_
=
this
;
if
(
executed
)
{
int
my_id
=
base
::
this_thread
::
state
<
thread_state
>
()
->
id_
;
PLS_ERROR
(
"Double Execution!"
)
}
executed
=
true
;
executed_at
=
base
::
this_thread
::
state
<
thread_state
>
()
->
id_
;
execute_internal
();
tbb_task_
->
currently_executing_
=
nullptr
;
PROFILE_END_BLOCK
...
...
@@ -34,18 +38,6 @@ void fork_join_sub_task::execute() {
}
}
void
fork_join_sub_task
::
spawn_child_internal
(
fork_join_sub_task
*
sub_task
)
{
// Keep our refcount up to date
ref_count_
++
;
// Assign forced values
sub_task
->
parent_
=
this
;
sub_task
->
tbb_task_
=
tbb_task_
;
sub_task
->
stack_state_
=
tbb_task_
->
my_stack_
->
save_state
();
tbb_task_
->
deque_
.
push_tail
(
sub_task
);
}
void
fork_join_sub_task
::
wait_for_all
()
{
while
(
ref_count_
>
0
)
{
PROFILE_STEALING
(
"get local sub task"
)
...
...
@@ -54,19 +46,17 @@ void fork_join_sub_task::wait_for_all() {
if
(
local_task
!=
nullptr
)
{
local_task
->
execute
();
}
else
{
while
(
ref_count_
>
0
)
{
// Try to steal work.
// External steal will be executed implicitly if success
PROFILE_STEALING
(
"steal work"
)
bool
internal_steal_success
=
tbb_task_
->
steal_work
();
PROFILE_END_BLOCK
if
(
internal_steal_success
)
{
tbb_task_
->
last_stolen_
->
execute
();
}
// Try to steal work.
// External steal will be executed implicitly if success
PROFILE_STEALING
(
"steal work"
)
bool
internal_steal_success
=
tbb_task_
->
steal_work
();
PROFILE_END_BLOCK
if
(
internal_steal_success
)
{
tbb_task_
->
last_stolen_
->
execute
();
}
}
}
tbb_task_
->
my_stack_
->
reset_state
(
stack
_state_
);
tbb_task_
->
deque_
.
release_memory_until
(
deque
_state_
);
}
fork_join_sub_task
*
fork_join_task
::
get_local_sub_task
()
{
...
...
@@ -74,7 +64,9 @@ fork_join_sub_task *fork_join_task::get_local_sub_task() {
}
fork_join_sub_task
*
fork_join_task
::
get_stolen_sub_task
()
{
return
deque_
.
pop_head
();
auto
tmp
=
deque_
.
save_state
();
auto
result
=
deque_
.
pop_head
();
return
result
;
}
bool
fork_join_task
::
internal_stealing
(
abstract_task
*
other_task
)
{
...
...
@@ -87,7 +79,7 @@ bool fork_join_task::internal_stealing(abstract_task *other_task) {
}
else
{
// Make sub-task belong to our fork_join_task instance
stolen_sub_task
->
tbb_task_
=
this
;
stolen_sub_task
->
stack_state_
=
my_stack_
->
save_state
();
stolen_sub_task
->
deque_state_
=
deque_
.
save_state
();
// We will execute this next without explicitly moving it onto our stack storage
last_stolen_
=
stolen_sub_task
;
...
...
@@ -114,9 +106,12 @@ void fork_join_task::execute() {
PROFILE_WORK_BLOCK
(
"execute fork_join_task"
);
// Bind this instance to our OS thread
my_stack_
=
base
::
this_thread
::
state
<
thread_state
>
()
->
task_stack_
;
// TODO: See if we did this right
// my_stack_ = base::this_thread::state<thread_state>()->task_stack_;
deque_
.
reset_base_pointer
();
root_task_
->
tbb_task_
=
this
;
root_task_
->
stack_state_
=
my_stack_
->
save_state
();
root_task_
->
deque_state_
=
deque_
.
save_state
();
// Execute it on our OS thread until its finished
root_task_
->
execute
();
...
...
@@ -124,12 +119,12 @@ void fork_join_task::execute() {
fork_join_sub_task
*
fork_join_task
::
currently_executing
()
const
{
return
currently_executing_
;
}
fork_join_task
::
fork_join_task
(
fork_join_sub_task
*
root_task
,
const
abstract_task
::
id
&
id
)
:
fork_join_task
::
fork_join_task
(
fork_join_sub_task
*
root_task
,
const
abstract_task
::
id
&
id
)
:
abstract_task
{
0
,
id
},
root_task_
{
root_task
},
currently_executing_
{
nullptr
},
my_stack_
{
nullptr
},
deque_
{},
deque_
{
base
::
this_thread
::
state
<
thread_state
>
()
->
task_stack_
},
last_stolen_
{
nullptr
}
{}
}
...
...
test/CMakeLists.txt
View file @
bd826491
add_executable
(
tests
main.cpp
base_tests.cpp scheduling_tests.cpp
data_structures_test.cpp
)
data_structures_test.cpp
)
target_link_libraries
(
tests catch2 pls
)
test/data_structures_test.cpp
View file @
bd826491
...
...
@@ -4,6 +4,7 @@
#include <pls/internal/data_structures/aligned_stack.h>
#include <pls/internal/data_structures/deque.h>
#include <pls/internal/data_structures/work_stealing_deque.h>
#include <vector>
#include <mutex>
...
...
@@ -130,3 +131,90 @@ TEST_CASE("deque stores objects correctly", "[internal/data_structures/deque.h]"
REQUIRE
(
deque
.
pop_tail
()
==
&
three
);
}
}
TEST_CASE
(
"work stealing deque stores objects correctly"
,
"[internal/data_structures/aligned_stack.h]"
)
{
constexpr
long
data_size
=
2
<<
14
;
char
data
[
data_size
];
aligned_stack
stack
{
data
,
data_size
};
work_stealing_deque
<
int
>
deque
{
&
stack
};
int
one
=
1
,
two
=
2
,
three
=
3
,
four
=
4
;
SECTION
(
"add and remove items form the tail"
)
{
deque
.
push_tail
(
one
);
deque
.
push_tail
(
two
);
deque
.
push_tail
(
three
);
REQUIRE
(
*
deque
.
pop_tail
()
==
three
);
REQUIRE
(
*
deque
.
pop_tail
()
==
two
);
REQUIRE
(
*
deque
.
pop_tail
()
==
one
);
}
SECTION
(
"handles getting empty by popping the tail correctly"
)
{
deque
.
push_tail
(
one
);
REQUIRE
(
*
deque
.
pop_tail
()
==
one
);
deque
.
push_tail
(
two
);
REQUIRE
(
*
deque
.
pop_tail
()
==
two
);
}
SECTION
(
"remove items form the head"
)
{
deque
.
push_tail
(
one
);
deque
.
push_tail
(
two
);
deque
.
push_tail
(
three
);
REQUIRE
(
*
deque
.
pop_head
()
==
one
);
REQUIRE
(
*
deque
.
pop_head
()
==
two
);
REQUIRE
(
*
deque
.
pop_head
()
==
three
);
}
SECTION
(
"handles getting empty by popping the head correctly"
)
{
deque
.
push_tail
(
one
);
REQUIRE
(
*
deque
.
pop_head
()
==
one
);
deque
.
push_tail
(
two
);
REQUIRE
(
*
deque
.
pop_head
()
==
two
);
}
SECTION
(
"handles getting empty by popping the head and tail correctly"
)
{
deque
.
push_tail
(
one
);
REQUIRE
(
*
deque
.
pop_tail
()
==
one
);
deque
.
push_tail
(
two
);
REQUIRE
(
*
deque
.
pop_head
()
==
two
);
deque
.
push_tail
(
three
);
REQUIRE
(
*
deque
.
pop_tail
()
==
three
);
}
SECTION
(
"handles jumps bigger 1 correctly"
)
{
deque
.
push_tail
(
one
);
deque
.
push_tail
(
two
);
REQUIRE
(
*
deque
.
pop_tail
()
==
two
);
deque
.
push_tail
(
three
);
deque
.
push_tail
(
four
);
REQUIRE
(
*
deque
.
pop_head
()
==
one
);
REQUIRE
(
*
deque
.
pop_head
()
==
three
);
REQUIRE
(
*
deque
.
pop_head
()
==
four
);
}
SECTION
(
"handles stack reset 1 correctly when emptied by tail"
)
{
deque
.
push_tail
(
one
);
deque
.
push_tail
(
two
);
auto
tmp_result
=
deque
.
pop_tail
();
REQUIRE
(
*
tmp_result
==
two
);
deque
.
release_memory_until
(
tmp_result
);
REQUIRE
(
*
deque
.
pop_tail
()
==
one
);
deque
.
push_tail
(
three
);
deque
.
push_tail
(
four
);
REQUIRE
(
*
deque
.
pop_head
()
==
three
);
REQUIRE
(
*
deque
.
pop_tail
()
==
four
);
}
SECTION
(
"synces correctly"
)
{
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment