Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
las3_pub
/
predictable_parallel_patterns
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
bd826491
authored
6 years ago
by
FritzFlorian
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
WIP: lock-free work stealing deque based on our stack.
parent
d16ad3eb
Pipeline
#1159
passed with stages
in 3 minutes 37 seconds
Changes
21
Pipelines
1
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
21 changed files
with
301 additions
and
98 deletions
+301
-98
app/benchmark_fft/main.cpp
+1
-1
app/invoke_parallel/main.cpp
+72
-20
app/playground/main.cpp
+1
-4
lib/pls/CMakeLists.txt
+1
-0
lib/pls/include/pls/internal/base/alignment.h
+23
-1
lib/pls/include/pls/internal/base/backoff.h
+2
-2
lib/pls/include/pls/internal/base/error_handling.h
+1
-0
lib/pls/include/pls/internal/base/system_details.h
+24
-2
lib/pls/include/pls/internal/base/ttas_spin_lock.h
+2
-3
lib/pls/include/pls/internal/data_structures/aligned_stack.h
+8
-6
lib/pls/include/pls/internal/data_structures/aligned_stack_impl.h
+1
-1
lib/pls/include/pls/internal/data_structures/work_stealing_deque.h
+0
-0
lib/pls/include/pls/internal/scheduling/fork_join_task.h
+22
-12
lib/pls/src/internal/base/alignment.cpp
+12
-3
lib/pls/src/internal/base/swmr_spin_lock.cpp
+4
-4
lib/pls/src/internal/base/ttas_spin_lock.cpp
+2
-2
lib/pls/src/internal/data_structures/aligned_stack.cpp
+6
-1
lib/pls/src/internal/data_structures/deque.cpp
+7
-7
lib/pls/src/internal/scheduling/fork_join_task.cpp
+23
-28
test/CMakeLists.txt
+1
-1
test/data_structures_test.cpp
+88
-0
No files found.
app/benchmark_fft/main.cpp
View file @
bd826491
...
...
@@ -6,7 +6,7 @@
#include <complex>
#include <vector>
static
constexpr
int
CUTOFF
=
1
0
;
static
constexpr
int
CUTOFF
=
1
6
;
static
constexpr
int
NUM_ITERATIONS
=
1000
;
static
constexpr
int
INPUT_SIZE
=
2064
;
typedef
std
::
vector
<
std
::
complex
<
double
>>
complex_vector
;
...
...
This diff is collapsed.
Click to expand it.
app/invoke_parallel/main.cpp
View file @
bd826491
...
...
@@ -2,48 +2,100 @@
#include <pls/internal/helpers/profiler.h>
#include <iostream>
#include <complex>
#include <vector>
static
pls
::
static_scheduler_memory
<
8
,
2
<<
14
>
my_scheduler_memory
;
static
constexpr
int
CUTOFF
=
16
;
static
constexpr
int
NUM_ITERATIONS
=
1000
;
static
constexpr
int
INPUT_SIZE
=
2064
;
typedef
std
::
vector
<
std
::
complex
<
double
>>
complex_vector
;
static
constexpr
int
CUTOFF
=
10
;
long
fib_serial
(
long
n
)
{
if
(
n
==
0
)
{
return
0
;
void
divide
(
complex_vector
::
iterator
data
,
int
n
)
{
complex_vector
tmp_odd_elements
(
n
/
2
);
for
(
int
i
=
0
;
i
<
n
/
2
;
i
++
)
{
tmp_odd_elements
[
i
]
=
data
[
i
*
2
+
1
];
}
for
(
int
i
=
0
;
i
<
n
/
2
;
i
++
)
{
data
[
i
]
=
data
[
i
*
2
];
}
if
(
n
==
1
)
{
return
1
;
for
(
int
i
=
0
;
i
<
n
/
2
;
i
++
)
{
data
[
i
+
n
/
2
]
=
tmp_odd_elements
[
i
]
;
}
}
void
combine
(
complex_vector
::
iterator
data
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
/
2
;
i
++
)
{
std
::
complex
<
double
>
even
=
data
[
i
];
std
::
complex
<
double
>
odd
=
data
[
i
+
n
/
2
];
return
fib_serial
(
n
-
1
)
+
fib_serial
(
n
-
2
);
// w is the "twiddle-factor".
// this could be cached, but we run the same 'data_structures' algorithm parallel/serial,
// so it won't impact the performance comparison.
std
::
complex
<
double
>
w
=
exp
(
std
::
complex
<
double
>
(
0
,
-
2.
*
M_PI
*
i
/
n
));
data
[
i
]
=
even
+
w
*
odd
;
data
[
i
+
n
/
2
]
=
even
-
w
*
odd
;
}
}
long
fib
(
long
n
)
{
if
(
n
<
=
CUTOFF
)
{
return
fib_serial
(
n
)
;
void
fft
(
complex_vector
::
iterator
data
,
int
n
)
{
if
(
n
<
2
)
{
return
;
}
// Actual 'invoke_parallel' logic/code
int
left
,
right
;
PROFILE_WORK_BLOCK
(
"Divide"
)
divide
(
data
,
n
);
PROFILE_END_BLOCK
PROFILE_WORK_BLOCK
(
"Invoke Parallel"
)
if
(
n
==
CUTOFF
)
{
PROFILE_WORK_BLOCK
(
"FFT Serial"
)
fft
(
data
,
n
/
2
);
fft
(
data
+
n
/
2
,
n
/
2
);
}
else
if
(
n
<=
CUTOFF
)
{
fft
(
data
,
n
/
2
);
fft
(
data
+
n
/
2
,
n
/
2
);
}
else
{
pls
::
invoke_parallel
(
[
&
]
{
left
=
fib
(
n
-
1
);
},
[
&
]
{
right
=
fib
(
n
-
2
);
}
[
&
]
{
fft
(
data
,
n
/
2
);
},
[
&
]
{
fft
(
data
+
n
/
2
,
n
/
2
);
}
);
return
left
+
right
;
}
PROFILE_END_BLOCK
PROFILE_WORK_BLOCK
(
"Combine"
)
combine
(
data
,
n
);
PROFILE_END_BLOCK
}
complex_vector
prepare_input
(
int
input_size
)
{
std
::
vector
<
double
>
known_frequencies
{
2
,
11
,
52
,
88
,
256
};
complex_vector
data
(
input_size
);
// Set our input data to match a time series of the known_frequencies.
// When applying fft to this time-series we should find these frequencies.
for
(
int
i
=
0
;
i
<
input_size
;
i
++
)
{
data
[
i
]
=
std
::
complex
<
double
>
(
0.0
,
0.0
);
for
(
auto
frequencie
:
known_frequencies
)
{
data
[
i
]
+=
sin
(
2
*
M_PI
*
frequencie
*
i
/
input_size
);
}
}
return
data
;
}
int
main
()
{
PROFILE_ENABLE
pls
::
malloc_scheduler_memory
my_scheduler_memory
{
8
,
2u
<<
14
};
pls
::
scheduler
scheduler
{
&
my_scheduler_memory
,
8
};
long
result
;
complex_vector
initial_input
=
prepare_input
(
INPUT_SIZE
)
;
scheduler
.
perform_work
([
&
]
{
PROFILE_MAIN_THREAD
// Call looks just the same, only requirement is
// the enclosure in the perform_work lambda.
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
result
=
fib
(
30
);
std
::
cout
<<
"Fib(30)="
<<
result
<<
std
::
endl
;
PROFILE_WORK_BLOCK
(
"Top Level FFT"
)
complex_vector
input
=
initial_input
;
fft
(
input
.
begin
(),
input
.
size
());
}
});
...
...
This diff is collapsed.
Click to expand it.
app/playground/main.cpp
View file @
bd826491
...
...
@@ -11,8 +11,5 @@
#include <pls/internal/helpers/unique_id.h>
int
main
()
{
std
::
cout
<<
pls
::
internal
::
scheduling
::
root_task
<
void
(
*
)
>::
create_id
().
type_
.
hash_code
()
<<
std
::
endl
;
std
::
cout
<<
pls
::
internal
::
helpers
::
unique_id
::
create
<
pls
::
internal
::
scheduling
::
root_task
<
void
(
*
)
>>
().
type_
.
hash_code
()
<<
std
::
endl
;
}
This diff is collapsed.
Click to expand it.
lib/pls/CMakeLists.txt
View file @
bd826491
...
...
@@ -20,6 +20,7 @@ add_library(pls STATIC
include/pls/internal/data_structures/aligned_stack.h src/internal/data_structures/aligned_stack.cpp
include/pls/internal/data_structures/aligned_stack_impl.h
include/pls/internal/data_structures/deque.h src/internal/data_structures/deque.cpp
include/pls/internal/data_structures/work_stealing_deque.h
include/pls/internal/helpers/prohibit_new.h
include/pls/internal/helpers/profiler.h
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/base/alignment.h
View file @
bd826491
...
...
@@ -19,10 +19,32 @@ struct aligned_wrapper {
};
void
*
allocate_aligned
(
size_t
size
);
std
::
uintptr_t
next_alignment
(
std
::
uintptr_t
size
);
system_details
::
pointer_t
next_alignment
(
system_details
::
pointer_t
size
);
system_details
::
pointer_t
previous_alignment
(
system_details
::
pointer_t
size
);
char
*
next_alignment
(
char
*
pointer
);
}
template
<
typename
T
>
struct
aligned_aba_pointer
{
const
system_details
::
pointer_t
pointer_
;
explicit
aligned_aba_pointer
(
T
*
pointer
,
unsigned
int
aba
=
0
)
:
pointer_
{
reinterpret_cast
<
system_details
::
pointer_t
>
(
pointer
)
+
aba
}
{}
T
*
pointer
()
const
{
return
reinterpret_cast
<
T
*>
(
pointer_
&
system_details
::
CACHE_LINE_ADDRESS_USED_BITS
);
}
unsigned
int
aba
()
const
{
return
pointer_
&
system_details
::
CACHE_LINE_ADDRESS_UNUSED_BITS
;
}
aligned_aba_pointer
set_aba
(
unsigned
int
aba
)
const
{
return
aligned_aba_pointer
(
pointer
(),
aba
);
}
};
}
}
}
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/base/backoff.h
View file @
bd826491
...
...
@@ -14,8 +14,8 @@ namespace internal {
namespace
base
{
class
backoff
{
static
constexpr
unsigned
long
INITIAL_SPIN_ITERS
=
2u
<<
2u
;
static
constexpr
unsigned
long
MAX_SPIN_ITERS
=
2u
<<
6u
;
const
unsigned
long
INITIAL_SPIN_ITERS
=
2u
<<
2u
;
const
unsigned
long
MAX_SPIN_ITERS
=
2u
<<
6u
;
unsigned
long
current_
=
INITIAL_SPIN_ITERS
;
std
::
minstd_rand
random_
;
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/base/error_handling.h
View file @
bd826491
...
...
@@ -11,5 +11,6 @@
* (or its inclusion adds too much overhead).
*/
#define PLS_ERROR(msg) std::cout << msg << std::endl; exit(1);
#define PLS_ASSERT(cond, msg) if (!cond) { PLS_ERROR(msg) }
#endif //PLS_ERROR_HANDLING_H
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/base/system_details.h
View file @
bd826491
...
...
@@ -18,10 +18,32 @@ namespace base {
* Currently sane default values for x86.
*/
namespace
system_details
{
/**
* Pointer Types needed for ABA protection mixed into addresses.
* pointer_t should be an integer type capable of holding ANY pointer value.
*/
using
pointer_t
=
std
::
uintptr_t
;
constexpr
pointer_t
ZERO_POINTER
=
0
;
constexpr
pointer_t
MAX_POINTER
=
~
ZERO_POINTER
;
/**
* Biggest type that supports atomic CAS operations.
* Usually it is sane to assume a pointer can be swapped in a single CAS operation.
*/
using
cas_integer
=
pointer_t
;
constexpr
cas_integer
MIN_CAS_INTEGER
=
0
;
constexpr
cas_integer
MAX_CAS_INTEGER
=
~
MIN_CAS_INTEGER
;
constexpr
cas_integer
FIRST_HALF_CAS_INTEGER
=
MAX_CAS_INTEGER
<<
((
sizeof
(
cas_integer
)
/
2
)
*
8
);
constexpr
cas_integer
SECOND_HALF_CAS_INTEGER
=
~
FIRST_HALF_CAS_INTEGER
;
/**
* Most processors have 64 byte cache lines
* Most processors have 64 byte cache lines
(last 6 bit of the address are zero at line beginnings).
*/
constexpr
std
::
uintptr_t
CACHE_LINE_SIZE
=
64
;
constexpr
unsigned
int
CACHE_LINE_ADDRESS_BITS
=
6
;
constexpr
pointer_t
CACHE_LINE_SIZE
=
2u
<<
(
CACHE_LINE_ADDRESS_BITS
-
1
);
constexpr
pointer_t
CACHE_LINE_ADDRESS_USED_BITS
=
MAX_POINTER
<<
CACHE_LINE_ADDRESS_BITS
;
constexpr
pointer_t
CACHE_LINE_ADDRESS_UNUSED_BITS
=
~
CACHE_LINE_ADDRESS_USED_BITS
;
/**
* Choose one of the following ways to store thread specific data.
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/base/ttas_spin_lock.h
View file @
bd826491
...
...
@@ -19,11 +19,10 @@ namespace base {
*/
class
ttas_spin_lock
{
std
::
atomic
<
int
>
flag_
;
backoff
backoff_
;
public
:
ttas_spin_lock
()
:
flag_
{
0
}
,
backoff_
{}
{};
ttas_spin_lock
(
const
ttas_spin_lock
&
/*other*/
)
:
flag_
{
0
}
,
backoff_
{}
{}
ttas_spin_lock
()
:
flag_
{
0
}
{};
ttas_spin_lock
(
const
ttas_spin_lock
&
/*other*/
)
:
flag_
{
0
}
{}
void
lock
();
bool
try_lock
(
unsigned
int
num_tries
=
1
);
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/data_structures/aligned_stack.h
View file @
bd826491
...
...
@@ -12,6 +12,8 @@ namespace pls {
namespace
internal
{
namespace
data_structures
{
using
base
::
system_details
::
pointer_t
;
/**
* Generic stack-like data structure that allows to allocate arbitrary objects in a given memory region.
* The objects will be stored aligned in the stack, making the storage cache friendly and very fast
...
...
@@ -26,15 +28,16 @@ namespace data_structures {
*/
class
aligned_stack
{
// Keep bounds of our memory block
char
*
memory_start_
;
char
*
memory_end_
;
pointer_t
memory_start_
;
pointer_t
memory_end_
;
// Current head will always be aligned to cache lines
char
*
head_
;
pointer_t
head_
;
public
:
typedef
char
*
state
;
typedef
pointer_t
state
;
aligned_stack
()
:
memory_start_
{
nullptr
},
memory_end_
{
nullptr
},
head_
{
nullptr
}
{};
aligned_stack
()
:
memory_start_
{
0
},
memory_end_
{
0
},
head_
{
0
}
{};
aligned_stack
(
pointer_t
memory_region
,
std
::
size_t
size
);
aligned_stack
(
char
*
memory_region
,
std
::
size_t
size
);
template
<
typename
T
>
...
...
@@ -48,7 +51,6 @@ class aligned_stack {
void
reset_state
(
state
new_state
)
{
head_
=
new_state
;
}
};
}
}
}
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/data_structures/aligned_stack_impl.h
View file @
bd826491
...
...
@@ -9,7 +9,7 @@ namespace data_structures {
template
<
typename
T
>
T
*
aligned_stack
::
push
(
const
T
&
object
)
{
// Copy-Construct
return
new
(
(
void
*
)
push
<
T
>
())
T
(
object
);
return
new
(
push
<
T
>
())
T
(
object
);
}
template
<
typename
T
>
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/data_structures/work_stealing_deque.h
0 → 100644
View file @
bd826491
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/scheduling/fork_join_task.h
View file @
bd826491
...
...
@@ -5,7 +5,7 @@
#include "pls/internal/helpers/profiler.h"
#include "pls/internal/data_structures/aligned_stack.h"
#include "pls/internal/data_structures/deque.h"
#include "pls/internal/data_structures/
work_stealing_
deque.h"
#include "abstract_task.h"
#include "thread_state.h"
...
...
@@ -15,7 +15,7 @@ namespace internal {
namespace
scheduling
{
class
fork_join_task
;
class
fork_join_sub_task
:
public
data_structures
::
deque_item
{
class
fork_join_sub_task
{
friend
class
fork_join_task
;
// Coordinate finishing of sub_tasks
...
...
@@ -25,8 +25,11 @@ class fork_join_sub_task : public data_structures::deque_item {
// Access to TBB scheduling environment
fork_join_task
*
tbb_task_
;
bool
executed
=
false
;
int
executed_at
=
-
1
;
// Stack Management (reset stack pointer after wait_for_all() calls)
data_structures
::
aligned_stack
::
state
stack
_state_
;
data_structures
::
work_stealing_deque
<
fork_join_sub_task
>::
state
deque
_state_
;
protected
:
explicit
fork_join_sub_task
();
fork_join_sub_task
(
const
fork_join_sub_task
&
other
);
...
...
@@ -37,11 +40,10 @@ class fork_join_sub_task : public data_structures::deque_item {
public
:
// Only use them when actually executing this sub_task (only public for simpler API design)
template
<
typename
T
>
void
spawn_child
(
const
T
&
sub_task
);
void
spawn_child
(
T
&
sub_task
);
void
wait_for_all
();
private
:
void
spawn_child_internal
(
fork_join_sub_task
*
sub_task
);
void
execute
();
};
...
...
@@ -50,7 +52,7 @@ class fork_join_lambda_by_reference : public fork_join_sub_task {
const
Function
*
function_
;
public
:
explicit
fork_join_lambda_by_reference
(
const
Function
*
function
)
:
function_
{
function
}
{};
explicit
fork_join_lambda_by_reference
(
const
Function
*
function
)
:
f
ork_join_sub_task
{},
f
unction_
{
function
}
{};
protected
:
void
execute_internal
()
override
{
...
...
@@ -63,7 +65,7 @@ class fork_join_lambda_by_value : public fork_join_sub_task {
const
Function
function_
;
public
:
explicit
fork_join_lambda_by_value
(
const
Function
&
function
)
:
function_
{
function
}
{};
explicit
fork_join_lambda_by_value
(
const
Function
&
function
)
:
f
ork_join_sub_task
{},
f
unction_
{
function
}
{};
protected
:
void
execute_internal
()
override
{
...
...
@@ -76,10 +78,9 @@ class fork_join_task : public abstract_task {
fork_join_sub_task
*
root_task_
;
fork_join_sub_task
*
currently_executing_
;
data_structures
::
aligned_stack
*
my_stack_
;
// Double-Ended Queue management
data_structures
::
deque
<
fork_join_sub_task
>
deque_
;
data_structures
::
work_stealing_
deque
<
fork_join_sub_task
>
deque_
;
// Steal Management
fork_join_sub_task
*
last_stolen_
;
...
...
@@ -97,12 +98,21 @@ class fork_join_task : public abstract_task {
};
template
<
typename
T
>
void
fork_join_sub_task
::
spawn_child
(
const
T
&
task
)
{
void
fork_join_sub_task
::
spawn_child
(
T
&
task
)
{
PROFILE_FORK_JOIN_STEALING
(
"spawn_child"
)
static_assert
(
std
::
is_base_of
<
fork_join_sub_task
,
T
>::
value
,
"Only pass fork_join_sub_task subclasses!"
);
T
*
new_task
=
tbb_task_
->
my_stack_
->
push
(
task
);
spawn_child_internal
(
new_task
);
// Keep our refcount up to date
ref_count_
++
;
// Assign forced values
task
.
parent_
=
this
;
task
.
tbb_task_
=
tbb_task_
;
task
.
deque_state_
=
tbb_task_
->
deque_
.
save_state
();
// Push on our deque
const
T
const_task
=
task
;
tbb_task_
->
deque_
.
push_tail
(
const_task
);
}
}
...
...
This diff is collapsed.
Click to expand it.
lib/pls/src/internal/base/alignment.cpp
View file @
bd826491
...
...
@@ -10,8 +10,8 @@ void *allocate_aligned(size_t size) {
return
aligned_alloc
(
system_details
::
CACHE_LINE_SIZE
,
size
);
}
s
td
::
uintptr_t
next_alignment
(
std
::
uintpt
r_t
size
)
{
s
td
::
uintpt
r_t
miss_alignment
=
size
%
base
::
system_details
::
CACHE_LINE_SIZE
;
s
ystem_details
::
pointer_t
next_alignment
(
system_details
::
pointe
r_t
size
)
{
s
ystem_details
::
pointe
r_t
miss_alignment
=
size
%
base
::
system_details
::
CACHE_LINE_SIZE
;
if
(
miss_alignment
==
0
)
{
return
size
;
}
else
{
...
...
@@ -19,8 +19,17 @@ std::uintptr_t next_alignment(std::uintptr_t size) {
}
}
system_details
::
pointer_t
previous_alignment
(
system_details
::
pointer_t
size
)
{
system_details
::
pointer_t
miss_alignment
=
size
%
base
::
system_details
::
CACHE_LINE_SIZE
;
if
(
miss_alignment
==
0
)
{
return
size
;
}
else
{
return
size
-
miss_alignment
;
}
}
char
*
next_alignment
(
char
*
pointer
)
{
return
reinterpret_cast
<
char
*>
(
next_alignment
(
reinterpret_cast
<
s
td
::
uintpt
r_t
>
(
pointer
)));
return
reinterpret_cast
<
char
*>
(
next_alignment
(
reinterpret_cast
<
s
ystem_details
::
pointe
r_t
>
(
pointer
)));
}
}
...
...
This diff is collapsed.
Click to expand it.
lib/pls/src/internal/base/swmr_spin_lock.cpp
View file @
bd826491
...
...
@@ -23,22 +23,22 @@ bool swmr_spin_lock::reader_try_lock() {
void
swmr_spin_lock
::
reader_unlock
()
{
PROFILE_LOCK
(
"Release Read Lock"
)
readers_
.
fetch_add
(
-
1
,
std
::
memory_order_release
)
;
readers_
--
;
}
void
swmr_spin_lock
::
writer_lock
()
{
PROFILE_LOCK
(
"Acquire Write Lock"
)
// Tell the readers that we would like to write
write_request_
.
store
(
1
,
std
::
memory_order_acquire
)
;
write_request_
=
1
;
// Wait for all of them to exit the critical section
while
(
readers_
.
load
(
std
::
memory_order_acquire
)
>
0
)
while
(
readers_
>
0
)
system_details
::
relax_cpu
();
// Spin, not expensive as relaxed load
}
void
swmr_spin_lock
::
writer_unlock
()
{
PROFILE_LOCK
(
"Release Write Lock"
)
write_request_
.
store
(
0
,
std
::
memory_order_release
)
;
write_request_
=
0
;
}
}
...
...
This diff is collapsed.
Click to expand it.
lib/pls/src/internal/base/ttas_spin_lock.cpp
View file @
bd826491
...
...
@@ -9,7 +9,7 @@ namespace base {
void
ttas_spin_lock
::
lock
()
{
PROFILE_LOCK
(
"Acquire Lock"
)
int
expected
=
0
;
backoff
_
.
reset
()
;
backoff
backoff_
;
while
(
true
)
{
while
(
flag_
.
load
(
std
::
memory_order_relaxed
)
==
1
)
...
...
@@ -26,7 +26,7 @@ void ttas_spin_lock::lock() {
bool
ttas_spin_lock
::
try_lock
(
unsigned
int
num_tries
)
{
PROFILE_LOCK
(
"Try Acquire Lock"
)
int
expected
=
0
;
backoff
_
.
reset
()
;
backoff
backoff_
;
while
(
true
)
{
while
(
flag_
.
load
()
==
1
)
{
...
...
This diff is collapsed.
Click to expand it.
lib/pls/src/internal/data_structures/aligned_stack.cpp
View file @
bd826491
...
...
@@ -5,11 +5,16 @@ namespace pls {
namespace
internal
{
namespace
data_structures
{
aligned_stack
::
aligned_stack
(
char
*
memory_region
,
const
std
::
size_t
size
)
:
aligned_stack
::
aligned_stack
(
pointer_t
memory_region
,
const
std
::
size_t
size
)
:
memory_start_
{
memory_region
},
memory_end_
{
memory_region
+
size
},
head_
{
base
::
alignment
::
next_alignment
(
memory_start_
)}
{}
aligned_stack
::
aligned_stack
(
char
*
memory_region
,
const
std
::
size_t
size
)
:
memory_start_
{(
pointer_t
)
memory_region
},
memory_end_
{(
pointer_t
)
memory_region
+
size
},
head_
{
base
::
alignment
::
next_alignment
(
memory_start_
)}
{}
}
}
}
This diff is collapsed.
Click to expand it.
lib/pls/src/internal/data_structures/deque.cpp
View file @
bd826491
...
...
@@ -14,11 +14,11 @@ deque_item *deque_internal::pop_head_internal() {
}
deque_item
*
result
=
head_
;
head_
=
head_
->
prev
_
;
head_
=
head_
->
next
_
;
if
(
head_
==
nullptr
)
{
tail_
=
nullptr
;
}
else
{
head_
->
next
_
=
nullptr
;
head_
->
prev
_
=
nullptr
;
}
return
result
;
...
...
@@ -32,11 +32,11 @@ deque_item *deque_internal::pop_tail_internal() {
}
deque_item
*
result
=
tail_
;
tail_
=
tail_
->
next
_
;
tail_
=
tail_
->
prev
_
;
if
(
tail_
==
nullptr
)
{
head_
=
nullptr
;
}
else
{
tail_
->
prev
_
=
nullptr
;
tail_
->
next
_
=
nullptr
;
}
return
result
;
...
...
@@ -46,12 +46,12 @@ void deque_internal::push_tail_internal(deque_item *new_item) {
std
::
lock_guard
<
base
::
spin_lock
>
lock
{
lock_
};
if
(
tail_
!=
nullptr
)
{
tail_
->
prev
_
=
new_item
;
tail_
->
next
_
=
new_item
;
}
else
{
head_
=
new_item
;
}
new_item
->
next
_
=
tail_
;
new_item
->
prev
_
=
nullptr
;
new_item
->
prev
_
=
tail_
;
new_item
->
next
_
=
nullptr
;
tail_
=
new_item
;
}
...
...
This diff is collapsed.
Click to expand it.
lib/pls/src/internal/scheduling/fork_join_task.cpp
View file @
bd826491
...
...
@@ -8,22 +8,26 @@ namespace internal {
namespace
scheduling
{
fork_join_sub_task
::
fork_join_sub_task
()
:
data_structures
::
deque_item
{},
ref_count_
{
0
},
parent_
{
nullptr
},
tbb_task_
{
nullptr
},
stack_state_
{
nullptr
}
{}
deque_state_
{
0
}
{}
fork_join_sub_task
::
fork_join_sub_task
(
const
fork_join_sub_task
&
other
)
:
data_structures
::
deque_item
(
other
),
ref_count_
{
0
},
parent_
{
nullptr
},
tbb_task_
{
nullptr
},
stack_state_
{
nullptr
}
{}
parent_
{
other
.
parent_
},
tbb_task_
{
other
.
tbb_task_
},
deque_state_
{
other
.
deque_state_
}
{}
void
fork_join_sub_task
::
execute
()
{
PROFILE_WORK_BLOCK
(
"execute sub_task"
)
tbb_task_
->
currently_executing_
=
this
;
if
(
executed
)
{
int
my_id
=
base
::
this_thread
::
state
<
thread_state
>
()
->
id_
;
PLS_ERROR
(
"Double Execution!"
)
}
executed
=
true
;
executed_at
=
base
::
this_thread
::
state
<
thread_state
>
()
->
id_
;
execute_internal
();
tbb_task_
->
currently_executing_
=
nullptr
;
PROFILE_END_BLOCK
...
...
@@ -34,18 +38,6 @@ void fork_join_sub_task::execute() {
}
}
void
fork_join_sub_task
::
spawn_child_internal
(
fork_join_sub_task
*
sub_task
)
{
// Keep our refcount up to date
ref_count_
++
;
// Assign forced values
sub_task
->
parent_
=
this
;
sub_task
->
tbb_task_
=
tbb_task_
;
sub_task
->
stack_state_
=
tbb_task_
->
my_stack_
->
save_state
();
tbb_task_
->
deque_
.
push_tail
(
sub_task
);
}
void
fork_join_sub_task
::
wait_for_all
()
{
while
(
ref_count_
>
0
)
{
PROFILE_STEALING
(
"get local sub task"
)
...
...
@@ -54,7 +46,6 @@ void fork_join_sub_task::wait_for_all() {
if
(
local_task
!=
nullptr
)
{
local_task
->
execute
();
}
else
{
while
(
ref_count_
>
0
)
{
// Try to steal work.
// External steal will be executed implicitly if success
PROFILE_STEALING
(
"steal work"
)
...
...
@@ -65,8 +56,7 @@ void fork_join_sub_task::wait_for_all() {
}
}
}
}
tbb_task_
->
my_stack_
->
reset_state
(
stack_state_
);
tbb_task_
->
deque_
.
release_memory_until
(
deque_state_
);
}
fork_join_sub_task
*
fork_join_task
::
get_local_sub_task
()
{
...
...
@@ -74,7 +64,9 @@ fork_join_sub_task *fork_join_task::get_local_sub_task() {
}
fork_join_sub_task
*
fork_join_task
::
get_stolen_sub_task
()
{
return
deque_
.
pop_head
();
auto
tmp
=
deque_
.
save_state
();
auto
result
=
deque_
.
pop_head
();
return
result
;
}
bool
fork_join_task
::
internal_stealing
(
abstract_task
*
other_task
)
{
...
...
@@ -87,7 +79,7 @@ bool fork_join_task::internal_stealing(abstract_task *other_task) {
}
else
{
// Make sub-task belong to our fork_join_task instance
stolen_sub_task
->
tbb_task_
=
this
;
stolen_sub_task
->
stack_state_
=
my_stack_
->
save_state
();
stolen_sub_task
->
deque_state_
=
deque_
.
save_state
();
// We will execute this next without explicitly moving it onto our stack storage
last_stolen_
=
stolen_sub_task
;
...
...
@@ -114,9 +106,12 @@ void fork_join_task::execute() {
PROFILE_WORK_BLOCK
(
"execute fork_join_task"
);
// Bind this instance to our OS thread
my_stack_
=
base
::
this_thread
::
state
<
thread_state
>
()
->
task_stack_
;
// TODO: See if we did this right
// my_stack_ = base::this_thread::state<thread_state>()->task_stack_;
deque_
.
reset_base_pointer
();
root_task_
->
tbb_task_
=
this
;
root_task_
->
stack_state_
=
my_stack_
->
save_state
();
root_task_
->
deque_state_
=
deque_
.
save_state
();
// Execute it on our OS thread until its finished
root_task_
->
execute
();
...
...
@@ -124,12 +119,12 @@ void fork_join_task::execute() {
fork_join_sub_task
*
fork_join_task
::
currently_executing
()
const
{
return
currently_executing_
;
}
fork_join_task
::
fork_join_task
(
fork_join_sub_task
*
root_task
,
const
abstract_task
::
id
&
id
)
:
fork_join_task
::
fork_join_task
(
fork_join_sub_task
*
root_task
,
const
abstract_task
::
id
&
id
)
:
abstract_task
{
0
,
id
},
root_task_
{
root_task
},
currently_executing_
{
nullptr
},
my_stack_
{
nullptr
},
deque_
{},
deque_
{
base
::
this_thread
::
state
<
thread_state
>
()
->
task_stack_
},
last_stolen_
{
nullptr
}
{}
}
...
...
This diff is collapsed.
Click to expand it.
test/CMakeLists.txt
View file @
bd826491
add_executable
(
tests
main.cpp
base_tests.cpp scheduling_tests.cpp
data_structures_test.cpp
)
data_structures_test.cpp
)
target_link_libraries
(
tests catch2 pls
)
This diff is collapsed.
Click to expand it.
test/data_structures_test.cpp
View file @
bd826491
...
...
@@ -4,6 +4,7 @@
#include <pls/internal/data_structures/aligned_stack.h>
#include <pls/internal/data_structures/deque.h>
#include <pls/internal/data_structures/work_stealing_deque.h>
#include <vector>
#include <mutex>
...
...
@@ -130,3 +131,90 @@ TEST_CASE("deque stores objects correctly", "[internal/data_structures/deque.h]"
REQUIRE
(
deque
.
pop_tail
()
==
&
three
);
}
}
TEST_CASE
(
"work stealing deque stores objects correctly"
,
"[internal/data_structures/aligned_stack.h]"
)
{
constexpr
long
data_size
=
2
<<
14
;
char
data
[
data_size
];
aligned_stack
stack
{
data
,
data_size
};
work_stealing_deque
<
int
>
deque
{
&
stack
};
int
one
=
1
,
two
=
2
,
three
=
3
,
four
=
4
;
SECTION
(
"add and remove items form the tail"
)
{
deque
.
push_tail
(
one
);
deque
.
push_tail
(
two
);
deque
.
push_tail
(
three
);
REQUIRE
(
*
deque
.
pop_tail
()
==
three
);
REQUIRE
(
*
deque
.
pop_tail
()
==
two
);
REQUIRE
(
*
deque
.
pop_tail
()
==
one
);
}
SECTION
(
"handles getting empty by popping the tail correctly"
)
{
deque
.
push_tail
(
one
);
REQUIRE
(
*
deque
.
pop_tail
()
==
one
);
deque
.
push_tail
(
two
);
REQUIRE
(
*
deque
.
pop_tail
()
==
two
);
}
SECTION
(
"remove items form the head"
)
{
deque
.
push_tail
(
one
);
deque
.
push_tail
(
two
);
deque
.
push_tail
(
three
);
REQUIRE
(
*
deque
.
pop_head
()
==
one
);
REQUIRE
(
*
deque
.
pop_head
()
==
two
);
REQUIRE
(
*
deque
.
pop_head
()
==
three
);
}
SECTION
(
"handles getting empty by popping the head correctly"
)
{
deque
.
push_tail
(
one
);
REQUIRE
(
*
deque
.
pop_head
()
==
one
);
deque
.
push_tail
(
two
);
REQUIRE
(
*
deque
.
pop_head
()
==
two
);
}
SECTION
(
"handles getting empty by popping the head and tail correctly"
)
{
deque
.
push_tail
(
one
);
REQUIRE
(
*
deque
.
pop_tail
()
==
one
);
deque
.
push_tail
(
two
);
REQUIRE
(
*
deque
.
pop_head
()
==
two
);
deque
.
push_tail
(
three
);
REQUIRE
(
*
deque
.
pop_tail
()
==
three
);
}
SECTION
(
"handles jumps bigger 1 correctly"
)
{
deque
.
push_tail
(
one
);
deque
.
push_tail
(
two
);
REQUIRE
(
*
deque
.
pop_tail
()
==
two
);
deque
.
push_tail
(
three
);
deque
.
push_tail
(
four
);
REQUIRE
(
*
deque
.
pop_head
()
==
one
);
REQUIRE
(
*
deque
.
pop_head
()
==
three
);
REQUIRE
(
*
deque
.
pop_head
()
==
four
);
}
SECTION
(
"handles stack reset 1 correctly when emptied by tail"
)
{
deque
.
push_tail
(
one
);
deque
.
push_tail
(
two
);
auto
tmp_result
=
deque
.
pop_tail
();
REQUIRE
(
*
tmp_result
==
two
);
deque
.
release_memory_until
(
tmp_result
);
REQUIRE
(
*
deque
.
pop_tail
()
==
one
);
deque
.
push_tail
(
three
);
deque
.
push_tail
(
four
);
REQUIRE
(
*
deque
.
pop_head
()
==
three
);
REQUIRE
(
*
deque
.
pop_tail
()
==
four
);
}
SECTION
(
"synces correctly"
)
{
}
}
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment