Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
las3_pub
/
predictable_parallel_patterns
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
bd826491
authored
Apr 30, 2019
by
FritzFlorian
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
WIP: lock-free work stealing deque based on our stack.
parent
d16ad3eb
Pipeline
#1159
passed with stages
in 3 minutes 37 seconds
Changes
21
Pipelines
1
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
21 changed files
with
301 additions
and
98 deletions
+301
-98
app/benchmark_fft/main.cpp
+1
-1
app/invoke_parallel/main.cpp
+72
-20
app/playground/main.cpp
+1
-4
lib/pls/CMakeLists.txt
+1
-0
lib/pls/include/pls/internal/base/alignment.h
+23
-1
lib/pls/include/pls/internal/base/backoff.h
+2
-2
lib/pls/include/pls/internal/base/error_handling.h
+1
-0
lib/pls/include/pls/internal/base/system_details.h
+24
-2
lib/pls/include/pls/internal/base/ttas_spin_lock.h
+2
-3
lib/pls/include/pls/internal/data_structures/aligned_stack.h
+8
-6
lib/pls/include/pls/internal/data_structures/aligned_stack_impl.h
+1
-1
lib/pls/include/pls/internal/data_structures/work_stealing_deque.h
+0
-0
lib/pls/include/pls/internal/scheduling/fork_join_task.h
+22
-12
lib/pls/src/internal/base/alignment.cpp
+12
-3
lib/pls/src/internal/base/swmr_spin_lock.cpp
+4
-4
lib/pls/src/internal/base/ttas_spin_lock.cpp
+2
-2
lib/pls/src/internal/data_structures/aligned_stack.cpp
+6
-1
lib/pls/src/internal/data_structures/deque.cpp
+7
-7
lib/pls/src/internal/scheduling/fork_join_task.cpp
+23
-28
test/CMakeLists.txt
+1
-1
test/data_structures_test.cpp
+88
-0
No files found.
app/benchmark_fft/main.cpp
View file @
bd826491
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
#include <complex>
#include <complex>
#include <vector>
#include <vector>
static
constexpr
int
CUTOFF
=
1
0
;
static
constexpr
int
CUTOFF
=
1
6
;
static
constexpr
int
NUM_ITERATIONS
=
1000
;
static
constexpr
int
NUM_ITERATIONS
=
1000
;
static
constexpr
int
INPUT_SIZE
=
2064
;
static
constexpr
int
INPUT_SIZE
=
2064
;
typedef
std
::
vector
<
std
::
complex
<
double
>>
complex_vector
;
typedef
std
::
vector
<
std
::
complex
<
double
>>
complex_vector
;
...
...
app/invoke_parallel/main.cpp
View file @
bd826491
...
@@ -2,48 +2,100 @@
...
@@ -2,48 +2,100 @@
#include <pls/internal/helpers/profiler.h>
#include <pls/internal/helpers/profiler.h>
#include <iostream>
#include <iostream>
#include <complex>
#include <vector>
static
pls
::
static_scheduler_memory
<
8
,
2
<<
14
>
my_scheduler_memory
;
static
constexpr
int
CUTOFF
=
16
;
static
constexpr
int
NUM_ITERATIONS
=
1000
;
static
constexpr
int
INPUT_SIZE
=
2064
;
typedef
std
::
vector
<
std
::
complex
<
double
>>
complex_vector
;
static
constexpr
int
CUTOFF
=
10
;
void
divide
(
complex_vector
::
iterator
data
,
int
n
)
{
complex_vector
tmp_odd_elements
(
n
/
2
);
long
fib_serial
(
long
n
)
{
for
(
int
i
=
0
;
i
<
n
/
2
;
i
++
)
{
if
(
n
==
0
)
{
tmp_odd_elements
[
i
]
=
data
[
i
*
2
+
1
];
return
0
;
}
for
(
int
i
=
0
;
i
<
n
/
2
;
i
++
)
{
data
[
i
]
=
data
[
i
*
2
];
}
}
if
(
n
==
1
)
{
for
(
int
i
=
0
;
i
<
n
/
2
;
i
++
)
{
return
1
;
data
[
i
+
n
/
2
]
=
tmp_odd_elements
[
i
]
;
}
}
}
void
combine
(
complex_vector
::
iterator
data
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
/
2
;
i
++
)
{
std
::
complex
<
double
>
even
=
data
[
i
];
std
::
complex
<
double
>
odd
=
data
[
i
+
n
/
2
];
return
fib_serial
(
n
-
1
)
+
fib_serial
(
n
-
2
);
// w is the "twiddle-factor".
// this could be cached, but we run the same 'data_structures' algorithm parallel/serial,
// so it won't impact the performance comparison.
std
::
complex
<
double
>
w
=
exp
(
std
::
complex
<
double
>
(
0
,
-
2.
*
M_PI
*
i
/
n
));
data
[
i
]
=
even
+
w
*
odd
;
data
[
i
+
n
/
2
]
=
even
-
w
*
odd
;
}
}
}
long
fib
(
long
n
)
{
void
fft
(
complex_vector
::
iterator
data
,
int
n
)
{
if
(
n
<
=
CUTOFF
)
{
if
(
n
<
2
)
{
return
fib_serial
(
n
)
;
return
;
}
}
// Actual 'invoke_parallel' logic/code
PROFILE_WORK_BLOCK
(
"Divide"
)
int
left
,
right
;
divide
(
data
,
n
);
PROFILE_END_BLOCK
PROFILE_WORK_BLOCK
(
"Invoke Parallel"
)
if
(
n
==
CUTOFF
)
{
PROFILE_WORK_BLOCK
(
"FFT Serial"
)
fft
(
data
,
n
/
2
);
fft
(
data
+
n
/
2
,
n
/
2
);
}
else
if
(
n
<=
CUTOFF
)
{
fft
(
data
,
n
/
2
);
fft
(
data
+
n
/
2
,
n
/
2
);
}
else
{
pls
::
invoke_parallel
(
pls
::
invoke_parallel
(
[
&
]
{
left
=
fib
(
n
-
1
);
},
[
&
]
{
fft
(
data
,
n
/
2
);
},
[
&
]
{
right
=
fib
(
n
-
2
);
}
[
&
]
{
fft
(
data
+
n
/
2
,
n
/
2
);
}
);
);
return
left
+
right
;
}
PROFILE_END_BLOCK
PROFILE_WORK_BLOCK
(
"Combine"
)
combine
(
data
,
n
);
PROFILE_END_BLOCK
}
complex_vector
prepare_input
(
int
input_size
)
{
std
::
vector
<
double
>
known_frequencies
{
2
,
11
,
52
,
88
,
256
};
complex_vector
data
(
input_size
);
// Set our input data to match a time series of the known_frequencies.
// When applying fft to this time-series we should find these frequencies.
for
(
int
i
=
0
;
i
<
input_size
;
i
++
)
{
data
[
i
]
=
std
::
complex
<
double
>
(
0.0
,
0.0
);
for
(
auto
frequencie
:
known_frequencies
)
{
data
[
i
]
+=
sin
(
2
*
M_PI
*
frequencie
*
i
/
input_size
);
}
}
return
data
;
}
}
int
main
()
{
int
main
()
{
PROFILE_ENABLE
PROFILE_ENABLE
pls
::
malloc_scheduler_memory
my_scheduler_memory
{
8
,
2u
<<
14
};
pls
::
scheduler
scheduler
{
&
my_scheduler_memory
,
8
};
pls
::
scheduler
scheduler
{
&
my_scheduler_memory
,
8
};
long
result
;
complex_vector
initial_input
=
prepare_input
(
INPUT_SIZE
)
;
scheduler
.
perform_work
([
&
]
{
scheduler
.
perform_work
([
&
]
{
PROFILE_MAIN_THREAD
PROFILE_MAIN_THREAD
// Call looks just the same, only requirement is
// Call looks just the same, only requirement is
// the enclosure in the perform_work lambda.
// the enclosure in the perform_work lambda.
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
result
=
fib
(
30
);
PROFILE_WORK_BLOCK
(
"Top Level FFT"
)
std
::
cout
<<
"Fib(30)="
<<
result
<<
std
::
endl
;
complex_vector
input
=
initial_input
;
fft
(
input
.
begin
(),
input
.
size
());
}
}
});
});
...
...
app/playground/main.cpp
View file @
bd826491
...
@@ -11,8 +11,5 @@
...
@@ -11,8 +11,5 @@
#include <pls/internal/helpers/unique_id.h>
#include <pls/internal/helpers/unique_id.h>
int
main
()
{
int
main
()
{
std
::
cout
<<
pls
::
internal
::
scheduling
::
root_task
<
void
(
*
)
>::
create_id
().
type_
.
hash_code
()
<<
std
::
endl
;
std
::
cout
<<
pls
::
internal
::
helpers
::
unique_id
::
create
<
pls
::
internal
::
scheduling
::
root_task
<
void
(
*
)
>>
().
type_
.
hash_code
()
<<
std
::
endl
;
}
}
lib/pls/CMakeLists.txt
View file @
bd826491
...
@@ -20,6 +20,7 @@ add_library(pls STATIC
...
@@ -20,6 +20,7 @@ add_library(pls STATIC
include/pls/internal/data_structures/aligned_stack.h src/internal/data_structures/aligned_stack.cpp
include/pls/internal/data_structures/aligned_stack.h src/internal/data_structures/aligned_stack.cpp
include/pls/internal/data_structures/aligned_stack_impl.h
include/pls/internal/data_structures/aligned_stack_impl.h
include/pls/internal/data_structures/deque.h src/internal/data_structures/deque.cpp
include/pls/internal/data_structures/deque.h src/internal/data_structures/deque.cpp
include/pls/internal/data_structures/work_stealing_deque.h
include/pls/internal/helpers/prohibit_new.h
include/pls/internal/helpers/prohibit_new.h
include/pls/internal/helpers/profiler.h
include/pls/internal/helpers/profiler.h
...
...
lib/pls/include/pls/internal/base/alignment.h
View file @
bd826491
...
@@ -19,10 +19,32 @@ struct aligned_wrapper {
...
@@ -19,10 +19,32 @@ struct aligned_wrapper {
};
};
void
*
allocate_aligned
(
size_t
size
);
void
*
allocate_aligned
(
size_t
size
);
std
::
uintptr_t
next_alignment
(
std
::
uintptr_t
size
);
system_details
::
pointer_t
next_alignment
(
system_details
::
pointer_t
size
);
system_details
::
pointer_t
previous_alignment
(
system_details
::
pointer_t
size
);
char
*
next_alignment
(
char
*
pointer
);
char
*
next_alignment
(
char
*
pointer
);
}
}
template
<
typename
T
>
struct
aligned_aba_pointer
{
const
system_details
::
pointer_t
pointer_
;
explicit
aligned_aba_pointer
(
T
*
pointer
,
unsigned
int
aba
=
0
)
:
pointer_
{
reinterpret_cast
<
system_details
::
pointer_t
>
(
pointer
)
+
aba
}
{}
T
*
pointer
()
const
{
return
reinterpret_cast
<
T
*>
(
pointer_
&
system_details
::
CACHE_LINE_ADDRESS_USED_BITS
);
}
unsigned
int
aba
()
const
{
return
pointer_
&
system_details
::
CACHE_LINE_ADDRESS_UNUSED_BITS
;
}
aligned_aba_pointer
set_aba
(
unsigned
int
aba
)
const
{
return
aligned_aba_pointer
(
pointer
(),
aba
);
}
};
}
}
}
}
}
}
...
...
lib/pls/include/pls/internal/base/backoff.h
View file @
bd826491
...
@@ -14,8 +14,8 @@ namespace internal {
...
@@ -14,8 +14,8 @@ namespace internal {
namespace
base
{
namespace
base
{
class
backoff
{
class
backoff
{
static
constexpr
unsigned
long
INITIAL_SPIN_ITERS
=
2u
<<
2u
;
const
unsigned
long
INITIAL_SPIN_ITERS
=
2u
<<
2u
;
static
constexpr
unsigned
long
MAX_SPIN_ITERS
=
2u
<<
6u
;
const
unsigned
long
MAX_SPIN_ITERS
=
2u
<<
6u
;
unsigned
long
current_
=
INITIAL_SPIN_ITERS
;
unsigned
long
current_
=
INITIAL_SPIN_ITERS
;
std
::
minstd_rand
random_
;
std
::
minstd_rand
random_
;
...
...
lib/pls/include/pls/internal/base/error_handling.h
View file @
bd826491
...
@@ -11,5 +11,6 @@
...
@@ -11,5 +11,6 @@
* (or its inclusion adds too much overhead).
* (or its inclusion adds too much overhead).
*/
*/
#define PLS_ERROR(msg) std::cout << msg << std::endl; exit(1);
#define PLS_ERROR(msg) std::cout << msg << std::endl; exit(1);
#define PLS_ASSERT(cond, msg) if (!cond) { PLS_ERROR(msg) }
#endif //PLS_ERROR_HANDLING_H
#endif //PLS_ERROR_HANDLING_H
lib/pls/include/pls/internal/base/system_details.h
View file @
bd826491
...
@@ -18,10 +18,32 @@ namespace base {
...
@@ -18,10 +18,32 @@ namespace base {
* Currently sane default values for x86.
* Currently sane default values for x86.
*/
*/
namespace
system_details
{
namespace
system_details
{
/**
* Pointer Types needed for ABA protection mixed into addresses.
* pointer_t should be an integer type capable of holding ANY pointer value.
*/
using
pointer_t
=
std
::
uintptr_t
;
constexpr
pointer_t
ZERO_POINTER
=
0
;
constexpr
pointer_t
MAX_POINTER
=
~
ZERO_POINTER
;
/**
* Biggest type that supports atomic CAS operations.
* Usually it is sane to assume a pointer can be swapped in a single CAS operation.
*/
using
cas_integer
=
pointer_t
;
constexpr
cas_integer
MIN_CAS_INTEGER
=
0
;
constexpr
cas_integer
MAX_CAS_INTEGER
=
~
MIN_CAS_INTEGER
;
constexpr
cas_integer
FIRST_HALF_CAS_INTEGER
=
MAX_CAS_INTEGER
<<
((
sizeof
(
cas_integer
)
/
2
)
*
8
);
constexpr
cas_integer
SECOND_HALF_CAS_INTEGER
=
~
FIRST_HALF_CAS_INTEGER
;
/**
/**
* Most processors have 64 byte cache lines
* Most processors have 64 byte cache lines
(last 6 bit of the address are zero at line beginnings).
*/
*/
constexpr
std
::
uintptr_t
CACHE_LINE_SIZE
=
64
;
constexpr
unsigned
int
CACHE_LINE_ADDRESS_BITS
=
6
;
constexpr
pointer_t
CACHE_LINE_SIZE
=
2u
<<
(
CACHE_LINE_ADDRESS_BITS
-
1
);
constexpr
pointer_t
CACHE_LINE_ADDRESS_USED_BITS
=
MAX_POINTER
<<
CACHE_LINE_ADDRESS_BITS
;
constexpr
pointer_t
CACHE_LINE_ADDRESS_UNUSED_BITS
=
~
CACHE_LINE_ADDRESS_USED_BITS
;
/**
/**
* Choose one of the following ways to store thread specific data.
* Choose one of the following ways to store thread specific data.
...
...
lib/pls/include/pls/internal/base/ttas_spin_lock.h
View file @
bd826491
...
@@ -19,11 +19,10 @@ namespace base {
...
@@ -19,11 +19,10 @@ namespace base {
*/
*/
class
ttas_spin_lock
{
class
ttas_spin_lock
{
std
::
atomic
<
int
>
flag_
;
std
::
atomic
<
int
>
flag_
;
backoff
backoff_
;
public
:
public
:
ttas_spin_lock
()
:
flag_
{
0
}
,
backoff_
{}
{};
ttas_spin_lock
()
:
flag_
{
0
}
{};
ttas_spin_lock
(
const
ttas_spin_lock
&
/*other*/
)
:
flag_
{
0
}
,
backoff_
{}
{}
ttas_spin_lock
(
const
ttas_spin_lock
&
/*other*/
)
:
flag_
{
0
}
{}
void
lock
();
void
lock
();
bool
try_lock
(
unsigned
int
num_tries
=
1
);
bool
try_lock
(
unsigned
int
num_tries
=
1
);
...
...
lib/pls/include/pls/internal/data_structures/aligned_stack.h
View file @
bd826491
...
@@ -12,6 +12,8 @@ namespace pls {
...
@@ -12,6 +12,8 @@ namespace pls {
namespace
internal
{
namespace
internal
{
namespace
data_structures
{
namespace
data_structures
{
using
base
::
system_details
::
pointer_t
;
/**
/**
* Generic stack-like data structure that allows to allocate arbitrary objects in a given memory region.
* Generic stack-like data structure that allows to allocate arbitrary objects in a given memory region.
* The objects will be stored aligned in the stack, making the storage cache friendly and very fast
* The objects will be stored aligned in the stack, making the storage cache friendly and very fast
...
@@ -26,15 +28,16 @@ namespace data_structures {
...
@@ -26,15 +28,16 @@ namespace data_structures {
*/
*/
class
aligned_stack
{
class
aligned_stack
{
// Keep bounds of our memory block
// Keep bounds of our memory block
char
*
memory_start_
;
pointer_t
memory_start_
;
char
*
memory_end_
;
pointer_t
memory_end_
;
// Current head will always be aligned to cache lines
// Current head will always be aligned to cache lines
char
*
head_
;
pointer_t
head_
;
public
:
public
:
typedef
char
*
state
;
typedef
pointer_t
state
;
aligned_stack
()
:
memory_start_
{
nullptr
},
memory_end_
{
nullptr
},
head_
{
nullptr
}
{};
aligned_stack
()
:
memory_start_
{
0
},
memory_end_
{
0
},
head_
{
0
}
{};
aligned_stack
(
pointer_t
memory_region
,
std
::
size_t
size
);
aligned_stack
(
char
*
memory_region
,
std
::
size_t
size
);
aligned_stack
(
char
*
memory_region
,
std
::
size_t
size
);
template
<
typename
T
>
template
<
typename
T
>
...
@@ -48,7 +51,6 @@ class aligned_stack {
...
@@ -48,7 +51,6 @@ class aligned_stack {
void
reset_state
(
state
new_state
)
{
head_
=
new_state
;
}
void
reset_state
(
state
new_state
)
{
head_
=
new_state
;
}
};
};
}
}
}
}
}
}
...
...
lib/pls/include/pls/internal/data_structures/aligned_stack_impl.h
View file @
bd826491
...
@@ -9,7 +9,7 @@ namespace data_structures {
...
@@ -9,7 +9,7 @@ namespace data_structures {
template
<
typename
T
>
template
<
typename
T
>
T
*
aligned_stack
::
push
(
const
T
&
object
)
{
T
*
aligned_stack
::
push
(
const
T
&
object
)
{
// Copy-Construct
// Copy-Construct
return
new
(
(
void
*
)
push
<
T
>
())
T
(
object
);
return
new
(
push
<
T
>
())
T
(
object
);
}
}
template
<
typename
T
>
template
<
typename
T
>
...
...
lib/pls/include/pls/internal/data_structures/work_stealing_deque.h
0 → 100644
View file @
bd826491
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/scheduling/fork_join_task.h
View file @
bd826491
...
@@ -5,7 +5,7 @@
...
@@ -5,7 +5,7 @@
#include "pls/internal/helpers/profiler.h"
#include "pls/internal/helpers/profiler.h"
#include "pls/internal/data_structures/aligned_stack.h"
#include "pls/internal/data_structures/aligned_stack.h"
#include "pls/internal/data_structures/deque.h"
#include "pls/internal/data_structures/
work_stealing_
deque.h"
#include "abstract_task.h"
#include "abstract_task.h"
#include "thread_state.h"
#include "thread_state.h"
...
@@ -15,7 +15,7 @@ namespace internal {
...
@@ -15,7 +15,7 @@ namespace internal {
namespace
scheduling
{
namespace
scheduling
{
class
fork_join_task
;
class
fork_join_task
;
class
fork_join_sub_task
:
public
data_structures
::
deque_item
{
class
fork_join_sub_task
{
friend
class
fork_join_task
;
friend
class
fork_join_task
;
// Coordinate finishing of sub_tasks
// Coordinate finishing of sub_tasks
...
@@ -25,8 +25,11 @@ class fork_join_sub_task : public data_structures::deque_item {
...
@@ -25,8 +25,11 @@ class fork_join_sub_task : public data_structures::deque_item {
// Access to TBB scheduling environment
// Access to TBB scheduling environment
fork_join_task
*
tbb_task_
;
fork_join_task
*
tbb_task_
;
bool
executed
=
false
;
int
executed_at
=
-
1
;
// Stack Management (reset stack pointer after wait_for_all() calls)
// Stack Management (reset stack pointer after wait_for_all() calls)
data_structures
::
aligned_stack
::
state
stack
_state_
;
data_structures
::
work_stealing_deque
<
fork_join_sub_task
>::
state
deque
_state_
;
protected
:
protected
:
explicit
fork_join_sub_task
();
explicit
fork_join_sub_task
();
fork_join_sub_task
(
const
fork_join_sub_task
&
other
);
fork_join_sub_task
(
const
fork_join_sub_task
&
other
);
...
@@ -37,11 +40,10 @@ class fork_join_sub_task : public data_structures::deque_item {
...
@@ -37,11 +40,10 @@ class fork_join_sub_task : public data_structures::deque_item {
public
:
public
:
// Only use them when actually executing this sub_task (only public for simpler API design)
// Only use them when actually executing this sub_task (only public for simpler API design)
template
<
typename
T
>
template
<
typename
T
>
void
spawn_child
(
const
T
&
sub_task
);
void
spawn_child
(
T
&
sub_task
);
void
wait_for_all
();
void
wait_for_all
();
private
:
private
:
void
spawn_child_internal
(
fork_join_sub_task
*
sub_task
);
void
execute
();
void
execute
();
};
};
...
@@ -50,7 +52,7 @@ class fork_join_lambda_by_reference : public fork_join_sub_task {
...
@@ -50,7 +52,7 @@ class fork_join_lambda_by_reference : public fork_join_sub_task {
const
Function
*
function_
;
const
Function
*
function_
;
public
:
public
:
explicit
fork_join_lambda_by_reference
(
const
Function
*
function
)
:
function_
{
function
}
{};
explicit
fork_join_lambda_by_reference
(
const
Function
*
function
)
:
f
ork_join_sub_task
{},
f
unction_
{
function
}
{};
protected
:
protected
:
void
execute_internal
()
override
{
void
execute_internal
()
override
{
...
@@ -63,7 +65,7 @@ class fork_join_lambda_by_value : public fork_join_sub_task {
...
@@ -63,7 +65,7 @@ class fork_join_lambda_by_value : public fork_join_sub_task {
const
Function
function_
;
const
Function
function_
;
public
:
public
:
explicit
fork_join_lambda_by_value
(
const
Function
&
function
)
:
function_
{
function
}
{};
explicit
fork_join_lambda_by_value
(
const
Function
&
function
)
:
f
ork_join_sub_task
{},
f
unction_
{
function
}
{};
protected
:
protected
:
void
execute_internal
()
override
{
void
execute_internal
()
override
{
...
@@ -76,10 +78,9 @@ class fork_join_task : public abstract_task {
...
@@ -76,10 +78,9 @@ class fork_join_task : public abstract_task {
fork_join_sub_task
*
root_task_
;
fork_join_sub_task
*
root_task_
;
fork_join_sub_task
*
currently_executing_
;
fork_join_sub_task
*
currently_executing_
;
data_structures
::
aligned_stack
*
my_stack_
;
// Double-Ended Queue management
// Double-Ended Queue management
data_structures
::
deque
<
fork_join_sub_task
>
deque_
;
data_structures
::
work_stealing_
deque
<
fork_join_sub_task
>
deque_
;
// Steal Management
// Steal Management
fork_join_sub_task
*
last_stolen_
;
fork_join_sub_task
*
last_stolen_
;
...
@@ -97,12 +98,21 @@ class fork_join_task : public abstract_task {
...
@@ -97,12 +98,21 @@ class fork_join_task : public abstract_task {
};
};
template
<
typename
T
>
template
<
typename
T
>
void
fork_join_sub_task
::
spawn_child
(
const
T
&
task
)
{
void
fork_join_sub_task
::
spawn_child
(
T
&
task
)
{
PROFILE_FORK_JOIN_STEALING
(
"spawn_child"
)
PROFILE_FORK_JOIN_STEALING
(
"spawn_child"
)
static_assert
(
std
::
is_base_of
<
fork_join_sub_task
,
T
>::
value
,
"Only pass fork_join_sub_task subclasses!"
);
static_assert
(
std
::
is_base_of
<
fork_join_sub_task
,
T
>::
value
,
"Only pass fork_join_sub_task subclasses!"
);
T
*
new_task
=
tbb_task_
->
my_stack_
->
push
(
task
);
// Keep our refcount up to date
spawn_child_internal
(
new_task
);
ref_count_
++
;
// Assign forced values
task
.
parent_
=
this
;
task
.
tbb_task_
=
tbb_task_
;
task
.
deque_state_
=
tbb_task_
->
deque_
.
save_state
();
// Push on our deque
const
T
const_task
=
task
;
tbb_task_
->
deque_
.
push_tail
(
const_task
);
}
}
}
}
...
...
lib/pls/src/internal/base/alignment.cpp
View file @
bd826491
...
@@ -10,8 +10,8 @@ void *allocate_aligned(size_t size) {
...
@@ -10,8 +10,8 @@ void *allocate_aligned(size_t size) {
return
aligned_alloc
(
system_details
::
CACHE_LINE_SIZE
,
size
);
return
aligned_alloc
(
system_details
::
CACHE_LINE_SIZE
,
size
);
}
}
s
td
::
uintptr_t
next_alignment
(
std
::
uintpt
r_t
size
)
{
s
ystem_details
::
pointer_t
next_alignment
(
system_details
::
pointe
r_t
size
)
{
s
td
::
uintpt
r_t
miss_alignment
=
size
%
base
::
system_details
::
CACHE_LINE_SIZE
;
s
ystem_details
::
pointe
r_t
miss_alignment
=
size
%
base
::
system_details
::
CACHE_LINE_SIZE
;
if
(
miss_alignment
==
0
)
{
if
(
miss_alignment
==
0
)
{
return
size
;
return
size
;
}
else
{
}
else
{
...
@@ -19,8 +19,17 @@ std::uintptr_t next_alignment(std::uintptr_t size) {
...
@@ -19,8 +19,17 @@ std::uintptr_t next_alignment(std::uintptr_t size) {
}
}
}
}
system_details
::
pointer_t
previous_alignment
(
system_details
::
pointer_t
size
)
{
system_details
::
pointer_t
miss_alignment
=
size
%
base
::
system_details
::
CACHE_LINE_SIZE
;
if
(
miss_alignment
==
0
)
{
return
size
;
}
else
{
return
size
-
miss_alignment
;
}
}
char
*
next_alignment
(
char
*
pointer
)
{
char
*
next_alignment
(
char
*
pointer
)
{
return
reinterpret_cast
<
char
*>
(
next_alignment
(
reinterpret_cast
<
s
td
::
uintpt
r_t
>
(
pointer
)));
return
reinterpret_cast
<
char
*>
(
next_alignment
(
reinterpret_cast
<
s
ystem_details
::
pointe
r_t
>
(
pointer
)));
}
}
}
}
...
...
lib/pls/src/internal/base/swmr_spin_lock.cpp
View file @
bd826491
...
@@ -23,22 +23,22 @@ bool swmr_spin_lock::reader_try_lock() {
...
@@ -23,22 +23,22 @@ bool swmr_spin_lock::reader_try_lock() {
void
swmr_spin_lock
::
reader_unlock
()
{
void
swmr_spin_lock
::
reader_unlock
()
{
PROFILE_LOCK
(
"Release Read Lock"
)
PROFILE_LOCK
(
"Release Read Lock"
)
readers_
.
fetch_add
(
-
1
,
std
::
memory_order_release
)
;
readers_
--
;
}
}
void
swmr_spin_lock
::
writer_lock
()
{
void
swmr_spin_lock
::
writer_lock
()
{
PROFILE_LOCK
(
"Acquire Write Lock"
)
PROFILE_LOCK
(
"Acquire Write Lock"
)
// Tell the readers that we would like to write
// Tell the readers that we would like to write
write_request_
.
store
(
1
,
std
::
memory_order_acquire
)
;
write_request_
=
1
;
// Wait for all of them to exit the critical section
// Wait for all of them to exit the critical section
while
(
readers_
.
load
(
std
::
memory_order_acquire
)
>
0
)
while
(
readers_
>
0
)
system_details
::
relax_cpu
();
// Spin, not expensive as relaxed load
system_details
::
relax_cpu
();
// Spin, not expensive as relaxed load
}
}
void
swmr_spin_lock
::
writer_unlock
()
{
void
swmr_spin_lock
::
writer_unlock
()
{
PROFILE_LOCK
(
"Release Write Lock"
)
PROFILE_LOCK
(
"Release Write Lock"
)
write_request_
.
store
(
0
,
std
::
memory_order_release
)
;
write_request_
=
0
;
}
}
}
}
...
...
lib/pls/src/internal/base/ttas_spin_lock.cpp
View file @
bd826491
...
@@ -9,7 +9,7 @@ namespace base {
...
@@ -9,7 +9,7 @@ namespace base {
void
ttas_spin_lock
::
lock
()
{
void
ttas_spin_lock
::
lock
()
{
PROFILE_LOCK
(
"Acquire Lock"
)
PROFILE_LOCK
(
"Acquire Lock"
)
int
expected
=
0
;
int
expected
=
0
;
backoff
_
.
reset
()
;
backoff
backoff_
;
while
(
true
)
{
while
(
true
)
{
while
(
flag_
.
load
(
std
::
memory_order_relaxed
)
==
1
)
while
(
flag_
.
load
(
std
::
memory_order_relaxed
)
==
1
)
...
@@ -26,7 +26,7 @@ void ttas_spin_lock::lock() {
...
@@ -26,7 +26,7 @@ void ttas_spin_lock::lock() {
bool
ttas_spin_lock
::
try_lock
(
unsigned
int
num_tries
)
{
bool
ttas_spin_lock
::
try_lock
(
unsigned
int
num_tries
)
{
PROFILE_LOCK
(
"Try Acquire Lock"
)
PROFILE_LOCK
(
"Try Acquire Lock"
)
int
expected
=
0
;
int
expected
=
0
;
backoff
_
.
reset
()
;
backoff
backoff_
;
while
(
true
)
{
while
(
true
)
{
while
(
flag_
.
load
()
==
1
)
{
while
(
flag_
.
load
()
==
1
)
{
...
...
lib/pls/src/internal/data_structures/aligned_stack.cpp
View file @
bd826491
...
@@ -5,11 +5,16 @@ namespace pls {
...
@@ -5,11 +5,16 @@ namespace pls {
namespace
internal
{
namespace
internal
{
namespace
data_structures
{
namespace
data_structures
{
aligned_stack
::
aligned_stack
(
char
*
memory_region
,
const
std
::
size_t
size
)
:
aligned_stack
::
aligned_stack
(
pointer_t
memory_region
,
const
std
::
size_t
size
)
:
memory_start_
{
memory_region
},
memory_start_
{
memory_region
},
memory_end_
{
memory_region
+
size
},
memory_end_
{
memory_region
+
size
},
head_
{
base
::
alignment
::
next_alignment
(
memory_start_
)}
{}
head_
{
base
::
alignment
::
next_alignment
(
memory_start_
)}
{}
aligned_stack
::
aligned_stack
(
char
*
memory_region
,
const
std
::
size_t
size
)
:
memory_start_
{(
pointer_t
)
memory_region
},
memory_end_
{(
pointer_t
)
memory_region
+
size
},
head_
{
base
::
alignment
::
next_alignment
(
memory_start_
)}
{}
}
}
}
}
}
}
lib/pls/src/internal/data_structures/deque.cpp
View file @
bd826491
...
@@ -14,11 +14,11 @@ deque_item *deque_internal::pop_head_internal() {
...
@@ -14,11 +14,11 @@ deque_item *deque_internal::pop_head_internal() {
}
}
deque_item
*
result
=
head_
;
deque_item
*
result
=
head_
;
head_
=
head_
->
prev
_
;
head_
=
head_
->
next
_
;
if
(
head_
==
nullptr
)
{
if
(
head_
==
nullptr
)
{
tail_
=
nullptr
;
tail_
=
nullptr
;
}
else
{
}
else
{
head_
->
next
_
=
nullptr
;
head_
->
prev
_
=
nullptr
;
}
}
return
result
;
return
result
;
...
@@ -32,11 +32,11 @@ deque_item *deque_internal::pop_tail_internal() {
...
@@ -32,11 +32,11 @@ deque_item *deque_internal::pop_tail_internal() {
}
}
deque_item
*
result
=
tail_
;
deque_item
*
result
=
tail_
;
tail_
=
tail_
->
next
_
;
tail_
=
tail_
->
prev
_
;
if
(
tail_
==
nullptr
)
{
if
(
tail_
==
nullptr
)
{
head_
=
nullptr
;
head_
=
nullptr
;
}
else
{
}
else
{
tail_
->
prev
_
=
nullptr
;
tail_
->
next
_
=
nullptr
;
}
}
return
result
;
return
result
;
...
@@ -46,12 +46,12 @@ void deque_internal::push_tail_internal(deque_item *new_item) {
...
@@ -46,12 +46,12 @@ void deque_internal::push_tail_internal(deque_item *new_item) {
std
::
lock_guard
<
base
::
spin_lock
>
lock
{
lock_
};
std
::
lock_guard
<
base
::
spin_lock
>
lock
{
lock_
};
if
(
tail_
!=
nullptr
)
{
if
(
tail_
!=
nullptr
)
{
tail_
->
prev
_
=
new_item
;
tail_
->
next
_
=
new_item
;
}
else
{
}
else
{
head_
=
new_item
;
head_
=
new_item
;
}
}
new_item
->
next
_
=
tail_
;
new_item
->
prev
_
=
tail_
;
new_item
->
prev
_
=
nullptr
;
new_item
->
next
_
=
nullptr
;
tail_
=
new_item
;
tail_
=
new_item
;
}
}
...
...
lib/pls/src/internal/scheduling/fork_join_task.cpp
View file @
bd826491
...
@@ -8,22 +8,26 @@ namespace internal {
...
@@ -8,22 +8,26 @@ namespace internal {
namespace
scheduling
{
namespace
scheduling
{
fork_join_sub_task
::
fork_join_sub_task
()
:
fork_join_sub_task
::
fork_join_sub_task
()
:
data_structures
::
deque_item
{},
ref_count_
{
0
},
ref_count_
{
0
},
parent_
{
nullptr
},
parent_
{
nullptr
},
tbb_task_
{
nullptr
},
tbb_task_
{
nullptr
},
stack_state_
{
nullptr
}
{}
deque_state_
{
0
}
{}
fork_join_sub_task
::
fork_join_sub_task
(
const
fork_join_sub_task
&
other
)
:
fork_join_sub_task
::
fork_join_sub_task
(
const
fork_join_sub_task
&
other
)
:
data_structures
::
deque_item
(
other
),
ref_count_
{
0
},
ref_count_
{
0
},
parent_
{
nullptr
},
parent_
{
other
.
parent_
},
tbb_task_
{
nullptr
},
tbb_task_
{
other
.
tbb_task_
},
stack_state_
{
nullptr
}
{}
deque_state_
{
other
.
deque_state_
}
{}
void
fork_join_sub_task
::
execute
()
{
void
fork_join_sub_task
::
execute
()
{
PROFILE_WORK_BLOCK
(
"execute sub_task"
)
PROFILE_WORK_BLOCK
(
"execute sub_task"
)
tbb_task_
->
currently_executing_
=
this
;
tbb_task_
->
currently_executing_
=
this
;
if
(
executed
)
{
int
my_id
=
base
::
this_thread
::
state
<
thread_state
>
()
->
id_
;
PLS_ERROR
(
"Double Execution!"
)
}
executed
=
true
;
executed_at
=
base
::
this_thread
::
state
<
thread_state
>
()
->
id_
;
execute_internal
();
execute_internal
();
tbb_task_
->
currently_executing_
=
nullptr
;
tbb_task_
->
currently_executing_
=
nullptr
;
PROFILE_END_BLOCK
PROFILE_END_BLOCK
...
@@ -34,18 +38,6 @@ void fork_join_sub_task::execute() {
...
@@ -34,18 +38,6 @@ void fork_join_sub_task::execute() {
}
}
}
}
void
fork_join_sub_task
::
spawn_child_internal
(
fork_join_sub_task
*
sub_task
)
{
// Keep our refcount up to date
ref_count_
++
;
// Assign forced values
sub_task
->
parent_
=
this
;
sub_task
->
tbb_task_
=
tbb_task_
;
sub_task
->
stack_state_
=
tbb_task_
->
my_stack_
->
save_state
();
tbb_task_
->
deque_
.
push_tail
(
sub_task
);
}
void
fork_join_sub_task
::
wait_for_all
()
{
void
fork_join_sub_task
::
wait_for_all
()
{
while
(
ref_count_
>
0
)
{
while
(
ref_count_
>
0
)
{
PROFILE_STEALING
(
"get local sub task"
)
PROFILE_STEALING
(
"get local sub task"
)
...
@@ -54,7 +46,6 @@ void fork_join_sub_task::wait_for_all() {
...
@@ -54,7 +46,6 @@ void fork_join_sub_task::wait_for_all() {
if
(
local_task
!=
nullptr
)
{
if
(
local_task
!=
nullptr
)
{
local_task
->
execute
();
local_task
->
execute
();
}
else
{
}
else
{
while
(
ref_count_
>
0
)
{
// Try to steal work.
// Try to steal work.
// External steal will be executed implicitly if success
// External steal will be executed implicitly if success
PROFILE_STEALING
(
"steal work"
)
PROFILE_STEALING
(
"steal work"
)
...
@@ -65,8 +56,7 @@ void fork_join_sub_task::wait_for_all() {
...
@@ -65,8 +56,7 @@ void fork_join_sub_task::wait_for_all() {
}
}
}
}
}
}
}
tbb_task_
->
deque_
.
release_memory_until
(
deque_state_
);
tbb_task_
->
my_stack_
->
reset_state
(
stack_state_
);
}
}
fork_join_sub_task
*
fork_join_task
::
get_local_sub_task
()
{
fork_join_sub_task
*
fork_join_task
::
get_local_sub_task
()
{
...
@@ -74,7 +64,9 @@ fork_join_sub_task *fork_join_task::get_local_sub_task() {
...
@@ -74,7 +64,9 @@ fork_join_sub_task *fork_join_task::get_local_sub_task() {
}
}
fork_join_sub_task
*
fork_join_task
::
get_stolen_sub_task
()
{
fork_join_sub_task
*
fork_join_task
::
get_stolen_sub_task
()
{
return
deque_
.
pop_head
();
auto
tmp
=
deque_
.
save_state
();
auto
result
=
deque_
.
pop_head
();
return
result
;
}
}
bool
fork_join_task
::
internal_stealing
(
abstract_task
*
other_task
)
{
bool
fork_join_task
::
internal_stealing
(
abstract_task
*
other_task
)
{
...
@@ -87,7 +79,7 @@ bool fork_join_task::internal_stealing(abstract_task *other_task) {
...
@@ -87,7 +79,7 @@ bool fork_join_task::internal_stealing(abstract_task *other_task) {
}
else
{
}
else
{
// Make sub-task belong to our fork_join_task instance
// Make sub-task belong to our fork_join_task instance
stolen_sub_task
->
tbb_task_
=
this
;
stolen_sub_task
->
tbb_task_
=
this
;
stolen_sub_task
->
stack_state_
=
my_stack_
->
save_state
();
stolen_sub_task
->
deque_state_
=
deque_
.
save_state
();
// We will execute this next without explicitly moving it onto our stack storage
// We will execute this next without explicitly moving it onto our stack storage
last_stolen_
=
stolen_sub_task
;
last_stolen_
=
stolen_sub_task
;
...
@@ -114,9 +106,12 @@ void fork_join_task::execute() {
...
@@ -114,9 +106,12 @@ void fork_join_task::execute() {
PROFILE_WORK_BLOCK
(
"execute fork_join_task"
);
PROFILE_WORK_BLOCK
(
"execute fork_join_task"
);
// Bind this instance to our OS thread
// Bind this instance to our OS thread
my_stack_
=
base
::
this_thread
::
state
<
thread_state
>
()
->
task_stack_
;
// TODO: See if we did this right
// my_stack_ = base::this_thread::state<thread_state>()->task_stack_;
deque_
.
reset_base_pointer
();
root_task_
->
tbb_task_
=
this
;
root_task_
->
tbb_task_
=
this
;
root_task_
->
stack_state_
=
my_stack_
->
save_state
();
root_task_
->
deque_state_
=
deque_
.
save_state
();
// Execute it on our OS thread until its finished
// Execute it on our OS thread until its finished
root_task_
->
execute
();
root_task_
->
execute
();
...
@@ -124,12 +119,12 @@ void fork_join_task::execute() {
...
@@ -124,12 +119,12 @@ void fork_join_task::execute() {
fork_join_sub_task
*
fork_join_task
::
currently_executing
()
const
{
return
currently_executing_
;
}
fork_join_sub_task
*
fork_join_task
::
currently_executing
()
const
{
return
currently_executing_
;
}
fork_join_task
::
fork_join_task
(
fork_join_sub_task
*
root_task
,
const
abstract_task
::
id
&
id
)
:
fork_join_task
::
fork_join_task
(
fork_join_sub_task
*
root_task
,
const
abstract_task
::
id
&
id
)
:
abstract_task
{
0
,
id
},
abstract_task
{
0
,
id
},
root_task_
{
root_task
},
root_task_
{
root_task
},
currently_executing_
{
nullptr
},
currently_executing_
{
nullptr
},
my_stack_
{
nullptr
},
deque_
{
base
::
this_thread
::
state
<
thread_state
>
()
->
task_stack_
},
deque_
{},
last_stolen_
{
nullptr
}
{}
last_stolen_
{
nullptr
}
{}
}
}
...
...
test/CMakeLists.txt
View file @
bd826491
add_executable
(
tests
add_executable
(
tests
main.cpp
main.cpp
base_tests.cpp scheduling_tests.cpp
data_structures_test.cpp
)
data_structures_test.cpp
)
target_link_libraries
(
tests catch2 pls
)
target_link_libraries
(
tests catch2 pls
)
test/data_structures_test.cpp
View file @
bd826491
...
@@ -4,6 +4,7 @@
...
@@ -4,6 +4,7 @@
#include <pls/internal/data_structures/aligned_stack.h>
#include <pls/internal/data_structures/aligned_stack.h>
#include <pls/internal/data_structures/deque.h>
#include <pls/internal/data_structures/deque.h>
#include <pls/internal/data_structures/work_stealing_deque.h>
#include <vector>
#include <vector>
#include <mutex>
#include <mutex>
...
@@ -130,3 +131,90 @@ TEST_CASE("deque stores objects correctly", "[internal/data_structures/deque.h]"
...
@@ -130,3 +131,90 @@ TEST_CASE("deque stores objects correctly", "[internal/data_structures/deque.h]"
REQUIRE
(
deque
.
pop_tail
()
==
&
three
);
REQUIRE
(
deque
.
pop_tail
()
==
&
three
);
}
}
}
}
TEST_CASE
(
"work stealing deque stores objects correctly"
,
"[internal/data_structures/aligned_stack.h]"
)
{
constexpr
long
data_size
=
2
<<
14
;
char
data
[
data_size
];
aligned_stack
stack
{
data
,
data_size
};
work_stealing_deque
<
int
>
deque
{
&
stack
};
int
one
=
1
,
two
=
2
,
three
=
3
,
four
=
4
;
SECTION
(
"add and remove items form the tail"
)
{
deque
.
push_tail
(
one
);
deque
.
push_tail
(
two
);
deque
.
push_tail
(
three
);
REQUIRE
(
*
deque
.
pop_tail
()
==
three
);
REQUIRE
(
*
deque
.
pop_tail
()
==
two
);
REQUIRE
(
*
deque
.
pop_tail
()
==
one
);
}
SECTION
(
"handles getting empty by popping the tail correctly"
)
{
deque
.
push_tail
(
one
);
REQUIRE
(
*
deque
.
pop_tail
()
==
one
);
deque
.
push_tail
(
two
);
REQUIRE
(
*
deque
.
pop_tail
()
==
two
);
}
SECTION
(
"remove items form the head"
)
{
deque
.
push_tail
(
one
);
deque
.
push_tail
(
two
);
deque
.
push_tail
(
three
);
REQUIRE
(
*
deque
.
pop_head
()
==
one
);
REQUIRE
(
*
deque
.
pop_head
()
==
two
);
REQUIRE
(
*
deque
.
pop_head
()
==
three
);
}
SECTION
(
"handles getting empty by popping the head correctly"
)
{
deque
.
push_tail
(
one
);
REQUIRE
(
*
deque
.
pop_head
()
==
one
);
deque
.
push_tail
(
two
);
REQUIRE
(
*
deque
.
pop_head
()
==
two
);
}
SECTION
(
"handles getting empty by popping the head and tail correctly"
)
{
deque
.
push_tail
(
one
);
REQUIRE
(
*
deque
.
pop_tail
()
==
one
);
deque
.
push_tail
(
two
);
REQUIRE
(
*
deque
.
pop_head
()
==
two
);
deque
.
push_tail
(
three
);
REQUIRE
(
*
deque
.
pop_tail
()
==
three
);
}
SECTION
(
"handles jumps bigger 1 correctly"
)
{
deque
.
push_tail
(
one
);
deque
.
push_tail
(
two
);
REQUIRE
(
*
deque
.
pop_tail
()
==
two
);
deque
.
push_tail
(
three
);
deque
.
push_tail
(
four
);
REQUIRE
(
*
deque
.
pop_head
()
==
one
);
REQUIRE
(
*
deque
.
pop_head
()
==
three
);
REQUIRE
(
*
deque
.
pop_head
()
==
four
);
}
SECTION
(
"handles stack reset 1 correctly when emptied by tail"
)
{
deque
.
push_tail
(
one
);
deque
.
push_tail
(
two
);
auto
tmp_result
=
deque
.
pop_tail
();
REQUIRE
(
*
tmp_result
==
two
);
deque
.
release_memory_until
(
tmp_result
);
REQUIRE
(
*
deque
.
pop_tail
()
==
one
);
deque
.
push_tail
(
three
);
deque
.
push_tail
(
four
);
REQUIRE
(
*
deque
.
pop_head
()
==
three
);
REQUIRE
(
*
deque
.
pop_tail
()
==
four
);
}
SECTION
(
"synces correctly"
)
{
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment