Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
las3_pub
/
predictable_parallel_patterns
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
83c6e622
authored
5 years ago
by
FritzFlorian
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Draft of new context switching tasks.
parent
5e0ce1f5
Pipeline
#1384
failed with stages
in 31 seconds
Changes
17
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
186 additions
and
863 deletions
+186
-863
app/benchmark_fib/main.cpp
+13
-35
lib/context_switcher/include/context_switcher/context_switcher.h
+1
-5
lib/context_switcher/include/context_switcher/lambda_capture.h
+1
-0
lib/context_switcher/src/context_switcher.cpp
+10
-0
lib/pls/CMakeLists.txt
+8
-2
lib/pls/include/pls/internal/scheduling/cont.h
+0
-163
lib/pls/include/pls/internal/scheduling/cont_manager.h
+0
-203
lib/pls/include/pls/internal/scheduling/memory_block.h
+0
-121
lib/pls/include/pls/internal/scheduling/parallel_result.h
+0
-37
lib/pls/include/pls/internal/scheduling/scheduler.h
+9
-13
lib/pls/include/pls/internal/scheduling/scheduler_impl.h
+9
-129
lib/pls/include/pls/internal/scheduling/scheduler_memory.h
+4
-6
lib/pls/include/pls/internal/scheduling/task.h
+59
-38
lib/pls/include/pls/internal/scheduling/task_manager.h
+36
-42
lib/pls/include/pls/internal/scheduling/thread_state.h
+19
-29
lib/pls/include/pls/internal/scheduling/thread_state_static.h
+3
-7
lib/pls/src/internal/scheduling/scheduler.cpp
+14
-33
No files found.
app/benchmark_fib/main.cpp
View file @
83c6e622
#include "pls/internal/scheduling/scheduler.h"
#include "pls/internal/scheduling/parallel_result.h"
#include "pls/internal/scheduling/scheduler_memory.h"
#include "pls/internal/helpers/profiler.h"
using
namespace
pls
::
internal
::
scheduling
;
...
...
@@ -14,24 +12,20 @@ using namespace pls::internal::scheduling;
using
namespace
comparison_benchmarks
::
base
;
parallel_result
<
int
>
pls_fib
(
int
n
)
{
int
pls_fib
(
int
n
)
{
if
(
n
<=
1
)
{
return
parallel_result
<
int
>
{
1
}
;
return
1
;
}
return
scheduler
::
par
([
=
]()
{
return
pls_fib
(
n
-
1
);
},
[
=
]()
{
return
pls_fib
(
n
-
2
);
}).
then
([
=
](
int
a
,
int
b
)
{
return
parallel_result
<
int
>
{
a
+
b
};
});
int
a
=
pls_fib
(
n
-
1
);
int
b
=
pls_fib
(
n
-
2
);
return
a
+
b
;
}
constexpr
int
MAX_NUM_THREADS
=
8
;
constexpr
int
MAX_NUM_THREADS
=
1
;
constexpr
int
MAX_NUM_TASKS
=
64
;
constexpr
int
MAX_NUM_CONTS
=
64
;
constexpr
int
MAX_CONT_SIZE
=
256
;
constexpr
int
MAX_STACK_SIZE
=
128
;
int
main
(
int
argc
,
char
**
argv
)
{
int
num_threads
;
...
...
@@ -39,43 +33,27 @@ int main(int argc, char **argv) {
benchmark_runner
::
read_args
(
argc
,
argv
,
num_threads
,
directory
);
string
test_name
=
to_string
(
num_threads
)
+
".csv"
;
string
full_directory
=
directory
+
"/PLS_v
2
/"
;
string
full_directory
=
directory
+
"/PLS_v
3
/"
;
benchmark_runner
runner
{
full_directory
,
test_name
};
static_scheduler_memory
<
MAX_NUM_THREADS
,
MAX_NUM_TASKS
,
MAX_NUM_CONTS
,
MAX_CONT_SIZE
>
static_scheduler_memory
;
MAX_STACK_SIZE
>
static_scheduler_memory
;
scheduler
scheduler
{
static_scheduler_memory
,
(
unsigned
int
)
num_threads
};
scheduler
scheduler
{
static_scheduler_memory
,
(
unsigned
)
num_threads
};
volatile
int
res
;
for
(
int
i
=
0
;
i
<
fib
::
NUM_WARMUP_ITERATIONS
;
i
++
)
{
scheduler
.
perform_work
([
&
]()
{
return
scheduler
::
par
([
&
]()
{
return
pls_fib
(
fib
::
INPUT_N
);
},
[]()
{
return
parallel_result
<
int
>
{
0
};
}).
then
([
&
](
int
result
,
int
)
{
res
=
result
;
return
parallel_result
<
int
>
{
0
};
});
res
=
pls_fib
(
fib
::
INPUT_N
);
});
}
for
(
int
i
=
0
;
i
<
fib
::
NUM_ITERATIONS
;
i
++
)
{
scheduler
.
perform_work
([
&
]()
{
runner
.
start_iteration
();
return
scheduler
::
par
([
&
]()
{
return
pls_fib
(
fib
::
INPUT_N
);
},
[]()
{
return
parallel_result
<
int
>
{
0
};
}).
then
([
&
](
int
result
,
int
)
{
res
=
result
;
res
=
pls_fib
(
fib
::
INPUT_N
);
runner
.
end_iteration
();
return
parallel_result
<
int
>
{
0
};
});
});
}
runner
.
commit_results
(
true
);
...
...
This diff is collapsed.
Click to expand it.
lib/context_switcher/include/context_switcher/context_switcher.h
View file @
83c6e622
...
...
@@ -24,11 +24,7 @@ continuation enter_context(assembly_bindings::stack_pointer_t stack_memory, size
return
continuation
{
assembly_bindings
::
__cs_enter_context
(
stack_base
,
captured_lambda
,
callback
,
stack_limit
)};
}
continuation
switch_context
(
continuation
&&
cont
)
{
assembly_bindings
::
continuation_t
cont_pointer
=
cont
.
consume
();
return
continuation
{
assembly_bindings
::
__cs_switch_context
(
cont_pointer
)};
}
continuation
switch_context
(
continuation
&&
cont
);
}
...
...
This diff is collapsed.
Click to expand it.
lib/context_switcher/include/context_switcher/lambda_capture.h
View file @
83c6e622
...
...
@@ -19,6 +19,7 @@ namespace context_switcher {
template
<
typename
F
>
struct
lambda_capture
{
// TODO: Check if we need an extra template here to perform the move
explicit
lambda_capture
(
F
&&
lambda
)
:
lambda_
{
std
::
forward
<
F
>
(
lambda
)}
{}
assembly_bindings
::
continuation_t
operator
()(
assembly_bindings
::
continuation_t
continuation_pointer
)
{
...
...
This diff is collapsed.
Click to expand it.
lib/context_switcher/src/context_switcher.cpp
View file @
83c6e622
#include "context_switcher/context_switcher.h"
namespace
context_switcher
{
continuation
switch_context
(
continuation
&&
cont
)
{
assembly_bindings
::
continuation_t
cont_pointer
=
cont
.
consume
();
return
continuation
{
assembly_bindings
::
__cs_switch_context
(
cont_pointer
)};
}
}
This diff is collapsed.
Click to expand it.
lib/pls/CMakeLists.txt
View file @
83c6e622
...
...
@@ -58,7 +58,12 @@ add_library(pls STATIC
include/pls/internal/scheduling/task.h src/internal/scheduling/task.cpp
include/pls/internal/scheduling/cont_manager.h
include/pls/internal/scheduling/cont.h
include/pls/internal/data_structures/bounded_ws_deque.h include/pls/internal/data_structures/optional.h include/pls/internal/scheduling/memory_block.h include/pls/internal/scheduling/thread_state_static.h src/internal/base/error_handling.cpp include/pls/internal/data_structures/bounded_trading_deque.h ../context_switcher/src/context_switcher.cpp
)
include/pls/internal/data_structures/bounded_ws_deque.h
include/pls/internal/data_structures/optional.h
include/pls/internal/scheduling/memory_block.h
include/pls/internal/scheduling/thread_state_static.h
src/internal/base/error_handling.cpp
include/pls/internal/data_structures/bounded_trading_deque.h
)
# Add everything in `./include` to be in the include path of this project
target_include_directories
(
pls
...
...
@@ -72,6 +77,7 @@ target_include_directories(pls
# Add cmake dependencies here if needed
target_link_libraries
(
pls
Threads::Threads
# pthread support
context_switcher
# coroutine support
)
if
(
EASY_PROFILER
)
target_link_libraries
(
pls easy_profiler
)
...
...
@@ -79,7 +85,7 @@ endif ()
# Rules for istalling the library on a system
# ...binaries
INSTALL
(
TARGETS pls
INSTALL
(
TARGETS pls
context_switcher
EXPORT pls-targets
LIBRARY
DESTINATION lib/pls
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/scheduling/cont.h
deleted
100644 → 0
View file @
5e0ce1f5
#ifndef PLS_INTERNAL_SCHEDULING_CONT_H_
#define PLS_INTERNAL_SCHEDULING_CONT_H_
#include <type_traits>
#include <atomic>
#include <utility>
#include "pls/internal/data_structures/stamped_integer.h"
#include "pls/internal/data_structures/delayed_initialization.h"
#include "pls/internal/base/alignment.h"
#include "pls/internal/base/error_handling.h"
#include "pls/internal/helpers/profiler.h"
#include "parallel_result.h"
#include "memory_block.h"
namespace
pls
{
namespace
internal
{
namespace
scheduling
{
class
base_cont
{
protected
:
// We plan to only init the members for a continuation on the slow path.
// If we can execute everything inline we simply skip it saving runtime overhead.
template
<
typename
T
>
using
delayed_init
=
data_structures
::
delayed_initialization
<
T
>
;
public
:
explicit
base_cont
(
base_cont
*
parent
,
memory_block
*
memory_block
,
bool
is_right_child
)
:
parent_
{
parent
},
memory_block_
{
memory_block
},
is_right_child_
{
is_right_child
}
{
PLS_ASSERT
(
parent_
==
nullptr
||
parent_
->
memory_block_
->
get_depth
()
==
memory_block_
->
get_depth
()
-
1
,
"Must only build cont chains with matching depth!"
)
};
/**
* Execute the continuation itself.
* Make sure to only call when all required results are in.
* Will store the result in it's parent, but not mess with any counters.
*/
virtual
void
execute
()
=
0
;
/**
* Execute the right hand side task associated with the continuation.
* Will store the result in it's parent, but not mess with any counters.
*/
virtual
void
execute_task
()
=
0
;
virtual
base_task
*
get_task
()
=
0
;
virtual
void
*
get_right_result_pointer
()
=
0
;
virtual
void
*
get_left_result_pointer
()
=
0
;
template
<
typename
T
>
void
store_right_result
(
T
&&
result
)
{
using
BASE_T
=
typename
std
::
remove_cv
<
typename
std
::
remove_reference
<
T
>::
type
>::
type
;
reinterpret_cast
<
delayed_init
<
BASE_T
>
*>
(
get_right_result_pointer
())
->
initialize
(
std
::
forward
<
T
>
(
result
));
}
template
<
typename
T
>
void
store_left_result
(
T
&&
result
)
{
using
BASE_T
=
typename
std
::
remove_cv
<
typename
std
::
remove_reference
<
T
>::
type
>::
type
;
reinterpret_cast
<
delayed_init
<
BASE_T
>
*>
(
get_left_result_pointer
())
->
initialize
(
std
::
forward
<
T
>
(
result
));
}
base_cont
*
get_parent
()
{
return
parent_
;
}
memory_block
*
get_memory_block
()
{
return
memory_block_
;
}
bool
is_right_child
()
const
{
return
is_right_child_
;
}
protected
:
base_cont
*
parent_
;
memory_block
*
memory_block_
;
bool
is_right_child_
;
};
template
<
typename
T2
,
typename
R1
,
typename
R2
,
typename
F
>
class
cont
:
public
base_cont
{
private
:
template
<
typename
RES_TYPE
>
struct
result_runner
{
// Strip off unwanted modifiers...
using
BASE_RES_TYPE
=
typename
std
::
remove_cv
<
typename
std
::
remove_reference
<
RES_TYPE
>::
type
>::
type
;
static
void
execute
(
cont
&
cont
)
{
parallel_result
<
BASE_RES_TYPE
>
result
{
cont
.
function_
((
*
cont
.
left_result_
).
value
(),
(
*
cont
.
right_result_
).
value
())};
if
(
result
.
fast_path
()
&&
cont
.
parent_
!=
nullptr
)
{
if
(
cont
.
is_right_child
())
{
cont
.
parent_
->
store_right_result
(
std
::
move
(
result
));
}
else
{
cont
.
parent_
->
store_left_result
(
std
::
move
(
result
));
}
}
}
};
template
<
typename
INNER_TYPE
>
struct
result_runner
<
parallel_result
<
INNER_TYPE
>>
{
static
void
execute
(
cont
&
cont
)
{
auto
result
=
cont
.
function_
((
*
cont
.
left_result_
).
value
(),
(
*
cont
.
right_result_
).
value
());
if
(
result
.
fast_path
()
&&
cont
.
parent_
)
{
if
(
cont
.
is_right_child
())
{
cont
.
parent_
->
store_right_result
(
std
::
move
(
result
));
}
else
{
cont
.
parent_
->
store_left_result
(
std
::
move
(
result
));
}
}
}
};
public
:
template
<
typename
FARG
,
typename
...
T2ARGS
>
explicit
cont
(
base_cont
*
parent
,
memory_block
*
memory_block
,
bool
is_right_child
,
FARG
&&
function
,
T2ARGS
...
task_2_args
)
:
base_cont
(
parent
,
memory_block
,
is_right_child
),
function_
{
std
::
forward
<
FARG
>
(
function
)},
task_
{
std
::
forward
<
T2ARGS
>
(
task_2_args
)...,
this
}
{};
void
execute
()
override
{
PROFILE_CONTINUATION
(
"execute_cont"
);
using
result_type
=
decltype
(
function_
((
*
left_result_
).
value
(),
(
*
right_result_
).
value
()));
result_runner
<
result_type
>::
execute
(
*
this
);
this
->~
cont
();
auto
*
memory_block
=
this
->
get_memory_block
();
memory_block
->
free_buffer
();
memory_block
->
reset_state
();
}
void
execute_task
()
override
{
task_
.
execute
();
}
base_task
*
get_task
()
override
{
return
&
task_
;
}
void
*
get_left_result_pointer
()
override
{
return
&
left_result_
;
}
void
*
get_right_result_pointer
()
override
{
return
&
right_result_
;
}
private
:
// Initial data members. These slow down the fast path, try to init them lazy when possible.
F
function_
;
T2
task_
;
// Some fields/actual values stay uninitialized (save time on the fast path if we don not need them).
// More fields untouched on the fast path is good, but for ease of an implementation we only keep some for now.
delayed_init
<
R1
>
left_result_
;
delayed_init
<
R2
>
right_result_
;
};
}
}
}
#endif //PLS_INTERNAL_SCHEDULING_CONT_H_
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/scheduling/cont_manager.h
deleted
100644 → 0
View file @
5e0ce1f5
#ifndef PLS_CONT_MANAGER_H_
#define PLS_CONT_MANAGER_H_
#include <memory>
#include <utility>
#include <array>
#include "pls/internal/data_structures/aligned_stack.h"
#include "pls/internal/scheduling/cont.h"
#include "pls/internal/scheduling/thread_state.h"
namespace
pls
{
namespace
internal
{
namespace
scheduling
{
class
cont_manager
{
public
:
// Helper to pass the compile time constants to the constructor.
template
<
size_t
NUM_CONTS
,
size_t
MAX_CONT_SIZE
>
struct
template_args
{};
template
<
size_t
NUM_CONTS
,
size_t
MAX_CONT_SIZE
>
explicit
cont_manager
(
data_structures
::
aligned_stack
&
cont_storage
,
template_args
<
NUM_CONTS
,
MAX_CONT_SIZE
>
)
:
max_cont_size_
{
MAX_CONT_SIZE
},
num_conts_
{
NUM_CONTS
}
{
// First node is currently active and our local start
active_node_
=
init_memory_block
<
MAX_CONT_SIZE
>
(
cont_storage
,
nullptr
,
0
);
// Build up chain after it
memory_block
*
current_node
=
active_node_
;
for
(
size_t
i
=
1
;
i
<
NUM_CONTS
;
i
++
)
{
memory_block
*
next_node
=
init_memory_block
<
MAX_CONT_SIZE
>
(
cont_storage
,
current_node
,
i
);
current_node
->
set_next
(
next_node
);
current_node
=
next_node
;
}
};
// Aquire and release memory blocks...
memory_block
*
get_next_memory_block
()
{
auto
result
=
active_node_
;
active_node_
=
active_node_
->
get_next
();
return
result
;
}
void
return_memory_block
()
{
active_node_
=
active_node_
->
get_prev
();
}
void
move_active_node
(
int
depth
)
{
if
(
depth
<
0
)
{
for
(
long
i
=
0
;
i
<
(
depth
*
-
1
);
i
++
)
{
active_node_
=
active_node_
->
get_prev
();
}
}
else
{
for
(
long
i
=
0
;
i
<
depth
;
i
++
)
{
active_node_
=
active_node_
->
get_next
();
}
}
}
void
move_active_node_to_start
()
{
move_active_node
(
-
1
*
active_node_
->
get_depth
());
}
memory_block
*
get_active_node
()
{
return
active_node_
;
}
bool
is_clean
()
{
if
(
get_active_node
()
->
get_depth
()
==
0
)
{
memory_block
*
current_node
=
active_node_
;
for
(
size_t
i
=
1
;
i
<
num_conts_
;
i
++
)
{
if
(
current_node
->
get_prev
()
!=
nullptr
&&
current_node
->
get_prev
()
->
get_next
()
!=
current_node
)
{
return
false
;
}
if
(
current_node
->
is_buffer_used
())
{
return
false
;
}
current_node
=
current_node
->
get_next
();
}
}
else
{
return
false
;
}
return
true
;
}
// Manage the fall through behaviour/slow path behaviour
bool
falling_through
()
const
{
return
fall_through_
;
}
void
fall_through_and_notify_cont
(
base_cont
*
notified_cont
,
bool
is_child_right
)
{
fall_through_
=
true
;
fall_through_cont_
=
notified_cont
;
fall_through_child_right
=
is_child_right
;
}
void
aquire_memory_chain
(
memory_block
*
target_chain
)
{
PLS_ASSERT
(
active_node_
->
get_depth
()
==
target_chain
->
get_depth
()
+
1
,
"Can only steal aquire chain parts with correct depth."
);
active_node_
->
set_prev
(
target_chain
);
target_chain
->
set_next
(
active_node_
);
}
void
execute_fall_through_code
()
{
PLS_ASSERT
(
falling_through
(),
"Must be falling through to execute the associated code."
)
auto
&
my_state
=
thread_state
::
get
();
// Copy fall through status and reset it (for potentially nested execution paths).
auto
*
notified_cont
=
fall_through_cont_
;
fall_through_cont_
=
nullptr
;
fall_through_
=
false
;
// Keep the target chain before we execute, as this potentially frees the memory
auto
*
target_memory_block
=
notified_cont
->
get_memory_block
();
auto
*
target_chain
=
target_memory_block
->
get_offered_chain
().
load
();
// Notify the next continuation of finishing a child...
if
(
target_memory_block
->
get_results_missing
().
fetch_add
(
-
1
)
==
1
)
{
// ... we finished the continuation.
// We are now in charge continuing to execute the above continuation chain.
PLS_ASSERT
(
active_node_
->
get_prev
()
->
get_depth
()
==
target_memory_block
->
get_depth
(),
"We must hold the system invariant to be in the correct depth."
)
if
(
active_node_
->
get_prev
()
!=
target_memory_block
)
{
// We do not own the thing we will execute.
// Own it by swapping the chain belonging to it in.
aquire_memory_chain
(
target_memory_block
);
}
my_state
.
parent_cont_
=
notified_cont
->
get_parent
();
my_state
.
right_spawn_
=
notified_cont
->
is_right_child
();
active_node_
=
target_memory_block
;
notified_cont
->
execute
();
if
(
!
falling_through
()
&&
notified_cont
->
get_parent
()
!=
nullptr
)
{
fall_through_and_notify_cont
(
notified_cont
->
get_parent
(),
notified_cont
->
is_right_child
());
}
return
;
}
else
{
// ... we did not finish the last continuation.
// We are no longer in charge of executing the above continuation chain.
PLS_ASSERT
(
active_node_
->
get_prev
()
->
get_depth
()
==
target_memory_block
->
get_depth
(),
"We must hold the system invariant to be in the correct depth."
)
if
(
active_node_
->
get_prev
()
==
target_memory_block
)
{
// We own the thing we are not allowed to execute.
// Get rid of the ownership by using the offered chain.
aquire_memory_chain
(
target_chain
);
}
move_active_node_to_start
();
// We are done here...nothing more to execute
return
;
}
}
private
:
template
<
size_t
MAX_CONT_SIZE
>
static
memory_block
*
init_memory_block
(
data_structures
::
aligned_stack
&
cont_storage
,
memory_block
*
prev
,
unsigned
long
depth
)
{
// Represents one cont_node and its corresponding memory buffer (as one continuous block of memory).
constexpr
size_t
buffer_size
=
MAX_CONT_SIZE
-
base
::
alignment
::
next_alignment
(
sizeof
(
memory_block
));
char
*
memory_block_ptr
=
cont_storage
.
push_bytes
<
memory_block
>
();
char
*
memory_block_buffer_ptr
=
cont_storage
.
push_bytes
(
buffer_size
);
return
new
(
memory_block_ptr
)
memory_block
(
memory_block_buffer_ptr
,
buffer_size
,
prev
,
depth
);
}
private
:
const
size_t
max_cont_size_
;
const
size_t
num_conts_
;
/**
* Managing the continuation chain.
*/
memory_block
*
active_node_
;
/**
* Managing falling through back to the scheduler.
*/
bool
fall_through_
{
false
};
bool
fall_through_child_right
{
false
};
base_cont
*
fall_through_cont_
{
nullptr
};
};
template
<
size_t
NUM_CONTS
,
size_t
MAX_CONT_SIZE
>
class
static_cont_manager
{
public
:
static_cont_manager
()
:
static_cont_storage_
{},
cont_manager_
(
static_cont_storage_
.
get_stack
(),
cont_manager
::
template_args
<
NUM_CONTS
,
MAX_CONT_SIZE
>
{})
{}
cont_manager
&
get_cont_manager
()
{
return
cont_manager_
;
}
private
:
data_structures
::
static_aligned_stack
<
NUM_CONTS
*
MAX_CONT_SIZE
>
static_cont_storage_
;
cont_manager
cont_manager_
;
};
}
}
}
#endif //PLS_CONT_MANAGER_H_
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/scheduling/memory_block.h
deleted
100644 → 0
View file @
5e0ce1f5
#ifndef PLS_INTERNAL_SCHEDULING_CONT_NODE_H_
#define PLS_INTERNAL_SCHEDULING_CONT_NODE_H_
namespace
pls
{
namespace
internal
{
namespace
scheduling
{
/**
* A block of memory that can be used to store tasks and continuations.
* Threads trade these blocks while executing and stealing tasks.
*
* Each block has an associated, raw memory buffer. The user can place his object
* in this memory region as needed. He is responsible for calling deconstructors of the
* placed objects.
*/
class
memory_block
{
public
:
memory_block
(
char
*
memory_buffer
,
size_t
memory_buffer_size
,
memory_block
*
prev
,
int
depth
)
:
prev_
{
prev
},
next_
{
nullptr
},
offered_chain_
{
nullptr
},
results_missing_
{
2
},
memory_buffer_
{
memory_buffer
},
memory_buffer_size_
{
memory_buffer_size
},
memory_buffer_used_
{
false
},
depth_
{
depth
}
{};
template
<
typename
T
,
typename
...
ARGS
>
T
*
place_in_buffer
(
ARGS
&&
...
args
)
{
PLS_ASSERT
(
!
memory_buffer_used_
,
"Must only allocate one continuation at once per node."
);
memory_buffer_used_
=
true
;
auto
*
result
=
new
(
memory_buffer_
)
T
(
std
::
forward
<
ARGS
>
(
args
)...);
continuation_
=
result
;
return
result
;
}
void
free_buffer
()
{
PLS_ASSERT
(
memory_buffer_used_
,
"Can only free a memory spot when it is in use."
)
memory_buffer_used_
=
false
;
}
bool
is_buffer_used
()
{
return
memory_buffer_used_
;
}
base_cont
*
get_cont
()
{
PLS_ASSERT
(
is_buffer_used
(),
"Can only read initialized buffer!"
);
return
continuation_
;
}
memory_block
*
get_prev
()
{
return
prev_
;
}
void
set_prev
(
memory_block
*
prev
)
{
prev_
=
prev
;
}
memory_block
*
get_next
()
{
return
next_
;
}
void
set_next
(
memory_block
*
next
)
{
next_
=
next
;
}
std
::
atomic
<
memory_block
*>
&
get_offered_chain
()
{
return
offered_chain_
;
}
std
::
atomic
<
unsigned
short
>
&
get_results_missing
()
{
return
results_missing_
;
}
int
get_depth
()
const
noexcept
{
return
depth_
;
}
void
reset_state
()
{
offered_chain_
.
store
(
nullptr
);
results_missing_
.
store
(
2
);
}
private
:
// Linked list property of memory blocks (a complete list represents a threads currently owned memory).
// Each block knows its chain start to allow stealing a whole chain in O(1)
// without the need to traverse back to the chain start.
memory_block
*
prev_
,
*
next_
;
// When blocked on this chain element, we need to know what other chain of memory we
// got offered by the stealing thread.
// For this we need the offered chain's element up to the point we can steal.
std
::
atomic
<
memory_block
*>
offered_chain_
;
// Management for coordinating concurrent result writing and stealing.
// The result count decides atomically who gets to execute the continuation
// and who therefore get's to own this memory block chain.
std
::
atomic
<
unsigned
short
>
results_missing_
;
// Pointer to memory region reserved for the companion continuation.
// Must be a buffer big enough to hold any continuation encountered in the program.
// This memory is managed explicitly by the continuation manager and runtime system
// (they need to make sure to always call de-constructors and never allocate two continuations).
char
*
memory_buffer_
;
base_cont
*
continuation_
;
// These two are only helper properties helping with bugs during development.
size_t
memory_buffer_size_
;
bool
memory_buffer_used_
;
// Each element stays at a fixed depth for the entire application run.
// Swapping parts of a memory chain will not reorder it, as always parts of
// the same size are exchanged.
const
int
depth_
;
};
}
}
}
#endif //PLS_INTERNAL_SCHEDULING_CONT_NODE_H_
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/scheduling/parallel_result.h
deleted
100644 → 0
View file @
5e0ce1f5
#ifndef PLS_INTERNAL_SCHEDULING_PARALLEL_RESULT_H_
#define PLS_INTERNAL_SCHEDULING_PARALLEL_RESULT_H_
#include <utility>
#include "pls/internal/data_structures/delayed_initialization.h"
namespace
pls
{
namespace
internal
{
namespace
scheduling
{
// Used to more enforce the use of parallel_results
class
parallel_result_base
{};
template
<
typename
T
>
class
parallel_result
:
public
parallel_result_base
{
public
:
using
value_type
=
T
;
parallel_result
()
=
default
;
parallel_result
(
parallel_result
&&
other
)
noexcept
:
val_
{
std
::
move
(
other
.
val_
)}
{}
parallel_result
(
const
parallel_result
&
other
)
noexcept
:
val_
{
other
.
val_
}
{}
parallel_result
(
T
val
)
:
val_
{
std
::
move
(
val
)}
{}
bool
fast_path
()
{
return
val_
.
initialized
();
}
T
&
value
()
{
return
val_
.
object
();
}
private
:
data_structures
::
delayed_initialization
<
T
>
val_
;
};
}
}
}
#endif //PLS_INTERNAL_SCHEDULING_PARALLEL_RESULT_H_
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/scheduling/scheduler.h
View file @
83c6e622
...
...
@@ -53,23 +53,20 @@ class scheduler {
template
<
typename
Function
>
void
perform_work
(
Function
work_section
);
template
<
typename
Function
>
void
spawn
(
Function
&&
lambda
)
{
// TODO: place function on next active
// TODO: capture continuation in current active
// TODO: advance current active
// TODO: after finish, return to last active (if not stolen)
// TODO: revert current active
}
/**
* Explicitly terminate the worker threads. Scheduler must not be used after this.
*/
void
terminate
();
/**
* Temporary object used for the parallel(...).then(...) API.
*/
template
<
typename
F1
,
typename
F2
>
struct
starter
;
template
<
typename
F1
,
typename
F2
>
starter
<
F1
,
F2
>
invoke_parallel
(
F1
&&
function_1
,
F2
&&
function_2
);
template
<
typename
F1
,
typename
F2
>
static
starter
<
F1
,
F2
>
par
(
F1
&&
function_1
,
F2
&&
function_2
);
unsigned
int
num_threads
()
const
{
return
num_threads_
;
}
private
:
...
...
@@ -77,7 +74,6 @@ class scheduler {
void
work_thread_work_section
();
thread_state
&
thread_state_for
(
size_t
id
);
friend
class
base_task
;
const
unsigned
int
num_threads_
;
const
bool
reuse_thread_
;
scheduler_memory
&
memory_
;
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/scheduling/scheduler_impl.h
View file @
83c6e622
...
...
@@ -3,134 +3,17 @@
#define PLS_SCHEDULER_IMPL_H
#include <utility>
#include "pls/internal/scheduling/cont.h"
#include "pls/internal/scheduling/parallel_result.h"
#include "pls/internal/scheduling/task.h"
#include "context_switcher/context_switcher.h"
#include "context_switcher/continuation.h"
#include "pls/internal/scheduling/task.h"
#include "pls/internal/helpers/profiler.h"
namespace
pls
{
namespace
internal
{
namespace
scheduling
{
template
<
typename
F1
,
typename
F2
>
struct
scheduler
::
starter
{
F1
function_1_
;
F2
function_2_
;
using
return_type_1
=
decltype
(
function_1_
());
using
return_type_2
=
decltype
(
function_2_
());
// Enforce correct return types of lambdas (parallel_result)
static_assert
(
std
::
is_base_of
<
parallel_result_base
,
return_type_1
>::
value
,
"Must only return parallel results in parallel code"
);
static_assert
(
std
::
is_base_of
<
parallel_result_base
,
return_type_2
>::
value
,
"Must only return parallel results in parallel code"
);
template
<
typename
F1ARG
,
typename
F2ARG
>
explicit
starter
(
F1ARG
&&
function_1
,
F2ARG
&&
function_2
)
:
function_1_
{
std
::
forward
<
F1ARG
>
(
function_1
)},
function_2_
{
std
::
forward
<
F2ARG
>
(
function_2
)}
{};
template
<
typename
FCONT
>
auto
then
(
FCONT
&&
cont_function
)
->
decltype
(
cont_function
(
std
::
declval
<
typename
return_type_1
::
value_type
>
(),
std
::
declval
<
typename
return_type_2
::
value_type
>
()))
{
PROFILE_FAST_PATH
(
"then"
);
using
continuation_type
=
cont
<
task
<
F2
>
,
return_type_1
,
return_type_2
,
FCONT
>
;
using
result_type
=
decltype
(
cont_function
(
std
::
declval
<
typename
return_type_1
::
value_type
>
(),
std
::
declval
<
typename
return_type_2
::
value_type
>
()));
auto
&
my_state
=
thread_state
::
get
();
auto
&
cont_manager
=
my_state
.
get_cont_manager
();
// Select current memory block.
// For now directly copy both the continuation function and the second task.
// (We might optimize this in the future to require less memory copies.)
auto
*
current_memory_block
=
cont_manager
.
get_next_memory_block
();
// We keep track of the last spawn to build up the parent_cont chain
const
bool
is_right_cont
=
my_state
.
right_spawn_
;
base_cont
*
parent_cont
=
my_state
.
parent_cont_
;
continuation_type
*
current_cont
=
current_memory_block
->
place_in_buffer
<
continuation_type
>
(
parent_cont
,
current_memory_block
,
is_right_cont
,
cont_function
,
function_2_
);
my_state
.
parent_cont_
=
current_cont
;
// Publish the second task.
my_state
.
get_task_manager
().
publish_task
(
current_cont
->
get_task
());
// Call first function on fast path
my_state
.
right_spawn_
=
false
;
return_type_1
result_1
=
function_1_
();
if
(
!
result_1
.
fast_path
())
{
// Get our replacement from the task stack and store it for later use when we are actually blocked.
auto
traded_memory
=
my_state
.
get_task_manager
().
try_pop_local
();
current_cont
->
get_memory_block
()
->
get_offered_chain
().
store
(
*
traded_memory
);
// Unwind stack...
return
result_type
{};
}
// Try to call second function on fast path
auto
traded_memory
=
my_state
.
get_task_manager
().
try_pop_local
();
if
(
traded_memory
)
{
// The task got stolen...get_memory_block
// ...but we got a memory block that can be used if we block on this one.
current_cont
->
get_memory_block
()
->
get_offered_chain
().
store
(
*
traded_memory
);
// Main scheduling loop is responsible for entering the result to the slow path...
current_cont
->
store_left_result
(
std
::
move
(
result_1
));
cont_manager
.
fall_through_and_notify_cont
(
current_cont
,
false
);
// Unwind stack...
return
result_type
{};
}
else
{
my_state
.
right_spawn_
=
true
;
return_type_2
result_2
=
function_2_
();
if
(
!
result_2
.
fast_path
())
{
// Main scheduling loop is responsible for entering the result to the slow path...
current_cont
->
store_left_result
(
std
::
move
(
result_1
));
current_cont
->
get_memory_block
()
->
get_results_missing
().
fetch_add
(
-
1
);
// Unwind stack...
return
result_type
{};
}
// We fully got all results, inline as good as possible.
// This is the common case, branch prediction should be rather good here.
// Just return the cont object unused and directly call the function.
current_cont
->~
continuation_type
();
current_memory_block
->
free_buffer
();
cont_manager
.
return_memory_block
();
// The continuation has the same execution environment as we had for the children.
// We need this to allow spawns in there.
my_state
.
parent_cont_
=
parent_cont
;
my_state
.
right_spawn_
=
is_right_cont
;
auto
cont_result
=
cont_function
(
result_1
.
value
(),
result_2
.
value
());
if
(
!
cont_result
.
fast_path
())
{
// Unwind stack...
return
result_type
{};
}
return
cont_result
;
}
};
};
template
<
typename
F1
,
typename
F2
>
scheduler
::
starter
<
F1
,
F2
>
scheduler
::
invoke_parallel
(
F1
&&
function_1
,
F2
&&
function_2
)
{
return
scheduler
::
starter
<
F1
,
F2
>
{
std
::
forward
<
F1
>
(
function_1
),
std
::
forward
<
F2
>
(
function_2
)};
}
template
<
typename
F1
,
typename
F2
>
scheduler
::
starter
<
F1
,
F2
>
scheduler
::
par
(
F1
&&
function_1
,
F2
&&
function_2
)
{
return
thread_state
::
get
().
get_scheduler
().
invoke_parallel
(
std
::
forward
<
F1
>
(
function_1
),
std
::
forward
<
F2
>
(
function_2
));
}
class
scheduler
::
init_function
{
public
:
virtual
void
run
()
=
0
;
...
...
@@ -140,15 +23,12 @@ class scheduler::init_function_impl : public init_function {
public
:
explicit
init_function_impl
(
F
&
function
)
:
function_
{
function
}
{}
void
run
()
override
{
scheduler
::
par
([]()
{
return
parallel_result
<
int
>
{
0
};
},
[
=
]()
{
return
function_
();
}).
then
([
=
](
int
,
int
b
)
{
thread_state
::
get
().
get_scheduler
().
work_section_done_
=
true
;
return
parallel_result
<
int
>
{
b
};
auto
&
thread_state
=
thread_state
::
get
();
thread_state
.
get_task_manager
().
get_active_task
().
run_as_task
([
&
](
context_switcher
::
continuation
cont
)
{
function_
();
return
std
::
move
(
cont
);
});
thread_state
.
get_scheduler
().
work_section_done_
.
store
(
true
);
}
private
:
F
&
function_
;
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/scheduling/scheduler_memory.h
View file @
83c6e622
...
...
@@ -11,8 +11,6 @@ namespace pls {
namespace
internal
{
namespace
scheduling
{
void
worker_routine
();
class
scheduler_memory
{
// Note: scheduler_memory is a pure interface and has no data.
// By not having an initialization routine we can do our 'static and heap specialization'
...
...
@@ -29,7 +27,7 @@ class scheduler_memory {
}
};
template
<
size_t
MAX_THREADS
,
size_t
NUM_TASKS
,
size_t
NUM_CONTS
,
size_t
MAX_CONT
_SIZE
>
template
<
size_t
MAX_THREADS
,
size_t
NUM_TASKS
,
size_t
STACK
_SIZE
>
class
static_scheduler_memory
:
public
scheduler_memory
{
public
:
static_scheduler_memory
()
:
scheduler_memory
{}
{
...
...
@@ -47,14 +45,14 @@ class static_scheduler_memory : public scheduler_memory {
return
threads_
[
id
];
}
private
:
using
thread_state_type
=
thread_state_static
<
NUM_TASKS
,
NUM_CONTS
,
MAX_CONT
_SIZE
>
;
using
thread_state_type
=
thread_state_static
<
NUM_TASKS
,
STACK
_SIZE
>
;
alignas
(
base
::
system_details
::
CACHE_LINE_SIZE
)
std
::
array
<
base
::
thread
,
MAX_THREADS
>
threads_
;
alignas
(
base
::
system_details
::
CACHE_LINE_SIZE
)
std
::
array
<
thread_state_type
,
MAX_THREADS
>
thread_states_
;
alignas
(
base
::
system_details
::
CACHE_LINE_SIZE
)
std
::
array
<
thread_state
*
,
MAX_THREADS
>
thread_state_pointers_
;
};
template
<
size_t
NUM_TASKS
,
size_t
MAX_TASK_STACK_SIZE
,
size_t
NUM_CONTS
,
size_t
MAX_CONT
_SIZE
>
template
<
size_t
NUM_TASKS
,
size_t
STACK
_SIZE
>
class
heap_scheduler_memory
:
public
scheduler_memory
{
public
:
explicit
heap_scheduler_memory
(
size_t
max_threads
)
:
max_threads_
{
max_threads
},
...
...
@@ -80,7 +78,7 @@ class heap_scheduler_memory : public scheduler_memory {
return
thread_vector_
[
id
];
}
private
:
using
thread_state_type
=
thread_state_static
<
NUM_TASKS
,
NUM_CONTS
,
MAX_CONT
_SIZE
>
;
using
thread_state_type
=
thread_state_static
<
NUM_TASKS
,
STACK
_SIZE
>
;
// thread_state_type is aligned at the cache line and therefore overaligned (C++ 11 does not require
// the new operator to obey alignments bigger than 16, cache lines are usually 64).
// To allow this object to be allocated using 'new' (which the vector does internally),
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/scheduling/task.h
View file @
83c6e622
#ifndef PLS_TASK_H
#define PLS_TASK_H
#include "pls/internal/scheduling/cont.h"
#include "pls/internal/scheduling/memory_block.h"
#include <utility>
#include "pls/internal/helpers/profiler.h"
#include "context_switcher/continuation.h"
#include "context_switcher/context_switcher.h"
#include "pls/internal/base/system_details.h"
namespace
pls
{
namespace
internal
{
namespace
scheduling
{
/**
* A task to be executed by the runtime system.
* Tasks are guaranteed to be executed exactly once.
* A task is the smallest unit of execution seen by the runtime system.
*
* Override the execute_internal() method for your custom code.
*/
class
base_task
{
public
:
/**
* Executes the task and stores its result in the correct continuation.
* The caller must handle finishing the continuation/informing it that task two was finished.
* Tasks represent a action dispatched by a potentially parallel call.
* Tasks have their own execution context (stack and register state), making them stackefull coroutines.
* Tasks can be suspended and resumed (stealing happens by resuming a task).
*
* Being coroutines tasks go through a very deliberate state machine:
* - initialized (no execution state)
* - running (currently executing user code)
* - suspended (suspended by switching to a different task).
*/
void
execute
()
{
execute_internal
();
}
struct
alignas
(
base
::
system_details
::
CACHE_LINE_SIZE
)
task
{
void
init
(
char
*
stack_memory
,
size_t
stack_size
,
unsigned
depth
,
unsigned
thread_id
)
{
stack_memory_
=
stack_memory
;
stack_size_
=
stack_size
;
base_cont
*
get_cont
()
{
return
cont_
;
depth_
=
depth
;
thread_id_
=
thread_id
;
}
protected
:
explicit
base_task
(
base_cont
*
cont
)
:
cont_
{
cont
}
{};
unsigned
get_thread_id
()
const
{
return
thread_id_
;
}
void
set_thread_id
(
unsigned
thread_id
)
{
thread_id_
=
thread_id
;
}
/**
* Overwrite this with the actual behaviour of concrete tasks.
*/
virtual
void
execute_internal
()
=
0
;
task
*
get_prev
()
const
{
return
prev_
;
}
void
set_prev
(
task
*
prev
)
{
prev_
=
prev
;
}
base_cont
*
cont_
;
};
task
*
get_next
()
const
{
return
next_
;
}
void
set_next
(
task
*
next
)
{
next_
=
next
;
}
template
<
typename
F
>
class
task
:
public
base_task
{
public
:
template
<
typename
FARG
>
explicit
task
(
FARG
&&
function
,
base_cont
*
cont
)
:
base_task
{
cont
},
function_
{
std
::
forward
<
FARG
>
(
function
)}
{}
void
execute_internal
()
override
{
PROFILE_TASK
(
"execute_internal"
)
auto
result
=
function_
();
if
(
result
.
fast_path
())
{
cont_
->
store_right_result
(
std
::
move
(
result
));
task
*
get_parent_task
()
const
{
return
parent_task_
;
}
void
set_parent_task
(
task
*
parent_task
)
{
parent_task_
=
parent_task
;
}
template
<
typename
F
>
context_switcher
::
continuation
run_as_task
(
F
&&
lambda
)
{
return
context_switcher
::
enter_context
(
stack_memory_
,
stack_size_
,
std
::
forward
<
F
>
(
lambda
));
}
private
:
F
function_
;
// Stack/Continuation Management
char
*
stack_memory_
;
size_t
stack_size_
;
// TODO: We do not need this in every single task...
context_switcher
::
continuation
continuation_
;
// Task Tree (we have a parent that we want to continue when we finish)
task
*
parent_task_
;
unsigned
depth_
;
unsigned
thread_id_
;
// Memory Linked List
task
*
prev_
;
task
*
next_
;
};
}
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/scheduling/task_manager.h
View file @
83c6e622
...
...
@@ -5,18 +5,11 @@
#include <memory>
#include <utility>
#include <array>
#include <mutex>
#include <atomic>
#include "pls/internal/scheduling/task.h"
#include "pls/internal/scheduling/cont_manager.h"
#include "pls/internal/scheduling/memory_block.h"
#include "pls/internal/data_structures/bounded_trading_deque.h"
#include "pls/internal/data_structures/stamped_integer.h"
#include "pls/internal/data_structures/optional.h"
#include "pls/internal/base/spin_lock.h"
#include "pls/internal/data_structures/aligned_stack.h"
namespace
pls
{
namespace
internal
{
...
...
@@ -28,58 +21,59 @@ namespace scheduling {
*/
class
task_manager
{
public
:
// Publishes a task on the stack, i.e. makes it visible for other threads to steal.
void
publish_task
(
base_task
*
task
)
{
task_deque_
.
push_bot
(
task
->
get_cont
()
->
get_memory_block
());
explicit
task_manager
(
task
*
tasks
,
data_structures
::
aligned_stack
static_stack_space
,
size_t
num_tasks
,
size_t
stack_size
)
{
for
(
size_t
i
=
0
;
i
<
num_tasks
-
1
;
i
++
)
{
tasks
[
i
].
init
(
static_stack_space
.
push_bytes
(
stack_size
),
stack_size
,
i
,
0
);
if
(
i
>
0
)
{
tasks
[
i
].
set_prev
(
&
tasks
[
i
-
1
]);
}
if
(
i
<
num_tasks
-
2
)
{
tasks
[
i
].
set_next
(
&
tasks
[
i
+
1
]);
}
// Try to pop a local task from this task managers stack.
data_structures
::
optional
<
memory_block
*>
try_pop_local
()
{
return
task_deque_
.
pop_bot
().
traded_
;
}
// Try to steal a task from a remote task_manager instance. The stolen task must be stored locally.
// Returns a pair containing the actual task and if the steal was successful.
base_task
*
steal_remote_task
(
cont_manager
&
stealing_cont_manager
)
{
auto
peek
=
task_deque_
.
peek_top
();
if
(
std
::
get
<
0
>
(
peek
))
{
memory_block
*
peeked_memory_block
=
(
*
std
::
get
<
0
>
(
peek
));
auto
peeked_depth
=
peeked_memory_block
->
get_depth
();
stealing_cont_manager
.
move_active_node
(
peeked_depth
);
auto
offered_chain
=
stealing_cont_manager
.
get_active_node
();
stealing_cont_manager
.
move_active_node
(
1
);
num_tasks_
=
num_tasks
;
this_thread_tasks_
=
tasks
;
active_task_
=
&
tasks
[
0
];
}
auto
stolen_memory_block
=
task_deque_
.
pop_top
(
offered_chain
,
std
::
get
<
1
>
(
peek
));
if
(
stolen_memory_block
)
{
PLS_ASSERT
(
*
stolen_memory_block
==
peeked_memory_block
,
"Steal must only work if it is equal!"
);
task
&
get_this_thread_task
(
size_t
depth
)
{
return
this_thread_tasks_
[
depth
];
}
return
(
*
stolen_memory_block
)
->
get_cont
()
->
get_task
();
}
else
{
stealing_cont_manager
.
move_active_node
(
-
(
peeked_depth
+
1
));
return
nullptr
;
void
set_thread_id
(
unsigned
id
)
{
for
(
size_t
i
=
0
;
i
<
num_tasks_
;
i
++
)
{
this_thread_tasks_
[
i
].
set_thread_id
(
id
);
}
}
return
nullptr
;
task
&
get_active_task
()
{
return
*
active_task_
;
}
explicit
task_manager
(
data_structures
::
bounded_trading_deque
<
memory_block
,
memory_block
>
&
task_deque
)
:
task_deque_
{
task_deque
}
{}
private
:
data_structures
::
bounded_trading_deque
<
memory_block
,
memory_block
>
&
task_deque_
;
size_t
num_tasks_
;
task
*
this_thread_tasks_
;
task
*
active_task_
;
};
template
<
size_t
NUM_TASKS
>
template
<
size_t
NUM_TASKS
,
size_t
STACK_SIZE
>
class
static_task_manager
{
public
:
static_task_manager
()
:
task_deque_
{},
task_manager_
{
task_deque_
.
get_deque
()}
{};
static_task_manager
()
:
tasks_
{},
static_stack_storage_
{},
task_manager_
{
tasks_
.
data
(),
static_stack_storage_
.
get_stack
(),
NUM_TASKS
,
STACK_SIZE
}
{};
task_manager
&
get_task_manager
()
{
return
task_manager_
;
}
private
:
data_structures
::
static_bounded_trading_deque
<
memory_block
,
memory_block
,
NUM_TASKS
>
task_deque_
;
std
::
array
<
task
,
NUM_TASKS
>
tasks_
;
data_structures
::
static_aligned_stack
<
NUM_TASKS
*
STACK_SIZE
>
static_stack_storage_
;
task_manager
task_manager_
;
};
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/scheduling/thread_state.h
View file @
83c6e622
...
...
@@ -3,54 +3,34 @@
#define PLS_THREAD_STATE_H
#include <random>
#include <memory>
#include <array>
#include <chrono>
#include "pls/internal/scheduling/task_manager.h"
namespace
pls
{
namespace
internal
{
namespace
scheduling
{
// forward declaration
class
task_manager
;
class
cont_manager
;
class
scheduler
;
class
base_task
;
class
base_cont
;
struct
task
;
struct
alignas
(
base
::
system_details
::
CACHE_LINE_SIZE
)
thread_state
{
private
:
scheduler
*
scheduler_
;
size_t
id_
;
// Keep track of the last spawn state (needed to chain tasks/conts correctly)
bool
right_spawn_
;
base_cont
*
parent_cont_
;
// TODO: Set this when spawning!
// See if we should move this to the cont manager...seems like a better fit!
unsigned
id_
;
task_manager
&
task_manager_
;
cont_manager
&
cont_manager_
;
alignas
(
base
::
system_details
::
CACHE_LINE_SIZE
)
base_
task
*
current_task_
;
alignas
(
base
::
system_details
::
CACHE_LINE_SIZE
)
task
*
current_task_
;
alignas
(
base
::
system_details
::
CACHE_LINE_SIZE
)
std
::
minstd_rand
random_
;
public
:
thread_state
(
task_manager
&
task_manager
,
cont_manager
&
cont_manager
)
:
explicit
thread_state
(
task_manager
&
task_manager
)
:
scheduler_
{
nullptr
},
id_
{
0
},
right_spawn_
{
false
},
parent_cont_
{
nullptr
},
task_manager_
{
task_manager
},
cont_manager_
{
cont_manager
},
current_task_
{
nullptr
},
random_
{
static_cast
<
unsigned
long
>
(
std
::
chrono
::
steady_clock
::
now
().
time_since_epoch
().
count
())}
{};
void
reset
()
{
right_spawn_
=
false
;
parent_cont_
=
nullptr
;
}
/**
* Convenience helper to get the thread_state instance associated with this thread.
* Must only be called on threads that are associated with a thread_state,
...
...
@@ -60,10 +40,19 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) thread_state {
*/
static
thread_state
&
get
()
{
return
*
base
::
this_thread
::
state
<
thread_state
>
();
}
size_t
get_id
()
{
return
id_
;
}
unsigned
get_id
()
{
return
id_
;
}
void
set_id
(
unsigned
id
)
{
id_
=
id
;
task_manager_
.
set_thread_id
(
id
);
}
task_manager
&
get_task_manager
()
{
return
task_manager_
;
}
cont_manager
&
get_cont_manager
()
{
return
cont_manager_
;
}
scheduler
&
get_scheduler
()
{
return
*
scheduler_
;
}
void
set_scheduler
(
scheduler
*
scheduler
)
{
scheduler_
=
scheduler
;
}
long
get_rand
()
{
return
random_
();
}
// Do not allow move/copy operations.
// State is a pure memory container with references/pointers into it from all over the code.
...
...
@@ -73,6 +62,7 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) thread_state {
thread_state
(
const
thread_state
&
)
=
delete
;
thread_state
&
operator
=
(
const
thread_state
&
)
=
delete
;
};
}
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/scheduling/thread_state_static.h
View file @
83c6e622
...
...
@@ -3,8 +3,6 @@
#define PLS_INTERNAL_SCHEDULING_THREAD_STATE_STATIC_H_
#include "pls/internal/scheduling/task_manager.h"
#include "pls/internal/scheduling/cont_manager.h"
#include "pls/internal/base/system_details.h"
#include "thread_state.h"
...
...
@@ -13,18 +11,16 @@ namespace pls {
namespace
internal
{
namespace
scheduling
{
template
<
size_t
NUM_TASKS
,
size_t
NUM_CONTS
,
size_t
MAX_CONT
_SIZE
>
template
<
size_t
NUM_TASKS
,
size_t
STACK
_SIZE
>
struct
alignas
(
base
::
system_details
::
CACHE_LINE_SIZE
)
thread_state_static
{
public
:
thread_state_static
()
:
static_task_manager_
{},
static_cont_manager_
{},
thread_state_
{
static_task_manager_
.
get_task_manager
(),
static_cont_manager_
.
get_cont_manager
()}
{}
thread_state_
{
static_task_manager_
.
get_task_manager
()}
{}
thread_state
&
get_thread_state
()
{
return
thread_state_
;
}
private
:
alignas
(
base
::
system_details
::
CACHE_LINE_SIZE
)
static_task_manager
<
NUM_TASKS
>
static_task_manager_
;
alignas
(
base
::
system_details
::
CACHE_LINE_SIZE
)
static_cont_manager
<
NUM_CONTS
,
MAX_CONT_SIZE
>
static_cont_manager_
;
alignas
(
base
::
system_details
::
CACHE_LINE_SIZE
)
static_task_manager
<
NUM_TASKS
,
STACK_SIZE
>
static_task_manager_
;
alignas
(
base
::
system_details
::
CACHE_LINE_SIZE
)
thread_state
thread_state_
;
};
...
...
This diff is collapsed.
Click to expand it.
lib/pls/src/internal/scheduling/scheduler.cpp
View file @
83c6e622
...
...
@@ -21,8 +21,8 @@ scheduler::scheduler(scheduler_memory &memory, const unsigned int num_threads, b
for
(
unsigned
int
i
=
0
;
i
<
num_threads_
;
i
++
)
{
// Placement new is required, as the memory of `memory_` is not required to be initialized.
memory
.
thread_state_for
(
i
).
s
cheduler_
=
this
;
memory
.
thread_state_for
(
i
).
id_
=
i
;
memory
.
thread_state_for
(
i
).
s
et_scheduler
(
this
)
;
memory
.
thread_state_for
(
i
).
set_id
(
i
)
;
if
(
reuse_thread
&&
i
==
0
)
{
continue
;
// Skip over first/main thread when re-using the users thread, as this one will replace the first one.
...
...
@@ -55,8 +55,7 @@ void scheduler::work_thread_main_loop() {
void
scheduler
::
work_thread_work_section
()
{
auto
&
my_state
=
thread_state
::
get
();
my_state
.
reset
();
auto
&
my_cont_manager
=
my_state
.
get_cont_manager
();
auto
&
my_task_manager
=
my_state
.
get_task_manager
();
auto
const
num_threads
=
my_state
.
get_scheduler
().
num_threads
();
auto
const
my_id
=
my_state
.
get_id
();
...
...
@@ -67,41 +66,23 @@ void scheduler::work_thread_work_section() {
}
do
{
// Work off pending continuations we need to execute locally
while
(
my_cont_manager
.
falling_through
())
{
my_cont_manager
.
execute_fall_through_code
();
}
// Steal Routine (will be continuously executed when there are no more fall through's).
// TODO: move into separate function
const
size_t
offset
=
my_state
.
random_
()
%
num_threads
;
const
size_t
max_tries
=
num_threads
;
for
(
size_t
i
=
0
;
i
<
max_tries
;
i
++
)
{
size_t
target
=
(
offset
+
i
)
%
num_threads
;
auto
&
target_state
=
my_state
.
get_scheduler
().
thread_state_for
(
target
);
PLS_ASSERT
(
my_cont_manager
.
is_clean
(),
"Only steal with clean chain!"
);
PROFILE_STEALING
(
"steal"
)
auto
*
stolen_task
=
target_state
.
get_task_manager
().
steal_remote_task
(
my_cont_manager
);
PROFILE_END_BLOCK
;
if
(
stolen_task
!=
nullptr
)
{
my_state
.
parent_cont_
=
stolen_task
->
get_cont
();
my_state
.
right_spawn_
=
true
;
stolen_task
->
execute
();
if
(
my_cont_manager
.
falling_through
())
{
break
;
}
else
{
my_cont_manager
.
fall_through_and_notify_cont
(
stolen_task
->
get_cont
(),
true
);
break
;
}
}
}
// const size_t offset = my_state.get_rand() % num_threads;
// const size_t max_tries = num_threads;
// for (size_t i = 0; i < max_tries; i++) {
// size_t target = (offset + i) % num_threads;
// auto &target_state = my_state.get_scheduler().thread_state_for(target);
//
// auto *stolen_task = target_state.get_task_manager().steal_remote_task(my_cont_manager);
// if (stolen_task != nullptr) {
// stolen_task->execute();
// }
// }
// if (!my_cont_manager.falling_through()) {
// base::this_thread::sleep(5);
// }
}
while
(
!
work_section_done_
);
PLS_ASSERT
(
my_cont_manager
.
is_clean
(),
"Only finish work section with clean chain!"
);
}
void
scheduler
::
terminate
()
{
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment