Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
las3_pub
/
predictable_parallel_patterns
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
89b6e3cb
authored
Feb 09, 2020
by
FritzFlorian
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add cscontext ARMv7 assembly and fast path optimization.
parent
3c60e8d7
Pipeline
#1404
failed with stages
in 37 seconds
Changes
12
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
167 additions
and
64 deletions
+167
-64
app/benchmark_fft/main.cpp
+6
-14
app/benchmark_fib/main.cpp
+4
-11
app/benchmark_matrix/main.cpp
+8
-16
cmake/SetupOptimizationLevel.cmake
+1
-1
lib/context_switcher/asm/cscontext/SelectAssemblyFiles.cmake
+5
-0
lib/context_switcher/asm/cscontext/enter_context_arm32_sysv_elf.s
+63
-0
lib/context_switcher/asm/cscontext/switch_context_arm32_sysv_elf.s
+48
-0
lib/pls/include/pls/algorithms/for_each_impl.h
+1
-4
lib/pls/include/pls/internal/scheduling/task.h
+5
-1
lib/pls/include/pls/internal/scheduling/task_manager_impl.h
+1
-0
lib/pls/src/internal/scheduling/scheduler.cpp
+1
-0
lib/pls/src/internal/scheduling/task_manager.cpp
+24
-17
No files found.
app/benchmark_fft/main.cpp
View file @
89b6e3cb
...
...
@@ -37,7 +37,7 @@ void conquer(fft::complex_vector::iterator data, int n) {
constexpr
int
MAX_NUM_THREADS
=
8
;
constexpr
int
MAX_NUM_TASKS
=
32
;
constexpr
int
MAX_STACK_SIZE
=
1024
*
64
;
constexpr
int
MAX_STACK_SIZE
=
1024
*
32
;
static_scheduler_memory
<
MAX_NUM_THREADS
,
MAX_NUM_TASKS
,
...
...
@@ -56,19 +56,11 @@ int main(int argc, char **argv) {
scheduler
scheduler
{
global_scheduler_memory
,
(
unsigned
)
num_threads
};
scheduler
.
perform_work
([
&
]()
{
for
(
int
i
=
0
;
i
<
fft
::
NUM_WARMUP_ITERATIONS
;
i
++
)
{
conquer
(
data
.
begin
(),
fft
::
SIZE
);
}
});
scheduler
.
perform_work
([
&
]()
{
for
(
int
i
=
0
;
i
<
fft
::
NUM_ITERATIONS
;
i
++
)
{
runner
.
start_iteration
();
conquer
(
data
.
begin
(),
fft
::
SIZE
);
runner
.
end_iteration
();
}
});
runner
.
run_iterations
(
fft
::
NUM_ITERATIONS
,
[
&
]()
{
scheduler
.
perform_work
([
&
]()
{
conquer
(
data
.
begin
(),
fft
::
SIZE
);;
});
},
fft
::
NUM_WARMUP_ITERATIONS
);
runner
.
commit_results
(
true
);
return
0
;
...
...
app/benchmark_fib/main.cpp
View file @
89b6e3cb
...
...
@@ -51,19 +51,12 @@ int main(int argc, char **argv) {
scheduler
scheduler
{
global_scheduler_memory
,
(
unsigned
)
num_threads
};
volatile
int
res
;
scheduler
.
perform_work
([
&
]()
{
for
(
int
i
=
0
;
i
<
fib
::
NUM_WARMUP_ITERATIONS
;
i
++
)
{
res
=
pls_fib
(
fib
::
INPUT_N
);
}
});
scheduler
.
perform_work
([
&
]()
{
for
(
int
i
=
0
;
i
<
fib
::
NUM_ITERATIONS
;
i
++
)
{
runner
.
start_iteration
();
runner
.
run_iterations
(
fib
::
NUM_ITERATIONS
,
[
&
]()
{
scheduler
.
perform_work
([
&
]()
{
res
=
pls_fib
(
fib
::
INPUT_N
);
runner
.
end_iteration
();
}
});
});
},
fib
::
NUM_WARMUP_ITERATIONS
);
runner
.
commit_results
(
true
);
return
0
;
...
...
app/benchmark_matrix/main.cpp
View file @
89b6e3cb
...
...
@@ -14,8 +14,8 @@ class pls_matrix : public matrix::matrix<T, SIZE> {
public
:
pls_matrix
()
:
matrix
::
matrix
<
T
,
SIZE
>
()
{}
void
pls_multiply
(
const
matrix
::
matrix
<
T
,
SIZE
>
&
a
,
const
matrix
::
matrix
<
T
,
SIZE
>
&
b
)
{
pls
::
algorithm
::
for_each_range
(
0
,
SIZE
,
[
this
,
&
a
,
&
b
](
int
i
)
{
void
multiply
(
const
matrix
::
matrix
<
T
,
SIZE
>
&
a
,
const
matrix
::
matrix
<
T
,
SIZE
>
&
b
)
override
{
pls
::
algorithm
::
for_each_range
(
0
,
SIZE
,
[
&
](
int
i
)
{
this
->
multiply_column
(
i
,
a
,
b
);
});
}
...
...
@@ -23,7 +23,7 @@ class pls_matrix : public matrix::matrix<T, SIZE> {
constexpr
int
MAX_NUM_THREADS
=
8
;
constexpr
int
MAX_NUM_TASKS
=
32
;
constexpr
int
MAX_STACK_SIZE
=
1024
*
4
;
constexpr
int
MAX_STACK_SIZE
=
1024
*
1
;
static_scheduler_memory
<
MAX_NUM_THREADS
,
MAX_NUM_TASKS
,
...
...
@@ -44,18 +44,10 @@ int main(int argc, char **argv) {
scheduler
scheduler
{
global_scheduler_memory
,
(
unsigned
)
num_threads
};
scheduler
.
perform_work
([
&
]()
{
for
(
int
i
=
0
;
i
<
matrix
::
WARMUP_ITERATIONS
;
i
++
)
{
result
.
pls_multiply
(
a
,
b
);
}
});
scheduler
.
perform_work
([
&
]()
{
for
(
int
i
=
0
;
i
<
matrix
::
NUM_ITERATIONS
;
i
++
)
{
runner
.
start_iteration
();
result
.
pls_multiply
(
a
,
b
);
runner
.
end_iteration
();
}
});
runner
.
run_iterations
(
matrix
::
NUM_ITERATIONS
,
[
&
]()
{
scheduler
.
perform_work
([
&
]()
{
result
.
multiply
(
a
,
b
);
});
},
matrix
::
WARMUP_ITERATIONS
);
runner
.
commit_results
(
true
);
}
cmake/SetupOptimizationLevel.cmake
View file @
89b6e3cb
...
...
@@ -18,7 +18,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release")
# but inlining functions and SIMD/Vectorization is
# only enabled by -O3, thus it's way faster in some
# array calculations.
set
(
CMAKE_CXX_FLAGS_RELEASE
"
${
CMAKE_CXX_FLAGS_RELEASE
}
-O
2
-march=native"
)
set
(
CMAKE_CXX_FLAGS_RELEASE
"
${
CMAKE_CXX_FLAGS_RELEASE
}
-O
3
-march=native"
)
set
(
CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE
)
else
()
set
(
CMAKE_CXX_FLAGS_DEBUG
"-g -O0"
)
...
...
lib/context_switcher/asm/cscontext/SelectAssemblyFiles.cmake
View file @
89b6e3cb
...
...
@@ -9,6 +9,11 @@ if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" AND CMAKE_SYSTEM_NAME STREQUAL "Lin
SET
(
CS_CSCONTEXT_ASSEMBLY
asm/cscontext/enter_context_x86_64_sysv_elf.s
asm/cscontext/switch_context_x86_64_sysv_elf.s
)
elseif
(
CMAKE_SYSTEM_PROCESSOR STREQUAL
"armv7l"
AND CMAKE_SYSTEM_NAME STREQUAL
"Linux"
)
# Typical Linux running on ARMv7
SET
(
CS_CSCONTEXT_ASSEMBLY
asm/cscontext/enter_context_arm32_sysv_elf.s
asm/cscontext/switch_context_arm32_sysv_elf.s
)
else
()
SET
(
CS_CSCONTEXT_FOUND FALSE
)
endif
()
...
...
lib/context_switcher/asm/cscontext/enter_context_arm32_sysv_elf.s
0 → 100644
View file @
89b6e3cb
.arm
.text
.global __cs_enter_context
.type __cs_enter_context, %function
__cs_enter_context:
/* Parameter List (in order)
* r0 = new stack pointer
* r1 = first parameter to callback
* r2 = callback function pointer
* r3 = new stack limit (not used on most platforms)
*
* Return
* r0 = continuation that returned control back to the caller (null if fallthrough)
*
* Variables
* r4 = temporary for the old stack pointer */
/* ========== Save State ========== */
/* store programm counter for later return */
push {lr}
/* store callee saved registers */
push {r4-r12,lr}
/* store floating point extension registers */
#if (defined(__VFP_FP__) && !defined(__SOFTFP__))
sub sp, sp, #64
vstmia sp, {d8-d15}
#endif
/* ========== Save State ========== */
/* Perform change to new stack */
/* Keep old stack as second parameter to our callback function. */
mov r4, sp
/* Make sure that stack start is properly aligned. */
and r0, r0, #-16
/* Switch to new stack pointer. */
mov sp, r0
/* Perform actual function call, this will now be on the new stack */
/* r0 = first parametor to callback (continuation) */
/* r1 = second parameter to callback (arbetary pointer) */
mov r0, r4
blx r2
/* Restore state of returned continuation. */
/* To do so we first reset the stack pointer (which we get returned in r0). */
/* After that we execute our standard restore procedere to pop the state from the stack. */
mov sp, r0
/* ========== Restore State ========== */
/* restore floating point extension registers */
#if (defined(__VFP_FP__) && !defined(__SOFTFP__))
vldmia sp, {d8-d15}
add sp, sp, #64
#endif
/* restore callee saved registers */
pop {r4-r12,lr}
/* ========== Restore State ========== */
/* Just return back from the call. */
/* This is the end of a fiber, so we have no continuation. */
eor r0, r0, r0
pop {pc}
lib/context_switcher/asm/cscontext/switch_context_arm32_sysv_elf.s
0 → 100644
View file @
89b6e3cb
.arm
.text
.global __cs_switch_context
.type __cs_switch_context, %function
__cs_switch_context:
/* Parameter List (in order)
* r0 = pointer to continuation (should hold value of target stack will be filled with this continuation)
*
* Return
* r0 = continuation that returned control back to the caller (null if fallthrough)
*
* Variables
* r1 = temporary for the old stack pointer */
/* ========== Save State ========== */
/* store programm counter for later return */
push {lr}
/* store callee saved registers */
push {r4-r12,lr}
/* store floating point extension registers */
#if (defined(__VFP_FP__) && !defined(__SOFTFP__))
sub sp, sp, #64
vstmia sp, {d8-d15}
#endif
/* ========== Save State ========== */
/* Perform change to new stack */
/* Keep old stack as result from this function. */
mov r1, sp
/* Switch to new stack pointer. */
mov sp, r0
/* ========== Restore State ========== */
/* restore floating point extension registers */
#if (defined(__VFP_FP__) && !defined(__SOFTFP__))
vldmia sp, {d8-d15}
add sp, sp, #64
#endif
/* restore callee saved registers */
pop {r4-r12,lr}
/* ========== Restore State ========== */
/* Just return back from the call. */
/* This is the end of a fiber, so we have no continuation. */
mov r0, r1
pop {pc}
lib/pls/include/pls/algorithms/for_each_impl.h
View file @
89b6e3cb
...
...
@@ -77,10 +77,7 @@ void for_each(RandomIt
execution_strategy
)
{
long
num_elements
=
std
::
distance
(
first
,
last
);
return
internal
::
for_each
(
first
,
last
,
function
,
execution_strategy
.
calculate_min_elements
(
num_elements
)
);
internal
::
for_each
(
first
,
last
,
function
,
execution_strategy
.
calculate_min_elements
(
num_elements
));
}
template
<
typename
RandomIt
,
typename
Function
>
...
...
lib/pls/include/pls/internal/scheduling/task.h
View file @
89b6e3cb
...
...
@@ -34,6 +34,8 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) task {
depth_
=
depth
;
thread_id_
=
thread_id
;
is_synchronized_
=
false
;
}
template
<
typename
F
>
...
...
@@ -44,9 +46,11 @@ struct alignas(base::system_details::CACHE_LINE_SIZE) task {
// TODO: Proper access control and split it up into responsibilities
// Stack/Continuation Management
char
*
stack_memory_
;
size_t
stack_size_
;
size_t
stack_size_
;
// TODO: maybe remove it, not needed in here
context_switcher
::
continuation
continuation_
;
bool
is_synchronized_
;
// TODO: Clean up responsibilities
// Work-Stealing
std
::
atomic
<
traded_cas_field
>
external_trading_deque_cas_
{};
std
::
atomic
<
task
*>
resource_stack_next_
{};
...
...
lib/pls/include/pls/internal/scheduling/task_manager_impl.h
View file @
89b6e3cb
...
...
@@ -28,6 +28,7 @@ void task_manager::spawn_child(F &&lambda) {
last_task
->
continuation_
=
std
::
move
(
cont
);
// we are now executing the new task, allow others to steal the last task continuation.
spawned_task
->
is_synchronized_
=
true
;
spawning_task_manager
->
active_task_
=
spawned_task
;
spawning_task_manager
->
deque_
.
push_bot
(
last_task
);
...
...
lib/pls/src/internal/scheduling/scheduler.cpp
View file @
89b6e3cb
...
...
@@ -108,6 +108,7 @@ void scheduler::work_thread_work_section() {
// Execute the stolen task by jumping to it's continuation.
PLS_ASSERT
(
stolen_task
->
continuation_
.
valid
(),
"A task that we can steal must have a valid continuation for us to start working."
);
stolen_task
->
is_synchronized_
=
false
;
context_switcher
::
switch_context
(
std
::
move
(
stolen_task
->
continuation_
));
// We will continue execution in this line when we finished the stolen work.
}
...
...
lib/pls/src/internal/scheduling/task_manager.cpp
View file @
89b6e3cb
...
...
@@ -128,23 +128,29 @@ void task_manager::sync() {
auto
*
last_task
=
spawning_task_manager
->
active_task_
;
auto
*
spawned_task
=
spawning_task_manager
->
active_task_
->
next_
;
auto
continuation
=
spawned_task
->
run_as_task
([
=
](
context_switcher
::
continuation
cont
)
{
last_task
->
continuation_
=
std
::
move
(
cont
);
spawning_task_manager
->
active_task_
=
spawned_task
;
context_switcher
::
continuation
result_cont
;
if
(
spawning_task_manager
->
try_clean_return
(
result_cont
))
{
// We return back to the main scheduling loop
return
result_cont
;
}
else
{
// We finish up the last task
return
result_cont
;
}
});
PLS_ASSERT
(
!
continuation
.
valid
(),
"We only return to a sync point, never jump to it directly."
"This must therefore never return an unfinished fiber/continuation."
);
if
(
last_task
->
is_synchronized_
)
{
return
;
// We are already the sole owner of last_task
}
else
{
auto
continuation
=
spawned_task
->
run_as_task
([
=
](
context_switcher
::
continuation
cont
)
{
last_task
->
continuation_
=
std
::
move
(
cont
);
spawning_task_manager
->
active_task_
=
spawned_task
;
context_switcher
::
continuation
result_cont
;
if
(
spawning_task_manager
->
try_clean_return
(
result_cont
))
{
// We return back to the main scheduling loop
return
result_cont
;
}
else
{
// We finish up the last task
return
result_cont
;
}
});
PLS_ASSERT
(
!
continuation
.
valid
(),
"We only return to a sync point, never jump to it directly."
"This must therefore never return an unfinished fiber/continuation."
);
return
;
// We cleanly synced to the last one finishing work on last_task
}
}
bool
task_manager
::
try_clean_return
(
context_switcher
::
continuation
&
result_cont
)
{
...
...
@@ -195,6 +201,7 @@ bool task_manager::try_clean_return(context_switcher::continuation &result_cont)
// We are the last one working on this task. Thus the sync must be finished, continue working.
active_task_
=
last_task
;
last_task
->
is_synchronized_
=
true
;
result_cont
=
std
::
move
(
last_task
->
continuation_
);
PLS_ASSERT
(
result_cont
.
valid
(),
"Must return a valid continuation."
);
return
false
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment