Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
las3_pub
/
predictable_parallel_patterns
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
4bb8c2e5
authored
5 years ago
by
FritzFlorian
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Working version of parallel scan
parent
a5bb074c
Pipeline
#1261
failed with stages
in 50 seconds
Changes
16
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
166 additions
and
26 deletions
+166
-26
CMakeLists.txt
+1
-0
app/benchmark_matrix/main.cpp
+1
-4
app/benchmark_prefix/CMakeLists.txt
+5
-0
app/benchmark_prefix/main.cpp
+25
-0
lib/pls/CMakeLists.txt
+2
-2
lib/pls/include/pls/algorithms/parallel_for.h
+9
-0
lib/pls/include/pls/algorithms/parallel_for_impl.h
+4
-2
lib/pls/include/pls/algorithms/parallel_scan.h
+15
-0
lib/pls/include/pls/algorithms/parallel_scan_impl.h
+81
-0
lib/pls/include/pls/internal/data_structures/aligned_stack.h
+2
-1
lib/pls/include/pls/internal/data_structures/aligned_stack_impl.h
+3
-11
lib/pls/include/pls/internal/data_structures/work_stealing_deque_impl.h
+1
-1
lib/pls/include/pls/internal/helpers/mini_benchmark.h
+1
-1
lib/pls/include/pls/internal/scheduling/task.h
+2
-3
lib/pls/include/pls/pls.h
+2
-1
lib/pls/src/internal/data_structures/aligned_stack.cpp
+12
-0
No files found.
CMakeLists.txt
View file @
4bb8c2e5
...
...
@@ -36,6 +36,7 @@ add_subdirectory(app/invoke_parallel)
add_subdirectory
(
app/benchmark_fft
)
add_subdirectory
(
app/benchmark_unbalanced
)
add_subdirectory
(
app/benchmark_matrix
)
add_subdirectory
(
app/benchmark_prefix
)
# Add optional tests
option
(
PACKAGE_TESTS
"Build the tests"
ON
)
...
...
This diff is collapsed.
Click to expand it.
app/benchmark_matrix/main.cpp
View file @
4bb8c2e5
...
...
@@ -2,8 +2,6 @@
#include <pls/internal/helpers/profiler.h>
#include <pls/internal/helpers/mini_benchmark.h>
#include <boost/range/irange.hpp>
const
int
MATRIX_SIZE
=
128
;
template
<
typename
T
,
int
SIZE
>
...
...
@@ -16,8 +14,7 @@ class matrix {
}
void
multiply
(
const
matrix
<
T
,
SIZE
>
&
a
,
const
matrix
<
T
,
SIZE
>
&
b
)
{
auto
range
=
boost
::
irange
(
0
,
SIZE
);
pls
::
algorithm
::
parallel_for
(
range
.
begin
(),
range
.
end
(),
[
&
](
int
i
)
{
pls
::
algorithm
::
parallel_for
(
0
,
SIZE
,
[
&
](
int
i
)
{
this
->
multiply_column
(
i
,
a
,
b
);
});
}
...
...
This diff is collapsed.
Click to expand it.
app/benchmark_prefix/CMakeLists.txt
0 → 100644
View file @
4bb8c2e5
add_executable
(
benchmark_prefix main.cpp
)
target_link_libraries
(
benchmark_prefix pls
)
if
(
EASY_PROFILER
)
target_link_libraries
(
benchmark_prefix easy_profiler
)
endif
()
This diff is collapsed.
Click to expand it.
app/benchmark_prefix/main.cpp
0 → 100644
View file @
4bb8c2e5
#include <pls/pls.h>
#include <pls/internal/helpers/profiler.h>
#include <pls/internal/helpers/mini_benchmark.h>
#include <iostream>
#include <vector>
#include <functional>
static
constexpr
int
INPUT_SIZE
=
1000000
;
int
main
()
{
PROFILE_ENABLE
std
::
vector
<
double
>
vec
(
INPUT_SIZE
,
1
);
std
::
vector
<
double
>
out
(
INPUT_SIZE
);
for
(
int
i
=
0
;
i
<
INPUT_SIZE
;
i
++
)
{
vec
[
i
]
=
i
;
}
pls
::
internal
::
helpers
::
run_mini_benchmark
([
&
]
{
pls
::
parallel_scan
(
vec
.
begin
(),
vec
.
end
(),
out
.
begin
(),
std
::
plus
<
double
>
(),
0.0
);
},
8
,
1000
);
PROFILE_SAVE
(
"test_profile.prof"
)
}
This diff is collapsed.
Click to expand it.
lib/pls/CMakeLists.txt
View file @
4bb8c2e5
...
...
@@ -6,6 +6,8 @@ add_library(pls STATIC
include/pls/algorithms/invoke_parallel_impl.h
include/pls/algorithms/parallel_for.h
include/pls/algorithms/parallel_for_impl.h
include/pls/algorithms/parallel_scan.h
include/pls/algorithms/parallel_scan_impl.h
include/pls/internal/base/spin_lock.h
include/pls/internal/base/tas_spin_lock.h src/internal/base/tas_spin_lock.cpp
...
...
@@ -35,8 +37,6 @@ add_library(pls STATIC
include/pls/internal/scheduling/task.h src/internal/scheduling/task.cpp
include/pls/internal/scheduling/scheduler_memory.h src/internal/scheduling/scheduler_memory.cpp
include/pls/internal/scheduling/lambda_task.h include/pls/internal/data_structures/deque.h
# include/pls/algorithms/parallel_scan.h include/pls/algorithms/parallel_scan_impl.h)
)
# Add everything in `./include` to be in the include path of this project
target_include_directories
(
pls
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/algorithms/parallel_for.h
View file @
4bb8c2e5
...
...
@@ -2,12 +2,21 @@
#ifndef PLS_PARALLEL_FOR_H
#define PLS_PARALLEL_FOR_H
// TODO: Replace with own integer iterator to remove dependency
#include <boost/range/irange.hpp>
namespace
pls
{
namespace
algorithm
{
template
<
typename
RandomIt
,
typename
Function
>
void
parallel_for
(
RandomIt
first
,
RandomIt
last
,
const
Function
&
function
);
template
<
typename
Function
>
void
parallel_for
(
size_t
first
,
size_t
last
,
const
Function
&
function
)
{
auto
range
=
boost
::
irange
(
first
,
last
);
parallel_for
(
range
.
begin
(),
range
.
end
(),
function
);
}
}
}
#include "parallel_for_impl.h"
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/algorithms/parallel_for_impl.h
View file @
4bb8c2e5
...
...
@@ -25,11 +25,13 @@ void parallel_for(RandomIt first, RandomIt last, const Function &function) {
// Cut in half recursively
long
middle_index
=
num_elements
/
2
;
auto
second_half_body
=
[
=
]
{
parallel_for
(
first
+
middle_index
,
last
,
function
);
};
auto
second_half_body
=
[
first
,
middle_index
,
last
,
&
function
]
{
parallel_for
(
first
+
middle_index
,
last
,
function
);
};
using
second_half_t
=
lambda_task_by_reference
<
decltype
(
second_half_body
)
>
;
scheduler
::
spawn_child
<
second_half_t
>
(
std
::
move
(
second_half_body
));
auto
first_half_body
=
[
=
]
{
parallel_for
(
first
,
first
+
middle_index
,
function
);
};
auto
first_half_body
=
[
first
,
middle_index
,
last
,
&
function
]
{
parallel_for
(
first
,
first
+
middle_index
,
function
);
};
using
first_half_t
=
lambda_task_by_reference
<
decltype
(
first_half_body
)
>
;
scheduler
::
spawn_child_and_wait
<
first_half_t
>
(
std
::
move
(
first_half_body
));
}
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/algorithms/parallel_scan.h
0 → 100644
View file @
4bb8c2e5
#ifndef PLS_PARALLEL_SCAN_H_
#define PLS_PARALLEL_SCAN_H_
namespace
pls
{
namespace
algorithm
{
template
<
typename
InIter
,
typename
OutIter
,
typename
BinaryOp
,
typename
Type
>
void
parallel_scan
(
InIter
input_start
,
const
InIter
input_end
,
OutIter
output
,
BinaryOp
op
,
Type
neutral_element
);
}
}
#include "parallel_scan_impl.h"
#endif //PLS_PARALLEL_SCAN_H_
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/algorithms/parallel_scan_impl.h
0 → 100644
View file @
4bb8c2e5
#ifndef PLS_PARALLEL_SCAN_IMPL_H_
#define PLS_PARALLEL_SCAN_IMPL_H_
#include <memory>
#include <functional>
#include "pls/pls.h"
#include "pls/internal/scheduling/thread_state.h"
namespace
pls
{
namespace
algorithm
{
namespace
internal
{
template
<
typename
InIter
,
typename
OutIter
,
typename
BinaryOp
,
typename
Type
>
void
serial_scan
(
InIter
input_start
,
const
InIter
input_end
,
OutIter
output
,
BinaryOp
op
,
Type
neutral_element
)
{
auto
current_input
=
input_start
;
auto
current_output
=
output
;
auto
last_value
=
neutral_element
;
while
(
current_input
!=
input_end
)
{
last_value
=
op
(
last_value
,
*
current_input
);
*
current_output
=
last_value
;
current_input
++
;
current_output
++
;
}
}
}
template
<
typename
InIter
,
typename
OutIter
,
typename
BinaryOp
,
typename
Type
>
void
parallel_scan
(
InIter
input_start
,
const
InIter
input_end
,
OutIter
output
,
BinaryOp
op
,
Type
neutral_element
)
{
constexpr
auto
chunks_per_thread
=
4
;
using
namespace
pls
::
internal
::
scheduling
;
auto
size
=
std
::
distance
(
input_start
,
input_end
);
auto
my_state
=
thread_state
::
get
();
auto
num_threads
=
thread_state
::
get
()
->
scheduler_
->
num_threads
();
auto
chunks
=
num_threads
*
chunks_per_thread
;
auto
items_per_chunk
=
std
::
max
(
1l
,
size
/
chunks
);
// First Pass -> each chunk isolated
// TODO: Put this onto the stack, not into heap allocated memory
Type
*
tmp
=
new
Type
[
chunks
];
//reinterpret_cast<Type *>(my_state->task_stack_->push_bytes(sizeof(Type) * (chunks)));
parallel_for
(
0
,
chunks
,
[
tmp
,
input_start
,
input_end
,
output
,
items_per_chunk
,
&
op
,
&
neutral_element
](
int
i
)
{
auto
local_start
=
input_start
+
items_per_chunk
*
i
;
auto
local_end
=
local_start
+
items_per_chunk
;
if
(
local_end
>
input_end
)
{
local_end
=
input_end
;
}
// TODO: This MUST be dynamic to make sense, as it has a far bigger influence than any other cutoff
//
internal
::
serial_scan
(
local_start
,
local_end
,
output
+
items_per_chunk
*
i
,
op
,
neutral_element
);
tmp
[
i
]
=
*
(
output
+
std
::
distance
(
local_start
,
local_end
)
-
1
);
});
internal
::
serial_scan
(
tmp
,
tmp
+
chunks
,
tmp
,
std
::
plus
<
int
>
(),
0
);
// Sum up pivot elements...
auto
output_start
=
output
;
auto
output_end
=
output
+
size
;
parallel_for
(
1
,
chunks
,
[
tmp
,
output_start
,
output_end
,
items_per_chunk
,
&
op
,
&
neutral_element
](
int
i
)
{
auto
local_start
=
output_start
+
items_per_chunk
*
i
;
auto
local_end
=
local_start
+
items_per_chunk
;
if
(
local_end
>
output_end
)
{
local_end
=
output_end
;
}
for
(;
local_start
!=
local_end
;
local_start
++
)
{
*
local_start
=
op
(
*
local_start
,
tmp
[
i
-
1
]);
}
});
// scheduler::wait_for_all();
delete
[]
tmp
;
}
}
}
#endif //PLS_PARALLEL_SCAN_IMPL_H_
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/data_structures/aligned_stack.h
View file @
4bb8c2e5
...
...
@@ -43,7 +43,8 @@ class aligned_stack {
template
<
typename
T
,
typename
...
ARGS
>
T
*
push
(
ARGS
&&
...
args
);
template
<
typename
T
>
void
*
push
();
void
*
push_bytes
();
void
*
push_bytes
(
size_t
size
);
template
<
typename
T
>
T
pop
();
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/data_structures/aligned_stack_impl.h
View file @
4bb8c2e5
...
...
@@ -9,20 +9,12 @@ namespace data_structures {
template
<
typename
T
,
typename
...
ARGS
>
T
*
aligned_stack
::
push
(
ARGS
&&
...
args
)
{
// Perfect-Forward construct
return
new
(
push
<
T
>
())
T
(
std
::
forward
<
ARGS
>
(
args
)...);
return
new
(
push
_bytes
<
T
>
())
T
(
std
::
forward
<
ARGS
>
(
args
)...);
}
template
<
typename
T
>
void
*
aligned_stack
::
push
()
{
void
*
result
=
reinterpret_cast
<
T
*>
(
head_
);
// Move head to next aligned position after new object
head_
=
base
::
alignment
::
next_alignment
(
head_
+
sizeof
(
T
));
if
(
head_
>=
memory_end_
)
{
PLS_ERROR
(
"Tried to allocate object on alligned_stack without sufficient memory!"
);
}
return
result
;
void
*
aligned_stack
::
push_bytes
()
{
return
push_bytes
(
sizeof
(
T
));
}
template
<
typename
T
>
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/data_structures/work_stealing_deque_impl.h
View file @
4bb8c2e5
...
...
@@ -28,7 +28,7 @@ std::pair<work_stealing_deque_item, T> *work_stealing_deque<Item>::allocate_item
// 'Union' type to push both on stack
using
pair_t
=
std
::
pair
<
work_stealing_deque_item
,
T
>
;
// Allocate space on stack
auto
new_pair
=
reinterpret_cast
<
pair_t
*>
(
stack_
->
push
<
pair_t
>
());
auto
new_pair
=
reinterpret_cast
<
pair_t
*>
(
stack_
->
push
_bytes
<
pair_t
>
());
// Initialize memory on stack
new
((
void
*
)
&
(
new_pair
->
first
))
work_stealing_deque_item
();
new
((
void
*
)
&
(
new_pair
->
second
))
T
(
std
::
forward
<
ARGS
>
(
args
)...);
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/helpers/mini_benchmark.h
View file @
4bb8c2e5
...
...
@@ -18,7 +18,7 @@ void run_mini_benchmark(const Function &lambda, size_t max_threads, unsigned lon
using
namespace
std
;
using
namespace
pls
::
internal
::
scheduling
;
malloc_scheduler_memory
scheduler_memory
{
max_threads
};
malloc_scheduler_memory
scheduler_memory
{
max_threads
,
2
<<
12
};
for
(
unsigned
int
num_threads
=
1
;
num_threads
<=
max_threads
;
num_threads
++
)
{
scheduler
local_scheduler
{
&
scheduler_memory
,
num_threads
};
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/scheduling/task.h
View file @
4bb8c2e5
...
...
@@ -50,11 +50,10 @@ void task::spawn_child(ARGS &&... args) {
ref_count_
++
;
// Push on our deque
auto
deque_state
=
thread_state
::
get
()
->
deque_
.
save_state
();
thread_state
::
get
()
->
deque_
.
push_tail
<
T
>
([
this
,
deque_state
](
T
*
item
)
{
thread_state
::
get
()
->
deque_
.
push_tail
<
T
>
([
this
](
T
*
item
)
{
// Assign forced values (for stack and parent management)
item
->
parent_
=
this
;
item
->
deque_state_
=
deque_state
;
item
->
deque_state_
=
thread_state
::
get
()
->
deque_
.
save_state
()
;
},
std
::
forward
<
ARGS
>
(
args
)...);
}
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/pls.h
View file @
4bb8c2e5
...
...
@@ -3,6 +3,7 @@
#include "pls/algorithms/invoke_parallel.h"
#include "pls/algorithms/parallel_for.h"
#include "pls/algorithms/parallel_scan.h"
#include "pls/internal/scheduling/task.h"
#include "pls/internal/scheduling/scheduler.h"
#include "pls/internal/helpers/unique_id.h"
...
...
@@ -23,7 +24,7 @@ using internal::scheduling::task;
using
algorithm
::
invoke_parallel
;
using
algorithm
::
parallel_for
;
using
algorithm
::
parallel_scan
;
}
#endif
This diff is collapsed.
Click to expand it.
lib/pls/src/internal/data_structures/aligned_stack.cpp
View file @
4bb8c2e5
...
...
@@ -15,6 +15,18 @@ aligned_stack::aligned_stack(char *memory_region, const std::size_t size) :
memory_end_
{(
pointer_t
)
memory_region
+
size
},
head_
{
base
::
alignment
::
next_alignment
(
memory_start_
)}
{}
void
*
aligned_stack
::
push_bytes
(
size_t
size
)
{
void
*
result
=
reinterpret_cast
<
void
*>
(
head_
);
// Move head to next aligned position after new object
head_
=
base
::
alignment
::
next_alignment
(
head_
+
size
);
if
(
head_
>=
memory_end_
)
{
PLS_ERROR
(
"Tried to allocate object on alligned_stack without sufficient memory!"
);
}
return
result
;
}
}
}
}
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment