Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
las3_pub
/
predictable_parallel_patterns
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
86333a60
authored
4 years ago
by
FritzFlorian
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Final preparations for single-app benchmark runs.
parent
e2f584c4
Pipeline
#1515
passed with stages
in 4 minutes 37 seconds
Changes
4
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
47 additions
and
26 deletions
+47
-26
app/benchmark_matrix_div_conquer/main.cpp
+40
-24
lib/pls/include/pls/internal/scheduling/scheduler.h
+1
-1
lib/pls/include/pls/internal/scheduling/strain_local_resource.h
+3
-1
lib/pls/include/pls/internal/scheduling/thread_state.h
+3
-0
No files found.
app/benchmark_matrix_div_conquer/main.cpp
View file @
86333a60
...
...
@@ -8,30 +8,39 @@ using namespace comparison_benchmarks::base;
#include <memory>
#include <array>
#include <algorithm>
#include <vector>
void
multiply_div_conquer
(
const
std
::
vector
<
std
::
vector
<
std
::
vector
<
std
::
unique_ptr
<
double
[]
>>>>
&
tmp_arrays
,
pls
::
strain_local_resource
&
local_indices
,
size_t
size
,
size_t
depth
,
size_t
branch
,
matrix_div_conquer
::
blocked_matrix_view
&
result
,
matrix_div_conquer
::
blocked_matrix_view
&
a
,
matrix_div_conquer
::
blocked_matrix_view
&
b
)
{
if
(
size
<=
8
)
{
if
(
size
<=
matrix_div_conquer
::
CUTOFF_SIZE
)
{
multiply_naive
(
size
,
result
,
a
,
b
);
return
;
}
// Temporary storage required for the intermediate results
auto
strain_local_index
=
local_indices
.
get_item
(
depth
);
std
::
unique_ptr
<
double
[]
>
const
&
data_1_1_a
=
tmp_arrays
[
depth
][
strain_local_index
.
get_strain_index
()][
0
];
std
::
unique_ptr
<
double
[]
>
const
&
data_1_1_b
=
tmp_arrays
[
depth
][
strain_local_index
.
get_strain_index
()][
1
];
std
::
unique_ptr
<
double
[]
>
const
&
data_1_2_a
=
tmp_arrays
[
depth
][
strain_local_index
.
get_strain_index
()][
2
];
std
::
unique_ptr
<
double
[]
>
const
&
data_1_2_b
=
tmp_arrays
[
depth
][
strain_local_index
.
get_strain_index
()][
3
];
std
::
unique_ptr
<
double
[]
>
const
&
data_2_1_a
=
tmp_arrays
[
depth
][
strain_local_index
.
get_strain_index
()][
4
];
std
::
unique_ptr
<
double
[]
>
const
&
data_2_1_b
=
tmp_arrays
[
depth
][
strain_local_index
.
get_strain_index
()][
5
];
std
::
unique_ptr
<
double
[]
>
const
&
data_2_2_a
=
tmp_arrays
[
depth
][
strain_local_index
.
get_strain_index
()][
6
];
std
::
unique_ptr
<
double
[]
>
const
&
data_2_2_b
=
tmp_arrays
[
depth
][
strain_local_index
.
get_strain_index
()][
7
];
size_t
index
;
if
(
depth
==
0
||
(
8u
<<
(
depth
-
1u
))
<=
local_indices
.
get_num_threads
())
{
index
=
branch
;
}
else
{
index
=
strain_local_index
.
get_strain_index
();
}
std
::
unique_ptr
<
double
[]
>
const
&
data_1_1_a
=
tmp_arrays
[
depth
][
index
][
0
];
std
::
unique_ptr
<
double
[]
>
const
&
data_1_1_b
=
tmp_arrays
[
depth
][
index
][
1
];
std
::
unique_ptr
<
double
[]
>
const
&
data_1_2_a
=
tmp_arrays
[
depth
][
index
][
2
];
std
::
unique_ptr
<
double
[]
>
const
&
data_1_2_b
=
tmp_arrays
[
depth
][
index
][
3
];
std
::
unique_ptr
<
double
[]
>
const
&
data_2_1_a
=
tmp_arrays
[
depth
][
index
][
4
];
std
::
unique_ptr
<
double
[]
>
const
&
data_2_1_b
=
tmp_arrays
[
depth
][
index
][
5
];
std
::
unique_ptr
<
double
[]
>
const
&
data_2_2_a
=
tmp_arrays
[
depth
][
index
][
6
];
std
::
unique_ptr
<
double
[]
>
const
&
data_2_2_b
=
tmp_arrays
[
depth
][
index
][
7
];
// Handles to sub-matrices used
matrix_div_conquer
::
blocked_matrix_view
result_1_1
=
result
.
quadrant_1_1
();
...
...
@@ -60,31 +69,31 @@ void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_
// Divide Work Into Sub-Calls
pls
::
spawn
(
[
&
]()
{
multiply_div_conquer
(
tmp_arrays
,
local_indices
,
size
/
2
,
depth
+
1
,
result_1_1_a
,
a_1_1
,
b_1_1
);
}
[
&
]()
{
multiply_div_conquer
(
tmp_arrays
,
local_indices
,
size
/
2
,
depth
+
1
,
0
,
result_1_1_a
,
a_1_1
,
b_1_1
);
}
);
pls
::
spawn
(
[
&
]()
{
multiply_div_conquer
(
tmp_arrays
,
local_indices
,
size
/
2
,
depth
+
1
,
result_1_1_b
,
a_1_2
,
b_2_1
);
}
[
&
]()
{
multiply_div_conquer
(
tmp_arrays
,
local_indices
,
size
/
2
,
depth
+
1
,
1
,
result_1_1_b
,
a_1_2
,
b_2_1
);
}
);
pls
::
spawn
(
[
&
]()
{
multiply_div_conquer
(
tmp_arrays
,
local_indices
,
size
/
2
,
depth
+
1
,
result_1_2_a
,
a_1_1
,
b_1_2
);
}
[
&
]()
{
multiply_div_conquer
(
tmp_arrays
,
local_indices
,
size
/
2
,
depth
+
1
,
2
,
result_1_2_a
,
a_1_1
,
b_1_2
);
}
);
pls
::
spawn
(
[
&
]()
{
multiply_div_conquer
(
tmp_arrays
,
local_indices
,
size
/
2
,
depth
+
1
,
result_1_2_b
,
a_1_2
,
b_2_2
);
}
[
&
]()
{
multiply_div_conquer
(
tmp_arrays
,
local_indices
,
size
/
2
,
depth
+
1
,
3
,
result_1_2_b
,
a_1_2
,
b_2_2
);
}
);
pls
::
spawn
(
[
&
]()
{
multiply_div_conquer
(
tmp_arrays
,
local_indices
,
size
/
2
,
depth
+
1
,
result_2_1_a
,
a_2_1
,
b_1_1
);
}
[
&
]()
{
multiply_div_conquer
(
tmp_arrays
,
local_indices
,
size
/
2
,
depth
+
1
,
4
,
result_2_1_a
,
a_2_1
,
b_1_1
);
}
);
pls
::
spawn
(
[
&
]()
{
multiply_div_conquer
(
tmp_arrays
,
local_indices
,
size
/
2
,
depth
+
1
,
result_2_1_b
,
a_2_2
,
b_2_1
);
}
[
&
]()
{
multiply_div_conquer
(
tmp_arrays
,
local_indices
,
size
/
2
,
depth
+
1
,
5
,
result_2_1_b
,
a_2_2
,
b_2_1
);
}
);
pls
::
spawn
(
[
&
]()
{
multiply_div_conquer
(
tmp_arrays
,
local_indices
,
size
/
2
,
depth
+
1
,
result_2_2_a
,
a_2_1
,
b_1_2
);
}
[
&
]()
{
multiply_div_conquer
(
tmp_arrays
,
local_indices
,
size
/
2
,
depth
+
1
,
6
,
result_2_2_a
,
a_2_1
,
b_1_2
);
}
);
pls
::
spawn
(
[
&
]()
{
multiply_div_conquer
(
tmp_arrays
,
local_indices
,
size
/
2
,
depth
+
1
,
result_2_2_b
,
a_2_2
,
b_2_2
);
}
[
&
]()
{
multiply_div_conquer
(
tmp_arrays
,
local_indices
,
size
/
2
,
depth
+
1
,
7
,
result_2_2_b
,
a_2_2
,
b_2_2
);
}
);
pls
::
sync
();
...
...
@@ -99,8 +108,8 @@ void multiply_div_conquer(const std::vector<std::vector<std::vector<std::unique_
}
}
constexpr
int
MAX_NUM_TASKS
=
1
6
;
constexpr
int
MAX_STACK_SIZE
=
4096
*
2
;
constexpr
int
MAX_NUM_TASKS
=
1
0
;
constexpr
int
MAX_STACK_SIZE
=
4096
*
1
;
int
main
(
int
argc
,
char
**
argv
)
{
auto
settings
=
benchmark_runner
::
parse_parameters
(
argc
,
argv
);
...
...
@@ -124,17 +133,24 @@ int main(int argc, char **argv) {
// Strain local data
std
::
vector
<
std
::
vector
<
std
::
vector
<
std
::
unique_ptr
<
double
[]
>>>>
div_conquer_temp_arrays
;
size_t
max_depth
=
0
;
size_t
buffers_needed
=
1
;
size_t
remaining_size
=
size
;
while
(
remaining_size
>
1
)
{
while
(
remaining_size
>
matrix_div_conquer
::
CUTOFF_SIZE
)
{
auto
&
depth_buffers
=
div_conquer_temp_arrays
.
emplace_back
();
for
(
int
thread_id
=
0
;
thread_id
<
8
;
thread_id
++
)
{
buffers_needed
=
std
::
min
(
buffers_needed
,
(
size_t
)
settings
.
num_threads_
);
for
(
int
thread_id
=
0
;
thread_id
<
buffers_needed
;
thread_id
++
)
{
auto
&
depth_thread_buffers
=
depth_buffers
.
emplace_back
();
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
depth_thread_buffers
.
emplace_back
(
new
double
[(
remaining_size
/
2
)
*
(
remaining_size
/
2
)]);
size_t
matrix_elements
=
(
remaining_size
/
2
)
*
(
remaining_size
/
2
);
depth_thread_buffers
.
emplace_back
(
new
double
[
matrix_elements
]);
for
(
size_t
j
=
0
;
j
<
matrix_elements
;
j
+=
32
)
{
depth_thread_buffers
[
i
][
j
]
=
1.0
;
// Touch memory
}
}
}
max_depth
++
;
buffers_needed
*=
8
;
remaining_size
=
remaining_size
/
2
;
}
pls
::
strain_local_resource
local_indices
{(
unsigned
)
settings
.
num_threads_
,
(
unsigned
)
max_depth
};
...
...
@@ -152,7 +168,7 @@ int main(int argc, char **argv) {
runner
.
run_iterations
(
settings
.
iterations_
,
[
&
]()
{
scheduler
.
perform_work
([
&
]()
{
multiply_div_conquer
(
div_conquer_temp_arrays
,
local_indices
,
size
,
0
,
result
,
a
,
b
);
multiply_div_conquer
(
div_conquer_temp_arrays
,
local_indices
,
size
,
0
,
0
,
result
,
a
,
b
);
});
});
runner
.
commit_results
(
true
);
...
...
@@ -163,7 +179,7 @@ int main(int argc, char **argv) {
runner
.
run_periodic
(
settings
.
iterations_
,
settings
.
interval_period_
,
settings
.
interval_deadline_
,
[
&
]()
{
scheduler
.
perform_work
([
&
]()
{
multiply_div_conquer
(
div_conquer_temp_arrays
,
local_indices
,
size
,
0
,
result
,
a
,
b
);
multiply_div_conquer
(
div_conquer_temp_arrays
,
local_indices
,
size
,
0
,
0
,
result
,
a
,
b
);
});
});
runner
.
commit_results
(
true
);
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/scheduling/scheduler.h
View file @
86333a60
...
...
@@ -45,7 +45,7 @@ class scheduler {
size_t
computation_depth
,
size_t
stack_size
,
bool
reuse_thread
=
true
,
size_t
serial_stack_size
=
4096
*
8
);
size_t
serial_stack_size
=
4096
*
1
);
template
<
typename
ALLOC
>
explicit
scheduler
(
unsigned
int
num_threads
,
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/scheduling/strain_local_resource.h
View file @
86333a60
...
...
@@ -60,7 +60,7 @@ class strain_local_resource {
};
strain_local_resource
(
unsigned
num_threads
,
unsigned
depth
)
:
local_items_
(
num_threads
)
{
unsigned
depth
)
:
num_threads_
{
num_threads
},
local_items_
(
num_threads
)
{
for
(
unsigned
thread_id
=
0
;
thread_id
<
num_threads
;
thread_id
++
)
{
local_items_
[
thread_id
].
reserve
(
depth
);
for
(
unsigned
i
=
0
;
i
<
depth
;
i
++
)
{
...
...
@@ -70,11 +70,13 @@ class strain_local_resource {
}
}
[[
nodiscard
]]
unsigned
get_num_threads
()
const
{
return
num_threads_
;
}
item_handle
get_item
(
unsigned
depth
);
static
strain_resource
*
get_local_copy
(
strain_resource
*
other_resources
,
unsigned
thread_id
);
static
void
acquire_locally
(
strain_resource
*
other_resources
,
unsigned
thread_id
);
private
:
const
unsigned
num_threads_
;
std
::
vector
<
std
::
vector
<
local_item
>>
local_items_
;
};
...
...
This diff is collapsed.
Click to expand it.
lib/pls/include/pls/internal/scheduling/thread_state.h
View file @
86333a60
...
...
@@ -69,6 +69,9 @@ struct PLS_CACHE_ALIGN thread_state {
stack_allocator_
{
stack_allocator
},
serial_call_stack_size_
{
serial_call_stack_size
}
{
serial_call_stack_
=
stack_allocator
->
allocate_stack
(
serial_call_stack_size_
);
for
(
size_t
i
=
0
;
i
<
serial_call_stack_size
;
i
+=
base
::
system_details
::
CACHE_LINE_SIZE
)
{
serial_call_stack_
[
i
]
=
'a'
;
// Touch the stack
}
};
~
thread_state
()
{
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment