Commit 79ac0243 by FritzFlorian

Add two 'standardized' benchmarks.

parent 2f539691
Pipeline #1371 failed with stages
in 37 seconds
...@@ -26,6 +26,9 @@ list(APPEND CMAKE_PREFIX_PATH "${PROJECT_SOURCE_DIR}/cmake") ...@@ -26,6 +26,9 @@ list(APPEND CMAKE_PREFIX_PATH "${PROJECT_SOURCE_DIR}/cmake")
# Each library has an own CMakeLists.txt that should make it avaliabale as a library target, # Each library has an own CMakeLists.txt that should make it avaliabale as a library target,
# thus allowing one to include it as any cmake dependency later on. # thus allowing one to include it as any cmake dependency later on.
add_subdirectory(extern/catch2) add_subdirectory(extern/catch2)
add_subdirectory(extern/picosha2)
add_subdirectory(extern/benchmark_base)
add_subdirectory(extern/benchmark_runner)
# Include all internal subprojects (library, examples, testing). # Include all internal subprojects (library, examples, testing).
add_subdirectory(lib/pls) add_subdirectory(lib/pls)
......
add_executable(benchmark_fft main.cpp) add_executable(benchmark_fft_pls_v2 main.cpp)
target_link_libraries(benchmark_fft pls) target_link_libraries(benchmark_fft_pls_v2 pls benchmark_runner benchmark_base)
if(EASY_PROFILER) if (EASY_PROFILER)
target_link_libraries(benchmark_fft easy_profiler) target_link_libraries(benchmark_fft_pls_v2 easy_profiler)
endif() endif ()
...@@ -2,142 +2,91 @@ ...@@ -2,142 +2,91 @@
#include "pls/internal/scheduling/parallel_result.h" #include "pls/internal/scheduling/parallel_result.h"
#include "pls/internal/scheduling/scheduler_memory.h" #include "pls/internal/scheduling/scheduler_memory.h"
#include "pls/internal/helpers/profiler.h" #include "pls/internal/helpers/profiler.h"
using namespace pls::internal::scheduling; using namespace pls::internal::scheduling;
#include <iostream> #include <iostream>
#include <complex> #include <complex>
#include <vector> #include <vector>
#include <atomic>
static constexpr int CUTOFF = 16;
static constexpr int INPUT_SIZE = 16384;
typedef std::vector<std::complex<double>> complex_vector;
void divide(complex_vector::iterator data, int n) {
complex_vector tmp_odd_elements(n / 2);
for (int i = 0; i < n / 2; i++) {
tmp_odd_elements[i] = data[i * 2 + 1];
}
for (int i = 0; i < n / 2; i++) {
data[i] = data[i * 2];
}
for (int i = 0; i < n / 2; i++) {
data[i + n / 2] = tmp_odd_elements[i];
}
}
void combine(complex_vector::iterator data, int n) { #include "benchmark_runner.h"
for (int i = 0; i < n / 2; i++) { #include "benchmark_base/fft.h"
std::complex<double> even = data[i];
std::complex<double> odd = data[i + n / 2];
// w is the "twiddle-factor".
// this could be cached, but we run the same 'data_structures' algorithm parallel/serial,
// so it won't impact the performance comparison.
std::complex<double> w = exp(std::complex<double>(0, -2. * M_PI * i / n));
data[i] = even + w * odd;
data[i + n / 2] = even - w * odd;
}
}
void fft_normal(complex_vector::iterator data, int n) {
if (n < 2) {
return;
}
divide(data, n); using namespace comparison_benchmarks::base;
fft_normal(data, n / 2);
fft_normal(data + n / 2, n / 2);
combine(data, n);
}
parallel_result<short> fft(complex_vector::iterator data, int n) { parallel_result<short> conquer(fft::complex_vector::iterator data, int n) {
if (n < 2) { if (n < 2) {
return parallel_result<short>{0}; return parallel_result<short>{0};
} }
divide(data, n); fft::divide(data, n);
if (n <= CUTOFF) { if (n <= fft::RECURSIVE_CUTOFF) {
fft_normal(data, n / 2); fft::conquer(data, n / 2);
fft_normal(data + n / 2, n / 2); fft::conquer(data + n / 2, n / 2);
combine(data, n); fft::combine(data, n);
return parallel_result<short>{0}; return parallel_result<short>{0};
} else { } else {
return scheduler::par([=]() { return scheduler::par([=]() {
return fft(data, n / 2); return conquer(data, n / 2);
}, [=]() { }, [=]() {
return fft(data + n / 2, n / 2); return conquer(data + n / 2, n / 2);
}).then([=](int, int) { }).then([=](int, int) {
combine(data, n); fft::combine(data, n);
return parallel_result<short>{0}; return parallel_result<short>{0};
}); });
} }
} }
complex_vector prepare_input(int input_size) { constexpr int MAX_NUM_THREADS = 8;
std::vector<double> known_frequencies{2, 11, 52, 88, 256}; constexpr int MAX_NUM_TASKS = 64;
complex_vector data(input_size); constexpr int MAX_NUM_CONTS = 64;
constexpr int MAX_CONT_SIZE = 256;
// Set our input data to match a time series of the known_frequencies.
// When applying fft to this time-series we should find these frequencies.
for (int i = 0; i < input_size; i++) {
data[i] = std::complex<double>(0.0, 0.0);
for (auto frequencie : known_frequencies) {
data[i] += sin(2 * M_PI * frequencie * i / input_size);
}
}
return data;
}
static constexpr int NUM_ITERATIONS = 500; int main(int argc, char **argv) {
constexpr size_t NUM_THREADS = 2; int num_threads;
string directory;
benchmark_runner::read_args(argc, argv, num_threads, directory);
constexpr size_t NUM_TASKS = 128; string test_name = to_string(num_threads) + ".csv";
string full_directory = directory + "/PLS_v2/";
benchmark_runner runner{full_directory, test_name};
constexpr size_t NUM_CONTS = 128; fft::complex_vector data = fft::generate_input();
constexpr size_t MAX_CONT_SIZE = 512;
int main() { static_scheduler_memory<MAX_NUM_THREADS,
PROFILE_ENABLE; MAX_NUM_TASKS,
complex_vector initial_input = prepare_input(INPUT_SIZE); MAX_NUM_CONTS,
static_scheduler_memory<NUM_THREADS,
NUM_TASKS,
NUM_CONTS,
MAX_CONT_SIZE> static_scheduler_memory; MAX_CONT_SIZE> static_scheduler_memory;
scheduler scheduler{static_scheduler_memory, NUM_THREADS}; scheduler scheduler{static_scheduler_memory, (unsigned int) num_threads};
auto start = std::chrono::steady_clock::now(); for (int i = 0; i < fft::NUM_WARMUP_ITERATIONS; i++) {
for (int i = 0; i < NUM_ITERATIONS; i++) {
complex_vector input_2(initial_input);
scheduler.perform_work([&]() { scheduler.perform_work([&]() {
PROFILE_MAIN_THREAD;
return scheduler::par([&]() { return scheduler::par([&]() {
return fft(input_2.begin(), INPUT_SIZE); return conquer(data.begin(), fft::SIZE);
}, []() { }, []() {
return parallel_result<int>{0}; return parallel_result<short>{0};
}).then([](int, int) { }).then([&](short, short) {
return parallel_result<int>{0}; return parallel_result<int>{0};
}); });
}); });
PROFILE_LOCK("DONE");
} }
auto end = std::chrono::steady_clock::now();
std::cout << "Framework: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() for (int i = 0; i < fft::NUM_ITERATIONS; i++) {
<< std::endl; scheduler.perform_work([&]() {
PROFILE_SAVE("test_profile.prof"); runner.start_iteration();
start = std::chrono::steady_clock::now(); return scheduler::par([&]() {
for (int i = 0; i < NUM_ITERATIONS; i++) { return conquer(data.begin(), fft::SIZE);
complex_vector input_1(initial_input); }, []() {
fft_normal(input_1.begin(), INPUT_SIZE); return parallel_result<short>{0};
}).then([&](short, short) {
runner.end_iteration();
return parallel_result<int>{0};
});
});
} }
end = std::chrono::steady_clock::now(); runner.commit_results(true);
std::cout << "Normal: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
<< std::endl;
return 0; return 0;
} }
add_executable(benchmark_matrix main.cpp) add_executable(benchmark_matrix_pls_v2 main.cpp)
target_link_libraries(benchmark_matrix pls) target_link_libraries(benchmark_matrix_pls_v2 pls benchmark_runner benchmark_base)
if (EASY_PROFILER) if (EASY_PROFILER)
target_link_libraries(benchmark_matrix easy_profiler) target_link_libraries(benchmark_matrix_pls_v2 easy_profiler)
endif () endif ()
...@@ -2,112 +2,78 @@ ...@@ -2,112 +2,78 @@
#include "pls/internal/scheduling/parallel_result.h" #include "pls/internal/scheduling/parallel_result.h"
#include "pls/internal/scheduling/scheduler_memory.h" #include "pls/internal/scheduling/scheduler_memory.h"
#include "pls/algorithms/for_each.h" #include "pls/algorithms/for_each.h"
using namespace pls::internal::scheduling; using namespace pls::internal::scheduling;
#include <chrono> #include "benchmark_runner.h"
#include "benchmark_base/matrix.h"
const int MATRIX_SIZE = 128; using namespace comparison_benchmarks::base;
template<typename T, int SIZE> template<typename T, int SIZE>
class matrix { class pls_matrix : public matrix::matrix<T, SIZE> {
public: public:
T data[SIZE][SIZE]; pls_matrix() : matrix::matrix<T, SIZE>() {}
explicit matrix(T i = 1) {
std::fill(&data[0][0], &data[0][0] + SIZE * SIZE, i);
}
parallel_result<int> multiply(const matrix<T, SIZE> &a, const matrix<T, SIZE> &b) { parallel_result<int> pls_multiply(const matrix::matrix<T, SIZE> &a, const matrix::matrix<T, SIZE> &b) {
return pls::algorithm::for_each_range(0, SIZE, [&](int i) { return pls::algorithm::for_each_range(0, SIZE, [this, &a, &b](int i) {
this->multiply_column(i, a, b); this->multiply_column(i, a, b);
}); });
} }
private:
void multiply_column(int i, const matrix<T, SIZE> &a, const matrix<T, SIZE> &b) {
for (int j = 0; j < SIZE; ++j) {
data[i][j] = 0;
}
for (int k = 0; k < SIZE; ++k) {
for (int j = 0; j < SIZE; ++j) {
data[i][j] += a.data[i][k] * b.data[k][j];
}
}
}
}; };
void fill_with_data(matrix<double, MATRIX_SIZE> &a, matrix<double, MATRIX_SIZE> &b) { constexpr size_t MAX_NUM_THREADS = 8;
// Fill in some data... constexpr size_t MAX_NUM_TASKS = 32;
for (int i = 0; i < MATRIX_SIZE; i++) { constexpr size_t MAX_NUM_CONTS = 32;
for (int j = 0; j < MATRIX_SIZE; j++) { constexpr size_t MAX_CONT_SIZE = 512;
a.data[i][j] = i;
b.data[i][j] = j;
}
}
}
static constexpr int NUM_ITERATIONS = 1000; int main(int argc, char **argv) {
constexpr size_t NUM_THREADS = 3; int num_threads;
string directory;
benchmark_runner::read_args(argc, argv, num_threads, directory);
constexpr size_t NUM_TASKS = 128; string test_name = to_string(num_threads) + ".csv";
string full_directory = directory + "/PLS_v2/";
benchmark_runner runner{full_directory, test_name};
constexpr size_t NUM_CONTS = 128; pls_matrix<double, matrix::MATRIX_SIZE> a;
constexpr size_t MAX_CONT_SIZE = 512; pls_matrix<double, matrix::MATRIX_SIZE> b;
pls_matrix<double, matrix::MATRIX_SIZE> result;
int main() { static_scheduler_memory<MAX_NUM_THREADS,
PROFILE_ENABLE MAX_NUM_TASKS,
matrix<double, MATRIX_SIZE> a; MAX_NUM_CONTS,
matrix<double, MATRIX_SIZE> b;
matrix<double, MATRIX_SIZE> result;
fill_with_data(a, b);
static_scheduler_memory<NUM_THREADS,
NUM_TASKS,
NUM_CONTS,
MAX_CONT_SIZE> static_scheduler_memory; MAX_CONT_SIZE> static_scheduler_memory;
scheduler scheduler{static_scheduler_memory, NUM_THREADS}; scheduler scheduler{static_scheduler_memory, (unsigned int) num_threads};
for (int i = 0; i < matrix::WARMUP_ITERATIONS; i++) {
auto start = std::chrono::steady_clock::now();
for (int i = 0; i < NUM_ITERATIONS; i++) {
scheduler.perform_work([&]() { scheduler.perform_work([&]() {
PROFILE_MAIN_THREAD;
return scheduler::par([&]() { return scheduler::par([&]() {
return result.multiply(a, b); return result.pls_multiply(a, b);
}, []() { }, []() {
return parallel_result<int>{0}; return parallel_result<int>{0};
}).then([](int, int) { }).then([&](int, int) {
return parallel_result<int>{0}; return parallel_result<int>{0};
}); });
}); });
} }
auto end = std::chrono::steady_clock::now();
std::cout << "Framework: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count()
<< std::endl;
}
//int main() { for (int i = 0; i < matrix::NUM_ITERATIONS; i++) {
// PROFILE_ENABLE scheduler.perform_work([&]() {
// pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18u}; runner.start_iteration();
// pls::scheduler scheduler{&my_scheduler_memory, 4};
//
// matrix<double, MATRIX_SIZE> a;
// matrix<double, MATRIX_SIZE> b;
// matrix<double, MATRIX_SIZE> result;
// fill_with_data(a, b);
//
// scheduler.perform_work([&] {
// auto start_time = std::chrono::high_resolution_clock::now();
// PROFILE_MAIN_THREAD
// for (int i = 0; i < 10000; i++) {
// PROFILE_WORK_BLOCK("Top Level")
// result.multiply(a, b);
// }
// auto end_time = std::chrono::high_resolution_clock::now();
// long time = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
// std::cout << "Runtime: " << time << "us" << std::endl;
// });
//
// PROFILE_SAVE("test_profile.prof")
//}
return scheduler::par([&]() {
return result.pls_multiply(a, b);
}, []() {
return parallel_result<int>{0};
}).then([&](int, int) {
runner.end_iteration();
return parallel_result<int>{0};
});
});
}
runner.commit_results(true);
}
...@@ -51,22 +51,22 @@ parallel_result<int> unbalanced_tree_search(int seed, int root_children, double ...@@ -51,22 +51,22 @@ parallel_result<int> unbalanced_tree_search(int seed, int root_children, double
return result; return result;
} }
constexpr size_t NUM_THREADS = 5; constexpr size_t MAX_NUM_THREADS = 5;
constexpr size_t NUM_TASKS = 128; constexpr size_t MAX_NUM_TASKS = 128;
constexpr size_t NUM_CONTS = 128; constexpr size_t MAX_NUM_CONTS = 128;
constexpr size_t MAX_CONT_SIZE = 512; constexpr size_t MAX_CONT_SIZE = 512;
volatile int result; volatile int result;
int main() { int main() {
PROFILE_ENABLE PROFILE_ENABLE
static_scheduler_memory<NUM_THREADS, static_scheduler_memory<MAX_NUM_THREADS,
NUM_TASKS, MAX_NUM_TASKS,
NUM_CONTS, MAX_NUM_CONTS,
MAX_CONT_SIZE> static_scheduler_memory; MAX_CONT_SIZE> static_scheduler_memory;
scheduler scheduler{static_scheduler_memory, NUM_THREADS}; scheduler scheduler{static_scheduler_memory, MAX_NUM_THREADS};
scheduler.perform_work([&]() { scheduler.perform_work([&]() {
return scheduler::par([&]() { return scheduler::par([&]() {
......
...@@ -8,12 +8,12 @@ ...@@ -8,12 +8,12 @@
using namespace pls::internal; using namespace pls::internal;
constexpr size_t NUM_THREADS = 4; constexpr size_t MAX_NUM_THREADS = 1;
constexpr size_t NUM_TASKS = 128; constexpr size_t MAX_NUM_TASKS = 128;
static constexpr int NUM_ITERATIONS = 100; static constexpr int NUM_ITERATIONS = 10;
constexpr size_t NUM_CONTS = 128; constexpr size_t MAX_NUM_CONTS = 128;
constexpr size_t MAX_CONT_SIZE = 256; constexpr size_t MAX_CONT_SIZE = 256;
int fib_normal(int n) { int fib_normal(int n) {
...@@ -29,8 +29,13 @@ int fib_normal(int n) { ...@@ -29,8 +29,13 @@ int fib_normal(int n) {
} }
scheduling::parallel_result<int> fib(int n) { scheduling::parallel_result<int> fib(int n) {
if (n <= 10) { pls::variable<int> i;
return fib_normal(n); pls::array<int> a{10};
if (n == 0) {
return 0;
}
if (n == 1) {
return 1;
} }
return scheduling::scheduler::par([=]() { return scheduling::scheduler::par([=]() {
...@@ -45,12 +50,12 @@ scheduling::parallel_result<int> fib(int n) { ...@@ -45,12 +50,12 @@ scheduling::parallel_result<int> fib(int n) {
static volatile int result; static volatile int result;
int main() { int main() {
PROFILE_ENABLE; PROFILE_ENABLE;
scheduling::static_scheduler_memory<NUM_THREADS, scheduling::static_scheduler_memory<MAX_NUM_THREADS,
NUM_TASKS, MAX_NUM_TASKS,
NUM_CONTS, MAX_NUM_CONTS,
MAX_CONT_SIZE> static_scheduler_memory; MAX_CONT_SIZE> static_scheduler_memory;
scheduling::scheduler scheduler{static_scheduler_memory, NUM_THREADS}; scheduling::scheduler scheduler{static_scheduler_memory, MAX_NUM_THREADS};
auto start = std::chrono::steady_clock::now(); auto start = std::chrono::steady_clock::now();
for (int i = 0; i < NUM_ITERATIONS; i++) { for (int i = 0; i < NUM_ITERATIONS; i++) {
......
# Configuration and common algorithm pieces for benchmarks
configure_file(src/sample_images.cpp.in sample_images.cpp)
add_library(benchmark_base STATIC
${CMAKE_CURRENT_BINARY_DIR}/sample_images.cpp
src/fft.cpp include/benchmark_base/fft.h
include/benchmark_base/heat.h
include/benchmark_base/matrix.h
include/benchmark_base/unbalanced.h src/unbalanced.cpp
include/benchmark_base/range.h)
target_include_directories(benchmark_base
PUBLIC
$<INSTALL_INTERFACE:include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src
)
target_link_libraries(benchmark_base picosha2)
Boost Software License - Version 1.0 - August 17th, 2003
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
#ifndef COMPARISON_BENCHMARKS_BASE_FFT_H
#define COMPARISON_BENCHMARKS_BASE_FFT_H
#include <complex>
#include <string>
#include <vector>
namespace comparison_benchmarks {
namespace base {
namespace fft {
const int SIZE = 8192;
const int NUM_ITERATIONS = 1000;
const int NUM_WARMUP_ITERATIONS = 100;
const int RECURSIVE_CUTOFF = 32;
typedef std::vector<std::complex<double>> complex_vector;
complex_vector generate_input();
void divide(complex_vector::iterator data, int n);
void conquer(complex_vector::iterator data, int n);
void combine(complex_vector::iterator data, int n);
}
}
}
#endif //COMPARISON_BENCHMARKS_BASE_FFT_H
#ifndef COMPARISON_BENCHMARKS_BASE_HEAT_H
#define COMPARISON_BENCHMARKS_BASE_HEAT_H
#include <array>
#include <iostream>
#include <memory>
namespace comparison_benchmarks {
namespace base {
namespace heat {
const int DIFFUSION_SIZE = 256;
const int DIFFUSION_STEPS = 256;
const int NUM_ITERATIONS = 100;
const int WARMUP_ITERATIONS = 20;
template<typename T, int SIZE>
class heat_diffusion {
// Center portion is SIZExSIZE, borders are fixed temperature values
using matrix = std::array<std::array<T, SIZE + 2>, SIZE + 2>;
protected:
// Sane default values for the simulation (form paper).
// This is not about perfect simulation results but the speedup of the workload.
double c = 0.1;
double d_s = 1.0 / (SIZE + 1);
double d_t = (d_s * d_s) / (4 * c);
public:
matrix *current_data;
matrix *next_data;
explicit heat_diffusion() {
current_data = new matrix;
next_data = new matrix;
reset_data();
}
~heat_diffusion() {
delete current_data;
delete next_data;
}
virtual void run_simulation(int n) {
for (int i = 0; i < n; i++) {
for (int row = 1; row <= SIZE; row++) {
for (int column = 1; column <= SIZE; column++) {
update_element(row, column);
}
}
// Synchronization point needed to coordinate the calculation!
swap_data_arrays();
}
}
protected:
void update_element(int row, int column) {
(*next_data)[row][column] = (*current_data)[row][column] + ((c * d_t) / (d_s * d_s)) *
((*current_data)[row + 1][column] + (*current_data)[row - 1][column]
- 4 * (*current_data)[row][column]
+ (*current_data)[row][column + 1] + (*current_data)[row][column - 1]);
}
void swap_data_arrays() {
matrix *tmp = current_data;
current_data = next_data;
next_data = tmp;
}
void reset_data() {
for (int row = 0; row < SIZE + 2; row++) {
for (int column = 0; column < SIZE + 2; column++) {
(*current_data)[row][column] = 0.0;
(*next_data)[row][column] = 0.0;
// Edges are a fixed, hot temperature
if (row == 0 || row == SIZE + 1) {
(*current_data)[row][column] = 1.0;
(*next_data)[row][column] = 1.0;
}
}
}
}
};
template<typename T, int SIZE>
std::ostream &operator<<(std::ostream &strm, const heat_diffusion<T, SIZE> &simulation) {
for (int i = 0; i < SIZE + 2; i++) {
for (int j = 0; j < SIZE + 2; j++) {
// 'color' our output according to temperature
char out;
if (simulation.current_data[i][j] < 0.1) {
out = ' ';
} else if (simulation.current_data[i][j] < 0.2) {
out = '-';
} else if (simulation.current_data[i][j] < 0.5) {
out = '=';
} else {
out = '#';
}
strm << out << "\t";
}
strm << std::endl;
}
return strm;
}
}
}
}
#endif //COMPARISON_BENCHMARKS_BASE_HEAT_H
#ifndef COMPARISON_BENCHMARKS_BASE_MATRIX_H
#define COMPARISON_BENCHMARKS_BASE_MATRIX_H
#include <algorithm>
#include <iostream>
namespace comparison_benchmarks {
namespace base {
namespace matrix {
const int MATRIX_SIZE = 128;
const int NUM_ITERATIONS = 5000;
const int WARMUP_ITERATIONS = 1000;
template<typename T, int SIZE>
class matrix {
public:
T data[SIZE][SIZE];
explicit matrix() {
for (int i = 0; i < MATRIX_SIZE; i++) {
for (int j = 0; j < MATRIX_SIZE; j++) {
data[i][j] = i;
}
}
}
virtual void multiply(const matrix<T, SIZE> &a, const matrix<T, SIZE> &b) {
for (int i = 0; i < SIZE; i++) {
multiply_column(i, a, b);
}
}
protected:
void multiply_column(int i, const matrix<T, SIZE> &a, const matrix<T, SIZE> &b) {
for (int j = 0; j < SIZE; ++j) {
data[i][j] = 0;
}
for (int k = 0; k < SIZE; ++k) {
for (int j = 0; j < SIZE; ++j) {
T a_data = a.data[i][k];
T b_data = b.data[k][j];
data[i][j] += a_data * b_data;
}
}
}
};
template<typename T, int SIZE>
std::ostream &operator<<(std::ostream &strm, const matrix<T, SIZE> &matrix) {
for (int i = 0; i < SIZE; i++) {
for (int j = 0; j < SIZE; j++) {
strm << matrix.data[i][j] << "\t";
}
strm << std::endl;
}
return strm;
}
}
}
}
#endif //COMPARISON_BENCHMARKS_BASE_MATRIX_H
#ifndef COMPARISON_BENCHMARKS_BASE_UNBALANCED_H_
#define COMPARISON_BENCHMARKS_BASE_UNBALANCED_H_
#include <cstdint>
#include <array>
#include <vector>
#include "picosha2.h"
namespace comparison_benchmarks {
namespace base {
namespace unbalanced {
const int SEED = 42;
const int ROOT_CHILDREN = 140;
const double Q = 0.124875;
const int NORMAL_CHILDREN = 8;
const int NUM_NODES = 71069;
const int NUM_ITERATIONS = 50;
const int WARMUP_ITERATIONS = 5;
using node_state = std::array<uint8_t, 20>;
/**
* Node of an unballanced binomial tree (https://www.cs.unc.edu/~olivier/LCPC06.pdf).
* To build up the tree recursivly call spawn_child_nodes on each node until leaves are reached.
* The tree is not built up directly in memory, but rather by the recursive calls.
*/
class node {
// The state is used to allow a deterministic tree construction using sha256 hashes.
node_state state_;
// Number of children for the current node
int num_children_;
// Set this to a positive number for the root node to start the tree with a specific size
int root_children_;
// general branching factors
double q_;
int b_;
// Private constructor for children
node(node_state state, double q, int b) : state_{state},
num_children_{0},
root_children_{-1},
q_{q},
b_{b} { init_num_children(); }
std::array<uint8_t, 20> generate_child_state(uint32_t index);
double get_state_random();
void init_num_children() {
double state_random = get_state_random();
if (root_children_ > 0) {
num_children_ = root_children_; // Root always spawns children
} else if (state_random < q_) {
num_children_ = b_;
} else {
num_children_ = 0;
}
}
public:
node(uint32_t seed, int root_children, double q, int b)
: state_({{}}), num_children_{0}, root_children_{root_children}, q_{q}, b_{b} {
for (int i = 0; i < 16; i++) {
state_[i] = 0;
}
state_[16] = static_cast<uint8_t>(0xFFu & (seed >> 24u));
state_[17] = static_cast<uint8_t>(0xFFu & (seed >> 16u));
state_[18] = static_cast<uint8_t>(0xFFu & (seed >> 8u));
state_[19] = static_cast<uint8_t>(0xFFu & (seed >> 0u));
picosha2::hash256_one_by_one hasher;
hasher.process(state_.begin(), state_.end());
hasher.finish();
hasher.get_hash_bytes(state_.begin(), state_.end());
init_num_children();
}
int get_num_children() const { return num_children_; }
node spawn_child_node(int index) {
return {generate_child_state(index), q_, b_};
}
};
}
}
}
#endif //COMPARISON_BENCHMARKS_BASE_UNBALANCED_H_
#include "benchmark_base/fft.h"
namespace comparison_benchmarks {
namespace base {
namespace fft {
complex_vector generate_input() {
std::vector<double> known_frequencies{2, 11, 52, 88, 256};
fft::complex_vector data(SIZE);
// Set our input data to match a time series of the known_frequencies.
// When applying fft to this time-series we should find these frequencies.
for (int i = 0; i < SIZE; i++) {
data[i] = std::complex<double>(0.0, 0.0);
for (auto frequencie : known_frequencies) {
data[i] += sin(2 * M_PI * frequencie * i / SIZE);
}
}
return data;
}
void divide(complex_vector::iterator data, int n) {
complex_vector tmp_odd_elements(n / 2);
for (int i = 0; i < n / 2; i++) {
tmp_odd_elements[i] = data[i * 2 + 1];
}
for (int i = 0; i < n / 2; i++) {
data[i] = data[i * 2];
}
for (int i = 0; i < n / 2; i++) {
data[i + n / 2] = tmp_odd_elements[i];
}
}
void combine(complex_vector::iterator data, int n) {
for (int i = 0; i < n / 2; i++) {
std::complex<double> even = data[i];
std::complex<double> odd = data[i + n / 2];
// w is the "twiddle-factor".
// this could be cached, but we run the same 'base' algorithm parallel/serial,
// so it won't impact the performance comparison.
std::complex<double> w = exp(std::complex<double>(0, -2. * M_PI * i / n));
data[i] = even + w * odd;
data[i + n / 2] = even - w * odd;
}
}
void conquer(complex_vector::iterator data, int n) {
if (n < 2) {
return;
}
divide(data, n);
conquer(data, n / 2);
conquer(data + n / 2, n / 2);
combine(data, n);
}
}
}
}
#include <vector>
#include <string>
#include <iostream>
#include <sstream>
using namespace std;
namespace comparison_benchmarks {
namespace base {
vector<string> get_sample_image_paths() {
const int num_images = 19;
vector<string> result(num_images);
for (int i = 0; i < num_images; i++) {
ostringstream string_stream;
string_stream << "@CMAKE_CURRENT_SOURCE_DIR@/sample_images/" << i << ".jpg";
result[i] = string_stream.str();
}
return result;
}
}
}
#include "benchmark_base/unbalanced.h"
namespace comparison_benchmarks {
namespace base {
namespace unbalanced {
node_state node::generate_child_state(uint32_t index) {
node_state result;
picosha2::hash256_one_by_one hasher;
hasher.process(state_.begin(), state_.end());
auto index_begin = reinterpret_cast<uint8_t *>(&index);
hasher.process(index_begin, index_begin + 4);
hasher.finish();
hasher.get_hash_bytes(result.begin(), result.end());
return result;
}
double node::get_state_random() {
int32_t state_random_integer;
uint32_t b = ((uint32_t) state_[16] << 24u) |
((uint32_t) state_[17] << 16u) |
((uint32_t) state_[18] << 8u) |
((uint32_t) state_[19] << 0u);
b = b & 0x7fffffff; // Mask out negative values
state_random_integer = static_cast<int32_t>(b);
return (double) state_random_integer / (double) INT32_MAX;
}
}
}
}
add_library(benchmark_runner INTERFACE)
target_include_directories(benchmark_runner INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
#ifndef BENCHMARK_RUNNER_H
#define BENCHMARK_RUNNER_H
#include <string>
#include <cstdlib>
#include <vector>
#include <chrono>
#include <numeric>
#include <iostream>
#include <fstream>
#include <bits/stdc++.h>
using namespace std;
class benchmark_runner {
private:
string csv_path_;
string csv_name_;
chrono::steady_clock::time_point last_start_time_;
vector<long> times_;
void print_statistics() {
long time_sum = std::accumulate(times_.begin(), times_.end(), 0l);
cout << "Average Runtime (us): " << (time_sum / times_.size()) << endl;
}
inline bool file_exists(const std::string &name) {
ifstream f(name);
return f.good();
}
public:
benchmark_runner(string csv_path, string csv_name) : csv_path_{std::move(csv_path)},
csv_name_{std::move(csv_name)},
times_{} {
string command = "mkdir -p " + csv_path_;
int res = system(command.c_str());
if (res) {
cout << "Error while creating directory!" << endl;
exit(1);
}
}
static void read_args(int argc, char **argv, int &num_threads, string &path) {
if (argc < 3) {
cout << "Must Specifiy concurrency and output directory! (usage: `benchmark <output_directory> <num_threads>`)"
<< endl;
exit(1);
}
string tmp = argv[1];
path = tmp;
num_threads = atoi(argv[2]);
}
void start_iteration() {
last_start_time_ = chrono::steady_clock::now();
}
void end_iteration() {
auto end_time = chrono::steady_clock::now();
long time = chrono::duration_cast<chrono::microseconds>(end_time - last_start_time_).count();
times_.emplace_back(time);
}
void run_iterations(int count, function<void(void)> f, int warmup_count) {
for (int i = 0; i < warmup_count; i++) {
f();
}
for (int i = 0; i < count; i++) {
start_iteration();
f();
end_iteration();
}
}
void commit_results(bool print_stats) {
if (print_stats) {
print_statistics();
}
string full_filename = csv_path_ + csv_name_;
bool write_header = !file_exists(full_filename);
{ // Scope for output file
ofstream o(full_filename, std::fstream::out | std::fstream::app);
if (write_header) {
o << "runtime_us" << endl;
}
for (auto time : times_) {
o << time << endl;
}
} // End Scope for output file
times_.clear();
}
};
#endif //BENCHMARK_RUNNER_H
add_library(picosha2 INTERFACE)
target_include_directories(picosha2 INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
\ No newline at end of file
MIT License
Copyright (c) 2017 okdshin
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
\ No newline at end of file
...@@ -29,12 +29,12 @@ pls::internal::scheduling::parallel_result<int> for_each(const RandomIt first, ...@@ -29,12 +29,12 @@ pls::internal::scheduling::parallel_result<int> for_each(const RandomIt first,
// Cut in half recursively // Cut in half recursively
const long middle_index = num_elements / 2; const long middle_index = num_elements / 2;
return scheduler::par([first, middle_index, last, &function, min_elements] { return scheduler::par([first, middle_index, last, function, min_elements] {
return internal::for_each(first, return internal::for_each(first,
first + middle_index, first + middle_index,
function, function,
min_elements); min_elements);
}, [first, middle_index, last, &function, min_elements] { }, [first, middle_index, last, function, min_elements] {
return internal::for_each(first + middle_index, return internal::for_each(first + middle_index,
last, last,
function, function,
......
...@@ -112,7 +112,7 @@ struct basic_range { ...@@ -112,7 +112,7 @@ struct basic_range {
: r(rhs.r), index(rhs.index) {} : r(rhs.r), index(rhs.index) {}
const_iterator_impl(basic_range<IntegerType> const *p_range, size_type p_index) const_iterator_impl(basic_range<IntegerType> const *p_range, size_type p_index)
: r(p_range), index(p_index) {} : r(*p_range), index(p_index) {}
const_iterator_impl &operator=(const const_iterator_impl &rhs) { const_iterator_impl &operator=(const const_iterator_impl &rhs) {
r = rhs.r; r = rhs.r;
...@@ -121,7 +121,7 @@ struct basic_range { ...@@ -121,7 +121,7 @@ struct basic_range {
} }
bool operator==(const const_iterator_impl &rhs) const { bool operator==(const const_iterator_impl &rhs) const {
return *r == *(rhs.r) && index == rhs.index; return r == rhs.r && index == rhs.index;
} }
bool operator!=(const const_iterator_impl &rhs) const { bool operator!=(const const_iterator_impl &rhs) const {
...@@ -145,7 +145,7 @@ struct basic_range { ...@@ -145,7 +145,7 @@ struct basic_range {
} }
value_type operator*() const { value_type operator*() const {
return r->m_first_element + r->m_step * index; return r.m_first_element + r.m_step * index;
} }
// operator-> // operator->
...@@ -212,11 +212,11 @@ struct basic_range { ...@@ -212,11 +212,11 @@ struct basic_range {
value_type operator[](difference_type offset) const { value_type operator[](difference_type offset) const {
size_type new_index = index + offset; size_type new_index = index + offset;
return r->m_first_element + r->m_step * new_index; return r.m_first_element + r.m_step * new_index;
} }
private: private:
basic_range<IntegerType> const *r; basic_range<IntegerType> r;
size_type index; size_type index;
}; };
...@@ -236,7 +236,7 @@ struct basic_range { ...@@ -236,7 +236,7 @@ struct basic_range {
: r(rhs.r), index(rhs.index) {} : r(rhs.r), index(rhs.index) {}
const_reverse_iterator_impl(basic_range<IntegerType> const *p_range, size_type p_index) const_reverse_iterator_impl(basic_range<IntegerType> const *p_range, size_type p_index)
: r(p_range), index(p_index) {} : r(*p_range), index(p_index) {}
const_reverse_iterator_impl &operator=(const const_reverse_iterator_impl &rhs) { const_reverse_iterator_impl &operator=(const const_reverse_iterator_impl &rhs) {
r = rhs.r; r = rhs.r;
...@@ -245,7 +245,7 @@ struct basic_range { ...@@ -245,7 +245,7 @@ struct basic_range {
} }
bool operator==(const const_reverse_iterator_impl &rhs) const { bool operator==(const const_reverse_iterator_impl &rhs) const {
return *r == *(rhs.r) && index == rhs.index; return r == rhs.r && index == rhs.index;
} }
bool operator!=(const const_reverse_iterator_impl &rhs) const { bool operator!=(const const_reverse_iterator_impl &rhs) const {
...@@ -270,8 +270,8 @@ struct basic_range { ...@@ -270,8 +270,8 @@ struct basic_range {
value_type operator*() const { value_type operator*() const {
size_type reverse_index size_type reverse_index
= (r->m_element_count - 1) - index; = (r.m_element_count - 1) - index;
return r->m_first_element + r->m_step * reverse_index; return r.m_first_element + r.m_step * reverse_index;
} }
// operator-> // operator->
...@@ -338,12 +338,12 @@ struct basic_range { ...@@ -338,12 +338,12 @@ struct basic_range {
value_type operator[](difference_type offset) const { value_type operator[](difference_type offset) const {
size_type new_reverse_index size_type new_reverse_index
= (r->m_element_count - 1) - (index + offset); = (r.m_element_count - 1) - (index + offset);
return r->m_first_element + r->m_step * new_reverse_index; return r.m_first_element + r.m_step * new_reverse_index;
} }
private: private:
basic_range<IntegerType> const *r; basic_range<IntegerType> r;
size_type index; size_type index;
}; };
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment