diff --git a/CMakeLists.txt b/CMakeLists.txt index 7a41659..f0f2c75 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,12 +37,9 @@ add_subdirectory(lib/pls) # Include examples add_subdirectory(app/playground) -add_subdirectory(app/test_for_new) add_subdirectory(app/benchmark_fft) add_subdirectory(app/benchmark_unbalanced) add_subdirectory(app/benchmark_matrix) -add_subdirectory(app/benchmark_prefix) -add_subdirectory(app/benchmark_pipeline) add_subdirectory(app/benchmark_fib) add_subdirectory(app/context_switch) diff --git a/app/benchmark_pipeline/CMakeLists.txt b/app/benchmark_pipeline/CMakeLists.txt deleted file mode 100644 index d531b74..0000000 --- a/app/benchmark_pipeline/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -add_executable(benchmark_pipeline main.cpp) -target_link_libraries(benchmark_pipeline pls) -if (EASY_PROFILER) - target_link_libraries(benchmark_pipeline easy_profiler) -endif () diff --git a/app/benchmark_pipeline/main.cpp b/app/benchmark_pipeline/main.cpp deleted file mode 100644 index 6752d17..0000000 --- a/app/benchmark_pipeline/main.cpp +++ /dev/null @@ -1,148 +0,0 @@ -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -static constexpr int INPUT_SIZE = 8192; -typedef std::vector> complex_vector; - -using namespace pls::dataflow; - -void divide(complex_vector::iterator data, int n) { - complex_vector tmp_odd_elements(n / 2); - for (int i = 0; i < n / 2; i++) { - tmp_odd_elements[i] = data[i * 2 + 1]; - } - for (int i = 0; i < n / 2; i++) { - data[i] = data[i * 2]; - } - for (int i = 0; i < n / 2; i++) { - data[i + n / 2] = tmp_odd_elements[i]; - } -} - -void combine(complex_vector::iterator data, int n) { - for (int i = 0; i < n / 2; i++) { - std::complex even = data[i]; - std::complex odd = data[i + n / 2]; - - // w is the "twiddle-factor". - // this could be cached, but we run the same 'data_structures' algorithm parallel/serial, - // so it won't impact the performance comparison. - std::complex w = exp(std::complex(0, -2. * M_PI * i / n)); - - data[i] = even + w * odd; - data[i + n / 2] = even - w * odd; - } -} - -void fft(complex_vector::iterator data, int n) { - if (n < 2) { - return; - } - - divide(data, n); - fft(data, n / 2); - fft(data + n / 2, n / 2); - combine(data, n); -} - -complex_vector prepare_input(int input_size) { - std::vector known_frequencies{2, 11, 52, 88, 256}; - complex_vector data(input_size); - - // Set our input data to match a time series of the known_frequencies. - // When applying fft to this time-series we should find these frequencies. - for (int i = 0; i < input_size; i++) { - data[i] = std::complex(0.0, 0.0); - for (auto frequencie : known_frequencies) { - data[i] += sin(2 * M_PI * frequencie * i / input_size); - } - } - - return data; -} - -int main() { - PROFILE_ENABLE - pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18u}; - pls::scheduler scheduler{&my_scheduler_memory, 4}; - - graph, outputs> graph; - std::atomic count{0}; - auto lambda = [&](const int &in, int &out) { - PROFILE_WORK_BLOCK("Work Lambda") - auto tmp = in; - out = tmp; - complex_vector input = prepare_input(INPUT_SIZE); - fft(input.begin(), input.size()); - count++; - }; - function_node, outputs, decltype(lambda)> step_1{lambda}; - function_node, outputs, decltype(lambda)> step_2{lambda}; - function_node, outputs, decltype(lambda)> step_3{lambda}; - function_node, outputs, decltype(lambda)> step_4{lambda}; - - graph >> step_1 >> step_2 >> step_3 >> step_4 >> graph; - graph.build(); - - const int num_elements = 10; - std::vector> results(num_elements); - - pls::internal::helpers::run_mini_benchmark([&] { - PROFILE_WORK_BLOCK("Top Level") - for (int j = 0; j < num_elements; j++) { - graph.run(std::tuple{j}, &results[j]); - } - pls::scheduler::wait_for_all(); - }, 8, 1000); - - PROFILE_SAVE("test_profile.prof") -} - -//int main() { -// PROFILE_ENABLE -// pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18u}; -// pls::scheduler scheduler{&my_scheduler_memory, 4}; -// -// graph, outputs> graph; -// std::atomic count{0}; -// auto lambda = [&](const int &in, int &out) { -// PROFILE_WORK_BLOCK("Work Lambda") -// out = in; -// complex_vector input = prepare_input(INPUT_SIZE); -// fft(input.begin(), input.size()); -// count++; -// }; -// function_node, outputs, decltype(lambda)> step_1{lambda}; -// function_node, outputs, decltype(lambda)> step_2{lambda}; -// function_node, outputs, decltype(lambda)> step_3{lambda}; -// function_node, outputs, decltype(lambda)> step_4{lambda}; -// -// graph >> step_1 >> step_2 >> step_3 >> step_4 >> graph; -// graph.build(); -// -// const int num_elements = 10; -// std::vector> results(num_elements); -// -// scheduler.perform_work([&] { -// PROFILE_MAIN_THREAD -// for (int i = 0; i < 10; i++) { -// PROFILE_WORK_BLOCK("Top Level") -// for (int j = 0; j < num_elements; j++) { -// graph.run(std::tuple{j}, &results[j]); -// } -// pls::scheduler::wait_for_all(); -// } -// }); -// -// std::cout << count << std::endl; -// -// PROFILE_SAVE("test_profile.prof") -//} diff --git a/app/benchmark_prefix/CMakeLists.txt b/app/benchmark_prefix/CMakeLists.txt deleted file mode 100644 index f4f705b..0000000 --- a/app/benchmark_prefix/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -add_executable(benchmark_prefix main.cpp) -target_link_libraries(benchmark_prefix pls) -if (EASY_PROFILER) - target_link_libraries(benchmark_prefix easy_profiler) -endif () diff --git a/app/benchmark_prefix/main.cpp b/app/benchmark_prefix/main.cpp deleted file mode 100644 index a7cd7be..0000000 --- a/app/benchmark_prefix/main.cpp +++ /dev/null @@ -1,47 +0,0 @@ -#include -#include -#include - -#include -#include -#include - -static constexpr int INPUT_SIZE = 10e7; - -int main() { - PROFILE_ENABLE - std::vector vec(INPUT_SIZE, 1); - std::vector out(INPUT_SIZE); - - for (int i = 0; i < INPUT_SIZE; i++) { - vec[i] = i; - } - - pls::internal::helpers::run_mini_benchmark([&] { - pls::scan(vec.begin(), vec.end(), out.begin(), std::plus(), 0.0); - }, 8, 1000); - - PROFILE_SAVE("test_profile.prof") -} - -//int main() { -// PROFILE_ENABLE -// pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18}; -// pls::scheduler scheduler{&my_scheduler_memory, 8}; -// -// std::vector vec(INPUT_SIZE, 1); -// std::vector out(INPUT_SIZE); -// -// for (int i = 0; i < INPUT_SIZE; i++) { -// vec[i] = 1; -// } -// -// scheduler.perform_work([&] { -// PROFILE_MAIN_THREAD -// for (int i = 0; i < 100; i++) { -// pls::scan(vec.begin(), vec.end(), out.begin(), std::plus(), 0.0); -// } -// }); -// -// PROFILE_SAVE("test_profile.prof") -//} diff --git a/app/playground/CMakeLists.txt b/app/playground/CMakeLists.txt index 34b4ede..b825f68 100644 --- a/app/playground/CMakeLists.txt +++ b/app/playground/CMakeLists.txt @@ -1,6 +1,5 @@ add_executable(playground - barrier.h barrier.cpp main.cpp) # Example for adding the library to your app (as a cmake project dependency) -target_link_libraries(playground pls context_switcher Threads::Threads) +target_link_libraries(playground pls Threads::Threads) diff --git a/app/playground/barrier.cpp b/app/playground/barrier.cpp deleted file mode 100644 index 78b8ba2..0000000 --- a/app/playground/barrier.cpp +++ /dev/null @@ -1,13 +0,0 @@ -#include "barrier.h" - -barrier::barrier(const unsigned int count) : barrier_{} { - pthread_barrier_init(&barrier_, nullptr, count); -} - -barrier::~barrier() { - pthread_barrier_destroy(&barrier_); -} - -void barrier::wait() { - pthread_barrier_wait(&barrier_); -} diff --git a/app/playground/barrier.h b/app/playground/barrier.h deleted file mode 100644 index 88b5c0e..0000000 --- a/app/playground/barrier.h +++ /dev/null @@ -1,27 +0,0 @@ - -#ifndef PLS_BARRIER_H -#define PLS_BARRIER_H - -#include - - -/** - * Provides standard barrier behaviour. - * `count` threads have to call `wait()` before any of the `wait()` calls returns, - * thus blocking all threads until everyone reached the barrier. - * - * PORTABILITY: - * Current implementation is based on pthreads. - */ -class barrier { - pthread_barrier_t barrier_; - - public: - explicit barrier(unsigned int count); - ~barrier(); - - void wait(); -}; - - -#endif //PLS_BARRIER_H diff --git a/app/playground/main.cpp b/app/playground/main.cpp index 2ca61ee..d6e3b6c 100644 --- a/app/playground/main.cpp +++ b/app/playground/main.cpp @@ -1,42 +1,4 @@ -#include -#include -#include -#include -#include - -#include -#include "tsan_support.h" - -using namespace std; - -long count_memory_mappings() { - pid_t my_pid = getpid(); - ifstream proc_file{"/proc/" + to_string(my_pid) + "/maps"}; - - string line; - long line_count{0}; - while (getline(proc_file, line)) { - line_count++; - } - - return line_count; -} - int main() { - mutex mut; - - int count = 0; - while (true) { - printf("iteration: %d, mappings: %ld\n", count++, count_memory_mappings()); - void *main_fiber = __tsan_get_current_fiber(); - void *other_fiber = __tsan_create_fiber(0); - __tsan_switch_to_fiber(other_fiber, 0); - mut.lock(); - mut.unlock(); - __tsan_switch_to_fiber(main_fiber, 0); - __tsan_destroy_fiber(other_fiber); - - } return 0; } diff --git a/app/playground/tsan_support.h b/app/playground/tsan_support.h deleted file mode 100644 index a5f60a0..0000000 --- a/app/playground/tsan_support.h +++ /dev/null @@ -1,21 +0,0 @@ - -#ifndef CONTEXT_SWITCHER_TSAN_SUPPORT -#define CONTEXT_SWITCHER_TSAN_SUPPORT - -extern "C" { -// Fiber switching API. -// - TSAN context for fiber can be created by __tsan_create_fiber -// and freed by __tsan_destroy_fiber. -// - TSAN context of current fiber or thread can be obtained -// by calling __tsan_get_current_fiber. -// - __tsan_switch_to_fiber should be called immediatly before switch -// to fiber, such as call of swapcontext. -// - Fiber name can be set by __tsan_set_fiber_name. -void *__tsan_get_current_fiber(void); -void *__tsan_create_fiber(unsigned flags); -void __tsan_destroy_fiber(void *fiber); -void __tsan_switch_to_fiber(void *fiber, unsigned flags); -void __tsan_set_fiber_name(void *fiber, const char *name); -}; - -#endif //CONTEXT_SWITCHER_TSAN_SUPPORT diff --git a/app/test_for_new/CMakeLists.txt b/app/test_for_new/CMakeLists.txt deleted file mode 100644 index 06a77bd..0000000 --- a/app/test_for_new/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -add_executable(test_for_new main.cpp) - -# Example for adding the library to your app (as a cmake project dependency) -target_link_libraries(test_for_new pls) diff --git a/app/test_for_new/main.cpp b/app/test_for_new/main.cpp deleted file mode 100644 index 18596e1..0000000 --- a/app/test_for_new/main.cpp +++ /dev/null @@ -1,12 +0,0 @@ -#include -#include - -using namespace pls::internal::base; - -int global = 0; - -int main() { - // Try to use every feature, to trigger the prohibited use of new if found somewhere - thread t1{[]() {}}; - t1.join(); -} diff --git a/lib/pls/CMakeLists.txt b/lib/pls/CMakeLists.txt index e2f59f2..3311e62 100644 --- a/lib/pls/CMakeLists.txt +++ b/lib/pls/CMakeLists.txt @@ -25,7 +25,6 @@ add_library(pls STATIC include/pls/internal/helpers/prohibit_new.h include/pls/internal/helpers/profiler.h - include/pls/internal/helpers/mini_benchmark.h include/pls/internal/helpers/unique_id.h include/pls/internal/helpers/range.h include/pls/internal/helpers/seqence.h diff --git a/lib/pls/include/pls/internal/base/backoff.h b/lib/pls/include/pls/internal/base/backoff.h index c78ef3e..f0625b2 100644 --- a/lib/pls/include/pls/internal/base/backoff.h +++ b/lib/pls/include/pls/internal/base/backoff.h @@ -4,10 +4,11 @@ #include "pls/internal/base/system_details.h" #include "pls/internal/helpers/profiler.h" -#include "pls/internal/base/thread.h" #include -#include +#include +#include +#include namespace pls::internal::base { @@ -34,7 +35,8 @@ class backoff { if (current_ >= YELD_ITERS) { PROFILE_LOCK("Yield") - this_thread::sleep(5); + using namespace std::chrono_literals; + std::this_thread::sleep_for(5us); } current_ = std::min(current_ * 2, MAX_ITERS); diff --git a/lib/pls/include/pls/internal/base/tas_spin_lock.h b/lib/pls/include/pls/internal/base/tas_spin_lock.h index 74a11a5..e4438e6 100644 --- a/lib/pls/include/pls/internal/base/tas_spin_lock.h +++ b/lib/pls/include/pls/internal/base/tas_spin_lock.h @@ -7,8 +7,6 @@ #include #include -#include "pls/internal/base/thread.h" - namespace pls { namespace internal { namespace base { diff --git a/lib/pls/include/pls/internal/base/ttas_spin_lock.h b/lib/pls/include/pls/internal/base/ttas_spin_lock.h index 787f772..abd67fd 100644 --- a/lib/pls/include/pls/internal/base/ttas_spin_lock.h +++ b/lib/pls/include/pls/internal/base/ttas_spin_lock.h @@ -5,7 +5,6 @@ #include #include -#include "pls/internal/base/thread.h" #include "pls/internal/base/backoff.h" namespace pls { diff --git a/lib/pls/include/pls/internal/helpers/mini_benchmark.h b/lib/pls/include/pls/internal/helpers/mini_benchmark.h deleted file mode 100644 index 0b7fa63..0000000 --- a/lib/pls/include/pls/internal/helpers/mini_benchmark.h +++ /dev/null @@ -1,70 +0,0 @@ - -#ifndef PLS_MINI_BENCHMARK_H -#define PLS_MINI_BENCHMARK_H - -#include "pls/internal/scheduling/scheduler_memory.h" -#include "pls/internal/scheduling/scheduler.h" - -#include -#include - -namespace pls { -namespace internal { -namespace helpers { - -// TODO: Clean up (separate into small functions and .cpp file) -template -void run_mini_benchmark(const Function &lambda, - size_t max_threads, - unsigned long max_runtime_ms = 1000, - unsigned long warmup_time_ms = 100) { - using namespace std; - using namespace pls::internal::scheduling; - - malloc_scheduler_memory scheduler_memory{max_threads, 2u << 17u}; - for (unsigned int num_threads = 1; num_threads <= max_threads; num_threads++) { - scheduler local_scheduler{&scheduler_memory, num_threads}; - - chrono::high_resolution_clock::time_point start_time; - chrono::high_resolution_clock::time_point end_time; - long max_local_time = 0; - long total_time = 0; - long iterations = 0; - - local_scheduler.perform_work([&] { - start_time = chrono::high_resolution_clock::now(); - end_time = start_time; - chrono::high_resolution_clock::time_point planned_end_time = start_time + chrono::milliseconds(max_runtime_ms); - chrono::high_resolution_clock::time_point planned_warmup_time = start_time + chrono::milliseconds(warmup_time_ms); - - while (end_time < planned_end_time) { - if (end_time < planned_warmup_time) { - lambda(); - } else { - auto local_start_time = chrono::high_resolution_clock::now(); - lambda(); - auto local_end_time = chrono::high_resolution_clock::now(); - long local_time = chrono::duration_cast(local_end_time - local_start_time).count(); - - total_time += local_time; - max_local_time = std::max(local_time, max_local_time); - iterations++; - } - end_time = chrono::high_resolution_clock::now(); - } - }); - double time_per_iteration = (double) total_time / iterations; - - std::cout << (long) time_per_iteration << " (" << max_local_time << ")"; - if (num_threads < max_threads) { - std::cout << "\t\t"; - } - } - std::cout << std::endl; -} - -} -} -} - -#endif //PLS_MINI_BENCHMARK_H