diff --git a/app/benchmark_matrix/main.cpp b/app/benchmark_matrix/main.cpp index c6fc793..ed6d874 100644 --- a/app/benchmark_matrix/main.cpp +++ b/app/benchmark_matrix/main.cpp @@ -2,6 +2,8 @@ #include #include +#include + const int MATRIX_SIZE = 128; template @@ -58,8 +60,8 @@ int main() { //int main() { // PROFILE_ENABLE -// pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18}; -// pls::scheduler scheduler{&my_scheduler_memory, 8}; +// pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18u}; +// pls::scheduler scheduler{&my_scheduler_memory, 4}; // // matrix a; // matrix b; @@ -67,13 +69,17 @@ int main() { // fill_with_data(a, b); // // scheduler.perform_work([&] { +// auto start_time = std::chrono::high_resolution_clock::now(); // PROFILE_MAIN_THREAD -// for (int i = 0; i < 5000; i++) { +// for (int i = 0; i < 10000; i++) { // PROFILE_WORK_BLOCK("Top Level") // result.multiply(a, b); // } +// auto end_time = std::chrono::high_resolution_clock::now(); +// long time = std::chrono::duration_cast(end_time - start_time).count(); +// std::cout << "Runtime: " << time << "us" << std::endl; // }); // // PROFILE_SAVE("test_profile.prof") //} -// + diff --git a/lib/pls/include/pls/internal/helpers/mini_benchmark.h b/lib/pls/include/pls/internal/helpers/mini_benchmark.h index 5a5ecc9..0868cf6 100644 --- a/lib/pls/include/pls/internal/helpers/mini_benchmark.h +++ b/lib/pls/include/pls/internal/helpers/mini_benchmark.h @@ -14,36 +14,46 @@ namespace helpers { // TODO: Clean up (separate into small functions and .cpp file) template -void run_mini_benchmark(const Function &lambda, size_t max_threads, unsigned long max_runtime_ms = 1000) { +void run_mini_benchmark(const Function &lambda, + size_t max_threads, + unsigned long max_runtime_ms = 1000, + unsigned long warmup_time_ms = 100) { using namespace std; using namespace pls::internal::scheduling; - malloc_scheduler_memory scheduler_memory{max_threads, 2 << 12}; + malloc_scheduler_memory scheduler_memory{max_threads, 2u << 14}; for (unsigned int num_threads = 1; num_threads <= max_threads; num_threads++) { scheduler local_scheduler{&scheduler_memory, num_threads}; chrono::high_resolution_clock::time_point start_time; chrono::high_resolution_clock::time_point end_time; long max_local_time = 0; - unsigned long iterations = 0; + long total_time = 0; + long iterations = 0; + local_scheduler.perform_work([&] { start_time = chrono::high_resolution_clock::now(); end_time = start_time; chrono::high_resolution_clock::time_point planned_end_time = start_time + chrono::milliseconds(max_runtime_ms); + chrono::high_resolution_clock::time_point planned_warmup_time = start_time + chrono::milliseconds(warmup_time_ms); while (end_time < planned_end_time) { - auto local_start_time = chrono::high_resolution_clock::now(); - lambda(); - auto local_end_time = chrono::high_resolution_clock::now(); - long local_time = chrono::duration_cast(local_end_time - local_start_time).count(); - max_local_time = std::max(local_time, max_local_time); + if (end_time < planned_warmup_time) { + lambda(); + } else { + auto local_start_time = chrono::high_resolution_clock::now(); + lambda(); + auto local_end_time = chrono::high_resolution_clock::now(); + long local_time = chrono::duration_cast(local_end_time - local_start_time).count(); + + total_time += local_time; + max_local_time = std::max(local_time, max_local_time); + iterations++; + } end_time = chrono::high_resolution_clock::now(); - iterations++; } }); - - long time = chrono::duration_cast(end_time - start_time).count(); - double time_per_iteration = (double) time / iterations; + double time_per_iteration = (double) total_time / iterations; std::cout << (long) time_per_iteration << " (" << max_local_time << ")"; if (num_threads < max_threads) {