#include #include #include #include #include #include #include #include #include static constexpr int INPUT_SIZE = 8192; typedef std::vector> complex_vector; using namespace pls::dataflow; void divide(complex_vector::iterator data, int n) { complex_vector tmp_odd_elements(n / 2); for (int i = 0; i < n / 2; i++) { tmp_odd_elements[i] = data[i * 2 + 1]; } for (int i = 0; i < n / 2; i++) { data[i] = data[i * 2]; } for (int i = 0; i < n / 2; i++) { data[i + n / 2] = tmp_odd_elements[i]; } } void combine(complex_vector::iterator data, int n) { for (int i = 0; i < n / 2; i++) { std::complex even = data[i]; std::complex odd = data[i + n / 2]; // w is the "twiddle-factor". // this could be cached, but we run the same 'data_structures' algorithm parallel/serial, // so it won't impact the performance comparison. std::complex w = exp(std::complex(0, -2. * M_PI * i / n)); data[i] = even + w * odd; data[i + n / 2] = even - w * odd; } } void fft(complex_vector::iterator data, int n) { if (n < 2) { return; } divide(data, n); fft(data, n / 2); fft(data + n / 2, n / 2); combine(data, n); } complex_vector prepare_input(int input_size) { std::vector known_frequencies{2, 11, 52, 88, 256}; complex_vector data(input_size); // Set our input data to match a time series of the known_frequencies. // When applying fft to this time-series we should find these frequencies. for (int i = 0; i < input_size; i++) { data[i] = std::complex(0.0, 0.0); for (auto frequencie : known_frequencies) { data[i] += sin(2 * M_PI * frequencie * i / input_size); } } return data; } int main() { PROFILE_ENABLE pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18u}; pls::scheduler scheduler{&my_scheduler_memory, 4}; graph, outputs> graph; std::atomic count{0}; auto lambda = [&](const int &in, int &out) { PROFILE_WORK_BLOCK("Work Lambda") auto tmp = in; out = tmp; complex_vector input = prepare_input(INPUT_SIZE); fft(input.begin(), input.size()); count++; }; function_node, outputs, decltype(lambda)> step_1{lambda}; function_node, outputs, decltype(lambda)> step_2{lambda}; function_node, outputs, decltype(lambda)> step_3{lambda}; function_node, outputs, decltype(lambda)> step_4{lambda}; graph >> step_1 >> step_2 >> step_3 >> step_4 >> graph; graph.build(); const int num_elements = 10; std::vector> results(num_elements); pls::internal::helpers::run_mini_benchmark([&] { PROFILE_WORK_BLOCK("Top Level") for (int j = 0; j < num_elements; j++) { graph.run(std::tuple{j}, &results[j]); } pls::scheduler::wait_for_all(); }, 8, 1000); PROFILE_SAVE("test_profile.prof") } //int main() { // PROFILE_ENABLE // pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18u}; // pls::scheduler scheduler{&my_scheduler_memory, 4}; // // graph, outputs> graph; // std::atomic count{0}; // auto lambda = [&](const int &in, int &out) { // PROFILE_WORK_BLOCK("Work Lambda") // out = in; // complex_vector input = prepare_input(INPUT_SIZE); // fft(input.begin(), input.size()); // count++; // }; // function_node, outputs, decltype(lambda)> step_1{lambda}; // function_node, outputs, decltype(lambda)> step_2{lambda}; // function_node, outputs, decltype(lambda)> step_3{lambda}; // function_node, outputs, decltype(lambda)> step_4{lambda}; // // graph >> step_1 >> step_2 >> step_3 >> step_4 >> graph; // graph.build(); // // const int num_elements = 10; // std::vector> results(num_elements); // // scheduler.perform_work([&] { // PROFILE_MAIN_THREAD // for (int i = 0; i < 10; i++) { // PROFILE_WORK_BLOCK("Top Level") // for (int j = 0; j < num_elements; j++) { // graph.run(std::tuple{j}, &results[j]); // } // pls::scheduler::wait_for_all(); // } // }); // // std::cout << count << std::endl; // // PROFILE_SAVE("test_profile.prof") //}