main.cpp 4.35 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
#include <pls/pls.h>
#include <pls/dataflow/dataflow.h>
#include <pls/internal/helpers/profiler.h>
#include <pls/internal/helpers/mini_benchmark.h>

#include <iostream>
#include <complex>
#include <vector>
#include <tuple>
#include <atomic>

static constexpr int INPUT_SIZE = 8192;
typedef std::vector<std::complex<double>> complex_vector;

using namespace pls::dataflow;

void divide(complex_vector::iterator data, int n) {
  complex_vector tmp_odd_elements(n / 2);
  for (int i = 0; i < n / 2; i++) {
    tmp_odd_elements[i] = data[i * 2 + 1];
  }
  for (int i = 0; i < n / 2; i++) {
    data[i] = data[i * 2];
  }
  for (int i = 0; i < n / 2; i++) {
    data[i + n / 2] = tmp_odd_elements[i];
  }
}

void combine(complex_vector::iterator data, int n) {
  for (int i = 0; i < n / 2; i++) {
    std::complex<double> even = data[i];
    std::complex<double> odd = data[i + n / 2];

    // w is the "twiddle-factor".
    // this could be cached, but we run the same 'data_structures' algorithm parallel/serial,
    // so it won't impact the performance comparison.
    std::complex<double> w = exp(std::complex<double>(0, -2. * M_PI * i / n));

    data[i] = even + w * odd;
    data[i + n / 2] = even - w * odd;
  }
}

void fft(complex_vector::iterator data, int n) {
  if (n < 2) {
    return;
  }

  divide(data, n);
  fft(data, n / 2);
  fft(data + n / 2, n / 2);
  combine(data, n);
}

complex_vector prepare_input(int input_size) {
  std::vector<double> known_frequencies{2, 11, 52, 88, 256};
  complex_vector data(input_size);

  // Set our input data to match a time series of the known_frequencies.
  // When applying fft to this time-series we should find these frequencies.
  for (int i = 0; i < input_size; i++) {
    data[i] = std::complex<double>(0.0, 0.0);
    for (auto frequencie : known_frequencies) {
      data[i] += sin(2 * M_PI * frequencie * i / input_size);
    }
  }

  return data;
}

int main() {
  PROFILE_ENABLE
  pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18u};
  pls::scheduler scheduler{&my_scheduler_memory, 4};

  graph<inputs<int>, outputs<int>> graph;
  std::atomic<int> count{0};
  auto lambda = [&](const int &in, int &out) {
    PROFILE_WORK_BLOCK("Work Lambda")
    auto tmp = in;
    out = tmp;
    complex_vector input = prepare_input(INPUT_SIZE);
    fft(input.begin(), input.size());
    count++;
  };
  function_node<inputs<int>, outputs<int>, decltype(lambda)> step_1{lambda};
  function_node<inputs<int>, outputs<int>, decltype(lambda)> step_2{lambda};
  function_node<inputs<int>, outputs<int>, decltype(lambda)> step_3{lambda};
  function_node<inputs<int>, outputs<int>, decltype(lambda)> step_4{lambda};

  graph >> step_1 >> step_2 >> step_3 >> step_4 >> graph;
  graph.build();

  const int num_elements = 10;
  std::vector<std::tuple<int>> results(num_elements);

  pls::internal::helpers::run_mini_benchmark([&] {
    PROFILE_WORK_BLOCK("Top Level")
    for (int j = 0; j < num_elements; j++) {
      graph.run(std::tuple<int>{j}, &results[j]);
    }
    pls::scheduler::wait_for_all();
  }, 8, 1000);

  PROFILE_SAVE("test_profile.prof")
}

//int main() {
//  PROFILE_ENABLE
//  pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 18u};
//  pls::scheduler scheduler{&my_scheduler_memory, 4};
//
//  graph<inputs<int>, outputs<int>> graph;
//  std::atomic<int> count{0};
//  auto lambda = [&](const int &in, int &out) {
//    PROFILE_WORK_BLOCK("Work Lambda")
//    out = in;
//    complex_vector input = prepare_input(INPUT_SIZE);
//    fft(input.begin(), input.size());
//    count++;
//  };
//  function_node<inputs<int>, outputs<int>, decltype(lambda)> step_1{lambda};
//  function_node<inputs<int>, outputs<int>, decltype(lambda)> step_2{lambda};
//  function_node<inputs<int>, outputs<int>, decltype(lambda)> step_3{lambda};
//  function_node<inputs<int>, outputs<int>, decltype(lambda)> step_4{lambda};
//
//  graph >> step_1 >> step_2 >> step_3 >> step_4 >> graph;
//  graph.build();
//
//  const int num_elements = 10;
//  std::vector<std::tuple<int>> results(num_elements);
//
//  scheduler.perform_work([&] {
//    PROFILE_MAIN_THREAD
//    for (int i = 0; i < 10; i++) {
//      PROFILE_WORK_BLOCK("Top Level")
//      for (int j = 0; j < num_elements; j++) {
//        graph.run(std::tuple<int>{j}, &results[j]);
//      }
//      pls::scheduler::wait_for_all();
//    }
//  });
//
//  std::cout << count << std::endl;
//
//  PROFILE_SAVE("test_profile.prof")
//}