main.cpp

#include <pls/pls.h>
#include <pls/internal/helpers/profiler.h>

#include <iostream>
#include <complex>
#include <vector>

static constexpr int CUTOFF = 16;
static constexpr int NUM_ITERATIONS = 1000;
static constexpr int INPUT_SIZE = 2064;
typedef std::vector<std::complex<double>> complex_vector;

void divide(complex_vector::iterator data, int n) {
  complex_vector tmp_odd_elements(n / 2);
  for (int i = 0; i < n / 2; i++) {
    tmp_odd_elements[i] = data[i * 2 + 1];
  }
  for (int i = 0; i < n / 2; i++) {
    data[i] = data[i * 2];
  }
  for (int i = 0; i < n / 2; i++) {
    data[i + n / 2] = tmp_odd_elements[i];
  }
}

void combine(complex_vector::iterator data, int n) {
  for (int i = 0; i < n / 2; i++) {
    std::complex<double> even = data[i];
    std::complex<double> odd = data[i + n / 2];

    // w is the "twiddle-factor".
    // this could be cached, but we run the same 'data_structures' algorithm parallel/serial,
    // so it won't impact the performance comparison.
    std::complex<double> w = exp(std::complex<double>(0, -2. * M_PI * i / n));

    data[i] = even + w * odd;
    data[i + n / 2] = even - w * odd;
  }
}

void fft(complex_vector::iterator data, int n) {
  if (n < 2) {
    return;
  }

  PROFILE_WORK_BLOCK("Divide")
  divide(data, n);
  PROFILE_END_BLOCK
  PROFILE_WORK_BLOCK("Invoke Parallel")
  if (n == CUTOFF) {
    PROFILE_WORK_BLOCK("FFT Serial")
    fft(data, n / 2);
    fft(data + n / 2, n / 2);
  } else if (n <= CUTOFF) {
    fft(data, n / 2);
    fft(data + n / 2, n / 2);
  } else {
    pls::invoke_parallel(
        [&] { fft(data, n / 2); },
        [&] { fft(data + n / 2, n / 2); }
    );
  }
  PROFILE_END_BLOCK
  PROFILE_WORK_BLOCK("Combine")
  combine(data, n);
  PROFILE_END_BLOCK
}

complex_vector prepare_input(int input_size) {
  std::vector<double> known_frequencies{2, 11, 52, 88, 256};
  complex_vector data(input_size);

  // Set our input data to match a time series of the known_frequencies.
  // When applying fft to this time-series we should find these frequencies.
  for (int i = 0; i < input_size; i++) {
    data[i] = std::complex<double>(0.0, 0.0);
    for (auto frequencie : known_frequencies) {
      data[i] += sin(2 * M_PI * frequencie * i / input_size);
    }
  }

  return data;
}

int main() {
  PROFILE_ENABLE
  pls::malloc_scheduler_memory my_scheduler_memory{8, 2u << 14};
  pls::scheduler scheduler{&my_scheduler_memory, 8};

  complex_vector initial_input = prepare_input(INPUT_SIZE);
  scheduler.perform_work([&] {
    PROFILE_MAIN_THREAD
    // Call looks just the same, only requirement is
    // the enclosure in the perform_work lambda.
    for (int i = 0; i < 10; i++) {
      PROFILE_WORK_BLOCK("Top Level FFT")
      complex_vector input = initial_input;
      fft(input.begin(), input.size());
    }
  });

  PROFILE_SAVE("test_profile.prof")
}