diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1afba5a..be1f6ac 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,7 @@ add_subdirectory(lib/pls)
 add_subdirectory(app/playground)
 add_subdirectory(app/test_for_new)
 add_subdirectory(app/invoke_parallel)
+add_subdirectory(app/benchmark_fft)
 
 # Add optional tests
 option(PACKAGE_TESTS "Build the tests" ON)
diff --git a/app/benchmark_fft/CMakeLists.txt b/app/benchmark_fft/CMakeLists.txt
new file mode 100644
index 0000000..41591e5
--- /dev/null
+++ b/app/benchmark_fft/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_executable(benchmark_fft main.cpp)
+target_link_libraries(benchmark_fft pls)
+if(EASY_PROFILER)
+    target_link_libraries(benchmark_fft easy_profiler)
+endif()
diff --git a/app/benchmark_fft/main.cpp b/app/benchmark_fft/main.cpp
new file mode 100644
index 0000000..33cbd88
--- /dev/null
+++ b/app/benchmark_fft/main.cpp
@@ -0,0 +1,87 @@
+#include <pls/pls.h>
+#include <pls/internal/helpers/profiler.h>
+#include <pls/internal/helpers/mini_benchmark.h>
+
+#include <iostream>
+#include <complex>
+#include <vector>
+
+static constexpr int CUTOFF = 10;
+static constexpr int NUM_ITERATIONS = 1000;
+static constexpr int INPUT_SIZE = 2064;
+typedef std::vector<std::complex<double>> complex_vector;
+
+void divide(complex_vector::iterator data, int n) {
+    complex_vector tmp_odd_elements(n / 2);
+    for (int i = 0; i < n / 2; i++) {
+        tmp_odd_elements[i] = data[i * 2 + 1];
+    }
+    for (int i = 0; i < n / 2; i++) {
+        data[i] = data[i * 2];
+    }
+    for (int i = 0; i < n / 2; i++) {
+        data[i + n / 2] = tmp_odd_elements[i];
+    }
+}
+
+void combine(complex_vector::iterator data, int n) {
+    for (int i = 0; i < n / 2; i++) {
+        std::complex<double> even = data[i];
+        std::complex<double> odd = data[i + n / 2];
+
+        // w is the "twiddle-factor".
+        // this could be cached, but we run the same 'base' algorithm parallel/serial,
+        // so it won't impact the performance comparison.
+        std::complex<double> w = exp(std::complex<double>(0, -2. * M_PI * i / n));
+
+        data[i] = even + w * odd;
+        data[i + n / 2] = even - w * odd;
+    }
+}
+
+void fft(complex_vector::iterator data, int n) {
+    if (n < 2) {
+        return;
+    }
+
+    divide(data, n);
+    if (n <= CUTOFF) {
+        fft(data, n / 2);
+        fft(data + n / 2, n / 2);
+    } else {
+        pls::invoke_parallel(
+            [&] { fft(data, n / 2); },
+            [&] { fft(data + n / 2, n / 2); }
+        );
+    }
+    combine(data, n);
+}
+
+complex_vector prepare_input(int input_size) {
+    std::vector<double> known_frequencies{2, 11, 52, 88, 256};
+    complex_vector data(input_size);
+
+    // Set our input data to match a time series of the known_frequencies.
+    // When applying fft to this time-series we should find these frequencies.
+    for (int i = 0; i < input_size; i++) {
+        data[i] = std::complex<double>(0.0, 0.0);
+        for (auto frequencie : known_frequencies) {
+            data[i] += sin(2 * M_PI * frequencie * i / input_size);
+        }
+    }
+
+    return data;
+}
+
+
+int main() {
+    PROFILE_ENABLE
+    complex_vector initial_input = prepare_input(INPUT_SIZE);
+
+    pls::internal::helpers::run_mini_benchmark([&] {
+        complex_vector input = initial_input;
+        fft(input.begin(), input.size());
+    }, 8);
+
+    PROFILE_SAVE("test_profile.prof")
+}
diff --git a/lib/pls/CMakeLists.txt b/lib/pls/CMakeLists.txt
index 758a947..6cee207 100644
--- a/lib/pls/CMakeLists.txt
+++ b/lib/pls/CMakeLists.txt
@@ -17,7 +17,8 @@ add_library(pls STATIC
             src/algorithms/invoke_parallel.cpp include/pls/algorithms/invoke_parallel.h
             include/pls/internal/base/error_handling.h
             include/pls/internal/scheduling/scheduler_memory.h src/internal/scheduling/scheduler_memory.cpp
-        include/pls/internal/helpers/profiler.h)
+            include/pls/internal/helpers/profiler.h
+            include/pls/internal/helpers/mini_benchmark.h)
 
 # Add everything in `./include` to be in the include path of this project
 target_include_directories(pls
diff --git a/lib/pls/include/pls/internal/helpers/mini_benchmark.h b/lib/pls/include/pls/internal/helpers/mini_benchmark.h
new file mode 100644
index 0000000..c3742ad
--- /dev/null
+++ b/lib/pls/include/pls/internal/helpers/mini_benchmark.h
@@ -0,0 +1,53 @@
+
+#ifndef PLS_MINI_BENCHMARK_H
+#define PLS_MINI_BENCHMARK_H
+
+#include "pls/internal/scheduling/scheduler_memory.h"
+#include "pls/internal/scheduling/scheduler.h"
+
+#include <chrono>
+#include <iostream>
+
+namespace pls {
+    namespace internal {
+        namespace helpers {
+            // TODO: Clean up (separate into small functions and .cpp file)
+            template<typename Function>
+            void run_mini_benchmark(const Function& lambda, size_t max_threads, long max_runtime_ms=1000) {
+                using namespace std;
+                using namespace pls::internal::scheduling;
+
+                malloc_scheduler_memory scheduler_memory{max_threads};
+                for (unsigned int num_threads = 1; num_threads <= max_threads; num_threads++) {
+                    scheduler local_scheduler{&scheduler_memory, num_threads};
+
+                    chrono::high_resolution_clock::time_point start_time;
+                    chrono::high_resolution_clock::time_point end_time;
+                    unsigned long iterations = 0;
+                    local_scheduler.perform_work([&] {
+                        start_time = chrono::high_resolution_clock::now();
+                        end_time = start_time;
+                        chrono::high_resolution_clock::time_point planned_end_time = start_time + chrono::milliseconds(max_runtime_ms);
+
+                        while (end_time < planned_end_time) {
+                            lambda();
+                            end_time = chrono::high_resolution_clock::now();
+                            iterations++;
+                        }
+                    });
+
+                    long time = chrono::duration_cast<chrono::microseconds>(end_time - start_time).count();
+                    double time_per_iteration = (double)time / iterations;
+
+                    std::cout << time_per_iteration;
+                    if (num_threads < max_threads) {
+                        std::cout << ", ";
+                    }
+                }
+                std::cout << std::endl;
+            }
+        }
+    }
+}
+
+#endif //PLS_MINI_BENCHMARK_H