diff --git a/.gitignore b/.gitignore index 4c790d0..98f2979 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /target /Cargo.lock .DS_Store +/.vscode \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index a6d8ad6..69cd0ba 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,3 +10,10 @@ authors = ["Sven Vogel", "Felix L. Müller", "Elias Alexander", "Elias Schmidt"] png = "0.17.8" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" + +[dev-dependencies] +criterion = "0.5.1" + +[[bench]] +name = "multithreading" +harness = false \ No newline at end of file diff --git a/benches/multithreading.rs b/benches/multithreading.rs new file mode 100644 index 0000000..84a6014 --- /dev/null +++ b/benches/multithreading.rs @@ -0,0 +1,186 @@ +//! Benachmarking funcitonality for [Criterion.rs](https://github.com/bheisler/criterion.rs) +//! This benchmark will compare the performance of various thread pools launched with different amounts of +//! maximum threads. +//! Each thread will calculate a partial dot product of two different vectors composed of 1,000,000 64-bit +//! double precision floating point values. + +use std::{num::NonZeroUsize, sync::Arc}; + +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; +use imsearch::multithreading::ThreadPool; + +/// Amount of elements per vector used to calculate the dot product +const VEC_ELEM_COUNT: usize = 1_000_000; +/// Number of threads to test +const THREAD_COUNTS: [usize; 17] = [ + 1, 2, 4, 6, 8, 10, 12, 16, 18, 20, 22, 26, 28, 32, 40, 56, 64, +]; +/// seeds used to scramble up the values produced by the hash function for each vector +/// these are just some pseudo random numbers +const VEC_SEEDS: [u64; 2] = [0xa3f8347abce16, 0xa273048ca9dea]; + +/// Compute the dot product of two vectors +/// # Panics +/// this function assumes both vectors to be of exactly the same length. +/// If this is not the case the function will panic. +fn dot(a: &[f64], b: &[f64]) -> f64 { + let mut sum = 0.0; + + for i in 0..a.len() { + sum += a[i] * b[i]; + } + + sum +} + +/// Computes the dot product using a thread pool with varying number of threads. The vectors will be both splitted into equally +/// sized slices which then get passed ot their own thread to compute the partial dot product. After all threads have +/// finished the partial dot products will be summed to create the final result. +fn dot_parallel(a: Arc>, b: Arc>, threads: usize) { + let mut pool = + ThreadPool::with_threads_and_drop_handles(NonZeroUsize::new(threads).unwrap(), true); + + // number of elements in each vector for each thread + let steps = a.len() / threads; + + for i in 0..threads { + // offset of the first element for the thread local vec + let chunk = i * steps; + // create a new strong reference to the vector + let aa = a.clone(); + let bb = b.clone(); + // launch a new thread + pool.enqueue(move || { + let a = &aa[chunk..(chunk + steps)]; + let b = &bb[chunk..(chunk + steps)]; + dot(a, b) + }); + } + + black_box( + // wait for the threads to finish + pool.join_all() + // iterate over the results and sum the parital dot products together + .into_iter() + .map(|r| r.unwrap()) + .reduce(|a, b| a + b), + ); +} + +/// Compute a simple hash value for the given index value. +/// This function will return a value between [0, 1]. +#[inline] +fn hash(x: f64) -> f64 { + ((x * 234.8743 + 3.8274).sin() * 87624.58376).fract() +} + +/// Create a vector filled with `size` elements of 64-bit floating point numbers +/// each initialized with the function `hash` and the given seed. The vector will +/// be filled with values between [0, 1]. +fn create_vec(size: usize, seed: u64) -> Arc> { + let mut vec = Vec::with_capacity(size); + + for i in 0..size { + vec.push(hash(i as f64 + seed as f64)); + } + + Arc::new(vec) +} + +/// Function for executing the thread pool benchmarks using criterion.rs. +/// It will create two different vectors and benchmark the single thread performance +/// as well as the multi threadded performance for varying amounts of threads. +pub fn bench_threadpool(c: &mut Criterion) { + let vec_a = create_vec(VEC_ELEM_COUNT, VEC_SEEDS[0]); + let vec_b = create_vec(VEC_ELEM_COUNT, VEC_SEEDS[1]); + + let mut group = c.benchmark_group("threadpool with various number of threads"); + + for threads in THREAD_COUNTS.iter() { + group.throughput(Throughput::Bytes(*threads as u64)); + group.bench_with_input(BenchmarkId::from_parameter(threads), threads, |b, _| { + b.iter(|| { + dot_parallel(vec_a.clone(), vec_b.clone(), *threads); + }); + }); + } + group.finish(); +} + +/// Benchmark the effects of over and underusing a thread pools thread capacity. +/// The thread pool will automatically choose the number of threads to use. +/// We will then run a custom number of jobs with that pool that may be smaller or larger +/// than the amount of threads the pool can offer. +fn pool_overusage(a: Arc>, b: Arc>, threads: usize) { + // automatically choose the number of threads + let mut pool = ThreadPool::new(); + // drop the handles used by each thread after its done + pool.drop_finished_handles(); + + // number of elements in each vector for each thread + let steps = a.len() / threads; + + for i in 0..threads { + // offset of the first element for the thread local vec + let chunk = i * steps; + // create a new strong reference to the vector + let aa = a.clone(); + let bb = b.clone(); + // launch a new thread + pool.enqueue(move || { + let a = &aa[chunk..(chunk + steps)]; + let b = &bb[chunk..(chunk + steps)]; + dot(a, b) + }); + } + + black_box( + // wait for the threads to finish + pool.join_all() + // iterate over the results and sum the parital dot products together + .into_iter() + .map(|r| r.unwrap()) + .reduce(|a, b| a + b), + ); +} + +/// Benchmark the effects of over and underusing a thread pools thread capacity. +/// The thread pool will automatically choose the number of threads to use. +/// We will then run a custom number of jobs with that pool that may be smaller or larger +/// than the amount of threads the pool can offer. +pub fn bench_overusage(c: &mut Criterion) { + let vec_a = create_vec(VEC_ELEM_COUNT, VEC_SEEDS[0]); + let vec_b = create_vec(VEC_ELEM_COUNT, VEC_SEEDS[1]); + + let mut group = c.benchmark_group("threadpool overusage"); + + for threads in THREAD_COUNTS.iter() { + group.throughput(Throughput::Bytes(*threads as u64)); + group.bench_with_input(BenchmarkId::from_parameter(threads), threads, |b, _| { + b.iter(|| { + pool_overusage(vec_a.clone(), vec_b.clone(), *threads); + }); + }); + } + group.finish(); +} + +/// Benchmark the performance of a single thread used to calculate the dot product. +pub fn bench_single_threaded(c: &mut Criterion) { + let vec_a = create_vec(VEC_ELEM_COUNT, VEC_SEEDS[0]); + let vec_b = create_vec(VEC_ELEM_COUNT, VEC_SEEDS[1]); + + c.bench_function("single threaded", |s| { + s.iter(|| { + black_box(dot(&vec_a, &vec_b)); + }); + }); +} + +criterion_group!( + benches, + bench_single_threaded, + bench_threadpool, + bench_overusage +); +criterion_main!(benches); diff --git a/src/multithreading/mod.rs b/src/multithreading/mod.rs index 71dabb6..647ea48 100644 --- a/src/multithreading/mod.rs +++ b/src/multithreading/mod.rs @@ -26,7 +26,7 @@ //! This implementation is not fully platform independent. This is due to the usage of [`std::sync::atomic::AtomicUsize`]. //! This type is used to remove some locks from otherwise used [`std::sync::Mutex`] wrapping a [`usize`]. //! Note that atomic primitives are not available on all platforms but "can generally be relied upon existing" -//! (see: https://doc.rust-lang.org/std/sync/atomic/index.html). +//! (see: ). //! Additionally this implementation relies on using the `load` and `store` operations //! instead of using more comfortable ones like `fetch_add` in order to avoid unnecessary calls //! to `unwrap` or `expected` from [`std::sync::MutexGuard`]. @@ -86,7 +86,7 @@ fn get_default_thread_count() -> usize { /// This implementation is not fully platform independent. This is due to the usage of [`std::sync::atomic::AtomicUsize`]. /// This type is used to remove some locks from otherwise used [`std::sync::Mutex`] wrapping a [`usize`]. /// Note that atomic primitives are not available on all platforms but "can generally be relied upon existing" -/// (see: https://doc.rust-lang.org/std/sync/atomic/index.html). +/// (see: ). /// Additionally this implementation relies on using the `load` and `store` operations /// instead of using more comfortable one like `fetch_add` in order to avoid unnecessary calls /// to `unwrap` or `expected` from [`std::sync::MutexGuard`]. @@ -112,7 +112,7 @@ where /// number of currently running threads /// new implementation relies on atomic primitives to avoid locking and possible /// guard errors. Note that atomic primitives are not available on all platforms "can generally be relied upon existing" - /// (see: https://doc.rust-lang.org/std/sync/atomic/index.html). + /// (see: ). /// Also this implementation relies on using the `load` and `store` operations /// instead of using more comfortable one like `fetch_add` threads: Arc, @@ -173,7 +173,7 @@ where /// supplying a number of threads to great may negatively impact performance as the system may not /// be able to full fill the required needs /// # Memory usage - /// if `drop_handles` is set to [`Bool::false`] the pool will continue to store the handles of + /// if `drop_handles` is set to `false` the pool will continue to store the handles of /// launched threads. This causes memory consumption to rise over time as more and more /// threads are launched. pub fn with_threads_and_drop_handles(