//! Benachmarking funcitonality for [Criterion.rs](https://github.com/bheisler/criterion.rs) //! This benchmark will compare the performance of various thread pools launched with different amounts of //! maximum threads. //! Each thread will calculate a partial dot product of two different vectors composed of 1,000,000 64-bit //! double precision floating point values. use std::sync::Arc; use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; use imsearch::multithreading::ThreadPool; /// Amount of elements per vector used to calculate the dot product const VEC_ELEM_COUNT: usize = 1_000_000; /// Number of threads to test const THREAD_COUNTS: [usize; 17] = [ 1, 2, 4, 6, 8, 10, 12, 16, 18, 20, 22, 26, 28, 32, 40, 56, 64, ]; /// seeds used to scramble up the values produced by the hash function for each vector /// these are just some pseudo random numbers const VEC_SEEDS: [u64; 2] = [0xa3f8347abce16, 0xa273048ca9dea]; /// Compute the dot product of two vectors /// # Panics /// this function assumes both vectors to be of exactly the same length. /// If this is not the case the function will panic. fn dot(a: &[f64], b: &[f64]) -> f64 { let mut sum = 0.0; for i in 0..a.len() { sum += a[i] * b[i]; } sum } /// Computes the dot product using a thread pool with varying number of threads. The vectors will be both splitted into equally /// sized slices which then get passed ot their own thread to compute the partial dot product. After all threads have /// finished the partial dot products will be summed to create the final result. fn dot_parallel(a: Arc>, b: Arc>, threads: usize) { let mut pool = ThreadPool::with_limit(threads); // number of elements in each vector for each thread let steps = a.len() / threads; for i in 0..threads { // offset of the first element for the thread local vec let chunk = i * steps; // create a new strong reference to the vector let aa = a.clone(); let bb = b.clone(); // launch a new thread pool.enqueue(move || { let a = &aa[chunk..(chunk + steps)]; let b = &bb[chunk..(chunk + steps)]; dot(a, b) }); } pool.join_all(); black_box(pool.get_results().iter().sum::()); } /// Compute a simple hash value for the given index value. /// This function will return a value between [0, 1]. #[inline] fn hash(x: f64) -> f64 { ((x * 234.8743 + 3.8274).sin() * 87624.58376).fract() } /// Create a vector filled with `size` elements of 64-bit floating point numbers /// each initialized with the function `hash` and the given seed. The vector will /// be filled with values between [0, 1]. fn create_vec(size: usize, seed: u64) -> Arc> { let mut vec = Vec::with_capacity(size); for i in 0..size { vec.push(hash(i as f64 + seed as f64)); } Arc::new(vec) } /// Function for executing the thread pool benchmarks using criterion.rs. /// It will create two different vectors and benchmark the single thread performance /// as well as the multi threadded performance for varying amounts of threads. pub fn bench_threadpool(c: &mut Criterion) { let vec_a = create_vec(VEC_ELEM_COUNT, VEC_SEEDS[0]); let vec_b = create_vec(VEC_ELEM_COUNT, VEC_SEEDS[1]); let mut group = c.benchmark_group("threadpool with various number of threads"); for threads in THREAD_COUNTS.iter() { group.throughput(Throughput::Bytes(*threads as u64)); group.bench_with_input(BenchmarkId::from_parameter(threads), threads, |b, _| { b.iter(|| { dot_parallel(vec_a.clone(), vec_b.clone(), *threads); }); }); } group.finish(); } /// Benchmark the effects of over and underusing a thread pools thread capacity. /// The thread pool will automatically choose the number of threads to use. /// We will then run a custom number of jobs with that pool that may be smaller or larger /// than the amount of threads the pool can offer. fn pool_overusage(a: Arc>, b: Arc>, threads: usize) { // automatically choose the number of threads let mut pool = ThreadPool::new(); // number of elements in each vector for each thread let steps = a.len() / threads; for i in 0..threads { // offset of the first element for the thread local vec let chunk = i * steps; // create a new strong reference to the vector let aa = a.clone(); let bb = b.clone(); // launch a new thread pool.enqueue(move || { let a = &aa[chunk..(chunk + steps)]; let b = &bb[chunk..(chunk + steps)]; dot(a, b) }); } pool.join_all(); black_box(pool.get_results().iter().sum::()); } /// Benchmark the effects of over and underusing a thread pools thread capacity. /// The thread pool will automatically choose the number of threads to use. /// We will then run a custom number of jobs with that pool that may be smaller or larger /// than the amount of threads the pool can offer. pub fn bench_overusage(c: &mut Criterion) { let vec_a = create_vec(VEC_ELEM_COUNT, VEC_SEEDS[0]); let vec_b = create_vec(VEC_ELEM_COUNT, VEC_SEEDS[1]); let mut group = c.benchmark_group("threadpool overusage"); for threads in THREAD_COUNTS.iter() { group.throughput(Throughput::Bytes(*threads as u64)); group.bench_with_input(BenchmarkId::from_parameter(threads), threads, |b, _| { b.iter(|| { pool_overusage(vec_a.clone(), vec_b.clone(), *threads); }); }); } group.finish(); } /// Benchmark the performance of a single thread used to calculate the dot product. pub fn bench_single_threaded(c: &mut Criterion) { let vec_a = create_vec(VEC_ELEM_COUNT, VEC_SEEDS[0]); let vec_b = create_vec(VEC_ELEM_COUNT, VEC_SEEDS[1]); c.bench_function("single threaded", |s| { s.iter(|| { black_box(dot(&vec_a, &vec_b)); }); }); } criterion_group!( benches, bench_single_threaded, bench_threadpool, bench_overusage ); criterion_main!(benches);