pr-ferrisgroup/benches/multithreading.rs

180 lines
6.3 KiB
Rust
Raw Normal View History

//! Benachmarking funcitonality for [Criterion.rs](https://github.com/bheisler/criterion.rs)
//! This benchmark will compare the performance of various thread pools launched with different amounts of
//! maximum threads.
//! Each thread will calculate a partial dot product of two different vectors composed of 1,000,000 64-bit
//! double precision floating point values.
use std::sync::Arc;
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use imsearch::multithreading::{Task, ThreadPool};
/// Amount of elements per vector used to calculate the dot product
const VEC_ELEM_COUNT: usize = 1_000_000;
/// Number of threads to test
const THREAD_COUNTS: [usize; 17] = [
1, 2, 4, 6, 8, 10, 12, 16, 18, 20, 22, 26, 28, 32, 40, 56, 64,
];
/// seeds used to scramble up the values produced by the hash function for each vector
/// these are just some pseudo random numbers
const VEC_SEEDS: [u64; 2] = [0xa3f8347abce16, 0xa273048ca9dea];
/// Compute the dot product of two vectors
/// # Panics
/// this function assumes both vectors to be of exactly the same length.
/// If this is not the case the function will panic.
fn dot(a: &[f64], b: &[f64]) -> f64 {
let mut sum = 0.0;
for i in 0..a.len() {
sum += a[i] * b[i];
}
sum
}
/// Computes the dot product using a thread pool with varying number of threads. The vectors will be both splitted into equally
/// sized slices which then get passed ot their own thread to compute the partial dot product. After all threads have
/// finished the partial dot products will be summed to create the final result.
fn dot_parallel(a: Arc<Vec<f64>>, b: Arc<Vec<f64>>, threads: usize) {
let mut pool = ThreadPool::with_limit(threads);
// number of elements in each vector for each thread
let steps = a.len() / threads;
for i in 0..threads {
// offset of the first element for the thread local vec
let chunk = i * steps;
// create a new strong reference to the vector
let aa = a.clone();
let bb = b.clone();
// launch a new thread
pool.enqueue(Task::new(
(aa, bb, chunk, steps),
|(aa, bb, chunk, steps)| {
let a = &aa[chunk..(chunk + steps)];
let b = &bb[chunk..(chunk + steps)];
dot(a, b)
},
));
}
pool.join_all();
black_box(pool.get_results().iter().sum::<f64>());
}
/// Compute a simple hash value for the given index value.
/// This function will return a value between [0, 1].
#[inline]
fn hash(x: f64) -> f64 {
((x * 234.8743 + 3.8274).sin() * 87624.58376).fract()
}
/// Create a vector filled with `size` elements of 64-bit floating point numbers
/// each initialized with the function `hash` and the given seed. The vector will
/// be filled with values between [0, 1].
fn create_vec(size: usize, seed: u64) -> Arc<Vec<f64>> {
let mut vec = Vec::with_capacity(size);
for i in 0..size {
vec.push(hash(i as f64 + seed as f64));
}
Arc::new(vec)
}
/// Function for executing the thread pool benchmarks using criterion.rs.
/// It will create two different vectors and benchmark the single thread performance
/// as well as the multi threadded performance for varying amounts of threads.
pub fn bench_threadpool(c: &mut Criterion) {
let vec_a = create_vec(VEC_ELEM_COUNT, VEC_SEEDS[0]);
let vec_b = create_vec(VEC_ELEM_COUNT, VEC_SEEDS[1]);
let mut group = c.benchmark_group("threadpool with various number of threads");
for threads in THREAD_COUNTS.iter() {
group.throughput(Throughput::Bytes(*threads as u64));
group.bench_with_input(BenchmarkId::from_parameter(threads), threads, |b, _| {
b.iter(|| {
dot_parallel(vec_a.clone(), vec_b.clone(), *threads);
});
});
}
group.finish();
}
/// Benchmark the effects of over and underusing a thread pools thread capacity.
/// The thread pool will automatically choose the number of threads to use.
/// We will then run a custom number of jobs with that pool that may be smaller or larger
/// than the amount of threads the pool can offer.
fn pool_overusage(a: Arc<Vec<f64>>, b: Arc<Vec<f64>>, threads: usize) {
// automatically choose the number of threads
let mut pool = ThreadPool::new();
// number of elements in each vector for each thread
let steps = a.len() / threads;
for i in 0..threads {
// offset of the first element for the thread local vec
let chunk = i * steps;
// create a new strong reference to the vector
let aa = a.clone();
let bb = b.clone();
// launch a new thread
pool.enqueue(Task::new(
(aa, bb, chunk, steps),
|(aa, bb, chunk, steps)| {
let a = &aa[chunk..(chunk + steps)];
let b = &bb[chunk..(chunk + steps)];
dot(a, b)
},
));
}
pool.join_all();
black_box(pool.get_results().iter().sum::<f64>());
}
/// Benchmark the effects of over and underusing a thread pools thread capacity.
/// The thread pool will automatically choose the number of threads to use.
/// We will then run a custom number of jobs with that pool that may be smaller or larger
/// than the amount of threads the pool can offer.
pub fn bench_overusage(c: &mut Criterion) {
let vec_a = create_vec(VEC_ELEM_COUNT, VEC_SEEDS[0]);
let vec_b = create_vec(VEC_ELEM_COUNT, VEC_SEEDS[1]);
let mut group = c.benchmark_group("threadpool overusage");
for threads in THREAD_COUNTS.iter() {
group.throughput(Throughput::Bytes(*threads as u64));
group.bench_with_input(BenchmarkId::from_parameter(threads), threads, |b, _| {
b.iter(|| {
pool_overusage(vec_a.clone(), vec_b.clone(), *threads);
});
});
}
group.finish();
}
/// Benchmark the performance of a single thread used to calculate the dot product.
pub fn bench_single_threaded(c: &mut Criterion) {
let vec_a = create_vec(VEC_ELEM_COUNT, VEC_SEEDS[0]);
let vec_b = create_vec(VEC_ELEM_COUNT, VEC_SEEDS[1]);
c.bench_function("single threaded", |s| {
s.iter(|| {
black_box(dot(&vec_a, &vec_b));
});
});
}
criterion_group!(
benches,
bench_single_threaded,
bench_threadpool,
bench_overusage
);
criterion_main!(benches);