added compressed index array

This commit is contained in:
Sven Vogel 2023-04-30 18:13:07 +02:00
parent 8b685b2128
commit 41b8a5d533
1 changed files with 24 additions and 9 deletions

View File

@ -10,10 +10,9 @@ use jemalloc_ctl::{stats, epoch};
#[global_allocator] #[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
/// Only stores more efficiently when at least 50% of all elements are zeros
pub struct SparseVec { pub struct SparseVec {
values: Vec<f64>, values: Vec<f64>,
indices: Vec<usize>, indices: Vec<u16>,
} }
impl SparseVec { impl SparseVec {
@ -22,8 +21,9 @@ impl SparseVec {
let mut sum = 0.0; let mut sum = 0.0;
for index in 0..other.indices.len() { for index in 0..other.indices.len() {
let uncompressed_index = decompress_index(index, &self.indices);
// exponential search for an element in the second vector to have the same index // exponential search for an element in the second vector to have the same index
sum += binary_search(self.indices[index], &other.indices, &other.values) * self.values[index]; sum += binary_search(uncompressed_index, &other.indices, &other.values) * self.values[index];
} }
sum sum
@ -37,11 +37,15 @@ impl SparseVec {
let mut rng = rand::thread_rng(); let mut rng = rand::thread_rng();
let mut last_idx = 0;
for i in 0..non_zero_elements { for i in 0..non_zero_elements {
values.push(0.5); values.push(0.5);
let idx = i as f32 / non_zero_elements as f32 * (elements as f32 - 4.0) + rng.gen_range(0.0..3.0); let new_idx = i as f32 / non_zero_elements as f32 * (elements as f32 - 4.0) + rng.gen_range(0.0..3.0);
indices.push(idx as usize); let compressed_idx = new_idx as usize - last_idx;
last_idx = new_idx as usize;
indices.push(compressed_idx as u16);
} }
Self { Self {
@ -52,7 +56,16 @@ impl SparseVec {
} }
#[inline] #[inline]
fn binary_search(target: usize, indices: &[usize], values: &[f64]) -> f64 { fn decompress_index(index: usize, indices: &[u16]) -> usize {
let mut sum = 0;
for i in 0..=index {
sum += indices[i] as usize;
}
sum
}
#[inline]
fn binary_search(target: usize, indices: &[u16], values: &[f64]) -> f64 {
let mut range = 0..indices.len(); let mut range = 0..indices.len();
loop { loop {
let mut median = (range.end - range.start) >> 1; let mut median = (range.end - range.start) >> 1;
@ -61,11 +74,13 @@ fn binary_search(target: usize, indices: &[usize], values: &[f64]) -> f64 {
} }
median += range.start; median += range.start;
if indices[median] == target { let index = decompress_index(median, indices);
if index == target {
return values[median]; return values[median];
} }
if indices[median] > target { if index > target {
range.end = median; range.end = median;
} else { } else {
range.start = median; range.start = median;
@ -94,7 +109,7 @@ fn main() {
const NULL_NON_NULL_RATIO: f64 = 0.02; const NULL_NON_NULL_RATIO: f64 = 0.02;
let non_zero_elements = (VECTOR_SIZE as f64 * NULL_NON_NULL_RATIO) as usize; let non_zero_elements = (VECTOR_SIZE as f64 * NULL_NON_NULL_RATIO) as usize;
let heap_element_size = std::mem::size_of::<f64>() + std::mem::size_of::<usize>(); let heap_element_size = std::mem::size_of::<f64>() + std::mem::size_of::<u16>();
println!("Estimated size on heap: {}", ByteSize::b((non_zero_elements * heap_element_size) as u64)); println!("Estimated size on heap: {}", ByteSize::b((non_zero_elements * heap_element_size) as u64));
println!("Size on stack: {} B", std::mem::size_of::<SparseVec>()); println!("Size on stack: {} B", std::mem::size_of::<SparseVec>());