Make number of layers and sizes deterministic

This commit is contained in:
Dirkjan Ochtman 2021-02-10 17:28:18 +01:00
parent a99c61fee5
commit 2f7ddad3a2
2 changed files with 35 additions and 44 deletions

View File

@ -9,7 +9,7 @@ use indicatif::ProgressBar;
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use parking_lot::{Mutex, RwLock}; use parking_lot::{Mutex, RwLock};
use rand::rngs::SmallRng; use rand::rngs::SmallRng;
use rand::SeedableRng; use rand::{Rng, SeedableRng};
use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator}; use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
#[cfg(feature = "serde")] #[cfg(feature = "serde")]
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@ -155,49 +155,52 @@ where
); );
} }
// Determine the number and size of layers.
let mut sizes = Vec::new();
let mut num = points.len();
loop {
let next = (num as f32 * ml) as usize;
if next < M {
break;
}
sizes.push((num - next, num));
num = next;
}
sizes.push((num, num));
sizes.reverse();
// Give all points a random layer and sort the list of nodes by descending order for // Give all points a random layer and sort the list of nodes by descending order for
// construction. This allows us to copy higher layers to lower layers as construction // construction. This allows us to copy higher layers to lower layers as construction
// progresses, while preserving randomness in each point's layer and insertion order. // progresses, while preserving randomness in each point's layer and insertion order.
assert!(points.len() < u32::MAX as usize); assert!(points.len() < u32::MAX as usize);
let mut nodes = (0..points.len()) let mut shuffled = (0..points.len())
.map(|i| (LayerId::random(ml, &mut rng), i)) .map(|i| (PointId(rng.gen_range(0..points.len() as u32)), i))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
nodes.sort_unstable_by_key(|&n| Reverse(n)); shuffled.sort_unstable();
// Find out how many layers are needed, so that we can discard empty layers in the next
// step. Since layer IDs are randomly generated, there might be big gaps.
let (mut num_layers, mut prev) = (1, nodes[0].0);
for (layer, _) in nodes.iter() {
if *layer != prev {
num_layers += 1;
prev = *layer;
}
}
// Sort the original `points` in layer order.
// TODO: maybe optimize this? https://crates.io/crates/permutation
let mut cur_layer = LayerId(num_layers - 1);
let mut prev_layer = nodes[0].0;
let mut new_points = Vec::with_capacity(points.len()); let mut new_points = Vec::with_capacity(points.len());
let mut new_nodes = Vec::with_capacity(points.len()); let mut new_nodes = Vec::with_capacity(points.len());
let mut out = vec![INVALID; points.len()]; let mut out = vec![INVALID; points.len()];
for (i, &(layer, idx)) in nodes.iter().enumerate() { for (_, idx) in shuffled {
if prev_layer != layer { let pid = PointId(new_nodes.len() as u32);
cur_layer = LayerId(cur_layer.0 - 1); let layer = sizes
prev_layer = layer; .iter()
} .enumerate()
.find_map(|(i, &size)| match (pid.0 as usize) < size.1 {
true => Some(i),
false => None,
})
.unwrap();
let pid = PointId(i as u32);
new_points.push(points[idx].clone()); new_points.push(points[idx].clone());
new_nodes.push((cur_layer, pid)); new_nodes.push((LayerId(sizes.len() - layer - 1), pid));
out[idx] = pid; out[idx] = pid;
} }
let (points, nodes) = (new_points, new_nodes); let (points, nodes) = (new_points, new_nodes);
debug_assert_eq!(nodes.last().unwrap().0, LayerId(0)); debug_assert_eq!(nodes.last().unwrap().0, LayerId(0));
debug_assert_eq!(nodes.first().unwrap().0, LayerId(num_layers - 1)); debug_assert_eq!(nodes.first().unwrap().0, LayerId(sizes.len() - 1));
// The layer from the first node is our top layer, or the zero layer if we have no nodes. // The layer from the first node is our top layer, or the zero layer if we have no nodes.
@ -209,17 +212,12 @@ where
// Figure out how many nodes will go on each layer. This helps us allocate memory capacity // Figure out how many nodes will go on each layer. This helps us allocate memory capacity
// for each layer in advance, and also helps enable batch insertion of points. // for each layer in advance, and also helps enable batch insertion of points.
let mut sizes = vec![0; top.0 + 1]; let num_layers = sizes.len();
for (layer, _) in nodes.iter().copied() {
sizes[layer.0] += 1;
}
let mut start = 0;
let mut ranges = Vec::with_capacity(top.0); let mut ranges = Vec::with_capacity(top.0);
for (i, size) in sizes.into_iter().enumerate().rev() { for (i, (size, cumulative)) in sizes.into_iter().enumerate() {
let start = cumulative - size;
// Skip the first point, since we insert the enter point separately // Skip the first point, since we insert the enter point separately
ranges.push((LayerId(i), max(start, 1)..start + size)); ranges.push((LayerId(num_layers - i - 1), max(start, 1)..cumulative));
start += size;
} }
// Insert the first point so that we have an enter point to start searches with. // Insert the first point so that we have an enter point to start searches with.

View File

@ -3,8 +3,6 @@ use std::ops::{Deref, Index};
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use parking_lot::{MappedRwLockReadGuard, RwLock, RwLockReadGuard}; use parking_lot::{MappedRwLockReadGuard, RwLock, RwLockReadGuard};
use rand::rngs::SmallRng;
use rand::Rng;
#[cfg(feature = "serde")] #[cfg(feature = "serde")]
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
#[cfg(feature = "serde-big-array")] #[cfg(feature = "serde-big-array")]
@ -200,11 +198,6 @@ where
pub(crate) struct LayerId(pub usize); pub(crate) struct LayerId(pub usize);
impl LayerId { impl LayerId {
pub(crate) fn random(ml: f32, rng: &mut SmallRng) -> Self {
let layer = rng.gen::<f32>();
LayerId((-layer.ln() * ml).floor() as usize)
}
pub(crate) fn descend(&self) -> impl Iterator<Item = LayerId> { pub(crate) fn descend(&self) -> impl Iterator<Item = LayerId> {
DescendingLayerIter { next: Some(self.0) } DescendingLayerIter { next: Some(self.0) }
} }