From 98a673fea2ac8cbfb0cf947458604d1436b68974 Mon Sep 17 00:00:00 2001 From: Dirkjan Ochtman Date: Thu, 26 Nov 2020 16:47:49 +0100 Subject: [PATCH] Initial fully serial implementation --- .gitignore | 1 + Cargo.lock | 154 ++++++++++++++ Cargo.toml | 10 + src/lib.rs | 595 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 760 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/lib.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..1a3cabc --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,154 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +[[package]] +name = "ahash" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "865f8b0b3fced577b7df82e9b0eb7609595d7209c0b39e78d0646672e244b1b1" +dependencies = [ + "getrandom 0.2.0", + "lazy_static", + "version_check", +] + +[[package]] +name = "autocfg" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" + +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + +[[package]] +name = "getrandom" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc587bc0ec293155d5bfa6b9891ec18a1e330c234f896ea47fbada4cadbe47e6" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "getrandom" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee8025cf36f917e6a52cce185b7c7177689b838b7ec138364e50cc2277a56cf4" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "hinasmawo" +version = "0.1.0" +dependencies = [ + "ahash", + "ordered-float", + "rand", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d58d1b70b004888f764dfbf6a26a3b0342a1632d33968e4a179d8011c760614" + +[[package]] +name = "num-traits" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +dependencies = [ + "autocfg", +] + +[[package]] +name = "ordered-float" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fe9037165d7023b1228bc4ae9a2fa1a2b0095eca6c2998c624723dfd01314a5" +dependencies = [ + "num-traits", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" + +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.15", + "libc", + "rand_chacha", + "rand_core", + "rand_hc", + "rand_pcg", +] + +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.15", +] + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core", +] + +[[package]] +name = "rand_pcg" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" +dependencies = [ + "rand_core", +] + +[[package]] +name = "version_check" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed" + +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..fd4d6d0 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "hinasmawo" +version = "0.1.0" +authors = ["Dirkjan Ochtman "] +edition = "2018" + +[dependencies] +ahash = "0.6.1" +rand = { version = "0.7.3", features = ["small_rng"] } +ordered-float = "2.0" diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..2126955 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,595 @@ +use std::cmp::{max, Ordering}; +use std::ops::{Index, IndexMut}; + +use ahash::AHashSet as HashSet; +use ordered_float::OrderedFloat; +use rand::rngs::SmallRng; +use rand::{RngCore, SeedableRng}; + +pub struct Hnsw

{ + ef_construction: usize, + points: Vec

, + zero: Vec, + layers: Vec>, + rng: SmallRng, +} + +impl

Hnsw

+where + P: Point, +{ + pub fn new(ef_construction: usize) -> Self { + Self { + ef_construction, + points: Vec::new(), + zero: Vec::new(), + layers: Vec::new(), + rng: SmallRng::from_entropy(), + } + } + + /// Insert a point into the `Hnsw`, returning a `PointId` + /// + /// `PointId` implements `Hash`, `Eq` and friends, so it can be linked to some value. + pub fn insert(&mut self, point: P, search: &mut Search) -> PointId { + let layer = self.rng.next_u32() as f32 / u32::MAX as f32; + let layer = LayerId((-(layer.ln() * (M as f32).ln())).floor() as usize); + self.insert_at(point, layer, search) + } + + /// Deterministic implementation of insertion that takes the `layer` as an argument + /// + /// Implements the paper's algorithm 1, although there is a slight difference in that + /// new elements are always inserted from their selected layer, rather than delaying the + /// addition of new layers until after the selection of a particular layer. + fn insert_at(&mut self, point: P, layer: LayerId, search: &mut Search) -> PointId { + let empty = self.points.is_empty(); + let pid = PointId(self.points.len()); + self.points.push(point); + + let top = LayerId(self.layers.len()); + if layer > top { + self.layers.resize_with(layer.0, Default::default); + } + + search.reset(1, top); + for cur in max(top, layer).descend() { + search.num = if cur <= layer { + self.ef_construction + } else { + 1 + }; + + // If this layer already existed, search it for the 1 nearest neighbor + // (this roughly corresponds to the first loop in the paper's algorithm 1). + if cur <= top { + debug_assert_eq!(search.layer, cur); + + // At the first layer that already existed, insert the first element as an initial + // candidate. Because the zero-th layer always exists, also check if it was empty. + if cur == top && !empty { + search.push(NodeId(0), &self[pid], self); + } + + self.search_layer(cur, pid, search); + // If we're still above the layer to insert at, we're going to skip the + // insertion code below and continue to the next iteration. Before we do so, + // we update the `Search` so it's ready for the next layer coming up. + if cur > layer { + search.lower(self); + } + } + + // If we're above the layer to start inserting links at, skip the rest of this loop. + if cur > layer { + continue; + } + + if cur.is_zero() { + let nid = NodeId(self.zero.len()); + let mut node = ZeroNode { + nearest: Default::default(), + }; + self.link(cur, (nid, &mut node.nearest), &search.nearest); + self.zero.push(node); + } else { + let nid = NodeId(self.layers[cur.0 - 1].len()); + let lower = match cur.0 == 1 { + false => NodeId(self.layers[cur.0 - 2].len()), + true => NodeId(self.zero.len()), + }; + + let mut node = UpperNode { + pid, + lower, + nearest: Default::default(), + }; + + self.link(cur, (nid, &mut node.nearest), &search.nearest); + self.layers[cur.0 - 1].push(node); + } + + if search.layer == cur && !cur.is_zero() { + search.lower(self); + } + } + + pid + } + + /// Bidirectionally insert links between newly detected neighbors + /// + /// `layer` is the layer we're at; `new` contains the `NodeId` for the new `Node` (which has + /// not yet been added to the layer) and its still-empty list of nearest neighbors; `found` is + /// a slice containing the `Candidate`s found during searching (ordered from near to far). + /// + /// This just defers to the `Layer`'s `link()` implementation, which specializes on layer type. + fn link(&mut self, layer: LayerId, new: (NodeId, &mut [Option]), found: &[Candidate]) { + match layer.0 { + 0 => self.zero.link(new, found, &self.points), + l => self.layers[l - 1].link(new, found, &self.points), + } + } + + /// Search the given `layer` for neighbors closed to the point identified by `pid` + /// + /// This implements the outer loop of algorithm 2 from the paper, deferring the state mutation + /// in the inner loop to the `Search::push()` implementation. + fn search_layer(&self, layer: LayerId, pid: PointId, search: &mut Search) { + debug_assert_eq!(search.layer, layer); + let point = &self[pid]; + while let Some(candidate) = search.candidates.pop() { + if let Some(found) = search.nearest.last() { + if candidate.distance > found.distance { + break; + } + } + + let iter = match layer.0 { + 0 => self.zero[candidate.nid].nearest_iter(), + l => self.layers[l - 1][candidate.nid].nearest_iter(), + }; + + for nid in iter { + search.push(nid, point, self); + } + } + } +} + +/// Keeps mutable state for searching a point's nearest neighbors +/// +/// In particular, this contains most of the state used in algorithm 2. The structure is +/// initialized by using `push()` to add the initial enter points. +pub struct Search { + /// Nodes visited so far (`v` in the paper) + visited: HashSet, + /// Candidates for further inspection (`C` in the paper) + candidates: Vec, + /// Nearest neighbors found so far (`W` in the paper) + nearest: Vec, + /// Maximum number of nearest neighbors to retain (`ef` in the paper) + num: usize, + /// Current layer + layer: LayerId, +} + +impl Search { + /// Resets the state to be ready for a new search + fn reset(&mut self, num: usize, layer: LayerId) { + self.visited.clear(); + self.candidates.clear(); + self.nearest.clear(); + self.num = num; + self.layer = layer; + } + + /// Track node `nid` as a potential new neighbor for the given `point` + /// + /// Will immediately return if the node has been considered before. This implements + /// the inner loop from the paper's algorithm 2. + fn push(&mut self, nid: NodeId, point: &P, hnsw: &Hnsw

) { + if !self.visited.insert(nid) { + return; + } + + let pid = match self.layer.0 { + 0 => hnsw.zero.pid(nid), + l => hnsw.layers[l - 1].pid(nid), + }; + + let other = &hnsw[pid]; + let distance = OrderedFloat::from(point.distance(other)); + if self.nearest.len() >= self.num { + if let Some(found) = self.nearest.last() { + if distance > found.distance { + return; + } + } + } + + if self.nearest.len() > self.num { + self.nearest.pop(); + } + + let new = Candidate { distance, nid }; + let idx = self.candidates.binary_search(&new).unwrap_or_else(|e| e); + self.candidates.insert(idx, new); + + let idx = self.nearest.binary_search(&new).unwrap_or_else(|e| e); + self.nearest.insert(idx, new); + } + + /// Lower the search to the next lower level + /// + /// Resets `visited`, `candidates` to match `nearest`. + /// + /// Panics if called while the `Search` is at level 0. + fn lower(&mut self, hnsw: &Hnsw

) { + debug_assert!(!self.layer.is_zero()); + + self.nearest.truncate(self.num); // Limit size of the set of nearest neighbors + let old = hnsw.layers[self.layer.0 - 1].nodes(); + for cur in self.nearest.iter_mut() { + cur.nid = old[cur.nid].lower; + } + + // Re-initialize the `Search`: `nearest`, the output `W` from the last round, now becomes + // the set of enter points, which we use to initialize both `candidates` and `visited`. + self.layer = self.layer.lower(); + self.candidates.clear(); + self.candidates.extend(&self.nearest); + self.visited.clear(); + self.visited.extend(self.nearest.iter().map(|c| c.nid)); + } +} + +impl Default for Search { + fn default() -> Self { + Self { + visited: HashSet::new(), + candidates: Vec::new(), + nearest: Vec::new(), + layer: LayerId(0), + num: 1, + } + } +} + +impl

Index for Hnsw

{ + type Output = P; + + fn index(&self, index: PointId) -> &Self::Output { + &self.points[index.0] + } +} + +impl Index for [P] { + type Output = P; + + fn index(&self, index: PointId) -> &Self::Output { + &self[index.0] + } +} + +impl Index for Vec { + type Output = UpperNode; + + fn index(&self, index: NodeId) -> &Self::Output { + &self[index.0] + } +} + +impl IndexMut for Vec { + fn index_mut(&mut self, index: NodeId) -> &mut Self::Output { + &mut self[index.0] + } +} + +impl Index for [UpperNode] { + type Output = UpperNode; + + fn index(&self, index: NodeId) -> &Self::Output { + &self[index.0] + } +} + +impl Index for Vec { + type Output = ZeroNode; + + fn index(&self, index: NodeId) -> &Self::Output { + &self[index.0] + } +} + +impl IndexMut for Vec { + fn index_mut(&mut self, index: NodeId) -> &mut Self::Output { + &mut self[index.0] + } +} + +impl Index for [ZeroNode] { + type Output = ZeroNode; + + fn index(&self, index: NodeId) -> &Self::Output { + &self[index.0] + } +} + +impl Layer for Vec { + const LINKS: usize = M * 2; + + type Node = ZeroNode; + + fn pid(&self, nid: NodeId) -> PointId { + PointId(nid.0) + } + + fn nodes(&self) -> &[Self::Node] { + self + } + + fn nodes_mut(&mut self) -> &mut [Self::Node] { + self + } +} + +impl Layer for Vec { + const LINKS: usize = M; + + type Node = UpperNode; + + fn pid(&self, nid: NodeId) -> PointId { + self.nodes()[nid].pid + } + + fn nodes(&self) -> &[Self::Node] { + self + } + + fn nodes_mut(&mut self) -> &mut [Self::Node] { + self + } +} + +trait Layer { + const LINKS: usize; + + type Node: Node; + + fn pid(&self, nid: NodeId) -> PointId; + + fn nodes(&self) -> &[Self::Node]; + + fn nodes_mut(&mut self) -> &mut [Self::Node]; + + /// Bidirectionally insert links between newly detected neighbors + /// + /// `new` contains the `NodeId` for the new `Node` (which has not yet been added to the layer) + /// and its still-empty list of nearest neighbors; `found` is a slice containing all + /// `Candidate`s found during searching (ordered from near to far). + /// + /// Initializes both the new node's neighbors (in `new.1`) and updates the nearest neighbors + /// for the new node's neighbors if necessary. + fn link( + &mut self, + new: (NodeId, &mut [Option]), + found: &[Candidate], + points: &[P], + ) { + // Just make sure the candidates are all unique + debug_assert_eq!( + found.len(), + found.iter().map(|c| c.nid).collect::>().len() + ); + + // Only use the `Self::LINKS` nearest candidates found + for (i, candidate) in found.iter().take(Self::LINKS).enumerate() { + // `candidate` here is the new node's neighbor + let &Candidate { distance, nid } = candidate; + new.1[i] = Some(nid); // Update the new node's `nearest` + + let pid = self.pid(nid); + let old = &points[pid.0]; + let nearest = self.nodes()[nid.0].nearest(); + + // Find the correct index to insert at to keep the neighbor's neighbors sorted + let idx = nearest + .binary_search_by(|third| { + // `third` here is one of the neighbors of the new node's neighbor. + let third = match third { + Some(nid) => *nid, + // if `third` is `None`, our new `node` is always "closer" + None => return Ordering::Greater, + }; + + let pid = self.pid(third); + let third_distance = OrderedFloat::from(old.distance(&points[pid.0])); + distance.cmp(&third_distance) + }) + .unwrap_or_else(|e| e); + + // It might be possible for all the neighbor's current neighbors to be closer to our + // neighbor than to the new node, in which case we skip insertion of our new node's ID. + if idx >= nearest.len() { + continue; + } + + let nearest = self.nodes_mut()[nid.0].nearest_mut(); + if nearest[idx].is_none() { + nearest[idx] = Some(new.0); + continue; + } + + let end = Self::LINKS - 1; + nearest.copy_within(idx..end, idx + 1); + nearest[idx] = Some(new.0); + } + } +} + +#[derive(Debug)] +struct UpperNode { + /// This node's point + pid: PointId, + /// The point's node on the next level down + /// + /// This is only used when lowering the search. + lower: NodeId, + /// The nearest neighbors on this layer + /// + /// This is always kept in sorted order (near to far). + nearest: [Option; M], +} + +impl Node for UpperNode { + fn nearest(&self) -> &[Option] { + &self.nearest + } + + fn nearest_mut(&mut self) -> &mut [Option] { + &mut self.nearest + } + + fn nearest_iter(&self) -> NearestIter<'_> { + NearestIter { + nearest: &self.nearest, + } + } +} + +#[derive(Debug)] +struct ZeroNode { + /// The nearest neighbors on this layer + /// + /// This is always kept in sorted order (near to far). + nearest: [Option; M * 2], +} + +impl Node for ZeroNode { + fn nearest(&self) -> &[Option] { + &self.nearest + } + + fn nearest_mut(&mut self) -> &mut [Option] { + &mut self.nearest + } + + fn nearest_iter(&self) -> NearestIter<'_> { + NearestIter { + nearest: &self.nearest, + } + } +} + +trait Node { + fn nearest(&self) -> &[Option]; + fn nearest_mut(&mut self) -> &mut [Option]; + fn nearest_iter(&self) -> NearestIter<'_>; +} + +struct NearestIter<'a> { + nearest: &'a [Option], +} + +impl<'a> Iterator for NearestIter<'a> { + type Item = NodeId; + + fn next(&mut self) -> Option { + let (&first, rest) = self.nearest.split_first()?; + self.nearest = rest; + if first.is_none() { + self.nearest = &[]; + } + first + } +} + +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +struct LayerId(usize); + +impl LayerId { + /// Return a `LayerId` for the layer one lower + /// + /// Panics when called for `LayerId(0)`. + fn lower(&self) -> LayerId { + LayerId(self.0 - 1) + } + + fn descend(&self) -> DescendingLayerIter { + DescendingLayerIter { next: Some(self.0) } + } + + fn is_zero(&self) -> bool { + self.0 == 0 + } +} + +struct DescendingLayerIter { + next: Option, +} + +impl Iterator for DescendingLayerIter { + type Item = LayerId; + + fn next(&mut self) -> Option { + Some(LayerId(match self.next? { + 0 => { + self.next = None; + 0 + } + next => { + self.next = Some(next - 1); + next + } + })) + } +} + +pub trait Point { + fn distance(&self, other: &Self) -> f32; +} + +#[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)] +struct Candidate { + distance: OrderedFloat, + nid: NodeId, +} + +/// References a node in a particular layer (usually the same layer) +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +struct NodeId(usize); + +/// References a `Point` in the `Hnsw` +/// +/// This can be used to index into the `Hnsw` to refer to the `Point` data. +#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] +pub struct PointId(usize); + +/// The parameter `M` from the paper +/// +/// This should become a generic argument to `Hnsw` when possible. +const M: usize = 6; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_insertion() { + let mut search = Search::default(); + let mut hnsw = Hnsw::new(100); + hnsw.insert(Point(0.1, 0.4), &mut search); + hnsw.insert(Point(-0.324, 0.543), &mut search); + hnsw.insert(Point(0.87, -0.33), &mut search); + hnsw.insert(Point(0.452, 0.932), &mut search); + } + + struct Point(f32, f32); + + impl super::Point for Point { + fn distance(&self, other: &Self) -> f32 { + ((self.0 - other.0).powi(2) + (self.1 - other.1).powi(2)).sqrt() + } + } +}