From 85f4f94b539ce38499e7a1173f7b3dd185f4d593 Mon Sep 17 00:00:00 2001 From: Dirkjan Ochtman Date: Thu, 27 May 2021 13:50:23 +0200 Subject: [PATCH] Use more efficient segmentation strategy Based on the triangular matrix approach as explained here: https://towardsdatascience.com/fast-word-segmentation-for-noisy-text-2c2c41f9e8da Use iteration rather than recursion to segment the input forwards rather than backwards and use a `Vec`-based memoization strategy instead of relying on a `HashMap` of words. This version is about 4.8x faster, 100 lines of code less and should use much less memory. --- README.md | 4 +- instant-segment/src/lib.rs | 211 +++++++++---------------------------- 2 files changed, 50 insertions(+), 165 deletions(-) diff --git a/README.md b/README.md index fecc172..8123731 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Corpus][corpus], as described by Thorsten Brants and Alex Franz, and data **"may only be used for linguistic education and research"**, so for any other usage you should acquire a different data set. -For the microbenchmark included in this repository, Instant Segment is ~17x +For the microbenchmark included in this repository, Instant Segment is ~100x faster than the Python implementation. Further optimizations are planned -- see the [issues][issues]. The API has been carefully constructed so that multiple segmentations can share the underlying state to allow parallel usage. @@ -110,4 +110,4 @@ make test-python [corpus]: http://googleresearch.blogspot.com/2006/08/all-our-n-gram-are-belong-to-you.html [distributed]: https://catalog.ldc.upenn.edu/LDC2006T13 -[issues]: https://github.com/InstantDomainSearch/instant-segment/issues \ No newline at end of file +[issues]: https://github.com/InstantDomainSearch/instant-segment/issues diff --git a/instant-segment/src/lib.rs b/instant-segment/src/lib.rs index 46ae92c..8ac45ad 100644 --- a/instant-segment/src/lib.rs +++ b/instant-segment/src/lib.rs @@ -1,4 +1,4 @@ -use std::ops::{BitOrAssign, Index, Range}; +use std::ops::{Index, Range}; use std::str; #[cfg(feature = "serde")] @@ -112,96 +112,70 @@ struct SegmentState<'a> { data: &'a Segmenter, text: Ascii<'a>, search: &'a mut Search, - offset: usize, } impl<'a> SegmentState<'a> { fn new(text: Ascii<'a>, data: &'a Segmenter, search: &'a mut Search) -> Self { search.clear(); - Self { - data, - text, - search, - offset: 0, - } + Self { data, text, search } } - /// Returns a list of words that is the best segmentation of `text` - fn run(mut self) { - let (mut start, mut end) = (0, 0); - while end < self.text.len() { - end = self.text.len().min(end + SEGMENT_SIZE); - self.offset = start; - self.search(0, start..end, None); + fn run(self) { + for end in 1..=self.text.len() { + let start = end.saturating_sub(self.data.limit); + for split in start..end { + let (prev, prev_score) = match split { + 0 => (None, 0.0), + _ => { + let prefix = self.search.candidates[split - 1]; + let word = &self.text[split - prefix.len as usize..split]; + (Some(word), prefix.score) + } + }; - let mut limit = usize::MAX; - if end < self.text.len() { - limit = 5; - } - - for split in self.search.best[0].decode(self.offset).take(limit) { - self.search.result.push(self.text[start..split].into()); - start = split; - } - } - } - - /// Score `word` in the context of `previous` word - fn search(&mut self, level: usize, range: Range, previous: Option>) -> f64 { - if range.is_empty() { - self.search.best[level].clear(); - return 0.0; - } - - let mut best = f64::MIN; - for split in 1..(range.len().min(self.data.limit) + 1) { - let (start, split, end) = (range.start, range.start + split, range.end); - let previous = previous.clone().map(|range| &self.text[range]); - let prefix_score = self.data.score(&self.text[start..split], previous); - - let key = ( - (start - self.offset) as u8, - (split - self.offset) as u8, - (end - self.offset) as u8, - ); - - let (suffix_score, suffix_splits) = match self.search.memo.get(&key) { - Some((score, suffix_splits)) => (*score, *suffix_splits), - None => { - let suffix_score = self.search(level + 1, split..end, Some(start..split)); - let suffix_splits = self.search.best[level + 1]; - self.search.memo.insert(key, (suffix_score, suffix_splits)); - (suffix_score, suffix_splits) + let word = &self.text[split..end]; + let score = self.data.score(word, prev) + prev_score; + match self.search.candidates.get_mut(end - 1) { + Some(cur) if cur.score < score => { + cur.len = end - split; + cur.score = score; + } + None => self.search.candidates.push(Candidate { + len: end - split, + score, + }), + _ => {} } - }; - - let score = prefix_score + suffix_score; - if score > best { - best = score; - let new_splits = &mut self.search.best[level]; - new_splits.clear(); - new_splits.set(split - self.offset); - *new_splits |= suffix_splits; } } - best + let mut end = self.text.len(); + let mut best = self.search.candidates[end - 1]; + loop { + let word = &self.text[end - best.len as usize..end]; + self.search.result.push(word.into()); + + end -= best.len as usize; + if end == 0 { + break; + } + + best = self.search.candidates[end - 1]; + } + + self.search.result.reverse(); } } #[derive(Clone)] pub struct Search { - memo: HashMap<(u8, u8, u8), (f64, BitVec)>, - best: Box<[BitVec; SEGMENT_SIZE]>, + candidates: Vec, result: Vec, } impl Search { fn clear(&mut self) { - self.memo.clear(); - for inner in self.best.iter_mut() { - inner.clear(); - } + self.candidates.clear(); self.result.clear(); } @@ -214,75 +188,16 @@ impl Search { impl Default for Search { fn default() -> Self { Self { - memo: HashMap::default(), - best: Box::new([BitVec::default(); SEGMENT_SIZE]), + candidates: Vec::new(), result: Vec::new(), } } } -#[derive(Clone, Copy, Default)] -struct BitVec([u64; 4]); - -impl BitVec { - fn set(&mut self, mut bit: usize) { - debug_assert!(bit < 256); - let mut idx = 3; - while bit > 63 { - idx -= 1; - bit -= 64; - } - self.0[idx] |= 1 << bit; - } - - fn clear(&mut self) { - self.0.iter_mut().for_each(|dst| { - *dst = 0; - }); - } - - fn decode(&self, offset: usize) -> Splits { - Splits { - vec: self.0, - offset, - idx: 3, - } - } -} - -impl BitOrAssign for BitVec { - fn bitor_assign(&mut self, rhs: Self) { - self.0 - .iter_mut() - .zip(rhs.0.iter()) - .for_each(|(dst, src)| *dst |= *src); - } -} - -struct Splits { - vec: [u64; 4], - offset: usize, - idx: usize, -} - -impl Iterator for Splits { - type Item = usize; - - fn next(&mut self) -> Option { - while self.idx > 0 && self.vec[self.idx] == 0 { - self.idx -= 1; - } - - let cur = self.vec[self.idx]; - if cur == 0 { - return None; - } - - let trailing = cur.trailing_zeros(); - let next = Some(self.offset + (3 - self.idx) * 64 + trailing as usize); - self.vec[self.idx] -= 1 << trailing; - next - } +#[derive(Clone, Copy, Debug, Default)] +struct Candidate { + len: usize, + score: f64, } #[derive(Debug)] @@ -326,7 +241,6 @@ impl std::fmt::Display for InvalidCharacter { type HashMap = std::collections::HashMap; const DEFAULT_LIMIT: usize = 24; -const SEGMENT_SIZE: usize = 250; #[cfg(test)] pub mod tests { @@ -338,33 +252,4 @@ pub mod tests { let text = Ascii::new("cantbuymelove").unwrap(); assert_eq!(&text[0..text.len()], "cantbuymelove"); } - - #[test] - fn bitvec() { - let mut splits = BitVec::default(); - assert_eq!(splits.decode(0).collect::>(), vec![]); - - splits.set(1); - assert_eq!(splits.decode(0).collect::>(), vec![1]); - - splits.set(5); - assert_eq!(splits.decode(10).collect::>(), vec![11, 15]); - - splits.set(64); - assert_eq!(splits.decode(0).collect::>(), vec![1, 5, 64]); - - splits.set(255); - assert_eq!(splits.decode(0).collect::>(), vec![1, 5, 64, 255]); - - let mut new = BitVec::default(); - new.set(3); - new.set(16); - new.set(128); - - splits |= new; - assert_eq!( - splits.decode(0).collect::>(), - vec![1, 3, 5, 16, 64, 128, 255] - ); - } }