From 85f4f94b539ce38499e7a1173f7b3dd185f4d593 Mon Sep 17 00:00:00 2001
From: Dirkjan Ochtman <dirkjan@ochtman.nl>
Date: Thu, 27 May 2021 13:50:23 +0200
Subject: [PATCH] Use more efficient segmentation strategy

Based on the triangular matrix approach as explained here:

https://towardsdatascience.com/fast-word-segmentation-for-noisy-text-2c2c41f9e8da

Use iteration rather than recursion to segment the input forwards
rather than backwards and use a `Vec`-based memoization strategy
instead of relying on a `HashMap` of words. This version is about
4.8x faster, 100 lines of code less and should use much less memory.
---
 README.md                  |   4 +-
 instant-segment/src/lib.rs | 211 +++++++++----------------------------
 2 files changed, 50 insertions(+), 165 deletions(-)
diff --git a/README.md b/README.md
index fecc172..8123731 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ Corpus][corpus], as described by Thorsten Brants and Alex Franz, and
 data **"may only be used for linguistic education and research"**, so for any
 other usage you should acquire a different data set.
 
-For the microbenchmark included in this repository, Instant Segment is ~17x
+For the microbenchmark included in this repository, Instant Segment is ~100x
 faster than the Python implementation. Further optimizations are planned -- see
 the [issues][issues]. The API has been carefully constructed so that multiple
 segmentations can share the underlying state to allow parallel usage.
@@ -110,4 +110,4 @@ make test-python
 [corpus]:
   http://googleresearch.blogspot.com/2006/08/all-our-n-gram-are-belong-to-you.html
 [distributed]: https://catalog.ldc.upenn.edu/LDC2006T13
-[issues]: https://github.com/InstantDomainSearch/instant-segment/issues
\ No newline at end of file
+[issues]: https://github.com/InstantDomainSearch/instant-segment/issues
diff --git a/instant-segment/src/lib.rs b/instant-segment/src/lib.rs
index 46ae92c..8ac45ad 100644
--- a/instant-segment/src/lib.rs
+++ b/instant-segment/src/lib.rs
@@ -1,4 +1,4 @@
-use std::ops::{BitOrAssign, Index, Range};
+use std::ops::{Index, Range};
 use std::str;
 
 #[cfg(feature = "serde")]
@@ -112,96 +112,70 @@ struct SegmentState<'a> {
     data: &'a Segmenter,
     text: Ascii<'a>,
     search: &'a mut Search,
-    offset: usize,
 }
 
 impl<'a> SegmentState<'a> {
     fn new(text: Ascii<'a>, data: &'a Segmenter, search: &'a mut Search) -> Self {
         search.clear();
-        Self {
-            data,
-            text,
-            search,
-            offset: 0,
-        }
+        Self { data, text, search }
     }
 
-    /// Returns a list of words that is the best segmentation of `text`
-    fn run(mut self) {
-        let (mut start, mut end) = (0, 0);
-        while end < self.text.len() {
-            end = self.text.len().min(end + SEGMENT_SIZE);
-            self.offset = start;
-            self.search(0, start..end, None);
+    fn run(self) {
+        for end in 1..=self.text.len() {
+            let start = end.saturating_sub(self.data.limit);
+            for split in start..end {
+                let (prev, prev_score) = match split {
+                    0 => (None, 0.0),
+                    _ => {
+                        let prefix = self.search.candidates[split - 1];
+                        let word = &self.text[split - prefix.len as usize..split];
+                        (Some(word), prefix.score)
+                    }
+                };
 
-            let mut limit = usize::MAX;
-            if end < self.text.len() {
-                limit = 5;
-            }
-
-            for split in self.search.best[0].decode(self.offset).take(limit) {
-                self.search.result.push(self.text[start..split].into());
-                start = split;
-            }
-        }
-    }
-
-    /// Score `word` in the context of `previous` word
-    fn search(&mut self, level: usize, range: Range<usize>, previous: Option<Range<usize>>) -> f64 {
-        if range.is_empty() {
-            self.search.best[level].clear();
-            return 0.0;
-        }
-
-        let mut best = f64::MIN;
-        for split in 1..(range.len().min(self.data.limit) + 1) {
-            let (start, split, end) = (range.start, range.start + split, range.end);
-            let previous = previous.clone().map(|range| &self.text[range]);
-            let prefix_score = self.data.score(&self.text[start..split], previous);
-
-            let key = (
-                (start - self.offset) as u8,
-                (split - self.offset) as u8,
-                (end - self.offset) as u8,
-            );
-
-            let (suffix_score, suffix_splits) = match self.search.memo.get(&key) {
-                Some((score, suffix_splits)) => (*score, *suffix_splits),
-                None => {
-                    let suffix_score = self.search(level + 1, split..end, Some(start..split));
-                    let suffix_splits = self.search.best[level + 1];
-                    self.search.memo.insert(key, (suffix_score, suffix_splits));
-                    (suffix_score, suffix_splits)
+                let word = &self.text[split..end];
+                let score = self.data.score(word, prev) + prev_score;
+                match self.search.candidates.get_mut(end - 1) {
+                    Some(cur) if cur.score < score => {
+                        cur.len = end - split;
+                        cur.score = score;
+                    }
+                    None => self.search.candidates.push(Candidate {
+                        len: end - split,
+                        score,
+                    }),
+                    _ => {}
                 }
-            };
-
-            let score = prefix_score + suffix_score;
-            if score > best {
-                best = score;
-                let new_splits = &mut self.search.best[level];
-                new_splits.clear();
-                new_splits.set(split - self.offset);
-                *new_splits |= suffix_splits;
             }
         }
 
-        best
+        let mut end = self.text.len();
+        let mut best = self.search.candidates[end - 1];
+        loop {
+            let word = &self.text[end - best.len as usize..end];
+            self.search.result.push(word.into());
+
+            end -= best.len as usize;
+            if end == 0 {
+                break;
+            }
+
+            best = self.search.candidates[end - 1];
+        }
+
+        self.search.result.reverse();
     }
 }
 
 #[derive(Clone)]
 pub struct Search {
-    memo: HashMap<(u8, u8, u8), (f64, BitVec)>,
-    best: Box<[BitVec; SEGMENT_SIZE]>,
+    candidates: Vec<Candidate>,
     result: Vec<String>,
 }
 
 impl Search {
     fn clear(&mut self) {
-        self.memo.clear();
-        for inner in self.best.iter_mut() {
-            inner.clear();
-        }
+        self.candidates.clear();
         self.result.clear();
     }
 
@@ -214,75 +188,16 @@ impl Search {
 impl Default for Search {
     fn default() -> Self {
         Self {
-            memo: HashMap::default(),
-            best: Box::new([BitVec::default(); SEGMENT_SIZE]),
+            candidates: Vec::new(),
             result: Vec::new(),
         }
     }
 }
 
-#[derive(Clone, Copy, Default)]
-struct BitVec([u64; 4]);
-
-impl BitVec {
-    fn set(&mut self, mut bit: usize) {
-        debug_assert!(bit < 256);
-        let mut idx = 3;
-        while bit > 63 {
-            idx -= 1;
-            bit -= 64;
-        }
-        self.0[idx] |= 1 << bit;
-    }
-
-    fn clear(&mut self) {
-        self.0.iter_mut().for_each(|dst| {
-            *dst = 0;
-        });
-    }
-
-    fn decode(&self, offset: usize) -> Splits {
-        Splits {
-            vec: self.0,
-            offset,
-            idx: 3,
-        }
-    }
-}
-
-impl BitOrAssign for BitVec {
-    fn bitor_assign(&mut self, rhs: Self) {
-        self.0
-            .iter_mut()
-            .zip(rhs.0.iter())
-            .for_each(|(dst, src)| *dst |= *src);
-    }
-}
-
-struct Splits {
-    vec: [u64; 4],
-    offset: usize,
-    idx: usize,
-}
-
-impl Iterator for Splits {
-    type Item = usize;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        while self.idx > 0 && self.vec[self.idx] == 0 {
-            self.idx -= 1;
-        }
-
-        let cur = self.vec[self.idx];
-        if cur == 0 {
-            return None;
-        }
-
-        let trailing = cur.trailing_zeros();
-        let next = Some(self.offset + (3 - self.idx) * 64 + trailing as usize);
-        self.vec[self.idx] -= 1 << trailing;
-        next
-    }
+#[derive(Clone, Copy, Debug, Default)]
+struct Candidate {
+    len: usize,
+    score: f64,
 }
 
 #[derive(Debug)]
@@ -326,7 +241,6 @@ impl std::fmt::Display for InvalidCharacter {
 type HashMap<K, V> = std::collections::HashMap<K, V, ahash::RandomState>;
 
 const DEFAULT_LIMIT: usize = 24;
-const SEGMENT_SIZE: usize = 250;
 
 #[cfg(test)]
 pub mod tests {
@@ -338,33 +252,4 @@ pub mod tests {
         let text = Ascii::new("cantbuymelove").unwrap();
         assert_eq!(&text[0..text.len()], "cantbuymelove");
     }
-
-    #[test]
-    fn bitvec() {
-        let mut splits = BitVec::default();
-        assert_eq!(splits.decode(0).collect::<Vec<_>>(), vec![]);
-
-        splits.set(1);
-        assert_eq!(splits.decode(0).collect::<Vec<_>>(), vec![1]);
-
-        splits.set(5);
-        assert_eq!(splits.decode(10).collect::<Vec<_>>(), vec![11, 15]);
-
-        splits.set(64);
-        assert_eq!(splits.decode(0).collect::<Vec<_>>(), vec![1, 5, 64]);
-
-        splits.set(255);
-        assert_eq!(splits.decode(0).collect::<Vec<_>>(), vec![1, 5, 64, 255]);
-
-        let mut new = BitVec::default();
-        new.set(3);
-        new.set(16);
-        new.set(128);
-
-        splits |= new;
-        assert_eq!(
-            splits.decode(0).collect::<Vec<_>>(),
-            vec![1, 3, 5, 16, 64, 128, 255]
-        );
-    }
 }