tweak the non-word penalty

2025-02-21 07:22:05 +00:00 · 2024-02-10 12:30:24 +00:00 · 2024-02-10 12:30:24 +00:00 · 29cafd0db1
commit 29cafd0db1
parent a63a57d94d
1 changed files with 15 additions and 1 deletions
--- a/instant-segment/src/lib.rs
+++ b/instant-segment/src/lib.rs
@ -105,7 +105,21 @@ impl Segmenter {
            Some((uni, bi_scores)) => (uni, bi_scores),
            // Penalize words not found in the unigrams according
            // to their length, a crucial heuristic.
-            None => return 1.0 - self.uni_total_log10 - word.len() as f64,
+            //
+            // In the original presentation non-words are scored as
+            //
+            //    (1.0 - self.uni_total_log10 - word_len)
+            //
+            // However in practice this seems to under-penalize long non-words.  The intuition
+            // behind the variation used here is that it applies this penalty once for each word
+            // there "should" have been in the non-word's place.
+            //
+            // See <https://github.com/instant-labs/instant-segment/issues/53>.
+            None => {
+                let word_len = word.len() as f64;
+                let word_count = word_len / 5.0;
+                return (1.0 - self.uni_total_log10 - word_len) * word_count;
+            }
        };

        if let Some(prev) = previous {