From 29cafd0db1a3485f48d2f9365282eab954f6df0d Mon Sep 17 00:00:00 2001 From: David Hotham Date: Sat, 10 Feb 2024 12:30:24 +0000 Subject: [PATCH] tweak the non-word penalty --- instant-segment/src/lib.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/instant-segment/src/lib.rs b/instant-segment/src/lib.rs index 56fc6ca..74ba049 100644 --- a/instant-segment/src/lib.rs +++ b/instant-segment/src/lib.rs @@ -105,7 +105,21 @@ impl Segmenter { Some((uni, bi_scores)) => (uni, bi_scores), // Penalize words not found in the unigrams according // to their length, a crucial heuristic. - None => return 1.0 - self.uni_total_log10 - word.len() as f64, + // + // In the original presentation non-words are scored as + // + // (1.0 - self.uni_total_log10 - word_len) + // + // However in practice this seems to under-penalize long non-words. The intuition + // behind the variation used here is that it applies this penalty once for each word + // there "should" have been in the non-word's place. + // + // See . + None => { + let word_len = word.len() as f64; + let word_count = word_len / 5.0; + return (1.0 - self.uni_total_log10 - word_len) * word_count; + } }; if let Some(prev) = previous {