Make Segmenter::score() slightly more efficient

This commit is contained in:
Dirkjan Ochtman 2020-11-25 13:10:13 +01:00
parent 540348f703
commit ea4438f2e8
1 changed files with 17 additions and 19 deletions

View File

@ -42,25 +42,23 @@ impl Segmenter {
} }
fn score(&self, word: &str, previous: Option<&str>) -> f64 { fn score(&self, word: &str, previous: Option<&str>) -> f64 {
match previous { if let Some(prev) = previous {
None => match self.unigrams.get(word) { if let Some(pb) = self.bigrams.get(&(prev.into(), word.into())) {
// Probabibility of the given word if self.unigrams.get(prev).is_some() {
Some(p) => p / self.total, // Conditional probability of the word given the previous
// Penalize words not found in the unigrams according // word. The technical name is "stupid backoff" and it's
// to their length, a crucial heuristic. // not a probability distribution but it works well in practice.
None => 10.0 / (self.total * 10.0f64.powf(word.len() as f64)), return pb / self.total / self.score(prev, None);
}, }
Some(prev) => match ( }
self.bigrams.get(&(prev.into(), word.into())), }
self.unigrams.get(prev),
) { match self.unigrams.get(word) {
// Conditional probability of the word given the previous // Probability of the given word
// word. The technical name is "stupid backoff" and it's Some(p) => p / self.total,
// not a probability distribution but it works well in practice. // Penalize words not found in the unigrams according
(Some(pb), Some(_)) => pb / self.total / self.score(prev, None), // to their length, a crucial heuristic.
// Fall back to using the unigram probability None => 10.0 / (self.total * 10.0f64.powf(word.len() as f64)),
_ => self.score(word, None),
},
} }
} }