Make Segmenter::score() slightly more efficient

This commit is contained in:
Dirkjan Ochtman 2020-11-25 13:10:13 +01:00
parent 540348f703
commit ea4438f2e8
1 changed files with 17 additions and 19 deletions

View File

@ -42,25 +42,23 @@ impl Segmenter {
}
fn score(&self, word: &str, previous: Option<&str>) -> f64 {
match previous {
None => match self.unigrams.get(word) {
// Probabibility of the given word
Some(p) => p / self.total,
// Penalize words not found in the unigrams according
// to their length, a crucial heuristic.
None => 10.0 / (self.total * 10.0f64.powf(word.len() as f64)),
},
Some(prev) => match (
self.bigrams.get(&(prev.into(), word.into())),
self.unigrams.get(prev),
) {
// Conditional probability of the word given the previous
// word. The technical name is "stupid backoff" and it's
// not a probability distribution but it works well in practice.
(Some(pb), Some(_)) => pb / self.total / self.score(prev, None),
// Fall back to using the unigram probability
_ => self.score(word, None),
},
if let Some(prev) = previous {
if let Some(pb) = self.bigrams.get(&(prev.into(), word.into())) {
if self.unigrams.get(prev).is_some() {
// Conditional probability of the word given the previous
// word. The technical name is "stupid backoff" and it's
// not a probability distribution but it works well in practice.
return pb / self.total / self.score(prev, None);
}
}
}
match self.unigrams.get(word) {
// Probability of the given word
Some(p) => p / self.total,
// Penalize words not found in the unigrams according
// to their length, a crucial heuristic.
None => 10.0 / (self.total * 10.0f64.powf(word.len() as f64)),
}
}