Make Segmenter::score() slightly more efficient
This commit is contained in:
parent
540348f703
commit
ea4438f2e8
28
src/lib.rs
28
src/lib.rs
|
@ -42,25 +42,23 @@ impl Segmenter {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn score(&self, word: &str, previous: Option<&str>) -> f64 {
|
fn score(&self, word: &str, previous: Option<&str>) -> f64 {
|
||||||
match previous {
|
if let Some(prev) = previous {
|
||||||
None => match self.unigrams.get(word) {
|
if let Some(pb) = self.bigrams.get(&(prev.into(), word.into())) {
|
||||||
// Probabibility of the given word
|
if self.unigrams.get(prev).is_some() {
|
||||||
|
// Conditional probability of the word given the previous
|
||||||
|
// word. The technical name is "stupid backoff" and it's
|
||||||
|
// not a probability distribution but it works well in practice.
|
||||||
|
return pb / self.total / self.score(prev, None);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
match self.unigrams.get(word) {
|
||||||
|
// Probability of the given word
|
||||||
Some(p) => p / self.total,
|
Some(p) => p / self.total,
|
||||||
// Penalize words not found in the unigrams according
|
// Penalize words not found in the unigrams according
|
||||||
// to their length, a crucial heuristic.
|
// to their length, a crucial heuristic.
|
||||||
None => 10.0 / (self.total * 10.0f64.powf(word.len() as f64)),
|
None => 10.0 / (self.total * 10.0f64.powf(word.len() as f64)),
|
||||||
},
|
|
||||||
Some(prev) => match (
|
|
||||||
self.bigrams.get(&(prev.into(), word.into())),
|
|
||||||
self.unigrams.get(prev),
|
|
||||||
) {
|
|
||||||
// Conditional probability of the word given the previous
|
|
||||||
// word. The technical name is "stupid backoff" and it's
|
|
||||||
// not a probability distribution but it works well in practice.
|
|
||||||
(Some(pb), Some(_)) => pb / self.total / self.score(prev, None),
|
|
||||||
// Fall back to using the unigram probability
|
|
||||||
_ => self.score(word, None),
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue