Use separate total value for bigrams

This commit is contained in:
Dirkjan Ochtman 2021-02-11 12:08:19 +01:00
parent 9addd3810b
commit 8f7959eeed
1 changed files with 7 additions and 11 deletions

View File

@ -15,7 +15,8 @@ pub mod test_data;
pub struct Segmenter { pub struct Segmenter {
unigrams: HashMap<String, f64>, unigrams: HashMap<String, f64>,
bigrams: HashMap<(String, String), f64>, bigrams: HashMap<(String, String), f64>,
total: f64, uni_total: f64,
bi_total: f64,
limit: usize, limit: usize,
} }
@ -42,12 +43,12 @@ impl Segmenter {
unigrams: HashMap<String, f64>, unigrams: HashMap<String, f64>,
bigrams: HashMap<(String, String), f64>, bigrams: HashMap<(String, String), f64>,
) -> Self { ) -> Self {
let total = unigrams.values().sum();
Self { Self {
uni_total: unigrams.values().sum(),
bi_total: bigrams.values().sum(),
unigrams, unigrams,
bigrams, bigrams,
limit: DEFAULT_LIMIT, limit: DEFAULT_LIMIT,
total,
} }
} }
@ -72,17 +73,17 @@ impl Segmenter {
// Conditional probability of the word given the previous // Conditional probability of the word given the previous
// word. The technical name is "stupid backoff" and it's // word. The technical name is "stupid backoff" and it's
// not a probability distribution but it works well in practice. // not a probability distribution but it works well in practice.
return (bi / self.total) / (uni / self.total); return (bi / self.bi_total) / (uni / self.uni_total);
} }
} }
} }
match self.unigrams.get(word) { match self.unigrams.get(word) {
// Probability of the given word // Probability of the given word
Some(p) => p / self.total, Some(p) => p / self.uni_total,
// Penalize words not found in the unigrams according // Penalize words not found in the unigrams according
// to their length, a crucial heuristic. // to their length, a crucial heuristic.
None => 10.0 / (self.total * 10.0f64.powi(word.len() as i32)), None => 10.0 / (self.uni_total * 10.0f64.powi(word.len() as i32)),
} }
} }
@ -90,11 +91,6 @@ impl Segmenter {
pub fn set_limit(&mut self, limit: usize) { pub fn set_limit(&mut self, limit: usize) {
self.limit = limit; self.limit = limit;
} }
/// Customize the relative score by setting the `total`
pub fn set_total(&mut self, total: f64) {
self.total = total;
}
} }
struct SegmentState<'a> { struct SegmentState<'a> {