Use separate total value for bigrams
This commit is contained in:
parent
9addd3810b
commit
8f7959eeed
18
src/lib.rs
18
src/lib.rs
|
@ -15,7 +15,8 @@ pub mod test_data;
|
|||
pub struct Segmenter {
|
||||
unigrams: HashMap<String, f64>,
|
||||
bigrams: HashMap<(String, String), f64>,
|
||||
total: f64,
|
||||
uni_total: f64,
|
||||
bi_total: f64,
|
||||
limit: usize,
|
||||
}
|
||||
|
||||
|
@ -42,12 +43,12 @@ impl Segmenter {
|
|||
unigrams: HashMap<String, f64>,
|
||||
bigrams: HashMap<(String, String), f64>,
|
||||
) -> Self {
|
||||
let total = unigrams.values().sum();
|
||||
Self {
|
||||
uni_total: unigrams.values().sum(),
|
||||
bi_total: bigrams.values().sum(),
|
||||
unigrams,
|
||||
bigrams,
|
||||
limit: DEFAULT_LIMIT,
|
||||
total,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -72,17 +73,17 @@ impl Segmenter {
|
|||
// Conditional probability of the word given the previous
|
||||
// word. The technical name is "stupid backoff" and it's
|
||||
// not a probability distribution but it works well in practice.
|
||||
return (bi / self.total) / (uni / self.total);
|
||||
return (bi / self.bi_total) / (uni / self.uni_total);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
match self.unigrams.get(word) {
|
||||
// Probability of the given word
|
||||
Some(p) => p / self.total,
|
||||
Some(p) => p / self.uni_total,
|
||||
// Penalize words not found in the unigrams according
|
||||
// to their length, a crucial heuristic.
|
||||
None => 10.0 / (self.total * 10.0f64.powi(word.len() as i32)),
|
||||
None => 10.0 / (self.uni_total * 10.0f64.powi(word.len() as i32)),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -90,11 +91,6 @@ impl Segmenter {
|
|||
pub fn set_limit(&mut self, limit: usize) {
|
||||
self.limit = limit;
|
||||
}
|
||||
|
||||
/// Customize the relative score by setting the `total`
|
||||
pub fn set_total(&mut self, total: f64) {
|
||||
self.total = total;
|
||||
}
|
||||
}
|
||||
|
||||
struct SegmentState<'a> {
|
||||
|
|
Loading…
Reference in New Issue